from:"Mark Nelson"

powerpc/pseries: Add WARN_ON() to request_event_sources_irqs() on irq allocation/request failure

2010-05-27 Thread Mark Nelson

At the moment if request_event_sources_irqs() can't allocate or request
the interrupt, it just does a KERN_ERR printk. This may be fine for the
existing RAS code where if we miss an EPOW event it just means that the
event won't be logged and if we miss one of the RAS errors then we could
miss an event that we perhaps should take action on.

But, for the upcoming IO events code that will use event-sources if we
can't allocate or request the interrupt it means we'd potentially miss
an interrupt from the device. So, let's add a WARN_ON() in this error
case so that we're a bit more vocal when something's amiss.

While we're at it, also use pr_err() to neaten the code up a bit.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---
 arch/powerpc/platforms/pseries/event_sources.c |   23 +++
 1 file changed, 15 insertions(+), 8 deletions(-)

Index: upstream/arch/powerpc/platforms/pseries/event_sources.c
===
--- upstream.orig/arch/powerpc/platforms/pseries/event_sources.c
+++ upstream/arch/powerpc/platforms/pseries/event_sources.c
@@ -41,9 +41,12 @@ void request_event_sources_irqs(struct d
if (count  15)
break;
virqs[count] = irq_create_mapping(NULL, *(opicprop++));
-   if (virqs[count] == NO_IRQ)
-   printk(KERN_ERR Unable to allocate interrupt 
-  number for %s\n, np-full_name);
+   if (virqs[count] == NO_IRQ) {
+   pr_err(event-sources: Unable to allocate 
+  interrupt number for %s\n,
+  np-full_name);
+   WARN_ON(1);
+   }
else
count++;
 
@@ -59,9 +62,12 @@ void request_event_sources_irqs(struct d
virqs[count] = irq_create_of_mapping(oirq.controller,
oirq.specifier,
oirq.size);
-   if (virqs[count] == NO_IRQ)
-   printk(KERN_ERR Unable to allocate interrupt 
-  number for %s\n, np-full_name);
+   if (virqs[count] == NO_IRQ) {
+   pr_err(event-sources: Unable to allocate 
+  interrupt number for %s\n,
+  np-full_name);
+   WARN_ON(1);
+   }
else
count++;
}
@@ -70,8 +76,9 @@ void request_event_sources_irqs(struct d
/* Now request them */
for (i = 0; i  count; i++) {
if (request_irq(virqs[i], handler, 0, name, NULL)) {
-   printk(KERN_ERR Unable to request interrupt %d for 
-  %s\n, virqs[i], np-full_name);
+   pr_err(event-sources: Unable to request interrupt 
+  %d for %s\n, virqs[i], np-full_name);
+   WARN_ON(1);
return;
}
}
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/pseries: Rename RAS_VECTOR_OFFSET to RTAS_VECTOR_EXTERNAL_INTERRUPT and move to rtas.h

2010-05-27 Thread Mark Nelson

The RAS code has a #define, RAS_VECTOR_OFFSET, that's used in the
check-exception RTAS call for the vector offset of the exception.

We'll be using this same vector offset for the upcoming IO Event interrupts
code (0x500) so let's move it to include/asm/rtas.h and call it
RTAS_VECTOR_EXTERNAL_INTERRUPT.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---
 arch/powerpc/include/asm/rtas.h  |3 +++
 arch/powerpc/platforms/pseries/ras.c |5 ++---
 2 files changed, 5 insertions(+), 3 deletions(-)

Index: upstream/arch/powerpc/include/asm/rtas.h
===
--- upstream.orig/arch/powerpc/include/asm/rtas.h
+++ upstream/arch/powerpc/include/asm/rtas.h
@@ -137,6 +137,9 @@ struct rtas_t {
 #define RTAS_TYPE_PMGM_CONFIG_CHANGE   0x70
 #define RTAS_TYPE_PMGM_SERVICE_PROC0x71
 
+/* RTAS check-exception vector offset */
+#define RTAS_VECTOR_EXTERNAL_INTERRUPT 0x500
+
 struct rtas_error_log {
unsigned long version:8;/* Architectural version */
unsigned long severity:3;   /* Severity level of error */
Index: upstream/arch/powerpc/platforms/pseries/ras.c
===
--- upstream.orig/arch/powerpc/platforms/pseries/ras.c
+++ upstream/arch/powerpc/platforms/pseries/ras.c
@@ -61,7 +61,6 @@ static int ras_check_exception_token;
 
 #define EPOW_SENSOR_TOKEN  9
 #define EPOW_SENSOR_INDEX  0
-#define RAS_VECTOR_OFFSET  0x500
 
 static irqreturn_t ras_epow_interrupt(int irq, void *dev_id);
 static irqreturn_t ras_error_interrupt(int irq, void *dev_id);
@@ -121,7 +120,7 @@ static irqreturn_t ras_epow_interrupt(in
spin_lock(ras_log_buf_lock);
 
status = rtas_call(ras_check_exception_token, 6, 1, NULL,
-  RAS_VECTOR_OFFSET,
+  RTAS_VECTOR_EXTERNAL_INTERRUPT,
   irq_map[irq].hwirq,
   RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS,
   critical, __pa(ras_log_buf),
@@ -156,7 +155,7 @@ static irqreturn_t ras_error_interrupt(i
spin_lock(ras_log_buf_lock);
 
status = rtas_call(ras_check_exception_token, 6, 1, NULL,
-  RAS_VECTOR_OFFSET,
+  RTAS_VECTOR_EXTERNAL_INTERRUPT,
   irq_map[irq].hwirq,
   RTAS_INTERNAL_ERROR, 1 /*Time Critical */,
   __pa(ras_log_buf),
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc/pseries: Make request_ras_irqs() available to other pseries code

2010-05-19 Thread Mark Nelson

Hi Michael,

Thanks for looking over these patches!

On Tuesday 18 May 2010 22:40:51 Michael Ellerman wrote:
 On Mon, 2010-05-17 at 22:33 +1000, Mark Nelson wrote:
  At the moment only the RAS code uses event-sources interrupts (for EPOW
  events and internal errors) so request_ras_irqs() (which actually requests
  the event-sources interrupts) is found in ras.c and is static.
 
 Hi Mark,
 
 Just a few niggles,
 
 ...
 
  +
  +void request_event_sources_irqs(struct device_node *np,
  +   irq_handler_t handler,
  +   const char *name)
  +{
  +   int i, index, count = 0;
  +   struct of_irq oirq;
  +   const u32 *opicprop;
  +   unsigned int opicplen;
  +   unsigned int virqs[16];
  +
  +   /* Check for obsolete open-pic-interrupt property. If present, then
  +* map those interrupts using the default interrupt host and default
  +* trigger
  +*/
  +   opicprop = of_get_property(np, open-pic-interrupt, opicplen);
  +   if (opicprop) {
  +   opicplen /= sizeof(u32);
  +   for (i = 0; i  opicplen; i++) {
  +   if (count  15)
  +   break;
  +   virqs[count] = irq_create_mapping(NULL, *(opicprop++));
  +   if (virqs[count] == NO_IRQ)
  +   printk(KERN_ERR Unable to allocate interrupt 
  +  number for %s\n, np-full_name);
  +   else
  +   count++;
  +
  +   }
  +   }
  +   /* Else use normal interrupt tree parsing */
  +   else {
  +   /* First try to do a proper OF tree parsing */
  +   for (index = 0; of_irq_map_one(np, index, oirq) == 0;
  +index++) {
  +   if (count  15)
  +   break;
  +   virqs[count] = irq_create_of_mapping(oirq.controller,
  +   oirq.specifier,
  +   oirq.size);
  +   if (virqs[count] == NO_IRQ)
  +   printk(KERN_ERR Unable to allocate interrupt 
  +  number for %s\n, np-full_name);
  +   else
  +   count++;
  +   }
  +   }
  +
  +   /* Now request them */
  +   for (i = 0; i  count; i++) {
  +   if (request_irq(virqs[i], handler, 0, name, NULL)) {
  +   printk(KERN_ERR Unable to request interrupt %d for 
  +  %s\n, virqs[i], np-full_name);
  +   return;
  +   }
  +   }
 
 Existing code I know, but the error handling in here is a little lax,
 what's not going to work if we miss some or all of the interrupts?

That's a good point. For the existing code, if we miss an EPOW event
it just means that the event won't be logged (as that's all we do with
those events at the moment, although there is a comment saying
that it should be fixed to take appropriate action depending upon the
type of power failure); but it's a bigger problem if we miss one of the
RAS errors because then we could miss a fatal event that we should halt
the machine on. And for the upcoming IO events it's even worse as we'd
miss an interrupt from the device...

I would do it in a follow-on patch rather than this one, but what would
be a good course of action if we can't request the interrupt?

 
  Index: upstream/arch/powerpc/platforms/pseries/event_sources.h
  ===
  --- /dev/null
  +++ upstream/arch/powerpc/platforms/pseries/event_sources.h
  @@ -0,0 +1,29 @@
  +/*
  + * Copyright (C) 2001 Dave Engebretsen IBM Corporation
  + *
  + * This program is free software; you can redistribute it and/or modify
  + * it under the terms of the GNU General Public License as published by
  + * the Free Software Foundation; either version 2 of the License, or
  + * (at your option) any later version.
  + *
  + * This program is distributed in the hope that it will be useful,
  + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  + * GNU General Public License for more details.
  + *
  + * You should have received a copy of the GNU General Public License
  + * along with this program; if not, write to the Free Software
  + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  + */
  +
  +#ifndef _POWERPC_EVENT_SOURCES_H
  +#define _POWERPC_EVENT_SOURCES_H
  +
  +#include linux/interrupt.h
  +
  +struct device_node;
  +
  +extern void request_event_sources_irqs(struct device_node *np,
  +  irq_handler_t handler, const char *name);
 
 This could just go in platforms/pseries/pseries.h

That's true - it is only one function. I'll move it.

 
  Index: upstream/arch/powerpc/platforms/pseries/ras.c

Re: [PATCH] powerpc/pseries: Make request_ras_irqs() available to other pseries code

2010-05-19 Thread Mark Nelson

On Wednesday 19 May 2010 16:35:54 Michael Ellerman wrote:
 On Wed, 2010-05-19 at 16:35 +1000, Mark Nelson wrote:
  Hi Michael,
  
  Thanks for looking over these patches!
 ..
   
   Existing code I know, but the error handling in here is a little lax,
   what's not going to work if we miss some or all of the interrupts?
  
  That's a good point. For the existing code, if we miss an EPOW event
  it just means that the event won't be logged (as that's all we do with
  those events at the moment, although there is a comment saying
  that it should be fixed to take appropriate action depending upon the
  type of power failure); but it's a bigger problem if we miss one of the
  RAS errors because then we could miss a fatal event that we should halt
  the machine on. And for the upcoming IO events it's even worse as we'd
  miss an interrupt from the device...
 
 Yeah that's what I was thinking.
 
  I would do it in a follow-on patch rather than this one, but what would
  be a good course of action if we can't request the interrupt?
 
 Yes a follow on patch is the way to do it.
 
 There shouldn't be that many reasons the request fails, other than
 ENOMEM, or broken device tree perhaps. It's definitely worth a
 WARN_ON(), people notice those at least.

That sounds good. I'll do a simple follow-on patch that adds the
WARN_ON().

Thanks!
Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] powerpc/pseries: Make request_ras_irqs() available to other pseries code

2010-05-19 Thread Mark Nelson

At the moment only the RAS code uses event-sources interrupts (for EPOW
events and internal errors) so request_ras_irqs() (which actually requests
the event-sources interrupts) is found in ras.c and is static.

We want to be able to use event-sources interrupts in other pseries code,
so let's rename request_ras_irqs() to request_event_sources_irqs() and
move it to event_sources.c.

This will be used in an upcoming patch that adds support for IO Event
interrupts that come through as event sources.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---

Changes v1 - v2:
 * move the prototype for request_event_sources_irqs() to pseries.h

 arch/powerpc/platforms/pseries/Makefile|2 
 arch/powerpc/platforms/pseries/event_sources.c |   79 +
 arch/powerpc/platforms/pseries/pseries.h   |7 ++
 arch/powerpc/platforms/pseries/ras.c   |   62 ---
 4 files changed, 90 insertions(+), 60 deletions(-)

Index: upstream/arch/powerpc/platforms/pseries/event_sources.c
===
--- /dev/null
+++ upstream/arch/powerpc/platforms/pseries/event_sources.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include asm/prom.h
+
+#include pseries.h
+
+void request_event_sources_irqs(struct device_node *np,
+   irq_handler_t handler,
+   const char *name)
+{
+   int i, index, count = 0;
+   struct of_irq oirq;
+   const u32 *opicprop;
+   unsigned int opicplen;
+   unsigned int virqs[16];
+
+   /* Check for obsolete open-pic-interrupt property. If present, then
+* map those interrupts using the default interrupt host and default
+* trigger
+*/
+   opicprop = of_get_property(np, open-pic-interrupt, opicplen);
+   if (opicprop) {
+   opicplen /= sizeof(u32);
+   for (i = 0; i  opicplen; i++) {
+   if (count  15)
+   break;
+   virqs[count] = irq_create_mapping(NULL, *(opicprop++));
+   if (virqs[count] == NO_IRQ)
+   printk(KERN_ERR Unable to allocate interrupt 
+  number for %s\n, np-full_name);
+   else
+   count++;
+
+   }
+   }
+   /* Else use normal interrupt tree parsing */
+   else {
+   /* First try to do a proper OF tree parsing */
+   for (index = 0; of_irq_map_one(np, index, oirq) == 0;
+index++) {
+   if (count  15)
+   break;
+   virqs[count] = irq_create_of_mapping(oirq.controller,
+   oirq.specifier,
+   oirq.size);
+   if (virqs[count] == NO_IRQ)
+   printk(KERN_ERR Unable to allocate interrupt 
+  number for %s\n, np-full_name);
+   else
+   count++;
+   }
+   }
+
+   /* Now request them */
+   for (i = 0; i  count; i++) {
+   if (request_irq(virqs[i], handler, 0, name, NULL)) {
+   printk(KERN_ERR Unable to request interrupt %d for 
+  %s\n, virqs[i], np-full_name);
+   return;
+   }
+   }
+}
+
Index: upstream/arch/powerpc/platforms/pseries/Makefile
===
--- upstream.orig/arch/powerpc/platforms/pseries/Makefile
+++ upstream/arch/powerpc/platforms/pseries/Makefile
@@ -7,7 +7,7 @@ EXTRA_CFLAGS+= -DDEBUG
 endif
 
 obj-y  := lpar.o hvCall.o nvram.o reconfig.o \
-  setup.o iommu.o ras.o \
+  setup.o iommu.o event_sources.o ras.o \
   firmware.o power.o dlpar.o
 obj-$(CONFIG_SMP)  += smp.o
 obj-$(CONFIG_XICS) += xics.o
Index: upstream/arch/powerpc/platforms/pseries

Re: [PATCH] powerpc/pseries: Add support for IO Event interrupt drivers

2010-05-19 Thread Mark Nelson

On Tuesday 18 May 2010 23:37:31 Michael Ellerman wrote:
 On Mon, 2010-05-17 at 22:53 +1000, Mark Nelson wrote:
  This patch adds support for handling IO Event interrupts which come
  through at the /event-sources/ibm,io-events device tree node.
 
 Hi Mark,
 
 You'll have to explain to me offline sometime how it is we ran out of
 interrupts and started needing to multiplex them ..
 
  There is one ibm,io-events interrupt, but this interrupt might be used
  for multiple I/O devices, each with their own separate driver. So, we
  create a platform interrupt handler that will do the RTAS check-exception
  call and then call the appropriate driver's interrupt handler (the one(s)
  that registered with a scope that matches the scope of the incoming
  interrupt).
  
  So, a driver for a device that uses IO Event interrupts will register
  it's interrupt service routine (or interrupt handler) with the platform
  code using ioei_register_isr(). This register function takes a function
  pointer to the driver's handler and the scope that the driver is
  interested in (scopes defined in arch/powerpc/include/asm/io_events.h).
  The driver's handler must take a pointer to a struct io_events_section
  and must not return anything.
  
  The platform code registers io_event_interrupt() as the interrupt handler
  for the ibm,io-events interrupt. Upon receiving an IO Event interrupt, it
  checks the scope of the incoming interrupt and only calls those drivers'
  handlers that have registered as being interested in that scope.
 
 The checks the scope requires an RTAS call, which takes a global lock
 (and you add another) - these aren't going to be used for anything
 performance critical I hope?

I've been told they're not performance critical but I'll have to go
back and have a closer look at the locking again - I probably am being
a bit overzealous.

 
  It is possible for a single driver to register the same function pointer
  more than once with different scopes if it is interested in more than one
  type of IO Event interrupt. If a handler wants to be notified of all
  incoming IO Event interrupts it can register with IOEI_SCOPE_ANY.
  
  A driver can unregister to stop receiving the IO Event interrupts using
  ioei_unregister_isr(), passing it the same function pointer to the
  driver's handler and the scope the driver was registered with.
  
  Signed-off-by: Mark Nelson ma...@au1.ibm.com
  ---
   arch/powerpc/include/asm/io_events.h   |   40 +
   arch/powerpc/platforms/pseries/Makefile|2 
   arch/powerpc/platforms/pseries/io_events.c |  205 
  +
   3 files changed, 246 insertions(+), 1 deletion(-)
  
  Index: upstream/arch/powerpc/include/asm/io_events.h
  ===
  --- /dev/null
  +++ upstream/arch/powerpc/include/asm/io_events.h
  @@ -0,0 +1,40 @@
  +/*
  + * Copyright 2010 IBM Corporation, Mark Nelson
 
 I usually have name, corp, but I'm not sure if it matters.

I'll find out what the officially sanctioned way is :)

 
  + *  This program is free software; you can redistribute it and/or
  + *  modify it under the terms of the GNU General Public License
  + *  as published by the Free Software Foundation; either version
  + *  2 of the License, or (at your option) any later version.
  + */
  +
  +#ifndef _ASM_POWERPC_IOEVENTS_H
  +#define _ASM_POWERPC_IOEVENTS_H
  +
  +struct io_events_section {
  +   u16 id; // Unique section identifierx00-x01
  +   u16 length; // Section length (bytes)   x02-x03
  +   u8  version;// Section Version  x04-x04
  +   u8  sec_subtype;// Section subtype  x05-x05
  +   u16 creator_id; // Creator Component ID x06-x07
  +   u8  event_type; // IO-Event Typex08-x08
  +   u8  rpc_field_len;  // PRC Field Length x09-x09
  +   u8  scope;  // Error/Event Scopex0A-x0A
  +   u8  event_subtype;  // I/O-Event Sub-Type   x0B-x0B
  +   u32 drc_index;  // DRC Indexx0C-x0F
  +   u32 rpc_data[]; // RPC Data (optional)  x10-...
  +};
 
 I know it's idiomatic in that sort of code, but C++ comments?

Yeah, I'm not too phased - I was just copying what lppaca.h did...

 
  +#define IOEI_SCOPE_NOT_APP 0x00
  +#define IOEI_SCOPE_RIO_HUB 0x36
  +#define IOEI_SCOPE_RIO_BRIDGE  0x37
  +#define IOEI_SCOPE_PHB 0x38
  +#define IOEI_SCOPE_EADS_GLOBAL 0x39
  +#define IOEI_SCOPE_EADS_SLOT   0x3A
  +#define IOEI_SCOPE_TORRENT_HUB 0x3B
  +#define IOEI_SCOPE_SERVICE_PROC0x51
  +#define IOEI_SCOPE_ANY -1
  +
  +int ioei_register_isr(void (*isr)(struct io_events_section *), int scope);
  +int ioei_unregister_isr(void (*isr)(struct io_events_section *), int 
  scope);
 
 Given

Re: [PATCH] powerpc/pseries: Add support for IO Event interrupt drivers

2010-05-19 Thread Mark Nelson

On Wednesday 19 May 2010 14:27:44 Sonny Rao wrote:
 On Tue, 18 May 2010 23:37:31 +1000, Michael Ellerman wrote:
  
  On Mon, 2010-05-17 at 22:53 +1000, Mark Nelson wrote:
   This patch adds support for handling IO Event interrupts which come
   through at the /event-sources/ibm,io-events device tree node.
  
  Hi Mark,
  
  You'll have to explain to me offline sometime how it is we ran out of
  interrupts and started needing to multiplex them ..
 
 Firmware has decided to multiplex all i/o error reporting through a single
 interrupt for reasons unknown, that is the primary reason for this patch.
 
 One question is, we already register a few RAS interrupts which call
 RTAS using check-exception for getting error information.  Those live
 in platforms/pseries/ras.c -- should we combine the two into a common
 implementation somehow?

I developed in ras.c but then moved it out just for functional
separation but you could be right in that they should live together.
I'll have another look at it when I've fixed it up.

 
   There is one ibm,io-events interrupt, but this interrupt might be used
   for multiple I/O devices, each with their own separate driver. So, we
   create a platform interrupt handler that will do the RTAS check-exception
   call and then call the appropriate driver's interrupt handler (the one(s)
   that registered with a scope that matches the scope of the incoming
   interrupt).
   
   So, a driver for a device that uses IO Event interrupts will register
   it's interrupt service routine (or interrupt handler) with the platform
   code using ioei_register_isr(). This register function takes a function
   pointer to the driver's handler and the scope that the driver is
   interested in (scopes defined in arch/powerpc/include/asm/io_events.h).
   The driver's handler must take a pointer to a struct io_events_section
   and must not return anything.
   
   The platform code registers io_event_interrupt() as the interrupt handler
   for the ibm,io-events interrupt. Upon receiving an IO Event interrupt, it
   checks the scope of the incoming interrupt and only calls those drivers'
   handlers that have registered as being interested in that scope.
  
  The checks the scope requires an RTAS call, which takes a global lock
  (and you add another) - these aren't going to be used for anything
  performance critical I hope?
 
 Nope it shouldn't be performance critical, but it does raise the point
 that the current RTAS implementation in Linux *always* uses the global
 lock.  There is a set of calls which are not required to be serialized
 against each other.  This would be a totally different patchset to fix that
 problem.  The check-exception call is one that doesn't require the global
 RTAS lock.  I might work on that someday :-)
 
 snip
 
   + /* check to see if we've already registered this function with
   +  * this scope. If we have, don't register it again
   +  */
   + iter = ioei_isr_list;
   + while (iter) {
   + if (iter-ioei_isr == isr  iter-scope == scope)
   + break;
   + iter = iter-next;
   + }
   +
   + if (iter) {
   + ret = -EEXIST;
   + goto out;
   + }
   +
   + cons = kmalloc(sizeof(struct ioei_consumer), GFP_KERNEL);
  
  But you don't want to kmalloc while holding the lock and with interrupts
  off.
 
 Well, he could use GFP_ATOMIC but that's the wrong approach.  You should
 allocate the buffer (using GFP_KERNEL) before taking the spin lock.
 

And then free it before returning in the case that we've got a
duplicate, right?

 snip
 
   +#define EXT_INT_VECTOR_OFFSET0x500
   +#define RTAS_TYPE_IO_EVENT   0xE1
 
 These defines should probably go in asm/rtas.h
 
 I noticed the code in ras.c has it's own define too RAS_VECTOR_OFFSET
 for 0x500 and asm/rtas.h actually has RTAS_TYPE_IO for 0xE1

I'm not sure how I missed RTAS_TYPE_IO in asm/rtas.h but I'll use
that and then add EXT_INT_VECTOR_OFFSET and update the RAS code to use
it too.

 
   +
   +static irqreturn_t io_event_interrupt(int irq, void *dev_id)
   +{
   + struct rtas_error_log *rtas_elog;
   + struct io_events_section *ioei_sec;
   + char *ch_ptr;
   + int status;
   + u16 *sec_len;
   +
   + spin_lock(ioei_log_buf_lock);
   +
   + status = rtas_call(ioei_check_exception_token, 6, 1, NULL,
   +EXT_INT_VECTOR_OFFSET,
   +irq_map[irq].hwirq,
  
  This is going to be  slow anyway, you may as well use virq_to_hw().
  
   +RTAS_IO_EVENTS, 1 /*Time Critical */,
  
  Missing space before the T  ^
  
   +__pa(ioei_log_buf),
  
  Does the buffer need to be aligned, and/or inside the RMO? I'd guess
  yes.
 
 The docs for check-exception don't particularly specify alignment
 requirements, but RTAS in generally going to want it in the RMO
 
 Also, if we're going to go ahead and use rtas_call() which locks
 it's own buffer which meets the requirements, why do we even need

[PATCH] powerpc/pseries: Make request_ras_irqs() available to other pseries code

2010-05-17 Thread Mark Nelson

At the moment only the RAS code uses event-sources interrupts (for EPOW
events and internal errors) so request_ras_irqs() (which actually requests
the event-sources interrupts) is found in ras.c and is static.

We want to be able to use event-sources interrupts in other pseries code,
so let's rename request_ras_irqs() to request_event_sources_irqs() and
move it to event_sources.c.

This will be used in an upcoming patch that adds support for IO Event
interrupts that come through as event sources.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---
 arch/powerpc/platforms/pseries/Makefile|2 
 arch/powerpc/platforms/pseries/event_sources.c |   79 +
 arch/powerpc/platforms/pseries/event_sources.h |   29 +
 arch/powerpc/platforms/pseries/ras.c   |   63 +--
 4 files changed, 113 insertions(+), 60 deletions(-)

Index: upstream/arch/powerpc/platforms/pseries/event_sources.c
===
--- /dev/null
+++ upstream/arch/powerpc/platforms/pseries/event_sources.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include asm/prom.h
+
+#include event_sources.h
+
+void request_event_sources_irqs(struct device_node *np,
+   irq_handler_t handler,
+   const char *name)
+{
+   int i, index, count = 0;
+   struct of_irq oirq;
+   const u32 *opicprop;
+   unsigned int opicplen;
+   unsigned int virqs[16];
+
+   /* Check for obsolete open-pic-interrupt property. If present, then
+* map those interrupts using the default interrupt host and default
+* trigger
+*/
+   opicprop = of_get_property(np, open-pic-interrupt, opicplen);
+   if (opicprop) {
+   opicplen /= sizeof(u32);
+   for (i = 0; i  opicplen; i++) {
+   if (count  15)
+   break;
+   virqs[count] = irq_create_mapping(NULL, *(opicprop++));
+   if (virqs[count] == NO_IRQ)
+   printk(KERN_ERR Unable to allocate interrupt 
+  number for %s\n, np-full_name);
+   else
+   count++;
+
+   }
+   }
+   /* Else use normal interrupt tree parsing */
+   else {
+   /* First try to do a proper OF tree parsing */
+   for (index = 0; of_irq_map_one(np, index, oirq) == 0;
+index++) {
+   if (count  15)
+   break;
+   virqs[count] = irq_create_of_mapping(oirq.controller,
+   oirq.specifier,
+   oirq.size);
+   if (virqs[count] == NO_IRQ)
+   printk(KERN_ERR Unable to allocate interrupt 
+  number for %s\n, np-full_name);
+   else
+   count++;
+   }
+   }
+
+   /* Now request them */
+   for (i = 0; i  count; i++) {
+   if (request_irq(virqs[i], handler, 0, name, NULL)) {
+   printk(KERN_ERR Unable to request interrupt %d for 
+  %s\n, virqs[i], np-full_name);
+   return;
+   }
+   }
+}
+
Index: upstream/arch/powerpc/platforms/pseries/event_sources.h
===
--- /dev/null
+++ upstream/arch/powerpc/platforms/pseries/event_sources.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2001 Dave Engebretsen IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS

[PATCH] powerpc/pseries: Add support for IO Event interrupt drivers

2010-05-17 Thread Mark Nelson

This patch adds support for handling IO Event interrupts which come
through at the /event-sources/ibm,io-events device tree node.

There is one ibm,io-events interrupt, but this interrupt might be used
for multiple I/O devices, each with their own separate driver. So, we
create a platform interrupt handler that will do the RTAS check-exception
call and then call the appropriate driver's interrupt handler (the one(s)
that registered with a scope that matches the scope of the incoming
interrupt).

So, a driver for a device that uses IO Event interrupts will register
it's interrupt service routine (or interrupt handler) with the platform
code using ioei_register_isr(). This register function takes a function
pointer to the driver's handler and the scope that the driver is
interested in (scopes defined in arch/powerpc/include/asm/io_events.h).
The driver's handler must take a pointer to a struct io_events_section
and must not return anything.

The platform code registers io_event_interrupt() as the interrupt handler
for the ibm,io-events interrupt. Upon receiving an IO Event interrupt, it
checks the scope of the incoming interrupt and only calls those drivers'
handlers that have registered as being interested in that scope.

It is possible for a single driver to register the same function pointer
more than once with different scopes if it is interested in more than one
type of IO Event interrupt. If a handler wants to be notified of all
incoming IO Event interrupts it can register with IOEI_SCOPE_ANY.

A driver can unregister to stop receiving the IO Event interrupts using
ioei_unregister_isr(), passing it the same function pointer to the
driver's handler and the scope the driver was registered with.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---
 arch/powerpc/include/asm/io_events.h   |   40 +
 arch/powerpc/platforms/pseries/Makefile|2 
 arch/powerpc/platforms/pseries/io_events.c |  205 +
 3 files changed, 246 insertions(+), 1 deletion(-)

Index: upstream/arch/powerpc/include/asm/io_events.h
===
--- /dev/null
+++ upstream/arch/powerpc/include/asm/io_events.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2010 IBM Corporation, Mark Nelson
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ASM_POWERPC_IOEVENTS_H
+#define _ASM_POWERPC_IOEVENTS_H
+
+struct io_events_section {
+   u16 id; // Unique section identifierx00-x01
+   u16 length; // Section length (bytes)   x02-x03
+   u8  version;// Section Version  x04-x04
+   u8  sec_subtype;// Section subtype  x05-x05
+   u16 creator_id; // Creator Component ID x06-x07
+   u8  event_type; // IO-Event Typex08-x08
+   u8  rpc_field_len;  // PRC Field Length x09-x09
+   u8  scope;  // Error/Event Scopex0A-x0A
+   u8  event_subtype;  // I/O-Event Sub-Type   x0B-x0B
+   u32 drc_index;  // DRC Indexx0C-x0F
+   u32 rpc_data[]; // RPC Data (optional)  x10-...
+};
+
+#define IOEI_SCOPE_NOT_APP 0x00
+#define IOEI_SCOPE_RIO_HUB 0x36
+#define IOEI_SCOPE_RIO_BRIDGE  0x37
+#define IOEI_SCOPE_PHB 0x38
+#define IOEI_SCOPE_EADS_GLOBAL 0x39
+#define IOEI_SCOPE_EADS_SLOT   0x3A
+#define IOEI_SCOPE_TORRENT_HUB 0x3B
+#define IOEI_SCOPE_SERVICE_PROC0x51
+#define IOEI_SCOPE_ANY -1
+
+int ioei_register_isr(void (*isr)(struct io_events_section *), int scope);
+int ioei_unregister_isr(void (*isr)(struct io_events_section *), int scope);
+
+#endif /* _ASM_POWERPC_IOEVENTS_H */
Index: upstream/arch/powerpc/platforms/pseries/io_events.c
===
--- /dev/null
+++ upstream/arch/powerpc/platforms/pseries/io_events.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright 2010 IBM Corporation, Mark Nelson
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ */
+
+#include linux/errno.h
+#include linux/spinlock.h
+#include linux/slab.h
+#include linux/module.h
+#include linux/irq.h
+#include linux/interrupt.h
+#include linux/of.h
+
+#include asm/io_events.h
+#include asm/rtas.h
+#include asm/irq.h
+
+#include event_sources.h
+
+struct ioei_consumer {
+   void (*ioei_isr)(struct io_events_section *);
+   int scope;
+   struct ioei_consumer *next

[PATCH v3] powerpc: Track backing pages allocated by vmemmap_populate()

2010-04-21 Thread Mark Nelson

We need to keep track of the backing pages that get allocated by
vmemmap_populate() so that when we use kdump, the dump-capture kernel knows
where these pages are.

We use a simple linked list of structures that contain the physical address
of the backing page and corresponding virtual address to track the backing
pages.
To save space, we just use a pointer to the next struct vmemmap_backing. We
can also do this because we never remove nodes.  We call the pointer list
to be compatible with changes made to the crash utility.

vmemmap_populate() is called either at boot-time or on a memory hotplug
operation. We don't have to worry about the boot-time calls because they
will be inherently single-threaded, and for a memory hotplug operation
vmemmap_populate() is called through:
sparse_add_one_section()
|
V
kmalloc_section_memmap()
|
V
sparse_mem_map_populate()
|
V
vmemmap_populate()
and in sparse_add_one_section() we're protected by pgdat_resize_lock().
So, we don't need a spinlock to protect the vmemmap_list.

We allocate space for the vmemmap_backing structs by allocating whole pages
in vmemmap_list_alloc() and then handing out chunks of this to
vmemmap_list_populate().

This means that we waste at most just under one page, but this keeps the code
is simple.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---

changes since v2:
 - use a pointer to the next struct vmemmap_backing instead of an hlist
 - add vmemmap_list_alloc() to allocate space for vmemmap_backing structs
   a page at-a-time when needed 

changes since v1:
 - use an hlist to save space in the structure
 - remove the spinlock because it's not needed

 arch/powerpc/include/asm/pgalloc-64.h |6 
 arch/powerpc/mm/init_64.c |   43 ++
 2 files changed, 49 insertions(+)

Index: upstream/arch/powerpc/include/asm/pgalloc-64.h
===
--- upstream.orig/arch/powerpc/include/asm/pgalloc-64.h
+++ upstream/arch/powerpc/include/asm/pgalloc-64.h
@@ -11,6 +11,12 @@
 #include linux/cpumask.h
 #include linux/percpu.h
 
+struct vmemmap_backing {
+   struct vmemmap_backing *list;
+   unsigned long phys;
+   unsigned long virt_addr;
+};
+
 /*
  * Functions that deal with pagetables that could be at any level of
  * the table need to be passed an index_size so they know how to
Index: upstream/arch/powerpc/mm/init_64.c
===
--- upstream.orig/arch/powerpc/mm/init_64.c
+++ upstream/arch/powerpc/mm/init_64.c
@@ -252,6 +252,47 @@ static void __meminit vmemmap_create_map
 }
 #endif /* CONFIG_PPC_BOOK3E */
 
+struct vmemmap_backing *vmemmap_list;
+
+static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
+{
+   static struct vmemmap_backing *next;
+   static int num_left;
+
+   /* allocate a page when required and hand out chunks */
+   if (!next || !num_left) {
+   next = vmemmap_alloc_block(PAGE_SIZE, node);
+   if (unlikely(!next)) {
+   WARN_ON(1);
+   return NULL;
+   }
+   num_left = PAGE_SIZE / sizeof(struct vmemmap_backing);
+   }
+
+   num_left--;
+
+   return next++;
+}
+
+static __meminit void vmemmap_list_populate(unsigned long phys,
+   unsigned long start,
+   int node)
+{
+   struct vmemmap_backing *vmem_back;
+
+   vmem_back = vmemmap_list_alloc(node);
+   if (unlikely(!vmem_back)) {
+   WARN_ON(1);
+   return;
+   }
+
+   vmem_back-phys = phys;
+   vmem_back-virt_addr = start;
+   vmem_back-list = vmemmap_list;
+
+   vmemmap_list = vmem_back;
+}
+
 int __meminit vmemmap_populate(struct page *start_page,
   unsigned long nr_pages, int node)
 {
@@ -276,6 +317,8 @@ int __meminit vmemmap_populate(struct pa
if (!p)
return -ENOMEM;
 
+   vmemmap_list_populate(__pa(p), start, node);
+
pr_debug(  * %016lx..%016lx allocated at %p\n,
 start, start + page_size, p);
 
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2] powerpc: Track backing pages allocated by vmemmap_populate()

2010-04-13 Thread Mark Nelson

On Tuesday 13 April 2010 21:16:44 Benjamin Herrenschmidt wrote:
 On Tue, 2010-04-13 at 16:02 +1000, Mark Nelson wrote:
  That's a good question, and one that I probably should have added to
  the
  commit message.
  But, following through, it looks like we end up calling into
  __remove_section() from mm/memory_hotplug.c and if
  CONFIG_SPARSEMEM_VMEMMAP is enabled we just return EBUSY as freeing
  memmap with vmemmap isn't implemented yet.
  
  So for the moment, I'm not sure we have to worry about it.
 
 We probably don't. IE. The vmemmap will remain for those struct pages,
 which means they won't need to be allocated again if some memory is
 plugged back there. If not, we just waste a bit of memory. Not a big
 deal.
 
Excellent! That makes sense.

Thanks Ben!

Mark.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2] powerpc: Track backing pages allocated by vmemmap_populate()

2010-04-12 Thread Mark Nelson

We need to keep track of the backing pages that get allocated by
vmemmap_populate() so that when we use kdump, the dump-capture kernel knows
where these pages are.

We use a linked list of structures that contain the physical address of the
backing page and corresponding virtual address to track the backing pages.
We can use an hlist to save space, because we never remove nodes.

vmemmap_populate() is called either at boot-time or on a memory hotplug
operation. We don't have to worry about the boot-time calls because they
will be inherently single-threaded, and for a memory hotplug operation
vmemmap_populate() is called through:
sparse_add_one_section()
|
V
kmalloc_section_memmap()
|
V
sparse_mem_map_populate()
|
V
vmemmap_populate()
and in sparse_add_one_section() we're protected by pgdat_resize_lock().
So, we don't need a spinlock to protect the vmemmap_list.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---

changes since v1:
 - use an hlist to save space in the structure
 - remove the spinlock because it's not needed

 arch/powerpc/include/asm/pgalloc-64.h |7 +++
 arch/powerpc/mm/init_64.c |   24 
 2 files changed, 31 insertions(+)

Index: upstream/arch/powerpc/include/asm/pgalloc-64.h
===
--- upstream.orig/arch/powerpc/include/asm/pgalloc-64.h
+++ upstream/arch/powerpc/include/asm/pgalloc-64.h
@@ -10,6 +10,13 @@
 #include linux/slab.h
 #include linux/cpumask.h
 #include linux/percpu.h
+#include linux/list.h
+
+struct vmemmap_backing {
+   unsigned long phys;
+   unsigned long virt_addr;
+   struct hlist_node hlist;
+};
 
 /*
  * Functions that deal with pagetables that could be at any level of
Index: upstream/arch/powerpc/mm/init_64.c
===
--- upstream.orig/arch/powerpc/mm/init_64.c
+++ upstream/arch/powerpc/mm/init_64.c
@@ -43,6 +43,7 @@
 #include linux/lmb.h
 #include linux/hugetlb.h
 #include linux/slab.h
+#include linux/list.h
 
 #include asm/pgalloc.h
 #include asm/page.h
@@ -252,6 +253,27 @@ static void __meminit vmemmap_create_map
 }
 #endif /* CONFIG_PPC_BOOK3E */
 
+HLIST_HEAD(vmemmap_list);
+
+static __meminit void vmemmap_list_populate(unsigned long phys,
+   unsigned long start,
+   int node)
+{
+   struct vmemmap_backing *vmem_back;
+
+   vmem_back = vmemmap_alloc_block(sizeof(struct vmemmap_backing), node);
+   if (unlikely(!vmem_back)) {
+   WARN_ON(1);
+   return;
+   }
+
+   vmem_back-phys = phys;
+   vmem_back-virt_addr = start;
+   INIT_HLIST_NODE(vmem_back-hlist);
+
+   hlist_add_head(vmem_back-hlist, vmemmap_list);
+}
+
 int __meminit vmemmap_populate(struct page *start_page,
   unsigned long nr_pages, int node)
 {
@@ -276,6 +298,8 @@ int __meminit vmemmap_populate(struct pa
if (!p)
return -ENOMEM;
 
+   vmemmap_list_populate(__pa(p), start, node);
+
pr_debug(  * %016lx..%016lx allocated at %p\n,
 start, start + page_size, p);
 
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2] powerpc: Track backing pages allocated by vmemmap_populate()

2010-04-12 Thread Mark Nelson

On Tuesday 13 April 2010 15:24:17 Michael Ellerman wrote:
 On Tue, 2010-04-13 at 14:16 +1000, Mark Nelson wrote:
  We need to keep track of the backing pages that get allocated by
  vmemmap_populate() so that when we use kdump, the dump-capture kernel knows
  where these pages are.
  
  We use a linked list of structures that contain the physical address of the
  backing page and corresponding virtual address to track the backing pages.
  We can use an hlist to save space, because we never remove nodes.
 
 What if we remove_memory() ? Or does that not do what I think it does?

That's a good question, and one that I probably should have added to the
commit message.
But, following through, it looks like we end up calling into
__remove_section() from mm/memory_hotplug.c and if
CONFIG_SPARSEMEM_VMEMMAP is enabled we just return EBUSY as freeing
memmap with vmemmap isn't implemented yet.

So for the moment, I'm not sure we have to worry about it.

Thanks!
Mark.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc: Track backing pages used allocated by vmemmap_populate()

2010-03-26 Thread Mark Nelson

We need to keep track of the backing pages that get allocated by
vmemmap_populate() so that when we use kdump, the dump-capture kernel can
find these pages in memory.

We use a linked list of structures that contain the physical address of the
backing page and corresponding virtual address to track the backing pages.
And we use a simple spinlock to protect the vmemmap_list.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---
 arch/powerpc/include/asm/pgalloc-64.h |7 +++
 arch/powerpc/mm/init_64.c |   27 +++
 2 files changed, 34 insertions(+)

Index: upstream/arch/powerpc/include/asm/pgalloc-64.h
===
--- upstream.orig/arch/powerpc/include/asm/pgalloc-64.h
+++ upstream/arch/powerpc/include/asm/pgalloc-64.h
@@ -10,6 +10,13 @@
 #include linux/slab.h
 #include linux/cpumask.h
 #include linux/percpu.h
+#include linux/list.h
+
+struct vmemmap_backing {
+   unsigned long phys;
+   unsigned long virt_addr;
+   struct list_head list;
+};
 
 /*
  * Functions that deal with pagetables that could be at any level of
Index: upstream/arch/powerpc/mm/init_64.c
===
--- upstream.orig/arch/powerpc/mm/init_64.c
+++ upstream/arch/powerpc/mm/init_64.c
@@ -42,6 +42,7 @@
 #include linux/poison.h
 #include linux/lmb.h
 #include linux/hugetlb.h
+#include linux/list.h
 
 #include asm/pgalloc.h
 #include asm/page.h
@@ -251,6 +252,30 @@ static void __meminit vmemmap_create_map
 }
 #endif /* CONFIG_PPC_BOOK3E */
 
+LIST_HEAD(vmemmap_list);
+DEFINE_SPINLOCK(vmemmap_list_lock);
+
+static __meminit void vmemmap_list_populate(unsigned long phys,
+   unsigned long start,
+   int node)
+{
+   struct vmemmap_backing *vmem_back;
+
+   vmem_back = vmemmap_alloc_block(sizeof(struct vmemmap_backing), node);
+   if (unlikely(!vmem_back)) {
+   WARN_ON(1);
+   return;
+   }
+
+   vmem_back-phys = phys;
+   vmem_back-virt_addr = start;
+   INIT_LIST_HEAD(vmem_back-list);
+
+   spin_lock(vmemmap_list_lock);
+   list_add(vmem_back-list, vmemmap_list);
+   spin_unlock(vmemmap_list_lock);
+}
+
 int __meminit vmemmap_populate(struct page *start_page,
   unsigned long nr_pages, int node)
 {
@@ -275,6 +300,8 @@ int __meminit vmemmap_populate(struct pa
if (!p)
return -ENOMEM;
 
+   vmemmap_list_populate(__pa(p), start, node);
+
pr_debug(  * %016lx..%016lx allocated at %p\n,
 start, start + page_size, p);
 
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc: pair loads and stores in copy_4k_page

2010-02-10 Thread Mark Nelson

Hi Anton,

On Thursday 11 February 2010 15:07:54 Anton Blanchard wrote:
 
 A number of our chips like loads and stores to be paired. A small kernel
 module testcase shows the improvement of pairing loads and stores in 
 copy_4k_page:
 
 POWER6: +9%
 POWER7: +1.5%

I just tried this on one of our QS22 cell blades and it seems to cause
about half a percent speedup but that looks like it's within the noise
of the results that I'm getting.

In any case it doesn't look like it has a negative effect for cell.

Looks good!

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

powerpc/pseries: Fix kexec regression caused by CPPR tracking

2010-02-07 Thread Mark Nelson

The code to track the CPPR values added by commit
49bd3647134ea47420067aea8d1401e722bf2aac (powerpc/pseries: Track previous
CPPR values to correctly EOI interrupts) broke kexec on pseries because
the kexec code in xics.c calls xics_set_cpu_priority() before the IPI has
been EOI'ed. This wasn't a problem previously but it now triggers a BUG_ON
in xics_set_cpu_priority() because os_cppr-index isn't 0:

Oops: Exception in kernel mode, sig: 5 [#1] 
SMP NR_CPUS=128 NUMA
kernel BUG at arch/powerpc/platforms/pseries/xics.c:791!
pSeries 
Modules linked in: ehea dm_mirror dm_region_hash dm_log dm_zero dm_snapshot 
parport_pc parport dm_multipath autofs4
NIP: c00461bc LR: c0046260 CTR: c004bc08
   
REGS: cfffb770 TRAP: 0700   Not tainted  (2.6.33-rc6-autokern1) 
   
MSR: 80021032 ME,CE,IR,DR  CR: 4822  XER: 0001
   
TASK = c0aef700[0] 'swapper' THREAD: c0bcc000 CPU: 0
   
GPR00: 0001 cfffb9f0 c0bcc9e8   
   
GPR04:   00dc 0002  
   
GPR08: c40036e8 c4002e40 0898   
   
GPR12: 0002 c0bf8480 0350 c0792f28  
   
GPR16: c07915e0  00419000 03da8990  
   
GPR20: c08a8990 0010 c0ae92c0 0010  
   
GPR24:  c0be2380  00200200  
   
GPR28: 0001 0001 c0b249e8   
   
NIP [c00461bc] .xics_set_cpu_priority+0x38/0xb8 
   
LR [c0046260] .xics_teardown_cpu+0x24/0xa4  
   
Call Trace: 
   
[cfffb9f0] [ebf3] 0xebf3 (unreliable)   
   
[cfffba60] [c0046260] .xics_teardown_cpu+0x24/0xa4  
   
[cfffbae0] [c0046330] .xics_kexec_teardown_cpu+0x18/0xb4
   
[cfffbb60] [c004a150] .pseries_kexec_cpu_down_xics+0x20/0x38
   
[cfffbbf0] [c002e5b8] .kexec_smp_down+0x48/0x7c 
   
[cfffbc70] [c00b2dd0] 
.generic_smp_call_function_interrupt+0xf4/0x1b4  
[cfffbd20] [c002aed0] .smp_message_recv+0x48/0x100  
   
[cfffbda0] [c0046ae0] .xics_ipi_dispatch+0x84/0x148 
   
[cfffbe30] [c00d62dc] .handle_IRQ_event+0xc8/0x248  
   
[cfffbf00] [c00d8eb4] .handle_percpu_irq+0x80/0xf4  
   
[cfffbf90] [c0029048] .call_handle_irq+0x1c/0x2c
   
[c0bcfa30] [c000ec84] .do_IRQ+0x1b8/0x2a4   
   
[c0bcfae0] [c0004804] hardware_interrupt_entry+0x1c/0x98

Fix this problem by setting the index on the CPPR stack to 0 before calling
xics_set_cpu_priority() in xics_teardown_cpu().

Also make it clear that we only want to set the priority when there's just
one CPPR value in the stack, and enforce it by updating the value of
os_cppr-stack[0] rather than os_cppr-stack[os_cppr-index].

While we're at it change the BUG_ON to a WARN_ON.

Reported-by: Anton Blanchard an...@samba.org
Signed-off-by: Mark Nelson ma...@au1.ibm.com
---
Ben, if it's not too late for 2.6.33 this would be really nice to have
as without it we can't kexec on pseries.

 arch/powerpc/platforms/pseries/xics.c |   14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

Index: upstream/arch/powerpc/platforms/pseries/xics.c
===
--- upstream.orig/arch/powerpc/platforms/pseries/xics.c
+++ upstream/arch/powerpc/platforms/pseries/xics.c
@@ -784,9 +784,13 @@ static void xics_set_cpu_priority(unsign
 {
struct xics_cppr

[PATCH v2] powerpc/pseries: Track previous CPPR values to correctly EOI interrupts

2009-12-07 Thread Mark Nelson

At the moment when we EOI an interrupt we set the CPPR back to 0xFF
regardless of its previous value. This could lead to problems if we
take an interrupt with a priority of 5, but before EOIing it we get
an IPI which has a priority of 4. The problem is that at the moment
when we EOI the IPI we will set the CPPR to 0xFF, but it should
really be set back to 5 (the previous priority).

To keep track of the previous CPPR values we create the xics_cppr
structure that has an array for CPPR values and an index pointing
to the current priority. This can easily grow if new priorities get
added in the future.

This will also be useful because the partition adjunct option of
upcoming machines will update the H_XIRR hcall to accept the CPPR
as a parameter.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---

changes since v1:
* simplified function names to push_cppr() and pop_cppr()
* added checking for empty and full stack of cppr
* make sure to set cpu priority only when we're not already taking an interrupt

 arch/powerpc/platforms/pseries/xics.c |   56 ++
 1 file changed, 51 insertions(+), 5 deletions(-)

Index: upstream/arch/powerpc/platforms/pseries/xics.c
===
--- upstream.orig/arch/powerpc/platforms/pseries/xics.c
+++ upstream/arch/powerpc/platforms/pseries/xics.c
@@ -20,6 +20,7 @@
 #include linux/cpu.h
 #include linux/msi.h
 #include linux/of.h
+#include linux/percpu.h
 
 #include asm/firmware.h
 #include asm/io.h
@@ -46,6 +47,12 @@ static struct irq_host *xics_host;
  */
 #define IPI_PRIORITY   4
 
+/* The least favored priority */
+#define LOWEST_PRIORITY0xFF
+
+/* The number of priorities defined above */
+#define MAX_NUM_PRIORITIES 3
+
 static unsigned int default_server = 0xFF;
 static unsigned int default_distrib_server = 0;
 static unsigned int interrupt_server_size = 8;
@@ -56,6 +63,12 @@ static int ibm_set_xive;
 static int ibm_int_on;
 static int ibm_int_off;
 
+struct xics_cppr {
+   unsigned char stack[MAX_NUM_PRIORITIES];
+   int index;
+};
+
+static DEFINE_PER_CPU(struct xics_cppr, xics_cppr);
 
 /* Direct hardware low level accessors */
 
@@ -284,6 +297,19 @@ static inline unsigned int xics_xirr_vec
return xirr  0x00ff;
 }
 
+static void push_cppr(unsigned int vec)
+{
+   struct xics_cppr *os_cppr = __get_cpu_var(xics_cppr);
+
+   if (WARN_ON(os_cppr-index = MAX_NUM_PRIORITIES - 1))
+   return;
+
+   if (vec == XICS_IPI)
+   os_cppr-stack[++os_cppr-index] = IPI_PRIORITY;
+   else
+   os_cppr-stack[++os_cppr-index] = DEFAULT_PRIORITY;
+}
+
 static unsigned int xics_get_irq_direct(void)
 {
unsigned int xirr = direct_xirr_info_get();
@@ -294,8 +320,10 @@ static unsigned int xics_get_irq_direct(
return NO_IRQ;
 
irq = irq_radix_revmap_lookup(xics_host, vec);
-   if (likely(irq != NO_IRQ))
+   if (likely(irq != NO_IRQ)) {
+   push_cppr(vec);
return irq;
+   }
 
/* We don't have a linux mapping, so have rtas mask it. */
xics_mask_unknown_vec(vec);
@@ -315,8 +343,10 @@ static unsigned int xics_get_irq_lpar(vo
return NO_IRQ;
 
irq = irq_radix_revmap_lookup(xics_host, vec);
-   if (likely(irq != NO_IRQ))
+   if (likely(irq != NO_IRQ)) {
+   push_cppr(vec);
return irq;
+   }
 
/* We don't have a linux mapping, so have RTAS mask it. */
xics_mask_unknown_vec(vec);
@@ -326,12 +356,22 @@ static unsigned int xics_get_irq_lpar(vo
return NO_IRQ;
 }
 
+static unsigned char pop_cppr(void)
+{
+   struct xics_cppr *os_cppr = __get_cpu_var(xics_cppr);
+
+   if (WARN_ON(os_cppr-index  1))
+   return LOWEST_PRIORITY;
+
+   return os_cppr-stack[--os_cppr-index];
+}
+
 static void xics_eoi_direct(unsigned int virq)
 {
unsigned int irq = (unsigned int)irq_map[virq].hwirq;
 
iosync();
-   direct_xirr_info_set((0xff  24) | irq);
+   direct_xirr_info_set((pop_cppr()  24) | irq);
 }
 
 static void xics_eoi_lpar(unsigned int virq)
@@ -339,7 +379,7 @@ static void xics_eoi_lpar(unsigned int v
unsigned int irq = (unsigned int)irq_map[virq].hwirq;
 
iosync();
-   lpar_xirr_info_set((0xff  24) | irq);
+   lpar_xirr_info_set((pop_cppr()  24) | irq);
 }
 
 static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
@@ -746,6 +786,12 @@ void __init xics_init_IRQ(void)
 
 static void xics_set_cpu_priority(unsigned char cppr)
 {
+   struct xics_cppr *os_cppr = __get_cpu_var(xics_cppr);
+
+   BUG_ON(os_cppr-index != 0);
+
+   os_cppr-stack[os_cppr-index] = cppr;
+
if (firmware_has_feature(FW_FEATURE_LPAR))
lpar_cppr_info(cppr);
else
@@ -772,7 +818,7 @@ static void xics_set_cpu_giq(unsigned in
 
 void xics_setup_cpu(void

[PATCH] powerpc/pseries: Track previous CPPR values to correctly EOI interrupts

2009-12-02 Thread Mark Nelson

At the moment when we EOI an interrupt we set the CPPR back to 0xFF
regardless of its previous value. This could lead to problems if we
take an interrupt with a priority of 5, but before EOIing it we get
an IPI which has a priority of 4. The problem is that at the moment
when we EOI the IPI we will set the CPPR to 0xFF, but it should
really be set back to 5 (the previous priority).

To keep track of the previous CPPR values we create the xics_cppr
structure that has an array for CPPR values and an index pointing
to the current priority. This can easily grow if new priorities get
added in the future.

This will also be useful because the partition adjunct option of
upcoming machines will update the H_XIRR hcall to accept the CPPR
as a parameter.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---
 arch/powerpc/platforms/pseries/xics.c |   48 ++
 1 file changed, 43 insertions(+), 5 deletions(-)

Index: upstream/arch/powerpc/platforms/pseries/xics.c
===
--- upstream.orig/arch/powerpc/platforms/pseries/xics.c
+++ upstream/arch/powerpc/platforms/pseries/xics.c
@@ -20,6 +20,7 @@
 #include linux/cpu.h
 #include linux/msi.h
 #include linux/of.h
+#include linux/percpu.h
 
 #include asm/firmware.h
 #include asm/io.h
@@ -46,6 +47,12 @@ static struct irq_host *xics_host;
  */
 #define IPI_PRIORITY   4
 
+/* The least favored priority */
+#define LOWEST_PRIORITY0xFF
+
+/* The number of priorities defined above */
+#define MAX_NUM_PRIORITIES 3
+
 static unsigned int default_server = 0xFF;
 static unsigned int default_distrib_server = 0;
 static unsigned int interrupt_server_size = 8;
@@ -56,6 +63,12 @@ static int ibm_set_xive;
 static int ibm_int_on;
 static int ibm_int_off;
 
+struct xics_cppr {
+   unsigned char stack[MAX_NUM_PRIORITIES];
+   int index;
+};
+
+static DEFINE_PER_CPU(struct xics_cppr, xics_cppr);
 
 /* Direct hardware low level accessors */
 
@@ -284,6 +297,16 @@ static inline unsigned int xics_xirr_vec
return xirr  0x00ff;
 }
 
+static void get_irq_update_cppr(unsigned int vec)
+{
+   struct xics_cppr *os_cppr = __get_cpu_var(xics_cppr);
+
+   if (vec == XICS_IPI)
+   os_cppr-stack[++os_cppr-index] = IPI_PRIORITY;
+   else
+   os_cppr-stack[++os_cppr-index] = DEFAULT_PRIORITY;
+}
+
 static unsigned int xics_get_irq_direct(void)
 {
unsigned int xirr = direct_xirr_info_get();
@@ -294,8 +317,10 @@ static unsigned int xics_get_irq_direct(
return NO_IRQ;
 
irq = irq_radix_revmap_lookup(xics_host, vec);
-   if (likely(irq != NO_IRQ))
+   if (likely(irq != NO_IRQ)) {
+   get_irq_update_cppr(vec);
return irq;
+   }
 
/* We don't have a linux mapping, so have rtas mask it. */
xics_mask_unknown_vec(vec);
@@ -315,8 +340,10 @@ static unsigned int xics_get_irq_lpar(vo
return NO_IRQ;
 
irq = irq_radix_revmap_lookup(xics_host, vec);
-   if (likely(irq != NO_IRQ))
+   if (likely(irq != NO_IRQ)) {
+   get_irq_update_cppr(vec);
return irq;
+   }
 
/* We don't have a linux mapping, so have RTAS mask it. */
xics_mask_unknown_vec(vec);
@@ -326,12 +353,19 @@ static unsigned int xics_get_irq_lpar(vo
return NO_IRQ;
 }
 
+static unsigned char eoi_update_and_get_cppr(void)
+{
+   struct xics_cppr *os_cppr = __get_cpu_var(xics_cppr);
+
+   return os_cppr-stack[--os_cppr-index];
+}
+
 static void xics_eoi_direct(unsigned int virq)
 {
unsigned int irq = (unsigned int)irq_map[virq].hwirq;
 
iosync();
-   direct_xirr_info_set((0xff  24) | irq);
+   direct_xirr_info_set((eoi_update_and_get_cppr()  24) | irq);
 }
 
 static void xics_eoi_lpar(unsigned int virq)
@@ -339,7 +373,7 @@ static void xics_eoi_lpar(unsigned int v
unsigned int irq = (unsigned int)irq_map[virq].hwirq;
 
iosync();
-   lpar_xirr_info_set((0xff  24) | irq);
+   lpar_xirr_info_set((eoi_update_and_get_cppr()  24) | irq);
 }
 
 static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
@@ -746,6 +780,10 @@ void __init xics_init_IRQ(void)
 
 static void xics_set_cpu_priority(unsigned char cppr)
 {
+   struct xics_cppr *os_cppr = __get_cpu_var(xics_cppr);
+   os_cppr-index = 0;
+   os_cppr-stack[os_cppr-index] = cppr;
+
if (firmware_has_feature(FW_FEATURE_LPAR))
lpar_cppr_info(cppr);
else
@@ -772,7 +810,7 @@ static void xics_set_cpu_giq(unsigned in
 
 void xics_setup_cpu(void)
 {
-   xics_set_cpu_priority(0xff);
+   xics_set_cpu_priority(LOWEST_PRIORITY);
 
xics_set_cpu_giq(default_distrib_server, 1);
 }
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: dma_ops-map_page == NULL

2009-07-07 Thread Mark Nelson

On Tuesday 07 July 2009 21:08:25 Benjamin Herrenschmidt wrote:
 On Tue, 2009-07-07 at 10:15 +1000, Mark Nelson wrote:
  
  When the 32 and 64bit DMA code was merged in .28 , map_/unmap_page() was
  added in favour of map_/unmap_single() (which was later removed in .29)
  so you'll have to replace your calls to dma_map_single() with
  dma_map_page(). Just pass it the page and offset rather than the address.
 
 Wait a minute ... dma_map_single() should still work, it will just call
 dma_map_page() underneath. All dma_ops should have a -map page
 callback.

Sorry my mistake - I was thinking of when we removed the map/unmap_single()
from the dma_mapping_ops.

Mark.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: dma_ops-map_page == NULL

2009-07-06 Thread Mark Nelson

On Tuesday 07 July 2009 03:51:00 Kári Davíðsson wrote:
 I am doing a driver that uses dma_map_single().
 
 After changing to to linux 2.6.29.3 I am getting
 segfaults in dma_map_single() because dma_ops-map_page is NULL.
 Actually dma_ops looks funky too.

When the 32 and 64bit DMA code was merged in .28 , map_/unmap_page() was
added in favour of map_/unmap_single() (which was later removed in .29)
so you'll have to replace your calls to dma_map_single() with
dma_map_page(). Just pass it the page and offset rather than the address.

Hope that helps!
Mark.

 
 The driver is an of_platform_driver which is declared as an child of
 the lbp (fsl,lpb) node of the device tree.
 
 This is on powerpc 5200b platform.
 
 rg
 kd
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/wdrtas: Update wdrtas_get_interval to use rtas_data_buf

2009-03-24 Thread Mark Nelson

The buffer passed to the ibm,get-system-parameter RTAS call must be
in the RMA. To ensure we pass an address in the RMA use rtas_data_buf
for the actual RTAS call and then copy the result to value. We can't
just make it static because this can be compiled in as a module.

Also add the WDRTAS_SP_SPI_LEN so we don't litter '4' throughout the
function.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---

Adrian, does this patch cause any problems for your pxcabs?

Thanks!
Mark

 drivers/watchdog/wdrtas.c |   13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

Index: upstream/drivers/watchdog/wdrtas.c
===
--- upstream.orig/drivers/watchdog/wdrtas.c
+++ upstream/drivers/watchdog/wdrtas.c
@@ -106,6 +106,8 @@ static int wdrtas_set_interval(int inter
return result;
 }
 
+#define WDRTAS_SP_SPI_LEN 4
+
 /**
  * wdrtas_get_interval - returns the current watchdog interval
  * @fallback_value: value (in seconds) to use, if the RTAS call fails
@@ -119,10 +121,17 @@ static int wdrtas_set_interval(int inter
 static int wdrtas_get_interval(int fallback_value)
 {
long result;
-   char value[4];
+   char value[WDRTAS_SP_SPI_LEN];
 
+   spin_lock(rtas_data_buf_lock);
+   memset(rtas_data_buf, 0, WDRTAS_SP_SPI_LEN);
result = rtas_call(wdrtas_token_get_sp, 3, 1, NULL,
-  WDRTAS_SP_SPI, (void *)__pa(value), 4);
+  WDRTAS_SP_SPI, __pa(rtas_data_buf),
+  WDRTAS_SP_SPI_LEN);
+
+   memcpy(value, rtas_data_buf, WDRTAS_SP_SPI_LEN);
+   spin_unlock(rtas_data_buf_lock);
+
if (value[0] != 0 || value[1] != 2 || value[3] != 0 || result  0) {
printk(KERN_WARNING wdrtas: could not get sp_spi watchdog 
   timeout (%li). Continuing\n, result);
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [PATCH] powerpc/wdrtas: Update wdrtas_get_interval to use rtas_data_buf

2009-03-24 Thread Mark Nelson

On Tue, 24 Mar 2009 11:31:31 pm Adrian Reber wrote:
 On Tue, Mar 24, 2009 at 05:30:41PM +1100, Mark Nelson wrote:
  The buffer passed to the ibm,get-system-parameter RTAS call must be
  in the RMA. To ensure we pass an address in the RMA use rtas_data_buf
  for the actual RTAS call and then copy the result to value. We can't
  just make it static because this can be compiled in as a module.
  
  Also add the WDRTAS_SP_SPI_LEN so we don't litter '4' throughout the
  function.
  
  Signed-off-by: Mark Nelson ma...@au1.ibm.com
  ---
  
  Adrian, does this patch cause any problems for your pxcabs?
 
 No, it even helps. I have no tried the watchdog until now, but without the
 patch I get: 
 
wdrtas: could not get sp_spi watchdog timeout (0). Continuing
 
 and with the patch it reads the correct value. So only with your patch
 it works like it is supposed to. Thanks!
 
 Tested-by: Adrian Reber adr...@lisas.de

Good to hear

Thanks for testing!
Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [PATCH] powerpc/wdrtas: Update wdrtas_get_interval to use rtas_data_buf

2009-03-24 Thread Mark Nelson

On Wed, 25 Mar 2009 01:31:32 am Utz Bacher wrote:
 Adrian Reber adr...@lisas.de wrote on 24.03.2009 13:31:31:
  On Tue, Mar 24, 2009 at 05:30:41PM +1100, Mark Nelson wrote:
   The buffer passed to the ibm,get-system-parameter RTAS call must be
   in the RMA. To ensure we pass an address in the RMA use rtas_data_buf
   for the actual RTAS call and then copy the result to value. We can't
   just make it static because this can be compiled in as a module.
   
   Also add the WDRTAS_SP_SPI_LEN so we don't litter '4' throughout the
   function.
   
   Signed-off-by: Mark Nelson ma...@au1.ibm.com
   ---
   
   Adrian, does this patch cause any problems for your pxcabs?
  
  No, it even helps. I have no tried the watchdog until now, but without 
 the
  patch I get: 
  
 wdrtas: could not get sp_spi watchdog timeout (0). Continuing
  
  and with the patch it reads the correct value. So only with your patch
  it works like it is supposed to. Thanks!
  
  Tested-by: Adrian Reber adr...@lisas.de
 
 looks good to me.
 
 Acked-by: Utz Bacher utz.bac...@de.ibm.com

Thanks for looking over it!
Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: Crash (ext3 ) during 2.6.29-rc6 boot

2009-02-25 Thread Mark Nelson

On Wed, 25 Feb 2009 08:50:46 pm Geert Uytterhoeven wrote:
 On Wed, 25 Feb 2009, Mark Nelson wrote:
  On Tue, 24 Feb 2009 05:38:37 pm Sachin P. Sant wrote:
   Jan Kara wrote:
  Hmm, OK. But then I'm not sure how that can happen. Obviously, memcpy
somehow got beyond end of the page referenced by bh-b_data. So it means
that le16_to_cpu(entry-e_value_offs) + size  page_size. But
ext3_xattr_find_entry() calls ext3_xattr_check_entry() which in
particular checks whether e_value_offs + e_value_size isn't greater than
bh-b_size. So I see no way how memcpy can get beyond end of the page.
  Sachin, is the problem reproducible? If yes, can you send us contents
  
   Yes, i am able to recreate this problem easily. As i had mentioned if the
   earlier kernel is booted with selinux enabled and then 2.6.29-rc6 is 
   booted
   i get this crash. But if i specify selinux=0 at command line, 2.6.29-rc6 
   boots
   without any problem.
  
  Hi Sanchin and Geert,
  
  Does the patch below fix the problems you're seeing? If it does I'll send
  a properly written up and formatted patch to linuxppc-dev (as well as
  another one to fix the same problem in copy_tofrom_user()).
 
 Unfortunately not, now it crashes while accessing the memory pointed to by
 GPR16, in
 
 NIP: copy_page_range+x0608/0x628
 LR:  dup_mm+0x2e4/0x428
 Trace: debug_table+0xcc70/0x1afe0 (unreliable)
 dup_mm+0x2e4/0x428
 copy_process+0x86c/0xf9c
 do_fork+0x188/0x39c
 sys_clone+0x58/0x70
 ppc_clone+0x8/0xc
 
 However, after reverting 25d6e2d7c58ddc4a3b614fc5381591c0cfe66556, I still see
 similar problems as above (crash in copy_page_range()).
 Which makes me think that
   1. Your new patch fixes the problem introduced by 25d6e2d7,
   2. There's still another issue than the one introduced by 25d6e2d7.

Does the following patch fix the errors you're seeing? (it applies the
same fix as the previous patch but this time to copy_tofrom_user, which
I updated in a4e22f02f5b6518c1484faea1f88d81802b9feac)

Thanks!

Mark

---
 arch/powerpc/lib/copyuser_64.S |   38 +++---
 1 file changed, 31 insertions(+), 7 deletions(-)

Index: upstream/arch/powerpc/lib/copyuser_64.S
===
--- upstream.orig/arch/powerpc/lib/copyuser_64.S
+++ upstream/arch/powerpc/lib/copyuser_64.S
@@ -62,18 +62,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 72:std r8,8(r3)
beq+3f
addir3,r3,16
-23:ld  r9,8(r4)
 .Ldo_tail:
bf  cr7*4+1,1f
-   rotldi  r9,r9,32
+23:lwz r9,8(r4)
+   addir4,r4,4
 73:stw r9,0(r3)
addir3,r3,4
 1: bf  cr7*4+2,2f
-   rotldi  r9,r9,16
+44:lhz r9,8(r4)
+   addir4,r4,2
 74:sth r9,0(r3)
addir3,r3,2
 2: bf  cr7*4+3,3f
-   rotldi  r9,r9,8
+45:lbz r9,8(r4)
 75:stb r9,0(r3)
 3: li  r3,0
blr
@@ -141,11 +142,24 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 6: cmpwi   cr1,r5,8
addir3,r3,32
sld r9,r9,r10
-   ble cr1,.Ldo_tail
+   ble cr1,7f
 34:ld  r0,8(r4)
srd r7,r0,r11
or  r9,r7,r9
-   b   .Ldo_tail
+7:
+   bf  cr7*4+1,1f
+   rotldi  r9,r9,32
+94:stw r9,0(r3)
+   addir3,r3,4
+1: bf  cr7*4+2,2f
+   rotldi  r9,r9,16
+95:sth r9,0(r3)
+   addir3,r3,2
+2: bf  cr7*4+3,3f
+   rotldi  r9,r9,8
+96:stb r9,0(r3)
+3: li  r3,0
+   blr
 
 .Ldst_unaligned:
PPC_MTOCRF  0x01,r6 /* put #bytes to 8B bdry into cr7 */
@@ -218,7 +232,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 121:
 132:
addir3,r3,8
-123:
 134:
 135:
 138:
@@ -226,6 +239,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 140:
 141:
 142:
+123:
+144:
+145:
 
 /*
  * here we have had a fault on a load and r3 points to the first
@@ -309,6 +325,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 187:
 188:
 189:   
+194:
+195:
+196:
 1:
ld  r6,-24(r1)
ld  r5,-8(r1)
@@ -329,7 +348,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
.llong  72b,172b
.llong  23b,123b
.llong  73b,173b
+   .llong  44b,144b
.llong  74b,174b
+   .llong  45b,145b
.llong  75b,175b
.llong  24b,124b
.llong  25b,125b
@@ -347,6 +368,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
.llong  79b,179b
.llong  80b,180b
.llong  34b,134b
+   .llong  94b,194b
+   .llong  95b,195b
+   .llong  96b,196b
.llong  35b,135b
.llong  81b,181b
.llong  36b,136b
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: Crash (ext3 ) during 2.6.29-rc6 boot

2009-02-25 Thread Mark Nelson

On Wed, 25 Feb 2009 10:08:22 pm Sachin P. Sant wrote:
 Mark Nelson wrote:
  Hi Sanchin and Geert,
 
  Does the patch below fix the problems you're seeing? If it does I'll send
  a properly written up and formatted patch to linuxppc-dev (as well as
  another one to fix the same problem in copy_tofrom_user()).

 This patch fixes the issue at my side. I tried booting the system few times
 and every single time it came up clean.

Good to hear. Thanks for testing Sanchin!

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: Crash (ext3 ) during 2.6.29-rc6 boot

2009-02-25 Thread Mark Nelson

On Thu, 26 Feb 2009 12:31:20 am Geert Uytterhoeven wrote:
 On Wed, 25 Feb 2009, Mark Nelson wrote:
  On Wed, 25 Feb 2009 08:50:46 pm Geert Uytterhoeven wrote:
   On Wed, 25 Feb 2009, Mark Nelson wrote:
On Tue, 24 Feb 2009 05:38:37 pm Sachin P. Sant wrote:
 Jan Kara wrote:
Hmm, OK. But then I'm not sure how that can happen. Obviously, 
  memcpy
  somehow got beyond end of the page referenced by bh-b_data. So it 
  means
  that le16_to_cpu(entry-e_value_offs) + size  page_size. But
  ext3_xattr_find_entry() calls ext3_xattr_check_entry() which in
  particular checks whether e_value_offs + e_value_size isn't greater 
  than
  bh-b_size. So I see no way how memcpy can get beyond end of the 
  page.
Sachin, is the problem reproducible? If yes, can you send us 
  contents

 Yes, i am able to recreate this problem easily. As i had mentioned if 
 the
 earlier kernel is booted with selinux enabled and then 2.6.29-rc6 is 
 booted
 i get this crash. But if i specify selinux=0 at command line, 
 2.6.29-rc6 boots
 without any problem.

Hi Sanchin and Geert,

Does the patch below fix the problems you're seeing? If it does I'll 
send
a properly written up and formatted patch to linuxppc-dev (as well as
another one to fix the same problem in copy_tofrom_user()).
   
   Unfortunately not, now it crashes while accessing the memory pointed to by
   GPR16, in
   
   NIP: copy_page_range+x0608/0x628
   LR:  dup_mm+0x2e4/0x428
   Trace: debug_table+0xcc70/0x1afe0 (unreliable)
   dup_mm+0x2e4/0x428
   copy_process+0x86c/0xf9c
   do_fork+0x188/0x39c
   sys_clone+0x58/0x70
   ppc_clone+0x8/0xc
   
   However, after reverting 25d6e2d7c58ddc4a3b614fc5381591c0cfe66556, I 
   still see
   similar problems as above (crash in copy_page_range()).
   Which makes me think that
 1. Your new patch fixes the problem introduced by 25d6e2d7,
 2. There's still another issue than the one introduced by 25d6e2d7.
  
  Does the following patch fix the errors you're seeing? (it applies the
  same fix as the previous patch but this time to copy_tofrom_user, which
  I updated in a4e22f02f5b6518c1484faea1f88d81802b9feac)
 
 Thanks, but I still get crashes in copy_page_range().
 

Hmmm... I'm out of ideas for the moment, but thanks for testing anyway!

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: Crash (ext3 ) during 2.6.29-rc6 boot

2009-02-25 Thread Mark Nelson

On Thu, 26 Feb 2009 09:45:41 am Mark Nelson wrote:
 On Thu, 26 Feb 2009 12:31:20 am Geert Uytterhoeven wrote:
  On Wed, 25 Feb 2009, Mark Nelson wrote:
   On Wed, 25 Feb 2009 08:50:46 pm Geert Uytterhoeven wrote:
On Wed, 25 Feb 2009, Mark Nelson wrote:
 On Tue, 24 Feb 2009 05:38:37 pm Sachin P. Sant wrote:
  Jan Kara wrote:
 Hmm, OK. But then I'm not sure how that can happen. Obviously, 
   memcpy
   somehow got beyond end of the page referenced by bh-b_data. So 
   it means
   that le16_to_cpu(entry-e_value_offs) + size  page_size. But
   ext3_xattr_find_entry() calls ext3_xattr_check_entry() which in
   particular checks whether e_value_offs + e_value_size isn't 
   greater than
   bh-b_size. So I see no way how memcpy can get beyond end of the 
   page.
 Sachin, is the problem reproducible? If yes, can you send us 
   contents
 
  Yes, i am able to recreate this problem easily. As i had mentioned 
  if the
  earlier kernel is booted with selinux enabled and then 2.6.29-rc6 
  is booted
  i get this crash. But if i specify selinux=0 at command line, 
  2.6.29-rc6 boots
  without any problem.
 
 Hi Sanchin and Geert,
 
 Does the patch below fix the problems you're seeing? If it does I'll 
 send
 a properly written up and formatted patch to linuxppc-dev (as well as
 another one to fix the same problem in copy_tofrom_user()).

Unfortunately not, now it crashes while accessing the memory pointed to 
by
GPR16, in

NIP: copy_page_range+x0608/0x628
LR:  dup_mm+0x2e4/0x428
Trace: debug_table+0xcc70/0x1afe0 (unreliable)
dup_mm+0x2e4/0x428
copy_process+0x86c/0xf9c
do_fork+0x188/0x39c
sys_clone+0x58/0x70
ppc_clone+0x8/0xc

However, after reverting 25d6e2d7c58ddc4a3b614fc5381591c0cfe66556, I 
still see
similar problems as above (crash in copy_page_range()).
Which makes me think that
  1. Your new patch fixes the problem introduced by 25d6e2d7,
  2. There's still another issue than the one introduced by 25d6e2d7.
   
   Does the following patch fix the errors you're seeing? (it applies the
   same fix as the previous patch but this time to copy_tofrom_user, which
   I updated in a4e22f02f5b6518c1484faea1f88d81802b9feac)
  
  Thanks, but I still get crashes in copy_page_range().
  
 
 Hmmm... I'm out of ideas for the moment, but thanks for testing anyway!
 
 Mark
 ___
 Linuxppc-dev mailing list
 Linuxppc-dev@ozlabs.org
 https://ozlabs.org/mailman/listinfo/linuxppc-dev
 

If you revert both 25d6e2d7c58ddc4a3b614fc5381591c0cfe66556 and
a4e22f02f5b6518c1484faea1f88d81802b9feac, does it help? You could also
try to revert 57dda6ef5bd5b9e60410477ad29e654097e2cca1 just in case I
need to keep wearing the brown paper bag for a bit longer :)

Thanks!

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH] powerpc: Fix 64bit memcpy() regression

2009-02-25 Thread Mark Nelson

This fixes a regression introduced by commit
25d6e2d7c58ddc4a3b614fc5381591c0cfe66556 (powerpc: Update 64bit memcpy()
using CPU_FTR_UNALIGNED_LD_STD).

This commit allowed CPUs that have the CPU_FTR_UNALIGNED_LD_STD CPU
feature bit present to do the memcpy() with unaligned load doubles. But,
along with this came a bug where our final load double would read bytes
beyond a page boundary and into the next (unmapped) page. This was caught
by enabling CONFIG_DEBUG_PAGEALLOC, 

The fix was to read only the number of bytes that we need to store rather
than reading a full 8-byte doubleword and storing only a portion of that.

In order to minimise the amount of existing code touched we use the
original do_tail for the src_unaligned case.

Below is an example of the regression, as reported by Sachin Sant:

Unable to handle kernel paging request for data at address 0xc0003f38
Faulting instruction address: 0xc0039574
cpu 0x1: Vector: 300 (Data Access) at [c0003baf3020]
pc: c0039574: .memcpy+0x74/0x244
lr: d244916c: .ext3_xattr_get+0x288/0x2f4 [ext3]
sp: c0003baf32a0
   msr: 80009032
   dar: c0003f38
 dsisr: 4000
  current = 0xc0003e54b010
  paca= 0xc0a53680
pid   = 1840, comm = readahead
enter ? for help
[link register   ] d244916c .ext3_xattr_get+0x288/0x2f4 [ext3]
[c0003baf32a0] d2449104 .ext3_xattr_get+0x220/0x2f4 [ext3]
(unreliab
le)
[c0003baf3390] d244a6e8 .ext3_xattr_security_get+0x40/0x5c [ext3]
[c0003baf3400] c0148154 .generic_getxattr+0x74/0x9c
[c0003baf34a0] c0333400 .inode_doinit_with_dentry+0x1c4/0x678
[c0003baf3560] c032c6b0 .security_d_instantiate+0x50/0x68
[c0003baf35e0] c013c818 .d_instantiate+0x78/0x9c
[c0003baf3680] c013ced0 .d_splice_alias+0xf0/0x120
[c0003baf3720] d243e05c .ext3_lookup+0xec/0x134 [ext3]
[c0003baf37c0] c0131e74 .do_lookup+0x110/0x260
[c0003baf3880] c0134ed0 .__link_path_walk+0xa98/0x1010
[c0003baf3970] c01354a0 .path_walk+0x58/0xc4
[c0003baf3a20] c0135720 .do_path_lookup+0x138/0x1e4
[c0003baf3ad0] c013645c .path_lookup_open+0x6c/0xc8
[c0003baf3b70] c0136780 .do_filp_open+0xcc/0x874
[c0003baf3d10] c01251e0 .do_sys_open+0x80/0x140
[c0003baf3dc0] c016aaec .compat_sys_open+0x24/0x38
[c0003baf3e30] c000855c syscall_exit+0x0/0x40
--- Exception: c01 (System Call) at 0ff0ef18
SP (ffc6f4b0) is in userspace
1:mon

Signed-off-by: Mark Nelson ma...@au1.ibm.com
Reported-by: Sachin Sant sach...@in.ibm.com
Tested-by: Sachin Sant sach...@in.ibm.com
---
 arch/powerpc/lib/memcpy_64.S |   26 --
 1 file changed, 20 insertions(+), 6 deletions(-)

Index: upstream/arch/powerpc/lib/memcpy_64.S
===
--- upstream.orig/arch/powerpc/lib/memcpy_64.S
+++ upstream/arch/powerpc/lib/memcpy_64.S
@@ -53,18 +53,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 3: std r8,8(r3)
beq 3f
addir3,r3,16
-   ld  r9,8(r4)
 .Ldo_tail:
bf  cr7*4+1,1f
-   rotldi  r9,r9,32
+   lwz r9,8(r4)
+   addir4,r4,4
stw r9,0(r3)
addir3,r3,4
 1: bf  cr7*4+2,2f
-   rotldi  r9,r9,16
+   lhz r9,8(r4)
+   addir4,r4,2
sth r9,0(r3)
addir3,r3,2
 2: bf  cr7*4+3,3f
-   rotldi  r9,r9,8
+   lbz r9,8(r4)
stb r9,0(r3)
 3: ld  r3,48(r1)   /* return dest pointer */
blr
@@ -133,11 +134,24 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
cmpwi   cr1,r5,8
addir3,r3,32
sld r9,r9,r10
-   ble cr1,.Ldo_tail
+   ble cr1,6f
ld  r0,8(r4)
srd r7,r0,r11
or  r9,r7,r9
-   b   .Ldo_tail
+6:
+   bf  cr7*4+1,1f
+   rotldi  r9,r9,32
+   stw r9,0(r3)
+   addir3,r3,4
+1: bf  cr7*4+2,2f
+   rotldi  r9,r9,16
+   sth r9,0(r3)
+   addir3,r3,2
+2: bf  cr7*4+3,3f
+   rotldi  r9,r9,8
+   stb r9,0(r3)
+3: ld  r3,48(r1)   /* return dest pointer */
+   blr
 
 .Ldst_unaligned:
PPC_MTOCRF  0x01,r6 # put #bytes to 8B bdry into cr7
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH] powerpc: Fix 64bit __copy_tofrom_user() regression

2009-02-25 Thread Mark Nelson

This fixes a regression introduced by commit
a4e22f02f5b6518c1484faea1f88d81802b9feac (powerpc: Update 64bit
__copy_tofrom_user() using CPU_FTR_UNALIGNED_LD_STD).

The same bug that existed in the 64bit memcpy() also exists here so fix
it here too. The fix is the same as that applied to memcpy() with the
addition of fixes for the exception handling code required for
__copy_tofrom_user().

This stops us reading beyond the end of the source region we were told
to copy.

Signed-off-by: Mark Nelson ma...@au1.ibm.com
---
 arch/powerpc/lib/copyuser_64.S |   38 +++---
 1 file changed, 31 insertions(+), 7 deletions(-)

Index: upstream/arch/powerpc/lib/copyuser_64.S
===
--- upstream.orig/arch/powerpc/lib/copyuser_64.S
+++ upstream/arch/powerpc/lib/copyuser_64.S
@@ -62,18 +62,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 72:std r8,8(r3)
beq+3f
addir3,r3,16
-23:ld  r9,8(r4)
 .Ldo_tail:
bf  cr7*4+1,1f
-   rotldi  r9,r9,32
+23:lwz r9,8(r4)
+   addir4,r4,4
 73:stw r9,0(r3)
addir3,r3,4
 1: bf  cr7*4+2,2f
-   rotldi  r9,r9,16
+44:lhz r9,8(r4)
+   addir4,r4,2
 74:sth r9,0(r3)
addir3,r3,2
 2: bf  cr7*4+3,3f
-   rotldi  r9,r9,8
+45:lbz r9,8(r4)
 75:stb r9,0(r3)
 3: li  r3,0
blr
@@ -141,11 +142,24 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 6: cmpwi   cr1,r5,8
addir3,r3,32
sld r9,r9,r10
-   ble cr1,.Ldo_tail
+   ble cr1,7f
 34:ld  r0,8(r4)
srd r7,r0,r11
or  r9,r7,r9
-   b   .Ldo_tail
+7:
+   bf  cr7*4+1,1f
+   rotldi  r9,r9,32
+94:stw r9,0(r3)
+   addir3,r3,4
+1: bf  cr7*4+2,2f
+   rotldi  r9,r9,16
+95:sth r9,0(r3)
+   addir3,r3,2
+2: bf  cr7*4+3,3f
+   rotldi  r9,r9,8
+96:stb r9,0(r3)
+3: li  r3,0
+   blr
 
 .Ldst_unaligned:
PPC_MTOCRF  0x01,r6 /* put #bytes to 8B bdry into cr7 */
@@ -218,7 +232,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 121:
 132:
addir3,r3,8
-123:
 134:
 135:
 138:
@@ -226,6 +239,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 140:
 141:
 142:
+123:
+144:
+145:
 
 /*
  * here we have had a fault on a load and r3 points to the first
@@ -309,6 +325,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 187:
 188:
 189:   
+194:
+195:
+196:
 1:
ld  r6,-24(r1)
ld  r5,-8(r1)
@@ -329,7 +348,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
.llong  72b,172b
.llong  23b,123b
.llong  73b,173b
+   .llong  44b,144b
.llong  74b,174b
+   .llong  45b,145b
.llong  75b,175b
.llong  24b,124b
.llong  25b,125b
@@ -347,6 +368,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
.llong  79b,179b
.llong  80b,180b
.llong  34b,134b
+   .llong  94b,194b
+   .llong  95b,195b
+   .llong  96b,196b
.llong  35b,135b
.llong  81b,181b
.llong  36b,136b
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: Crash (ext3 ) during 2.6.29-rc6 boot

2009-02-24 Thread Mark Nelson

On Wed, 25 Feb 2009 02:51:20 am Jan Kara wrote:
   Hello,
 
 On Tue 24-02-09 12:08:37, Sachin P. Sant wrote:
  Jan Kara wrote:
Hmm, OK. But then I'm not sure how that can happen. Obviously, memcpy
  somehow got beyond end of the page referenced by bh-b_data. So it means
  that le16_to_cpu(entry-e_value_offs) + size  page_size. But
  ext3_xattr_find_entry() calls ext3_xattr_check_entry() which in
  particular checks whether e_value_offs + e_value_size isn't greater than
  bh-b_size. So I see no way how memcpy can get beyond end of the page.
Sachin, is the problem reproducible? If yes, can you send us contents

  Yes, i am able to recreate this problem easily. As i had mentioned if the
  earlier kernel is booted with selinux enabled and then 2.6.29-rc6 is booted
  i get this crash. But if i specify selinux=0 at command line, 2.6.29-rc6 
  boots
  without any problem.
 
  of the page just before the faulting address (i.e., for current fault it
  would be 0xc0003f37-0xc0003f37). As far as I can
  remember powerpc monitor could dump it.

  Here is the page dump. This time it crashed while accessing address
  0xc0002d67.
   Thanks for the dump.
 
  Unable to handle kernel paging request for data at address 0xc
  0002d67
  Faulting instruction address: 0xc0039574
  cpu 0x1: Vector: 300 (Data Access) at [c0004288b0b0]
 pc: c0039574: .memcpy+0x74/0x244
 lr: c01b497c: .ext3_xattr_get+0x288/0x2f4
 sp: c0004288b330
msr: 80009032
 
  1:mon d 0xc0002d66
  ... SNIP ...
 
  c0002d66efd0    ||
  c0002d66efe0    ||
  c0002d66eff0    ||
  c0002d66f000 02ea0004 0100e200d20a  ||
  c0002d66f010    ||
  c0002d66f020 0706e40f 1b00e200d20a  ||
  c0002d66f030 73656c696e757800   |selinux.|
  c0002d66f040    ||
  c0002d66f050    ||
  c0002d66f060    ||
 
  ... SNIP ...
 
  c0002d66ff60    ||
  c0002d66ff70    ||
  c0002d66ff80    ||
  c0002d66ff90    ||
  c0002d66ffa0    ||
  c0002d66ffb0    ||
  c0002d66ffc0    ||
  c0002d66ffd0    ||
  c0002d66ffe0 73797374 656d5f753a6f626a  |system_u:obj|
  c0002d66fff0 6563745f723a7573 725f743a7330  |ect_r:usr_t:s0..|
  c0002d67    ||
  1:mon r
  R00 = e40f   R16 = 005d
  R01 = c0004288b330   R17 = 
  R02 = c09f59b8   R18 = fffbfe9e
  R03 = c00044aa34a0   R19 = 10042638
  R04 = c0002d66fff4   R20 = 10041610
  R05 = 0003   R21 = 00ff
  R06 =    R22 = 0006
  R07 = 0001   R23 = c07d27c1
  R08 = 723a7573725f743a   R24 = c0002c0cd758
  R09 = 3a6f626a6563745f   R25 = c00044aa3488
  R10 = c017b43c   R26 = c0002c0cd6f0
  R11 = c0002d66f020   R27 = c0002c0cd860
  R12 = d23c14b0   R28 = c0002c0b0840
  R13 = c0a93680   R29 = 001b
  R14 = 41ed   R30 = c09880b0
  R15 = 1004   R31 = ffde
  pc  = c0039574 .memcpy+0x74/0x244
  lr  = c01b497c .ext3_xattr_get+0x288/0x2f4
  msr = 80009032   cr  = 4400044b
  ctr =    xer = 2001   trap =  300
  dar = c0002d67   dsisr = 4000
  1:mon zr
 
BTW, I suppose you use 4KB blocksize on the filesystem, right?

  Yes.
 
  dumpe2fs /dev/sda3 | grep -i block size dumpe2fs 1.39 (29-May-2006)
  Block size:   4096
   OK. The xattr block causing oops is completely correct. To me it seems
 more like some problem in powerpc memcpy() (I saw there went some changes
 into in in the end of December) - we call it to copy 27 bytes from
 address 0xc0002d66ffe4 (which is one byte before end of the page).
 Could some of the powerpc guys have a look whether this could be the case?
 I'm not quite fluent in the powerpc assembly so it would take me ages ;).

You're right - it's a problem with the 64bit

Re: Crash (ext3 ) during 2.6.29-rc6 boot

2009-02-24 Thread Mark Nelson

On Wed, 25 Feb 2009 05:01:59 am Geert Uytterhoeven wrote:
 On Mon, 23 Feb 2009, Paul Mackerras wrote:
  Andrew Morton writes:
   It looks like we died in ext3_xattr_block_get():
   
 memcpy(buffer, bh-b_data + le16_to_cpu(entry-e_value_offs),
size);
   
   Perhaps entry-e_value_offs is no good.  I wonder if the filesystem is
   corrupted and this snuck through the defenses.
   
   I also wonder if there is enough info in that trace for a ppc person to
   be able to determine whether the faulting address is in the source or
   destination of the memcpy() (please)?
  
  It appears to have faulted on a load, implicating the source.  The
  address being referenced (0xc0003f38) doesn't look
  outlandish.  I wonder if this kernel has CONFIG_DEBUG_PAGEALLOC turned
  on, and what page size is selected?
 
 I'm seeing a similar thing on PS3, but not in ext3. During early userspace
 setup (udevd), it crashes accessing a 0xc00* address in:
 
 | NIP setup+0x20/0x130
 | LR copy_user_page+0x18/0x6c
 | Call trace:
 | do_wp_page+0x5b4/0x89c
 | do_page_fault+0x3a8/0x58c
 | handle_page_fault+0x20/0x5c
 
 I have CONFIG_DEBUG_PAGEALLOC=y. If I disable it, the system boots fine.
 
 If needed, I can probably bisect this tomorrow. It definitely didn't happen in
 2.6.29-rc5.

No need to bisect - it was 25d6e2d7c58ddc4a3b614fc5381591c0cfe66556, my
commit that optimised 64bit memcpy() for Power6 and Cell.

The bug was in -rc1, but if your copies were 8-byte aligned with respect
to the source the problem wouldn't have been seen... Could this have
been why you didn't see it in -rc5?

I'll work on a fix now.

Thanks!

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: Crash (ext3 ) during 2.6.29-rc6 boot

2009-02-24 Thread Mark Nelson

On Tue, 24 Feb 2009 05:38:37 pm Sachin P. Sant wrote:
 Jan Kara wrote:
Hmm, OK. But then I'm not sure how that can happen. Obviously, memcpy
  somehow got beyond end of the page referenced by bh-b_data. So it means
  that le16_to_cpu(entry-e_value_offs) + size  page_size. But
  ext3_xattr_find_entry() calls ext3_xattr_check_entry() which in
  particular checks whether e_value_offs + e_value_size isn't greater than
  bh-b_size. So I see no way how memcpy can get beyond end of the page.
Sachin, is the problem reproducible? If yes, can you send us contents

 Yes, i am able to recreate this problem easily. As i had mentioned if the
 earlier kernel is booted with selinux enabled and then 2.6.29-rc6 is booted
 i get this crash. But if i specify selinux=0 at command line, 2.6.29-rc6 boots
 without any problem.

Hi Sanchin and Geert,

Does the patch below fix the problems you're seeing? If it does I'll send
a properly written up and formatted patch to linuxppc-dev (as well as
another one to fix the same problem in copy_tofrom_user()).

Thanks and sorry again!

Mark

---
 arch/powerpc/lib/memcpy_64.S |   26 --
 1 file changed, 20 insertions(+), 6 deletions(-)

Index: upstream/arch/powerpc/lib/memcpy_64.S
===
--- upstream.orig/arch/powerpc/lib/memcpy_64.S
+++ upstream/arch/powerpc/lib/memcpy_64.S
@@ -53,18 +53,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
 3: std r8,8(r3)
beq 3f
addir3,r3,16
-   ld  r9,8(r4)
 .Ldo_tail:
bf  cr7*4+1,1f
-   rotldi  r9,r9,32
+   lwz r9,8(r4)
+   addir4,r4,4
stw r9,0(r3)
addir3,r3,4
 1: bf  cr7*4+2,2f
-   rotldi  r9,r9,16
+   lhz r9,8(r4)
+   addir4,r4,2
sth r9,0(r3)
addir3,r3,2
 2: bf  cr7*4+3,3f
-   rotldi  r9,r9,8
+   lbz r9,8(r4)
stb r9,0(r3)
 3: ld  r3,48(r1)   /* return dest pointer */
blr
@@ -133,11 +134,24 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_
cmpwi   cr1,r5,8
addir3,r3,32
sld r9,r9,r10
-   ble cr1,.Ldo_tail
+   ble cr1,6f
ld  r0,8(r4)
srd r7,r0,r11
or  r9,r7,r9
-   b   .Ldo_tail
+6:
+   bf  cr7*4+1,1f
+   rotldi  r9,r9,32
+   stw r9,0(r3)
+   addir3,r3,4
+1: bf  cr7*4+2,2f
+   rotldi  r9,r9,16
+   sth r9,0(r3)
+   addir3,r3,2
+2: bf  cr7*4+3,3f
+   rotldi  r9,r9,8
+   stb r9,0(r3)
+3: ld  r3,48(r1)   /* return dest pointer */
+   blr
 
 .Ldst_unaligned:
PPC_MTOCRF  0x01,r6 # put #bytes to 8B bdry into cr7
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH] powerpc: Update 64bit __copy_tofrom_user() using CPU_FTR_UNALIGNED_LD_STD

2008-11-11 Thread Mark Nelson

In exactly the same way that we updated memcpy() with new feature sections
in commit 25d6e2d7c58ddc4a3b614fc5381591c0cfe66556 we do the same thing
here for __copy_tofrom_user(). Once again this is purely a performance
tweak for Cell and Power6 - this has no effect on all the other 64bit
powerpc chips.

We can make these same changes to __copy_tofrom_user() because the basic
copy algorithm is the same as in memcpy() - this version just has all the
exception handling logic needed when copying to or from userspace as well
as a special case for copying whole 4K pages that are page aligned.

CPU_FTR_UNALIGNED_LD_STD CPU was added in commit
4ec577a28980a0790df3c3dfe9c81f6eacfb

We also make the same simple one line change from cmpldi r1,... to cmpldi
cr1,... for consistency.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
 arch/powerpc/lib/copyuser_64.S |   17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

Index: upstream/arch/powerpc/lib/copyuser_64.S
===
--- upstream.orig/arch/powerpc/lib/copyuser_64.S
+++ upstream/arch/powerpc/lib/copyuser_64.S
@@ -26,11 +26,24 @@ _GLOBAL(__copy_tofrom_user)
andi.   r6,r6,7
PPC_MTOCRF  0x01,r5
blt cr1,.Lshort_copy
+/* Below we want to nop out the bne if we're on a CPU that has the
+ * CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
+ * cleared.
+ * At the time of writing the only CPU that has this combination of bits
+ * set is Power6.
+ */
+BEGIN_FTR_SECTION
+   nop
+FTR_SECTION_ELSE
bne .Ldst_unaligned
+ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
+   CPU_FTR_UNALIGNED_LD_STD)
 .Ldst_aligned:
-   andi.   r0,r4,7
addir3,r3,-16
+BEGIN_FTR_SECTION
+   andi.   r0,r4,7
bne .Lsrc_unaligned
+END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
srdir7,r5,4
 20:ld  r9,0(r4)
addir4,r4,-8
@@ -138,7 +151,7 @@ _GLOBAL(__copy_tofrom_user)
PPC_MTOCRF  0x01,r6 /* put #bytes to 8B bdry into cr7 */
subfr5,r6,r5
li  r7,0
-   cmpldi  r1,r5,16
+   cmpldi  cr1,r5,16
bf  cr7*4+3,1f
 35:lbz r0,0(r4)
 81:stb r0,0(r3)
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH 1/2] powerpc: Update remaining dma_mapping_ops to use map/unmap_page

2008-10-28 Thread Mark Nelson

After the merge of the 32 and 64bit DMA code, dma_direct_ops lost their
map/unmap_single() functions but gained map/unmap_page(). This caused a
problem for Cell because Cell's dma_iommu_fixed_ops called the dma_direct_ops
if the fixed linear mapping was to be used or the iommu ops if the dynamic
window was to be used. So in order to fix this problem we need to update the
64bit DMA code to use map/unmap_page.

First, we update the generic IOMMU code so that iommu_map_single() becomes
iommu_map_page() and iommu_unmap_single() becomes iommu_unmap_page(). Then we
propagate these changes up through all the callers of these two functions and
in the process update all the dma_mapping_ops so that they have map/unmap_page
rahter than map/unmap_single. We can do this because on 64bit there is no
HIGHMEM memory so map/unmap_page ends up performing exactly the same function
as map/unmap_single, just taking different arguments.

This has no effect on drivers because the dma_map_single_attrs() just ends up
calling the map_page() function of the appropriate dma_mapping_ops and
similarly the dma_unmap_single_attrs() calls unmap_page().

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
Is it possible to get this in for 2.6.28 because currently Cell blades oops
on boot because they're calling dma_direct_ops.map_single (which is NULL)?

 arch/powerpc/include/asm/iommu.h|   15 ++--
 arch/powerpc/kernel/dma-iommu.c |   34 +
 arch/powerpc/kernel/ibmebus.c   |   29 -
 arch/powerpc/kernel/iommu.c |   22 ++-
 arch/powerpc/kernel/vio.c   |   25 +++--
 arch/powerpc/platforms/cell/iommu.c |   37 +++-
 arch/powerpc/platforms/iseries/iommu.c  |7 +++---
 arch/powerpc/platforms/ps3/system-bus.c |   36 ---
 8 files changed, 105 insertions(+), 100 deletions(-)

Index: upstream/arch/powerpc/include/asm/iommu.h
===
--- upstream.orig/arch/powerpc/include/asm/iommu.h
+++ upstream/arch/powerpc/include/asm/iommu.h
@@ -92,13 +92,14 @@ extern void *iommu_alloc_coherent(struct
  unsigned long mask, gfp_t flag, int node);
 extern void iommu_free_coherent(struct iommu_table *tbl, size_t size,
void *vaddr, dma_addr_t dma_handle);
-extern dma_addr_t iommu_map_single(struct device *dev, struct iommu_table *tbl,
-  void *vaddr, size_t size, unsigned long mask,
-  enum dma_data_direction direction,
-  struct dma_attrs *attrs);
-extern void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
-  size_t size, enum dma_data_direction direction,
-  struct dma_attrs *attrs);
+extern dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
+struct page *page, unsigned long offset,
+size_t size, unsigned long mask,
+enum dma_data_direction direction,
+struct dma_attrs *attrs);
+extern void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
+size_t size, enum dma_data_direction direction,
+struct dma_attrs *attrs);
 
 extern void iommu_init_early_pSeries(void);
 extern void iommu_init_early_iSeries(void);
Index: upstream/arch/powerpc/kernel/dma-iommu.c
===
--- upstream.orig/arch/powerpc/kernel/dma-iommu.c
+++ upstream/arch/powerpc/kernel/dma-iommu.c
@@ -30,28 +30,26 @@ static void dma_iommu_free_coherent(stru
 }
 
 /* Creates TCEs for a user provided buffer.  The user buffer must be
- * contiguous real kernel storage (not vmalloc).  The address of the buffer
- * passed here is the kernel (virtual) address of the buffer.  The buffer
- * need not be page aligned, the dma_addr_t returned will point to the same
- * byte within the page as vaddr.
+ * contiguous real kernel storage (not vmalloc).  The address passed here
+ * comprises a page address and offset into that page. The dma_addr_t
+ * returned will point to the same byte within the page as was passed in.
  */
-static dma_addr_t dma_iommu_map_single(struct device *dev, void *vaddr,
-  size_t size,
-  enum dma_data_direction direction,
-  struct dma_attrs *attrs)
+static dma_addr_t dma_iommu_map_page(struct device *dev, struct page *page,
+unsigned long offset, size_t size,
+enum dma_data_direction direction,
+struct dma_attrs *attrs

[PATCH 2/2] powerpc: Remove map_/unmap_single() from dma_mapping_ops

2008-10-28 Thread Mark Nelson

Now that all of the remaining dma_mapping_ops have had their map_/unmap_single
functions updated to become map/unmap_page functions, there is no need to have
the map_/unmap_single function pointers in the dma_mapping_ops.

So, take this opportunity to remove them and also remove the code that does
the checking for which set of functions to use.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
This is just a cleanup so can wait for 2.6.29

 arch/powerpc/include/asm/dma-mapping.h |   36 -
 1 file changed, 5 insertions(+), 31 deletions(-)

Index: upstream/arch/powerpc/include/asm/dma-mapping.h
===
--- upstream.orig/arch/powerpc/include/asm/dma-mapping.h
+++ upstream/arch/powerpc/include/asm/dma-mapping.h
@@ -60,12 +60,6 @@ struct dma_mapping_ops {
dma_addr_t *dma_handle, gfp_t flag);
void(*free_coherent)(struct device *dev, size_t size,
void *vaddr, dma_addr_t dma_handle);
-   dma_addr_t  (*map_single)(struct device *dev, void *ptr,
-   size_t size, enum dma_data_direction direction,
-   struct dma_attrs *attrs);
-   void(*unmap_single)(struct device *dev, dma_addr_t dma_addr,
-   size_t size, enum dma_data_direction direction,
-   struct dma_attrs *attrs);
int (*map_sg)(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction direction,
struct dma_attrs *attrs);
@@ -149,10 +143,9 @@ static inline int dma_set_mask(struct de
 }
 
 /*
- * TODO: map_/unmap_single will ideally go away, to be completely
- * replaced by map/unmap_page.   Until then, we allow dma_ops to have
- * one or the other, or both by checking to see if the specific
- * function requested exists; and if not, falling back on the other set.
+ * map_/unmap_single actually call through to map/unmap_page now that all the
+ * dma_mapping_ops have been converted over. We just have to get the page and
+ * offset to pass through to map_page
  */
 static inline dma_addr_t dma_map_single_attrs(struct device *dev,
  void *cpu_addr,
@@ -164,10 +157,6 @@ static inline dma_addr_t dma_map_single_
 
BUG_ON(!dma_ops);
 
-   if (dma_ops-map_single)
-   return dma_ops-map_single(dev, cpu_addr, size, direction,
-  attrs);
-
return dma_ops-map_page(dev, virt_to_page(cpu_addr),
 (unsigned long)cpu_addr % PAGE_SIZE, size,
 direction, attrs);
@@ -183,11 +172,6 @@ static inline void dma_unmap_single_attr
 
BUG_ON(!dma_ops);
 
-   if (dma_ops-unmap_single) {
-   dma_ops-unmap_single(dev, dma_addr, size, direction, attrs);
-   return;
-   }
-
dma_ops-unmap_page(dev, dma_addr, size, direction, attrs);
 }
 
@@ -201,12 +185,7 @@ static inline dma_addr_t dma_map_page_at
 
BUG_ON(!dma_ops);
 
-   if (dma_ops-map_page)
-   return dma_ops-map_page(dev, page, offset, size, direction,
-attrs);
-
-   return dma_ops-map_single(dev, page_address(page) + offset, size,
-  direction, attrs);
+   return dma_ops-map_page(dev, page, offset, size, direction, attrs);
 }
 
 static inline void dma_unmap_page_attrs(struct device *dev,
@@ -219,12 +198,7 @@ static inline void dma_unmap_page_attrs(
 
BUG_ON(!dma_ops);
 
-   if (dma_ops-unmap_page) {
-   dma_ops-unmap_page(dev, dma_address, size, direction, attrs);
-   return;
-   }
-
-   dma_ops-unmap_single(dev, dma_address, size, direction, attrs);
+   dma_ops-unmap_page(dev, dma_address, size, direction, attrs);
 }
 
 static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH 1/2] powerpc: Add new CPU feature: CPU_FTR_UNALIGNED_LD_STD

2008-10-27 Thread Mark Nelson

Add a new CPU feature bit, CPU_FTR_UNALIGNED_LD_STD, to be added
to the 64bit powerpc chips that can do unaligned load double and
store double (and not take a performance hit from it).

This is added to Power6 and Cell and will be used in an upcoming
patch to do the alignment in memcpy() only on CPUs that require
it.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
 arch/powerpc/include/asm/cputable.h |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

Index: upstream/arch/powerpc/include/asm/cputable.h
===
--- upstream.orig/arch/powerpc/include/asm/cputable.h
+++ upstream/arch/powerpc/include/asm/cputable.h
@@ -194,6 +194,7 @@ extern const char *powerpc_base_platform
 #define CPU_FTR_VSXLONG_ASM_CONST(0x0010)
 #define CPU_FTR_SAOLONG_ASM_CONST(0x0020)
 #define CPU_FTR_CP_USE_DCBTZ   LONG_ASM_CONST(0x0040)
+#define CPU_FTR_UNALIGNED_LD_STD   LONG_ASM_CONST(0x0080)
 
 #ifndef __ASSEMBLY__
 
@@ -404,7 +405,7 @@ extern const char *powerpc_base_platform
CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
-   CPU_FTR_DSCR)
+   CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD)
 #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -415,7 +416,8 @@ extern const char *powerpc_base_platform
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_PAUSE_ZERO | CPU_FTR_CI_LARGE_PAGE | \
-   CPU_FTR_CELL_TB_BUG | CPU_FTR_CP_USE_DCBTZ)
+   CPU_FTR_CELL_TB_BUG | CPU_FTR_CP_USE_DCBTZ | \
+   CPU_FTR_UNALIGNED_LD_STD)
 #define CPU_FTRS_PA6T (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_CI_LARGE_PAGE | \
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH 2/2] powerpc: Update 64bit memcpy() using CPU_FTR_UNALIGNED_LD_STD

2008-10-27 Thread Mark Nelson

Update memcpy() to add two new feature sections: one for aligning the
destination before copying and one for copying using aligned load
and store doubles.

These new feature sections will only affect Power6 and Cell because
the CPU feature bit was only added to these two processors.

Power6 gets its best performance in memcpy() when aligning neither the
source nor the destination, while Cell gets its best performance when
just the destination is aligned. But in order to save on CPU feature
bits we can use the previously added CPU_FTR_CP_USE_DCBTZ feature bit
to differentiate between Power6 and Cell (because CPU_FTR_CP_USE_DCBTZ
was added to Cell but not Power6).

The first feature section acts to nop out the branch that takes us to
the code that aligns us to an eight byte boundary for the destination.
We only want to nop out this branch on Power6.

So the ALT_FTR_SECTION_END() for this feature section creates a test
mask of the two feature bits ORed together and provides an expected
result of just CPU_FTR_UNALIGNED_LD_STD, thus we nop out the branch
if we're on a CPU that has CPU_FTR_UNALIGNED_LD_STD set and
CPU_FTR_CP_USE_DCBTZ unset.

For the second feature section added, if we're on a CPU that has the
CPU_FTR_UNALIGNED_LD_STD bit set then we don't want to do the copy
with aligned loads and stores (and the appropriate shifting left and
right instructions), so we want to nop out the branch to
.Lsrc_unaligned.

The andi. used for this branch is moved to just above the branch
because this allows us to nop out both instructions with just one
feature section which gives us better performance and doesn't hurt
readability which two separate feature sections did.

Moving the andi. to just above the branch doesn't have any noticeable
negative effect on the remaining 64bit processors (the ones that
didn't have this feature bit added).

On Cell this simple modification results in an improvement to measured
memcpy() bandwidth of up to 50% in the hot cache case and up to 15% in
the cold cache case

On Power6 we get memory bandwidth results that are up to three times
faster in the hot cache case and up to 50% faster in the cold cache
case.

CPU_FTR_CP_USE_DCBTZ was added in commit
2a9294369bd020db89bfdf78b84c3615b39a5c84.

To say that Cell gets its best performance in memcpy() with just the
destination aligned is true but only for the reason that the indirect
shift and rotate instructions, sld and srd, are microcoded on Cell.
This means that either the destination or the source can be aligned,
but not both, and seeing as we get better performance with the
destination aligned we choose this option.

While we're at it make a one line change from cmpldi r1,... to
cmpldi cr1,... for consistency.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
 arch/powerpc/lib/memcpy_64.S |   16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

Index: upstream/arch/powerpc/lib/memcpy_64.S
===
--- upstream.orig/arch/powerpc/lib/memcpy_64.S
+++ upstream/arch/powerpc/lib/memcpy_64.S
@@ -18,11 +18,23 @@ _GLOBAL(memcpy)
andi.   r6,r6,7
dcbt0,r4
blt cr1,.Lshort_copy
+/* Below we want to nop out the bne if we're on a CPU that has the
+   CPU_FTR_UNALIGNED_LD_STD bit set and the CPU_FTR_CP_USE_DCBTZ bit
+   cleared.
+   At the time of writing the only CPU that has this combination of bits
+   set is Power6. */
+BEGIN_FTR_SECTION
+   nop
+FTR_SECTION_ELSE
bne .Ldst_unaligned
+ALT_FTR_SECTION_END(CPU_FTR_UNALIGNED_LD_STD | CPU_FTR_CP_USE_DCBTZ, \
+CPU_FTR_UNALIGNED_LD_STD)
 .Ldst_aligned:
-   andi.   r0,r4,7
addir3,r3,-16
+BEGIN_FTR_SECTION
+   andi.   r0,r4,7
bne .Lsrc_unaligned
+END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
srdir7,r5,4
ld  r9,0(r4)
addir4,r4,-8
@@ -131,7 +143,7 @@ _GLOBAL(memcpy)
PPC_MTOCRF  0x01,r6 # put #bytes to 8B bdry into cr7
subfr5,r6,r5
li  r7,0
-   cmpldi  r1,r5,16
+   cmpldi  cr1,r5,16
bf  cr7*4+3,1f
lbz r0,0(r4)
stb r0,0(r3)
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH 0/2] powerpc: new copy_4K_page()

2008-08-21 Thread Mark Nelson

On Thu, 14 Aug 2008 04:17:32 pm Mark Nelson wrote:
 Hi All,
 
 What follows is an updated version of copy_4K_page that has been tuned
 for the Cell processor. With this new routine it was found that the
 system time measured when compiling a 2.6.26 pseries_defconfig was
 reduced by ~10s:
 
 mainline (2.6.27-rc1-00632-g2e1e921):
 
 real17m8.727s
 user59m48.693s
 sys 3m56.089s
 
 real17m9.350s
 user59m44.822s
 sys 3m56.666s
 
 new routine:
 
 real17m7.311s
 user59m51.339s
 sys 3m47.043s
 
 real17m7.863s
 user59m49.028s
 sys 3m46.608s
 
 This same routine was also found to improve performance on 970 CPUs
 too (but by a much smaller amount):
 
 mainline (2.6.27-rc1-00632-g2e1e921):
 
 real16m8.545s
 user14m38.134s
 sys 1m55.156s
 
 real16m7.089s
 user14m37.974s
 sys 1m55.010s
 
 new routine:
 
 real16m11.641s
 user14m37.251s
 sys 1m52.618s
 
 real16m6.139s
 user14m38.282s
 sys 1m53.184s
 
 
 I also did testing on Power{3..6} and I found that Power3, Power5 and
 Power6 did better with this new routine when the dcbt and dcbz
 weren't used (in which case they achieved performance comparable to
 the existing kernel copy_4K_page routine). Power4 on other hand
 performed slightly better with the dcbt and dcbz included (still
 comparable to the current kernel copy_4K_page).
 
 So in order to get the best performance across the board I created a
 new CPU feature that will govern whether the dcbt and dcbz are used
 (and un-creatively named it CPU_FTR_CP_USE_DCBTZ). I added it to the
 CPU features of Cell, Power4 and 970.
 Unfortunately I don't have access to a PA6T but judging by the
 marketing material I could find, it looks like it has a strong enough
 hardware prefetcher that it probably wouldn't benefit from the dcbt
 and dcbz...
 
 Okay, that's probably enough prattling along - you can all go and look
 at the code now.
 
 All comments appreciated
 
 [I decided to post the whole copy routine rather than a diff between
 it and the current one because I found the diff quite unreadable. I'll post
 a real patchset after I've addressed any comments.]
 
 Many thanks!
 

The actual patches for the new copy_4K_page() follow this.

Note: I changed the order of the patches so that the new CPU feature
bit is introduced in the first patch and then the new copy_4K_page
is introduced in the second patch.
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH 1/2] powerpc: add new CPU feature: CPU_FTR_CP_USE_DCBTZ

2008-08-21 Thread Mark Nelson

Add a new CPU feature bit, CPU_FTR_CP_USE_DCBTZ, to be added to the
64bit powerpc chips that benefit from having dcbt and dcbz
instructions used in their memory copy routines.

This will be used in a subsequent patch that updates copy_4K_page().
The new bit is added to Cell, PPC970 and Power4 because they show
better performance with the new copy_4K_page() when dcbt and dcbz
instructions are used.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
 arch/powerpc/include/asm/cputable.h |9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

Index: upstream/arch/powerpc/include/asm/cputable.h
===
--- upstream.orig/arch/powerpc/include/asm/cputable.h
+++ upstream/arch/powerpc/include/asm/cputable.h
@@ -192,6 +192,7 @@ extern const char *powerpc_base_platform
 #define CPU_FTR_NO_SLBIE_B LONG_ASM_CONST(0x0008)
 #define CPU_FTR_VSXLONG_ASM_CONST(0x0010)
 #define CPU_FTR_SAOLONG_ASM_CONST(0x0020)
+#define CPU_FTR_CP_USE_DCBTZ   LONG_ASM_CONST(0x0040)
 
 #ifndef __ASSEMBLY__
 
@@ -387,10 +388,11 @@ extern const char *powerpc_base_platform
CPU_FTR_MMCRA | CPU_FTR_CTRL)
 #define CPU_FTRS_POWER4(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
-   CPU_FTR_MMCRA)
+   CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ)
 #define CPU_FTRS_PPC970(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
-   CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA)
+   CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \
+   CPU_FTR_CP_USE_DCBTZ)
 #define CPU_FTRS_POWER5(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -411,7 +413,8 @@ extern const char *powerpc_base_platform
 #define CPU_FTRS_CELL  (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
-   CPU_FTR_PAUSE_ZERO | CPU_FTR_CI_LARGE_PAGE | CPU_FTR_CELL_TB_BUG)
+   CPU_FTR_PAUSE_ZERO | CPU_FTR_CI_LARGE_PAGE | \
+   CPU_FTR_CELL_TB_BUG | CPU_FTR_CP_USE_DCBTZ)
 #define CPU_FTRS_PA6T (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_CI_LARGE_PAGE | \
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH 2/2] powerpc: new copy_4K_page()

2008-08-21 Thread Mark Nelson

This new copy_4K_page() function was originally tuned for the best
performance on the Cell processor, but after testing on more 64bit
powerpc chips it was found that with a small modification it either
matched the performance offered by the current mainline version or
bettered it by a small amount.

It was found that on a Cell-based QS22 blade the amount of system
time measured when compiling a 2.6.26 pseries_defconfig decreased
by 4%. Using the same test, a 4-way 970MP machine saw a decrease of
2% in system time. No noticeable change was seen on Power4, Power5
or Power6.

The 4096 byte page is copied in thirty-two 128 byte strides. An
initial setup loop executes dcbt instructions for the whole source
page and dcbz instructions for the whole destination page. To do
this, the cache line size is retrieved from ppc64_caches.

A new CPU feature bit, CPU_FTR_CP_USE_DCBTZ, (introduced in the
previous patch) is used to make the modification to this new copy
routine - on Power4, 970 and Cell the feature bit is set so the
setup loop is executed, but on all other 64bit chips the setup
loop is nop'ed out.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
 arch/powerpc/lib/copypage_64.S |  198 +++--
 1 file changed, 93 insertions(+), 105 deletions(-)

Index: upstream/arch/powerpc/lib/copypage_64.S
===
--- upstream.orig/arch/powerpc/lib/copypage_64.S
+++ upstream/arch/powerpc/lib/copypage_64.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2002 Paul Mackerras, IBM Corp.
+ * Copyright (C) 2008 Mark Nelson, IBM Corp.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -8,112 +8,100 @@
  */
 #include asm/processor.h
 #include asm/ppc_asm.h
+#include asm/asm-offsets.h
+
+.section.toc,aw
+PPC64_CACHES:
+.tc ppc64_caches[TC],ppc64_caches
+.section.text
+
 
 _GLOBAL(copy_4K_page)
-   std r31,-8(1)
-   std r30,-16(1)
-   std r29,-24(1)
-   std r28,-32(1)
-   std r27,-40(1)
-   std r26,-48(1)
-   std r25,-56(1)
-   std r24,-64(1)
-   std r23,-72(1)
-   std r22,-80(1)
-   std r21,-88(1)
-   std r20,-96(1)
-   li  r5,4096/32 - 1
+   li  r5,4096 /* 4K page size */
+BEGIN_FTR_SECTION
+   ld  r10,[EMAIL PROTECTED](r2)
+   lwz r11,DCACHEL1LOGLINESIZE(r10)/* log2 of cache line size */
+   lwz r12,DCACHEL1LINESIZE(r10)   /* get cache line size */
+   li  r9,0
+   srd r8,r5,r11
+
+   mtctr   r8
+setup:
+   dcbtr9,r4
+   dcbzr9,r3
+   add r9,r9,r12
+   bdnzsetup
+END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ)
addir3,r3,-8
-   li  r12,5
-0: addir5,r5,-24
-   mtctr   r12
-   ld  r22,640(4)
-   ld  r21,512(4)
-   ld  r20,384(4)
-   ld  r11,256(4)
-   ld  r9,128(4)
-   ld  r7,0(4)
-   ld  r25,648(4)
-   ld  r24,520(4)
-   ld  r23,392(4)
-   ld  r10,264(4)
-   ld  r8,136(4)
-   ldu r6,8(4)
-   cmpwi   r5,24
-1: std r22,648(3)
-   std r21,520(3)
-   std r20,392(3)
-   std r11,264(3)
-   std r9,136(3)
-   std r7,8(3)
-   ld  r28,648(4)
-   ld  r27,520(4)
-   ld  r26,392(4)
-   ld  r31,264(4)
-   ld  r30,136(4)
-   ld  r29,8(4)
-   std r25,656(3)
-   std r24,528(3)
-   std r23,400(3)
-   std r10,272(3)
-   std r8,144(3)
-   std r6,16(3)
-   ld  r22,656(4)
-   ld  r21,528(4)
-   ld  r20,400(4)
-   ld  r11,272(4)
-   ld  r9,144(4)
-   ld  r7,16(4)
-   std r28,664(3)
-   std r27,536(3)
-   std r26,408(3)
-   std r31,280(3)
-   std r30,152(3)
-   stdur29,24(3)
-   ld  r25,664(4)
-   ld  r24,536(4)
-   ld  r23,408(4)
-   ld  r10,280(4)
-   ld  r8,152(4)
-   ldu r6,24(4)
+   srdir8,r5,7 /* page is copied in 128 byte strides */
+   addir8,r8,-1/* one stride copied outside loop */
+
+   mtctr   r8
+
+   ld  r5,0(r4)
+   ld  r6,8(r4)
+   ld  r7,16(r4)
+   ldu r8,24(r4)
+1: std r5,8(r3)
+   ld  r9,8(r4)
+   std r6,16(r3)
+   ld  r10,16(r4)
+   std r7,24(r3)
+   ld  r11,24(r4)
+   std r8,32(r3)
+   ld  r12,32(r4)
+   std r9,40(r3)
+   ld  r5,40(r4)
+   std r10,48(r3)
+   ld  r6,48(r4)
+   std r11,56(r3)
+   ld  r7,56(r4)
+   std r12,64(r3)
+   ld  r8,64(r4)
+   std r5,72(r3)
+   ld  r9,72(r4)
+   std r6,80

Re: [RFC 2/2] powerpc: copy_4K_page tweaked for Cell - add CPU feature

2008-08-15 Thread Mark Nelson

On Thu, 14 Aug 2008 10:10:48 pm Michael Ellerman wrote:
 On Thu, 2008-08-14 at 21:48 +1000, Mark Nelson wrote:
  Hi Michael,
  
  On Thu, 14 Aug 2008 08:51:35 pm Michael Ellerman wrote:
   On Thu, 2008-08-14 at 16:18 +1000, Mark Nelson wrote:
Add a new CPU feature, CPU_FTR_CP_USE_DCBTZ, to be added to the CPUs 
that benefit
from having dcbt and dcbz instructions used in copy_4K_page(). So far 
Cell, PPC970
and Power4 benefit.

This way all the other 64bit powerpc chips will have the whole 
prefetching loop
nop'ed out.
   
Index: upstream/arch/powerpc/lib/copypage_64.S
===
--- upstream.orig/arch/powerpc/lib/copypage_64.S
+++ upstream/arch/powerpc/lib/copypage_64.S
@@ -18,6 +18,7 @@ PPC64_CACHES:
 
 _GLOBAL(copy_4K_page)
li  r5,4096 /* 4K page size */
+BEGIN_FTR_SECTION
ld  r10,[EMAIL PROTECTED](r2)
lwz r11,DCACHEL1LOGLINESIZE(r10)/* log2 of cache line 
size */
lwz r12,DCACHEL1LINESIZE(r10)   /* Get cache line size 
*/
@@ -30,7 +31,7 @@ setup:
dcbzr9,r3
add r9,r9,r12
bdnzsetup
-
+END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ)
addir3,r3,-8
srdir8,r5,7 /* page is copied in 128 byte strides */
addir8,r8,-1/* one stride copied outside loop */
   
   Instead of nop'ing it out, we could use an alternative feature section
   to either run it or jump over it. It would look something like:
   
   
   _GLOBAL(copy_4K_page)
   BEGIN_FTR_SECTION
   li  r5,4096 /* 4K page size */
   ld  r10,[EMAIL PROTECTED](r2)
   lwz r11,DCACHEL1LOGLINESIZE(r10)/* log2 of cache line 
   size */
   lwz r12,DCACHEL1LINESIZE(r10)   /* Get cache line size */
   li  r9,0
   srd r8,r5,r11
   
   mtctr   r8
   setup:
   dcbtr9,r4
   dcbzr9,r3
   add r9,r9,r12
   bdnzsetup
   FTR_SECTION_ELSE
 b   1f
   ALT_FTR_SECTION_END_IFSET(CPU_FTR_CP_USE_DCBTZ)
   1:
   addir3,r3,-8
   
   So in the no-dcbtz case you'd get a branch instead of 11 nops.
   
   Of course you'd need to benchmark it to see if skipping the nops is
   better than executing them ;P
  
  Thanks for looking through this.
  
  That does look a lot better. In the first version there wasn't quite
  as much to nop out (the cache line size was hardcoded to 128
  bytes) so I wasn't so worried but I'll definitely try this with an
  alternative section like you describe.
  
  The jump probably will turn out to be better because I'd imagine
  that the same chips that don't need the dcbt and dcbz because
  they've got beefy enough hardware prefetchers also won't be
  disturbed by the jump (but benchmarks tomorrow will confirm;
  or prove me wrong :) )
 
 Yeah, that would make sense. But you never know :)

It turns out that on Power6 using the alternative section doesn't
have any noticeable effect on performance. On Power5 though it
actually made the compile test a tiny bit slower:

with alternative feature section:

real5m7.549s
user17m24.256s
sys 1m0.621s

real5m7.829s
user17m24.879s
sys 1m0.465s

original implementation:

real5m6.468s
user17m22.891s
sys 0m59.765s

real5m6.965s
user17m23.160s
sys 0m59.844s

So I guess I'll leave it the way it is...

Thanks!

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[RFC 0/2] powerpc: copy_4K_page tweaked for Cell

2008-08-14 Thread Mark Nelson

Hi All,

What follows is an updated version of copy_4K_page that has been tuned
for the Cell processor. With this new routine it was found that the
system time measured when compiling a 2.6.26 pseries_defconfig was
reduced by ~10s:

mainline (2.6.27-rc1-00632-g2e1e921):

real17m8.727s
user59m48.693s
sys 3m56.089s

real17m9.350s
user59m44.822s
sys 3m56.666s

new routine:

real17m7.311s
user59m51.339s
sys 3m47.043s

real17m7.863s
user59m49.028s
sys 3m46.608s

This same routine was also found to improve performance on 970 CPUs
too (but by a much smaller amount):

mainline (2.6.27-rc1-00632-g2e1e921):

real16m8.545s
user14m38.134s
sys 1m55.156s

real16m7.089s
user14m37.974s
sys 1m55.010s

new routine:

real16m11.641s
user14m37.251s
sys 1m52.618s

real16m6.139s
user14m38.282s
sys 1m53.184s


I also did testing on Power{3..6} and I found that Power3, Power5 and
Power6 did better with this new routine when the dcbt and dcbz
weren't used (in which case they achieved performance comparable to
the existing kernel copy_4K_page routine). Power4 on other hand
performed slightly better with the dcbt and dcbz included (still
comparable to the current kernel copy_4K_page).

So in order to get the best performance across the board I created a
new CPU feature that will govern whether the dcbt and dcbz are used
(and un-creatively named it CPU_FTR_CP_USE_DCBTZ). I added it to the
CPU features of Cell, Power4 and 970.
Unfortunately I don't have access to a PA6T but judging by the
marketing material I could find, it looks like it has a strong enough
hardware prefetcher that it probably wouldn't benefit from the dcbt
and dcbz...

Okay, that's probably enough prattling along - you can all go and look
at the code now.

All comments appreciated

[I decided to post the whole copy routine rather than a diff between
it and the current one because I found the diff quite unreadable. I'll post
a real patchset after I've addressed any comments.]

Many thanks!
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[RFC 1/2] powerpc: copy_4K_page tweaked for Cell

2008-08-14 Thread Mark Nelson

/*
 * Copyright (C) 2008 Mark Nelson, IBM Corp.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
#include asm/processor.h
#include asm/ppc_asm.h
#include asm/asm-offsets.h

.section.toc,aw
PPC64_CACHES:
.tc ppc64_caches[TC],ppc64_caches
.section.text


_GLOBAL(copy_4K_page)
li  r5,4096 /* 4K page size */
ld  r10,[EMAIL PROTECTED](r2)
lwz r11,DCACHEL1LOGLINESIZE(r10)/* log2 of cache line size */
lwz r12,DCACHEL1LINESIZE(r10)   /* Get cache line size */
li  r9,0
srd r8,r5,r11

mtctr   r8
setup:
dcbtr9,r4
dcbzr9,r3
add r9,r9,r12
bdnzsetup

addir3,r3,-8
srdir8,r5,7 /* page is copied in 128 byte strides */
addir8,r8,-1/* one stride copied outside loop */

mtctr   r8

ld  r5,0(r4)
ld  r6,8(r4)
ld  r7,16(r4)
ldu r8,24(r4)
1:  std r5,8(r3)
ld  r9,8(r4)
std r6,16(r3)
ld  r10,16(r4)
std r7,24(r3)
ld  r11,24(r4)
std r8,32(r3)
ld  r12,32(r4)
std r9,40(r3)
ld  r5,40(r4)
std r10,48(r3)
ld  r6,48(r4)
std r11,56(r3)
ld  r7,56(r4)
std r12,64(r3)
ld  r8,64(r4)
std r5,72(r3)
ld  r9,72(r4)
std r6,80(r3)
ld  r10,80(r4)
std r7,88(r3)
ld  r11,88(r4)
std r8,96(r3)
ld  r12,96(r4)
std r9,104(r3)
ld  r5,104(r4)
std r10,112(r3)
ld  r6,112(r4)
std r11,120(r3)
ld  r7,120(r4)
stdur12,128(r3)
ldu r8,128(r4)
bdnz1b

std r5,8(r3)
ld  r9,8(r4)
std r6,16(r3)
ld  r10,16(r4)
std r7,24(r3)
ld  r11,24(r4)
std r8,32(r3)
ld  r12,32(r4)
std r9,40(r3)
ld  r5,40(r4)
std r10,48(r3)
ld  r6,48(r4)
std r11,56(r3)
ld  r7,56(r4)
std r12,64(r3)
ld  r8,64(r4)
std r5,72(r3)
ld  r9,72(r4)
std r6,80(r3)
ld  r10,80(r4)
std r7,88(r3)
ld  r11,88(r4)
std r8,96(r3)
ld  r12,96(r4)
std r9,104(r3)
std r10,112(r3)
std r11,120(r3)
std r12,128(r3)
blr
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[RFC 2/2] powerpc: copy_4K_page tweaked for Cell - add CPU feature

2008-08-14 Thread Mark Nelson

Add a new CPU feature, CPU_FTR_CP_USE_DCBTZ, to be added to the CPUs that 
benefit
from having dcbt and dcbz instructions used in copy_4K_page(). So far Cell, 
PPC970
and Power4 benefit.

This way all the other 64bit powerpc chips will have the whole prefetching loop
nop'ed out.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
 arch/powerpc/include/asm/cputable.h |9 ++---
 arch/powerpc/lib/copypage_64.S  |3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

Index: upstream/arch/powerpc/include/asm/cputable.h
===
--- upstream.orig/arch/powerpc/include/asm/cputable.h
+++ upstream/arch/powerpc/include/asm/cputable.h
@@ -192,6 +192,7 @@ extern const char *powerpc_base_platform
 #define CPU_FTR_NO_SLBIE_B LONG_ASM_CONST(0x0008)
 #define CPU_FTR_VSXLONG_ASM_CONST(0x0010)
 #define CPU_FTR_SAOLONG_ASM_CONST(0x0020)
+#define CPU_FTR_CP_USE_DCBTZ   LONG_ASM_CONST(0x0040)
 
 #ifndef __ASSEMBLY__
 
@@ -387,10 +388,11 @@ extern const char *powerpc_base_platform
CPU_FTR_MMCRA | CPU_FTR_CTRL)
 #define CPU_FTRS_POWER4(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
-   CPU_FTR_MMCRA)
+   CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ)
 #define CPU_FTRS_PPC970(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
-   CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA)
+   CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \
+   CPU_FTR_CP_USE_DCBTZ)
 #define CPU_FTRS_POWER5(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -411,7 +413,8 @@ extern const char *powerpc_base_platform
 #define CPU_FTRS_CELL  (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
-   CPU_FTR_PAUSE_ZERO | CPU_FTR_CI_LARGE_PAGE | CPU_FTR_CELL_TB_BUG)
+   CPU_FTR_PAUSE_ZERO | CPU_FTR_CI_LARGE_PAGE | \
+   CPU_FTR_CELL_TB_BUG | CPU_FTR_CP_USE_DCBTZ)
 #define CPU_FTRS_PA6T (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_HPTE_TABLE | CPU_FTR_PPCAS_ARCH_V2 | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_CI_LARGE_PAGE | \
Index: upstream/arch/powerpc/lib/copypage_64.S
===
--- upstream.orig/arch/powerpc/lib/copypage_64.S
+++ upstream/arch/powerpc/lib/copypage_64.S
@@ -18,6 +18,7 @@ PPC64_CACHES:
 
 _GLOBAL(copy_4K_page)
li  r5,4096 /* 4K page size */
+BEGIN_FTR_SECTION
ld  r10,[EMAIL PROTECTED](r2)
lwz r11,DCACHEL1LOGLINESIZE(r10)/* log2 of cache line size */
lwz r12,DCACHEL1LINESIZE(r10)   /* Get cache line size */
@@ -30,7 +31,7 @@ setup:
dcbzr9,r3
add r9,r9,r12
bdnzsetup
-
+END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ)
addir3,r3,-8
srdir8,r5,7 /* page is copied in 128 byte strides */
addir8,r8,-1/* one stride copied outside loop */
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [RFC 2/2] powerpc: copy_4K_page tweaked for Cell - add CPU feature

2008-08-14 Thread Mark Nelson

Hi Michael,

On Thu, 14 Aug 2008 08:51:35 pm Michael Ellerman wrote:
 On Thu, 2008-08-14 at 16:18 +1000, Mark Nelson wrote:
  Add a new CPU feature, CPU_FTR_CP_USE_DCBTZ, to be added to the CPUs that 
  benefit
  from having dcbt and dcbz instructions used in copy_4K_page(). So far Cell, 
  PPC970
  and Power4 benefit.
  
  This way all the other 64bit powerpc chips will have the whole prefetching 
  loop
  nop'ed out.
 
  Index: upstream/arch/powerpc/lib/copypage_64.S
  ===
  --- upstream.orig/arch/powerpc/lib/copypage_64.S
  +++ upstream/arch/powerpc/lib/copypage_64.S
  @@ -18,6 +18,7 @@ PPC64_CACHES:
   
   _GLOBAL(copy_4K_page)
  li  r5,4096 /* 4K page size */
  +BEGIN_FTR_SECTION
  ld  r10,[EMAIL PROTECTED](r2)
  lwz r11,DCACHEL1LOGLINESIZE(r10)/* log2 of cache line size */
  lwz r12,DCACHEL1LINESIZE(r10)   /* Get cache line size */
  @@ -30,7 +31,7 @@ setup:
  dcbzr9,r3
  add r9,r9,r12
  bdnzsetup
  -
  +END_FTR_SECTION_IFSET(CPU_FTR_CP_USE_DCBTZ)
  addir3,r3,-8
  srdir8,r5,7 /* page is copied in 128 byte strides */
  addir8,r8,-1/* one stride copied outside loop */
 
 Instead of nop'ing it out, we could use an alternative feature section
 to either run it or jump over it. It would look something like:
 
 
 _GLOBAL(copy_4K_page)
 BEGIN_FTR_SECTION
 li  r5,4096 /* 4K page size */
 ld  r10,[EMAIL PROTECTED](r2)
 lwz r11,DCACHEL1LOGLINESIZE(r10)/* log2 of cache line size */
 lwz r12,DCACHEL1LINESIZE(r10)   /* Get cache line size */
 li  r9,0
 srd r8,r5,r11
 
 mtctr   r8
 setup:
 dcbtr9,r4
 dcbzr9,r3
 add r9,r9,r12
 bdnzsetup
 FTR_SECTION_ELSE
   b   1f
 ALT_FTR_SECTION_END_IFSET(CPU_FTR_CP_USE_DCBTZ)
 1:
 addir3,r3,-8
 
 So in the no-dcbtz case you'd get a branch instead of 11 nops.
 
 Of course you'd need to benchmark it to see if skipping the nops is
 better than executing them ;P

Thanks for looking through this.

That does look a lot better. In the first version there wasn't quite
as much to nop out (the cache line size was hardcoded to 128
bytes) so I wasn't so worried but I'll definitely try this with an
alternative section like you describe.

The jump probably will turn out to be better because I'd imagine
that the same chips that don't need the dcbt and dcbz because
they've got beefy enough hardware prefetchers also won't be
disturbed by the jump (but benchmarks tomorrow will confirm;
or prove me wrong :) )

Thanks!

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH] powerpc/cell: set fixed mapping to weak if we find a pcie-endpoint

2008-07-23 Thread Mark Nelson

From: Mark Nelson [EMAIL PROTECTED]

At the moment the fixed mapping is by default strongly ordered (the
iommu_fixed=weak boot option must be used to make the fixed mapping weakly
ordered). If we're on a setup where the southbridge is being used in
endpoint mode (triblade and CAB boards) the default should be a weakly
ordered fixed mapping.

This adds a check so that if a node of type pcie-endpoint can be found in
the device tree the fixed mapping is set to be weak by default (but can be
overridden using iommu_fixed=strong).

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
Acked-by: Arnd Bergmann [EMAIL PROTECTED]
---
 arch/powerpc/platforms/cell/iommu.c |   13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

Index: upstream/arch/powerpc/platforms/cell/iommu.c
===
--- upstream.orig/arch/powerpc/platforms/cell/iommu.c
+++ upstream/arch/powerpc/platforms/cell/iommu.c
@@ -1150,12 +1150,23 @@ static int iommu_fixed_disabled;
 
 static int __init setup_iommu_fixed(char *str)
 {
+   struct device_node *pciep = NULL;
+
if (strcmp(str, off) == 0)
iommu_fixed_disabled = 1;
 
-   else if (strcmp(str, weak) == 0)
+   /* If we can find a pcie-endpoint in the device tree assume that
+* we're on a triblade or a CAB so by default the fixed mapping
+* should be set to be weakly ordered; but only if the boot
+* option WASN'T set for strong ordering
+*/
+   pciep = of_find_node_by_type(NULL, pcie-endpoint);
+
+   if (strcmp(str, weak) == 0 || (pciep  strcmp(str, strong) != 0))
iommu_fixed_is_weak = 1;
 
+   of_node_put(pciep);
+
return 1;
 }
 __setup(iommu_fixed=, setup_iommu_fixed);
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH v2] powerpc/cell: set fixed mapping to weak if we find a pcie-endpoint

2008-07-23 Thread Mark Nelson

From: Mark Nelson [EMAIL PROTECTED]

At the moment the fixed mapping is by default strongly ordered (the
iommu_fixed=weak boot option must be used to make the fixed mapping weakly
ordered). If we're on a setup where the southbridge is being used in
endpoint mode (triblade and CAB boards) the default should be a weakly
ordered fixed mapping.

This adds a check so that if a node of type pcie-endpoint can be found in
the device tree the fixed mapping is set to be weak by default (but can be
overridden using iommu_fixed=strong).

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
Acked-by: Arnd Bergmann [EMAIL PROTECTED]
---
Updated thanks to Stephen Rothwell's tip that pciep didn't need to be 
initialized

Sorry for any confusion

 arch/powerpc/platforms/cell/iommu.c |   13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

Index: upstream/arch/powerpc/platforms/cell/iommu.c
===
--- upstream.orig/arch/powerpc/platforms/cell/iommu.c
+++ upstream/arch/powerpc/platforms/cell/iommu.c
@@ -1150,12 +1150,23 @@ static int iommu_fixed_disabled;
 
 static int __init setup_iommu_fixed(char *str)
 {
+   struct device_node *pciep;
+
if (strcmp(str, off) == 0)
iommu_fixed_disabled = 1;
 
-   else if (strcmp(str, weak) == 0)
+   /* If we can find a pcie-endpoint in the device tree assume that
+* we're on a triblade or a CAB so by default the fixed mapping
+* should be set to be weakly ordered; but only if the boot
+* option WASN'T set for strong ordering
+*/
+   pciep = of_find_node_by_type(NULL, pcie-endpoint);
+
+   if (strcmp(str, weak) == 0 || (pciep  strcmp(str, strong) != 0))
iommu_fixed_is_weak = 1;
 
+   of_node_put(pciep);
+
return 1;
 }
 __setup(iommu_fixed=, setup_iommu_fixed);
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [RFC 0/3] powerpc: memory copy routines tweaked for Cell

2008-06-20 Thread Mark Nelson

On Fri, 20 Jun 2008 09:49:29 am Mark Nelson wrote:
 On Thu, 19 Jun 2008 09:53:16 pm Arnd Bergmann wrote:
  On Thursday 19 June 2008, Mark Nelson wrote:
   The plan is to use Michael Ellerman's code patching work so that at 
   runtime
   if we're running on a Cell machine the new routines are called but 
   otherwise
   the existing memory copy routines are used.
  
  Have you tried running this code on other platforms to see if it
  actually performs worse on any of them? I would guess that the
  older code also doesn't work too well on Power 5 and Power 6, so the
  cell optimized version could give us a significant advantage as well,
  albeit less than another CPU specific version.
  
  Arnd 
  
 
 I did run the tests on Power 5 and Power 6, and on Power 5 with the
 new routines, the iperf bandwidth increased to 7.9 GBits/sec up from
 7.5 GBits/sec; but on Power 6 the bandwidth with the old routines
 was 13.6 GBits/sec compared to 12.8 GBits/sec...

After running the tests again I get a similar result, where on Power 6
the new routine is slower than the old one.

I'll spend some time doing tests to see if we can come up with a
routine that works well on Power 6 too.

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[RFC 3/3] powerpc: copy_4K_page tweaked for Cell

2008-06-19 Thread Mark Nelson

/*
 * Copyright (C) 2008 Gunnar von Boehn, IBM Corp.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 *
 *
 * copy_4K_page routine optimized for CELL-BE-PPC
 *
 * The CELL PPC core has 1 integerunit and 1 load/store unit
 * CELL: 1st level data cache = 32K - 2nd level data cache = 512K
 * - 3rd level data cache = 0K
 * To improve copy performance we need to prefetch source data
 * far ahead to hide this latency
 * For best performance instruction forms ending in . like andi.
 * should be avoided as they are implemented in microcode on CELL.
 *
 * The below code is loop unrolled for the CELL cache line of 128 bytes.
 */

#include asm/processor.h
#include asm/ppc_asm.h

#define PREFETCH_AHEAD 6
#define ZERO_AHEAD 4

.align  7
_GLOBAL(copy_4K_page)
dcbt0,r4/* Prefetch ONE SRC cacheline */

addir6,r3,-8/* prepare for stdu */
addir4,r4,-8/* prepare for ldu */

li  r10,32  /* copy 32 cache lines for a 4K page */
li  r12,128+8   /* prefetch distance*/

subir11,r10,PREFETCH_AHEAD
li  r10,PREFETCH_AHEAD

mtctr   r10
.LprefetchSRC:
dcbtr12,r4
addir12,r12,128
bdnz.LprefetchSRC

.Louterloop:/* copy while cache lines */
mtctr   r11

li  r11,128*ZERO_AHEAD +8   /* DCBZ dist */

.align  4
/* Copy whole cachelines, optimized by prefetching SRC cacheline */
.Lloop: /* Copy aligned body */
dcbtr12,r4  /* PREFETCH SOURCE some cache lines 
ahead*/
ld  r9, 0x08(r4)
dcbzr11,r6
ld  r7, 0x10(r4)/* 4 register stride copy */
ld  r8, 0x18(r4)/* 4 are optimal to hide 1st level 
cache lantency*/
ld  r0, 0x20(r4)
std r9, 0x08(r6)
std r7, 0x10(r6)
std r8, 0x18(r6)
std r0, 0x20(r6)
ld  r9, 0x28(r4)
ld  r7, 0x30(r4)
ld  r8, 0x38(r4)
ld  r0, 0x40(r4)
std r9, 0x28(r6)
std r7, 0x30(r6)
std r8, 0x38(r6)
std r0, 0x40(r6)
ld  r9, 0x48(r4)
ld  r7, 0x50(r4)
ld  r8, 0x58(r4)
ld  r0, 0x60(r4)
std r9, 0x48(r6)
std r7, 0x50(r6)
std r8, 0x58(r6)
std r0, 0x60(r6)
ld  r9, 0x68(r4)
ld  r7, 0x70(r4)
ld  r8, 0x78(r4)
ldu r0, 0x80(r4)
std r9, 0x68(r6)
std r7, 0x70(r6)
std r8, 0x78(r6)
stdur0, 0x80(r6)

bdnz.Lloop

sldir10,r10,2   /* adjust from 128 to 32 byte stride */
mtctr   r10
.Lloop2:/* Copy aligned body */
ld  r9, 0x08(r4)
ld  r7, 0x10(r4)
ld  r8, 0x18(r4)
ldu r0, 0x20(r4)
std r9, 0x08(r6)
std r7, 0x10(r6)
std r8, 0x18(r6)
stdur0, 0x20(r6)

bdnz.Lloop2

.Lendloop2:
blr
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[RFC 2/3] powerpc: memcpy tweaked for Cell

2008-06-19 Thread Mark Nelson

/*
 * Copyright (C) 2008 Gunnar von Boehn, IBM Corp.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 *
 *
 * memcpy (and copy_4K_page) routine optimized for CELL-BE-PPC
 *
 * The CELL PPC core has 1 integerunit and 1 load/store unit
 * CELL: 1st level data cache = 32K - 2nd level data cache = 512K
 * - 3rd level data cache = 0K
 * To improve copy performance we need to prefetch source data
 * far ahead to hide this latency
 * For best performance instruction forms ending in . like andi.
 * should be avoided as they are implemented in microcode on CELL.
 *
 * The below code is loop unrolled for the CELL cache line of 128 bytes.
 */

#include asm/processor.h
#include asm/ppc_asm.h

#define PREFETCH_AHEAD 6
#define ZERO_AHEAD 4

.align  7
_GLOBAL(memcpy)
dcbt0,r4/* Prefetch ONE SRC cacheline */
cmpldi  cr1,r5,16   /* is size  16 ? */
mr  r6,r3
blt+cr1,.Lshortcopy

.Lbigcopy:
neg r8,r3   /* LS 3 bits = # bytes to 8-byte dest bdry */
clrldi  r8,r8,64-4  /* aling to 16byte boundary */
sub r7,r4,r3
cmpldi  cr0,r8,0
beq+.Ldst_aligned

.Ldst_unaligned:
mtcrf   0x01,r8 /* put #bytes to boundary into cr7 */
subfr5,r8,r5

bf  cr7*4+3,1f
lbzxr0,r7,r6/* copy 1 byte */
stb r0,0(r6)
addir6,r6,1
1:  bf  cr7*4+2,2f
lhzxr0,r7,r6/* copy 2 byte */
sth r0,0(r6)
addir6,r6,2
2:  bf  cr7*4+1,4f
lwzxr0,r7,r6/* copy 4 byte */
stw r0,0(r6)
addir6,r6,4
4:  bf  cr7*4+0,8f
ldx r0,r7,r6/* copy 8 byte */
std r0,0(r6)
addir6,r6,8
8:
add r4,r7,r6

.Ldst_aligned:

cmpdi   cr5,r5,128-1

neg r7,r6
addir6,r6,-8/* prepare for stdu */
addir4,r4,-8/* prepare for ldu */

clrldi  r7,r7,64-7  /* align to cacheline boundary */
ble+cr5,.Llessthancacheline


cmpldi  cr6,r7,0
subfr5,r7,r5
srdir7,r7,4 /* divide size by 16 */
srdir10,r5,7/* number of cache lines to copy */


cmpldi  r10,0
li  r11,0   /* number cachelines to copy with 
prefetch */
beq .Lnocacheprefetch

cmpldi  r10,PREFETCH_AHEAD
li  r12,128+8   /* prefetch distance*/
ble .Llessthanmaxprefetch

subir11,r10,PREFETCH_AHEAD
li  r10,PREFETCH_AHEAD
.Llessthanmaxprefetch:

mtctr   r10
.LprefetchSRC:
dcbtr12,r4
addir12,r12,128
bdnz.LprefetchSRC
.Lnocacheprefetch:


mtctr   r7
cmpldi  cr1,r5,128
clrldi  r5,r5,64-7

beq cr6,.Lcachelinealigned  /*  */
.Laligntocacheline:
ld  r9,0x08(r4)
ldu r7,0x10(r4)
std r9,0x08(r6)
stdur7,0x10(r6)
bdnz.Laligntocacheline


.Lcachelinealigned: /* copy while cache lines */


blt-cr1,.Llessthancacheline /* size 128 */

.Louterloop:
cmpdi   r11,0
mtctr   r11
beq-.Lendloop

li  r11,128*ZERO_AHEAD +8   /* DCBZ dist */

.align  4
/* Copy whole cachelines, optimized by prefetching SRC cacheline */
.Lloop: /* Copy aligned body */
dcbtr12,r4  /* PREFETCH SOURCE some cache lines 
ahead*/
ld  r9, 0x08(r4)
dcbzr11,r6
ld  r7, 0x10(r4)/* 4 register stride copy */
ld  r8, 0x18(r4)/* 4 are optimal to hide 1st level 
cache lantency*/
ld  r0, 0x20(r4)
std r9, 0x08(r6)
std r7, 0x10(r6)
std r8, 0x18(r6)
std r0, 0x20(r6)
ld  r9, 0x28(r4)
ld  r7, 0x30(r4)
ld  r8, 0x38(r4)
ld  r0, 0x40(r4)
std r9, 0x28(r6)
std r7, 0x30(r6)
std r8, 0x38(r6)
std r0, 0x40(r6)
ld  r9, 0x48(r4)
ld  r7, 0x50(r4)
ld  r8, 0x58(r4)
ld  r0, 0x60(r4)
std r9, 0x48(r6)
std r7, 0x50(r6)
std r8, 0x58(r6)
std r0, 0x60(r6)
ld  r9, 0x68(r4)
ld  r7, 0x70(r4)
ld  r8, 0x78(r4)
ldu r0, 0x80(r4)
std r9, 0x68(r6)
std r7, 0x70(r6)
std r8, 0x78(r6)
stdur0, 0x80(r6)

bdnz.Lloop
.Lendloop:


cmpdi   r10,0
sldir10,r10,2   /* adjust from 128

Re: [RFC 0/3] powerpc: memory copy routines tweaked for Cell

2008-06-19 Thread Mark Nelson

On Thu, 19 Jun 2008 09:53:16 pm Arnd Bergmann wrote:
 On Thursday 19 June 2008, Mark Nelson wrote:
  The plan is to use Michael Ellerman's code patching work so that at runtime
  if we're running on a Cell machine the new routines are called but otherwise
  the existing memory copy routines are used.
 
 Have you tried running this code on other platforms to see if it
 actually performs worse on any of them? I would guess that the
 older code also doesn't work too well on Power 5 and Power 6, so the
 cell optimized version could give us a significant advantage as well,
 albeit less than another CPU specific version.
 
   Arnd 
 

I did run the tests on Power 5 and Power 6, and on Power 5 with the
new routines, the iperf bandwidth increased to 7.9 GBits/sec up from
7.5 GBits/sec; but on Power 6 the bandwidth with the old routines
was 13.6 GBits/sec compared to 12.8 GBits/sec...

I also couldn't get the updated routines to boot on 970MP without
removing the dcbz instructions.

I'll investigate more and also rerun the tests again

Thanks!

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [RFC 0/3] powerpc: memory copy routines tweaked for Cell

2008-06-19 Thread Mark Nelson

On Fri, 20 Jun 2008 12:53:49 am Olof Johansson wrote:
 
 On Jun 19, 2008, at 8:59 AM, Arnd Bergmann wrote:
 
  I assume it has suffered from bitrot and nobody tried to do better
  since the Power3 days. AFAICT, it hasn't seen any update since your
  original Power4 version from 2002.
 
 I've got an out-of-tree optimized version for pa6t as well that I  
 haven't bothered posting yet.
 
 The real pain with the usercopy code is all the exception cases. If  
 anyone has made a test harness to make sure they're all right, please  
 do post it for others to use as well...

I second that request - I verified (to the best that I could) with
pen and paper that the exception handling on this new version
is correct but it would be great to have a better way to test it.

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [RFC 1/3] powerpc: __copy_tofrom_user tweaked for Cell

2008-06-19 Thread Mark Nelson

 * The naming of the labels (with just numbers) is rather confusing,
 it would be good to have something better, but I must admit that
 I don't have a good idea either.

I will admit that at first glance the label naming with numbers
does look confusing but when you notice that all the loads start
at 20 and all the stores start at 60 and that to get the exception
handler for those instructions you just add 100 I think it makes
sense, but that could be because I've been looking at it way too
long...

(I thought I had a comment in there to that effect but it must
have gotten lost along the way. I'll add a new comment
explaining the above, that should help)

 
 * The trick of using the condition code in cr7 for the last bytes
 is really cute, but are the four branches actually better than a
 single computed branch into the middle of 15 byte wise copies?

The original copy_tofrom_user does this also, which I guess is
carried over to this new version...

Gunnar did you have an old version that did something similar
to this?

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [Cbe-oss-dev] [RFC 3/3] powerpc: copy_4K_page tweaked for Cell

2008-06-19 Thread Mark Nelson

On Fri, 20 Jun 2008 07:28:50 am Arnd Bergmann wrote:
 On Thursday 19 June 2008, Mark Nelson wrote:
  .align  7
  _GLOBAL(copy_4K_page)
  dcbt0,r4/* Prefetch ONE SRC cacheline */
  
  addir6,r3,-8/* prepare for stdu */
  addir4,r4,-8/* prepare for ldu */
  
  li  r10,32  /* copy 32 cache lines for a 4K page */
  li  r12,128+8   /* prefetch distance*/
 
 Since you have a loop here anyway instead of the fully unrolled
 code, why not provide a copy_64K_page function as well, jumping in
 here?

That is a good idea. What effect will that have on how the code
patching will work?

 
 The inline 64k copy_page function otherwise just adds code size,
 as well as being a tiny bit slower. It may even be good to
 have an out-of-line copy_64K_page for the regular code, just
 calling copy_4K_page repeatedly.

Doing that sounds like it'll make the code patching easier.

Thanks!

Mark
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [RFC] POWERPC: Merge 32 and 64-bit dma code

2008-05-28 Thread Mark Nelson

On Fri, 23 May 2008 06:06:50 pm Mark Nelson wrote:
 On Thu, 1 May 2008 09:36:43 am Becky Bruce wrote:
  
  I essentially adopt the 64-bit dma code, with some changes to support
  32-bit systems, including HIGHMEM.  dma functions on 32-bit are now
  invoked via accessor functions which call the correct op for a device based
  on archdata dma_ops.  Currently, this defaults to dma_direct_ops, but this
  structure will make it easier to do iommu/swiotlb on 32-bit going
  forward.
  
  In addition, the dma_map/unmap_page functions are added to dma_ops on
  HIGHMEM-enabled configs because we can't just fall back on map/unmap_single
  when HIGHMEM is enabled.  Adding these to dma_ops makes it cleaner to
  substitute different functionality once we have iommu/swiotlb support.
  
  This code conflicts with the dma_attrs code that Mark Nelson just pushed.
  At this point, I'm just looking for some review, and suggestions on how 
  this code might be improved.  I'll uprev it to work with Mark's code once
  that goes in.
 
 The last patch of my series may be in question so it could end up that this
 patch makes it in first. It shouldn't be too hard to respin my patches after
 your merge so no worries there though.
 
  
  There will be other patches that precede this one - I plan to duplicate 
  dma_mapping.h into include/asm-ppc to avoid breakage there.  There will 
  also be a patch to rename dma_64.c to dma.c, and a series of patches to 
  modify drivers that pass NULL dev pointers.
  
  Dma experts, please review this when you can - I was a dma newbie 
  until very recently, and the odds that I'm missing some subtlety 
  in this merge are fairly high.
  
 
 I'm far from a DMA expert but this all looks sane to me - I can't really
 comment on the 32bit side of things but I don't think it's going to break
 anything on 64bit (it compiles fine on cell and pseries).
 
 I'll try and test boot it on Monday.

Not quite Monday, but it boots fine on the QS22 and QS21 Cell blades
and on a Power6 box.

Mark.

 
 We should get BenH to look at it but he's travelling at the moment...
 
 Mark.
 
  Cheers,
  Becky
  
   ---
   arch/powerpc/kernel/Makefile  |4 +-
   arch/powerpc/kernel/dma_64.c  |   80 ++-
   arch/powerpc/kernel/pci-common.c  |   53 +
   arch/powerpc/kernel/pci_32.c  |7 ++
   arch/powerpc/kernel/pci_64.c  |   49 
   include/asm-powerpc/dma-mapping.h |  156 
  +
   include/asm-powerpc/machdep.h |5 +-
   include/asm-powerpc/pci.h |   14 ++--
   8 files changed, 186 insertions(+), 182 deletions(-)
 snip
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [RFC] POWERPC: Merge 32 and 64-bit dma code

2008-05-23 Thread Mark Nelson

On Thu, 1 May 2008 09:36:43 am Becky Bruce wrote:
 
 I essentially adopt the 64-bit dma code, with some changes to support
 32-bit systems, including HIGHMEM.  dma functions on 32-bit are now
 invoked via accessor functions which call the correct op for a device based
 on archdata dma_ops.  Currently, this defaults to dma_direct_ops, but this
 structure will make it easier to do iommu/swiotlb on 32-bit going
 forward.
 
 In addition, the dma_map/unmap_page functions are added to dma_ops on
 HIGHMEM-enabled configs because we can't just fall back on map/unmap_single
 when HIGHMEM is enabled.  Adding these to dma_ops makes it cleaner to
 substitute different functionality once we have iommu/swiotlb support.
 
 This code conflicts with the dma_attrs code that Mark Nelson just pushed.
 At this point, I'm just looking for some review, and suggestions on how 
 this code might be improved.  I'll uprev it to work with Mark's code once
 that goes in.

The last patch of my series may be in question so it could end up that this
patch makes it in first. It shouldn't be too hard to respin my patches after
your merge so no worries there though.

 
 There will be other patches that precede this one - I plan to duplicate 
 dma_mapping.h into include/asm-ppc to avoid breakage there.  There will 
 also be a patch to rename dma_64.c to dma.c, and a series of patches to 
 modify drivers that pass NULL dev pointers.
 
 Dma experts, please review this when you can - I was a dma newbie 
 until very recently, and the odds that I'm missing some subtlety 
 in this merge are fairly high.
 

I'm far from a DMA expert but this all looks sane to me - I can't really
comment on the 32bit side of things but I don't think it's going to break
anything on 64bit (it compiles fine on cell and pseries).

I'll try and test boot it on Monday.

We should get BenH to look at it but he's travelling at the moment...

Mark.

 Cheers,
 Becky
 
  ---
  arch/powerpc/kernel/Makefile  |4 +-
  arch/powerpc/kernel/dma_64.c  |   80 ++-
  arch/powerpc/kernel/pci-common.c  |   53 +
  arch/powerpc/kernel/pci_32.c  |7 ++
  arch/powerpc/kernel/pci_64.c  |   49 
  include/asm-powerpc/dma-mapping.h |  156 
 +
  include/asm-powerpc/machdep.h |5 +-
  include/asm-powerpc/pci.h |   14 ++--
  8 files changed, 186 insertions(+), 182 deletions(-)
snip
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [PATCH 5/6] [POWERPC] Move device_to_mask() to dma-mapping.h

2008-05-01 Thread Mark Nelson

On Thu, 1 May 2008 07:04:30 pm Segher Boessenkool wrote:
  Move device_to_mask() to dma-mapping.h because we need to use it from
  outside dma_64.c in a later patch.
 
 Why does this need to become an inline function?
 
 
 Segher
 

I'm not sure exactly what you mean - it was inline before the move.

But, I honestly didn't think about it too much (although I possibly should
have)... I figured that it was static inline before my patch so I should leave
it as such.

But if everybody thinks it would be better to leave it in dma_64.c and just
expose it for use outside, I'm fine with that.

Thanks!

Mark.
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [PATCH] [POWERPC] Add struct iommu_table argument to iommu_map_sg()

2008-04-29 Thread Mark Nelson


Olof Johansson wrote:

On Tue, Apr 29, 2008 at 03:17:45PM +1000, Mark Nelson wrote:

Make iommu_map_sg take a struct iommu_table. It did so before commit
740c3ce66700640a6e6136ff679b067e92125794 (iommu sg merging: ppc: make
iommu respect the segment size limits).

This stops the function looking in the archdata.dma_data for the iommu
table because in the future it will be called with a device that has
no table there.


The logical thing would be to add the archdata.dma_data to said device
instead, no? Without seeing the rest of the code that makes use of it
it's hard to tell anyway, so please post that.


I'll post the follow on series of patches as an RFC to give you a better
understanding of what I'm trying to do; but basically I'm building on
the new dma_*map*_attrs() interfaces that Andrew Morton sent to Linus
last night and implementing them for powerpc (in particular the Cell
platform). To cut a longish story short, the fixed linear mapping for
the Cell's IOMMU can be set to either be weakly or strongly ordered,
and so a device that has been identified as being able to use the
fixed mapping can later request a mapping that cannot be satisfied using
the fixed mapping (eg: it may request a weakly ordered mapping when
the fixed mapping is strongly ordered, or vice versa). Devices that
use the fixed mapping use the dma_direct_ops with their dma_data being
the necessary offset, but for the case above they have to use the
iommu_map*() functions that the dma_iommu_ops use. So that's how we end
up calling iommu_map_sg() with a device that doesn't have the table
in the dma_data (and already has something useful in dma_data).

I'll try to get the follow on series of patches cleaned up today and
send them out asap, so if my ramblings above turned out to be too
obscure the code will hopefully clear things up.

Thanks!

Mark.




This also has the nice side effect of making iommu_map_sg() match the
other map functions.


Consistency is good, but I wonder if the opposite wouldn't be the better
way to go here: always just pass down just the dev pointer instead. The
table can be reached from it.


-Olof


___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH] [POWERPC] Add struct iommu_table argument to iommu_map_sg()

2008-04-28 Thread Mark Nelson


Make iommu_map_sg take a struct iommu_table. It did so before commit
740c3ce66700640a6e6136ff679b067e92125794 (iommu sg merging: ppc: make
iommu respect the segment size limits).

This stops the function looking in the archdata.dma_data for the iommu
table because in the future it will be called with a device that has
no table there.

This also has the nice side effect of making iommu_map_sg() match the
other map functions.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
arch/powerpc/kernel/dma_64.c |2 +-
arch/powerpc/kernel/iommu.c  |7 +++
include/asm-powerpc/iommu.h  |6 +++---
3 files changed, 7 insertions(+), 8 deletions(-)

Index: upstream/arch/powerpc/kernel/dma_64.c
===
--- upstream.orig/arch/powerpc/kernel/dma_64.c
+++ upstream/arch/powerpc/kernel/dma_64.c
@@ -68,7 +68,7 @@ static void dma_iommu_unmap_single(struc
static int dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
int nelems, enum dma_data_direction direction)
{
-   return iommu_map_sg(dev, sglist, nelems,
+   return iommu_map_sg(dev, dev-archdata.dma_data, sglist, nelems,
device_to_mask(dev), direction);
}

Index: upstream/arch/powerpc/kernel/iommu.c
===
--- upstream.orig/arch/powerpc/kernel/iommu.c
+++ upstream/arch/powerpc/kernel/iommu.c
@@ -267,11 +267,10 @@ static void iommu_free(struct iommu_tabl
spin_unlock_irqrestore((tbl-it_lock), flags);
}

-int iommu_map_sg(struct device *dev, struct scatterlist *sglist,
-int nelems, unsigned long mask,
-enum dma_data_direction direction)
+int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
+struct scatterlist *sglist, int nelems,
+unsigned long mask, enum dma_data_direction direction)
{
-   struct iommu_table *tbl = dev-archdata.dma_data;
dma_addr_t dma_next = 0, dma_addr;
unsigned long flags;
struct scatterlist *s, *outs, *segstart;
Index: upstream/include/asm-powerpc/iommu.h
===
--- upstream.orig/include/asm-powerpc/iommu.h
+++ upstream/include/asm-powerpc/iommu.h
@@ -79,9 +79,9 @@ extern void iommu_free_table(struct iomm
extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
int nid);

-extern int iommu_map_sg(struct device *dev, struct scatterlist *sglist,
-   int nelems, unsigned long mask,
-   enum dma_data_direction direction);
+extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
+   struct scatterlist *sglist, int nelems,
+   unsigned long mask, enum dma_data_direction direction);
extern void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
   int nelems, enum dma_data_direction direction);

___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [patch 1/2] Replace NT_PRXFPREG with ELF_CORE_XFPREG_TYPE #define

2007-10-11 Thread Mark Nelson

Kumar Gala wrote:
 
 On Oct 11, 2007, at 2:15 AM, [EMAIL PROTECTED] wrote:
 
 Replace NT_PRXFPREG with ELF_CORE_XFPREG_TYPE in the coredump code which
 allows for more flexibility in the note type for the state of 'extended
 floating point' implementations in coredumps. New note types can now be
 added with an appropriate #define.

 #define ELF_CORE_XFPREG_TYPE to be NT_PRXFPREG in all current users so
 there's are no change in behaviour.
 
 Can we make this ELF_CORE_VECREG_TYPE or something that is so coupled to
 the x86 specific name?
 

 Signed-off-by: Mark Nelson [EMAIL PROTECTED]
 ---
  arch/ia64/ia32/elfcore32.h |1 +
  arch/x86_64/ia32/ia32_binfmt.c |1 +
  fs/binfmt_elf.c|4 ++--
  include/asm-i386/elf.h |1 +
  4 files changed, 5 insertions(+), 2 deletions(-)

 Index: linux/arch/ia64/ia32/elfcore32.h
 ===
 --- linux.orig/arch/ia64/ia32/elfcore32.h
 +++ linux/arch/ia64/ia32/elfcore32.h
 @@ -117,6 +117,7 @@ elf_core_copy_task_fpregs(struct task_st
  }

  #define ELF_CORE_COPY_XFPREGS 1
 +#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
  static inline int
  elf_core_copy_task_xfpregs(struct task_struct *tsk, elf_fpxregset_t
 *xfpu)
  {
 Index: linux/arch/x86_64/ia32/ia32_binfmt.c
 ===
 --- linux.orig/arch/x86_64/ia32/ia32_binfmt.c
 +++ linux/arch/x86_64/ia32/ia32_binfmt.c
 @@ -188,6 +188,7 @@ elf_core_copy_task_fpregs(struct task_st
  }

  #define ELF_CORE_COPY_XFPREGS 1
 +#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
  static inline int
  elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
  {
 Index: linux/fs/binfmt_elf.c
 ===
 --- linux.orig/fs/binfmt_elf.c
 +++ linux/fs/binfmt_elf.c
 @@ -1446,8 +1446,8 @@ static int elf_dump_thread_status(long s

  #ifdef ELF_CORE_COPY_XFPREGS
  if (elf_core_copy_task_xfpregs(p, t-xfpu)) {
 -fill_note(t-notes[2], LINUX, NT_PRXFPREG, sizeof(t-xfpu),
 -  t-xfpu);
 +fill_note(t-notes[2], LINUX, ELF_CORE_XFPREG_TYPE,
 +  sizeof(t-xfpu), t-xfpu);
  t-num_notes++;
  sz += notesize(t-notes[2]);
  }
 
 You've only fixed up one of 4 NT_PRXFPREG uses in the kernel.

Ooops... Right you are - very good pickup :) Fixed in a new version (will send 
promptly).


Thanks!

Mark.

 
 Also, I'm not a fan of your proposed mechanism to overload the
 elf_fpxregset_t.  I'd rather see us introduce a new elf_vecregset_t and
 have it typedef'd to be elf_fpxregset_t on i386, x86_64, ia64
 
 Index: linux/include/asm-i386/elf.h
 ===
 --- linux.orig/include/asm-i386/elf.h
 +++ linux/include/asm-i386/elf.h
 @@ -129,6 +129,7 @@ extern int dump_task_extended_fpu (struc
  #define ELF_CORE_COPY_TASK_REGS(tsk, elf_regs) dump_task_regs(tsk,
 elf_regs)
  #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk,
 elf_fpregs)
  #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs)
 dump_task_extended_fpu(tsk, elf_xfpregs)
 +#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG

  #define VDSO_HIGH_BASE(__fix_to_virt(FIX_VDSO))
  #define VDSO_CURRENT_BASE((unsigned long)current-mm-context.vdso)

 -- 
 ___
 Linuxppc-dev mailing list
 Linuxppc-dev@ozlabs.org
 https://ozlabs.org/mailman/listinfo/linuxppc-dev
 

___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH 1/2] [V2] Replace NT_PRXFPREG with ELF_CORE_XFPREG_TYPE #define

2007-10-11 Thread Mark Nelson

Replace NT_PRXFPREG with ELF_CORE_XFPREG_TYPE in the coredump code which
allows for more flexibility in the note type for the state of 'extended
floating point' implementations in coredumps.  New note types can now be
added with an appropriate #define.

This does #define ELF_CORE_XFPREG_TYPE to be NT_PRXFPREG in all
current users so there's are no change in behaviour.

This will let us use different note types on powerpc for the
Altivec/VMX state that some PowerPC cpus have (G4, PPC970, POWER6) and
for the SPE (signal processing extension) state that some embedded
PowerPC cpus from Freescale have.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
 arch/ia64/ia32/elfcore32.h |1 +
 arch/x86_64/ia32/ia32_binfmt.c |1 +
 fs/binfmt_elf.c|8 
 fs/binfmt_elf_fdpic.c  |6 +++---
 include/asm-i386/elf.h |1 +
 5 files changed, 10 insertions(+), 7 deletions(-)

Index: linux/arch/ia64/ia32/elfcore32.h
===
--- linux.orig/arch/ia64/ia32/elfcore32.h
+++ linux/arch/ia64/ia32/elfcore32.h
@@ -117,6 +117,7 @@ elf_core_copy_task_fpregs(struct task_st
 }
 
 #define ELF_CORE_COPY_XFPREGS 1
+#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
 static inline int
 elf_core_copy_task_xfpregs(struct task_struct *tsk, elf_fpxregset_t *xfpu)
 {
Index: linux/arch/x86_64/ia32/ia32_binfmt.c
===
--- linux.orig/arch/x86_64/ia32/ia32_binfmt.c
+++ linux/arch/x86_64/ia32/ia32_binfmt.c
@@ -188,6 +188,7 @@ elf_core_copy_task_fpregs(struct task_st
 }
 
 #define ELF_CORE_COPY_XFPREGS 1
+#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
 static inline int 
 elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
 {
Index: linux/fs/binfmt_elf.c
===
--- linux.orig/fs/binfmt_elf.c
+++ linux/fs/binfmt_elf.c
@@ -1411,7 +1411,7 @@ struct elf_thread_status
elf_fpregset_t fpu; /* NT_PRFPREG */
struct task_struct *thread;
 #ifdef ELF_CORE_COPY_XFPREGS
-   elf_fpxregset_t xfpu;   /* NT_PRXFPREG */
+   elf_fpxregset_t xfpu;   /* ELF_CORE_XFPREG_TYPE */
 #endif
struct memelfnote notes[3];
int num_notes;
@@ -1446,8 +1446,8 @@ static int elf_dump_thread_status(long s
 
 #ifdef ELF_CORE_COPY_XFPREGS
if (elf_core_copy_task_xfpregs(p, t-xfpu)) {
-   fill_note(t-notes[2], LINUX, NT_PRXFPREG, sizeof(t-xfpu),
- t-xfpu);
+   fill_note(t-notes[2], LINUX, ELF_CORE_XFPREG_TYPE,
+ sizeof(t-xfpu), t-xfpu);
t-num_notes++;
sz += notesize(t-notes[2]);
}
@@ -1624,7 +1624,7 @@ static int elf_core_dump(long signr, str
 #ifdef ELF_CORE_COPY_XFPREGS
if (elf_core_copy_task_xfpregs(current, xfpu))
fill_note(notes + numnote++,
- LINUX, NT_PRXFPREG, sizeof(*xfpu), xfpu);
+ LINUX, ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
 #endif 
   
fs = get_fs();
Index: linux/fs/binfmt_elf_fdpic.c
===
--- linux.orig/fs/binfmt_elf_fdpic.c
+++ linux/fs/binfmt_elf_fdpic.c
@@ -1417,7 +1417,7 @@ struct elf_thread_status
elf_fpregset_t fpu; /* NT_PRFPREG */
struct task_struct *thread;
 #ifdef ELF_CORE_COPY_XFPREGS
-   elf_fpxregset_t xfpu;   /* NT_PRXFPREG */
+   elf_fpxregset_t xfpu;   /* ELF_CORE_XFPREG_TYPE */
 #endif
struct memelfnote notes[3];
int num_notes;
@@ -1453,7 +1453,7 @@ static int elf_dump_thread_status(long s
 
 #ifdef ELF_CORE_COPY_XFPREGS
if (elf_core_copy_task_xfpregs(p, t-xfpu)) {
-   fill_note(t-notes[2], LINUX, NT_PRXFPREG, sizeof(t-xfpu),
+   fill_note(t-notes[2], LINUX, ELF_CORE_XFPREG_TYPE, 
sizeof(t-xfpu),
  t-xfpu);
t-num_notes++;
sz += notesize(t-notes[2]);
@@ -1690,7 +1690,7 @@ static int elf_fdpic_core_dump(long sign
 #ifdef ELF_CORE_COPY_XFPREGS
if (elf_core_copy_task_xfpregs(current, xfpu))
fill_note(notes + numnote++,
- LINUX, NT_PRXFPREG, sizeof(*xfpu), xfpu);
+ LINUX, ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
 #endif
 
fs = get_fs();
Index: linux/include/asm-i386/elf.h
===
--- linux.orig/include/asm-i386/elf.h
+++ linux/include/asm-i386/elf.h
@@ -129,6 +129,7 @@ extern int dump_task_extended_fpu (struc
 #define ELF_CORE_COPY_TASK_REGS(tsk, elf_regs) dump_task_regs(tsk, elf_regs)
 #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
 #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, 
elf_xfpregs)
+#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
 
 #define

Re: [PATCH 1/2] [V2] Replace NT_PRXFPREG with ELF_CORE_XFPREG_TYPE #define

2007-10-11 Thread Mark Nelson

Sorry for the patch noise but please disregard this patch - a line is longer 
than 80 characters and I'd hate to be brought up on that...

V3 will be the perfect version then :)


Thanks and apologies again!

Mark.

Mark Nelson wrote:
 Replace NT_PRXFPREG with ELF_CORE_XFPREG_TYPE in the coredump code which
 allows for more flexibility in the note type for the state of 'extended
 floating point' implementations in coredumps.  New note types can now be
 added with an appropriate #define.
 
 This does #define ELF_CORE_XFPREG_TYPE to be NT_PRXFPREG in all
 current users so there's are no change in behaviour.
 
 This will let us use different note types on powerpc for the
 Altivec/VMX state that some PowerPC cpus have (G4, PPC970, POWER6) and
 for the SPE (signal processing extension) state that some embedded
 PowerPC cpus from Freescale have.
 
 Signed-off-by: Mark Nelson [EMAIL PROTECTED]
 ---
  arch/ia64/ia32/elfcore32.h |1 +
  arch/x86_64/ia32/ia32_binfmt.c |1 +
  fs/binfmt_elf.c|8 
  fs/binfmt_elf_fdpic.c  |6 +++---
  include/asm-i386/elf.h |1 +
  5 files changed, 10 insertions(+), 7 deletions(-)
 
 Index: linux/arch/ia64/ia32/elfcore32.h
 ===
 --- linux.orig/arch/ia64/ia32/elfcore32.h
 +++ linux/arch/ia64/ia32/elfcore32.h
 @@ -117,6 +117,7 @@ elf_core_copy_task_fpregs(struct task_st
  }
  
  #define ELF_CORE_COPY_XFPREGS 1
 +#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
  static inline int
  elf_core_copy_task_xfpregs(struct task_struct *tsk, elf_fpxregset_t *xfpu)
  {
 Index: linux/arch/x86_64/ia32/ia32_binfmt.c
 ===
 --- linux.orig/arch/x86_64/ia32/ia32_binfmt.c
 +++ linux/arch/x86_64/ia32/ia32_binfmt.c
 @@ -188,6 +188,7 @@ elf_core_copy_task_fpregs(struct task_st
  }
  
  #define ELF_CORE_COPY_XFPREGS 1
 +#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
  static inline int 
  elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
  {
 Index: linux/fs/binfmt_elf.c
 ===
 --- linux.orig/fs/binfmt_elf.c
 +++ linux/fs/binfmt_elf.c
 @@ -1411,7 +1411,7 @@ struct elf_thread_status
   elf_fpregset_t fpu; /* NT_PRFPREG */
   struct task_struct *thread;
  #ifdef ELF_CORE_COPY_XFPREGS
 - elf_fpxregset_t xfpu;   /* NT_PRXFPREG */
 + elf_fpxregset_t xfpu;   /* ELF_CORE_XFPREG_TYPE */
  #endif
   struct memelfnote notes[3];
   int num_notes;
 @@ -1446,8 +1446,8 @@ static int elf_dump_thread_status(long s
  
  #ifdef ELF_CORE_COPY_XFPREGS
   if (elf_core_copy_task_xfpregs(p, t-xfpu)) {
 - fill_note(t-notes[2], LINUX, NT_PRXFPREG, sizeof(t-xfpu),
 -   t-xfpu);
 + fill_note(t-notes[2], LINUX, ELF_CORE_XFPREG_TYPE,
 +   sizeof(t-xfpu), t-xfpu);
   t-num_notes++;
   sz += notesize(t-notes[2]);
   }
 @@ -1624,7 +1624,7 @@ static int elf_core_dump(long signr, str
  #ifdef ELF_CORE_COPY_XFPREGS
   if (elf_core_copy_task_xfpregs(current, xfpu))
   fill_note(notes + numnote++,
 -   LINUX, NT_PRXFPREG, sizeof(*xfpu), xfpu);
 +   LINUX, ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
  #endif   

   fs = get_fs();
 Index: linux/fs/binfmt_elf_fdpic.c
 ===
 --- linux.orig/fs/binfmt_elf_fdpic.c
 +++ linux/fs/binfmt_elf_fdpic.c
 @@ -1417,7 +1417,7 @@ struct elf_thread_status
   elf_fpregset_t fpu; /* NT_PRFPREG */
   struct task_struct *thread;
  #ifdef ELF_CORE_COPY_XFPREGS
 - elf_fpxregset_t xfpu;   /* NT_PRXFPREG */
 + elf_fpxregset_t xfpu;   /* ELF_CORE_XFPREG_TYPE */
  #endif
   struct memelfnote notes[3];
   int num_notes;
 @@ -1453,7 +1453,7 @@ static int elf_dump_thread_status(long s
  
  #ifdef ELF_CORE_COPY_XFPREGS
   if (elf_core_copy_task_xfpregs(p, t-xfpu)) {
 - fill_note(t-notes[2], LINUX, NT_PRXFPREG, sizeof(t-xfpu),
 + fill_note(t-notes[2], LINUX, ELF_CORE_XFPREG_TYPE, 
 sizeof(t-xfpu),

the line above is the offender...

 t-xfpu);
   t-num_notes++;
   sz += notesize(t-notes[2]);
 @@ -1690,7 +1690,7 @@ static int elf_fdpic_core_dump(long sign
  #ifdef ELF_CORE_COPY_XFPREGS
   if (elf_core_copy_task_xfpregs(current, xfpu))
   fill_note(notes + numnote++,
 -   LINUX, NT_PRXFPREG, sizeof(*xfpu), xfpu);
 +   LINUX, ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
  #endif
  
   fs = get_fs();
 Index: linux/include/asm-i386/elf.h
 ===
 --- linux.orig/include/asm-i386/elf.h
 +++ linux/include/asm-i386/elf.h
 @@ -129,6 +129,7 @@ extern int

[PATCH 1/2] [V3] Replace NT_PRXFPREG with ELF_CORE_XFPREG_TYPE #define

2007-10-11 Thread Mark Nelson

Replace NT_PRXFPREG with ELF_CORE_XFPREG_TYPE in the coredump code which
allows for more flexibility in the note type for the state of 'extended
floating point' implementations in coredumps.  New note types can now be
added with an appropriate #define.

This does #define ELF_CORE_XFPREG_TYPE to be NT_PRXFPREG in all
current users so there's are no change in behaviour.

This will let us use different note types on powerpc for the
Altivec/VMX state that some PowerPC cpus have (G4, PPC970, POWER6) and
for the SPE (signal processing extension) state that some embedded
PowerPC cpus from Freescale have.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
 arch/ia64/ia32/elfcore32.h |1 +
 arch/x86_64/ia32/ia32_binfmt.c |1 +
 fs/binfmt_elf.c|8 
 fs/binfmt_elf_fdpic.c  |8 
 include/asm-i386/elf.h |1 +
 5 files changed, 11 insertions(+), 8 deletions(-)

Index: linux/arch/ia64/ia32/elfcore32.h
===
--- linux.orig/arch/ia64/ia32/elfcore32.h
+++ linux/arch/ia64/ia32/elfcore32.h
@@ -117,6 +117,7 @@ elf_core_copy_task_fpregs(struct task_st
 }
 
 #define ELF_CORE_COPY_XFPREGS 1
+#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
 static inline int
 elf_core_copy_task_xfpregs(struct task_struct *tsk, elf_fpxregset_t *xfpu)
 {
Index: linux/arch/x86_64/ia32/ia32_binfmt.c
===
--- linux.orig/arch/x86_64/ia32/ia32_binfmt.c
+++ linux/arch/x86_64/ia32/ia32_binfmt.c
@@ -188,6 +188,7 @@ elf_core_copy_task_fpregs(struct task_st
 }
 
 #define ELF_CORE_COPY_XFPREGS 1
+#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG
 static inline int 
 elf_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu)
 {
Index: linux/fs/binfmt_elf.c
===
--- linux.orig/fs/binfmt_elf.c
+++ linux/fs/binfmt_elf.c
@@ -1411,7 +1411,7 @@ struct elf_thread_status
elf_fpregset_t fpu; /* NT_PRFPREG */
struct task_struct *thread;
 #ifdef ELF_CORE_COPY_XFPREGS
-   elf_fpxregset_t xfpu;   /* NT_PRXFPREG */
+   elf_fpxregset_t xfpu;   /* ELF_CORE_XFPREG_TYPE */
 #endif
struct memelfnote notes[3];
int num_notes;
@@ -1446,8 +1446,8 @@ static int elf_dump_thread_status(long s
 
 #ifdef ELF_CORE_COPY_XFPREGS
if (elf_core_copy_task_xfpregs(p, t-xfpu)) {
-   fill_note(t-notes[2], LINUX, NT_PRXFPREG, sizeof(t-xfpu),
- t-xfpu);
+   fill_note(t-notes[2], LINUX, ELF_CORE_XFPREG_TYPE,
+ sizeof(t-xfpu), t-xfpu);
t-num_notes++;
sz += notesize(t-notes[2]);
}
@@ -1624,7 +1624,7 @@ static int elf_core_dump(long signr, str
 #ifdef ELF_CORE_COPY_XFPREGS
if (elf_core_copy_task_xfpregs(current, xfpu))
fill_note(notes + numnote++,
- LINUX, NT_PRXFPREG, sizeof(*xfpu), xfpu);
+ LINUX, ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
 #endif 
   
fs = get_fs();
Index: linux/fs/binfmt_elf_fdpic.c
===
--- linux.orig/fs/binfmt_elf_fdpic.c
+++ linux/fs/binfmt_elf_fdpic.c
@@ -1417,7 +1417,7 @@ struct elf_thread_status
elf_fpregset_t fpu; /* NT_PRFPREG */
struct task_struct *thread;
 #ifdef ELF_CORE_COPY_XFPREGS
-   elf_fpxregset_t xfpu;   /* NT_PRXFPREG */
+   elf_fpxregset_t xfpu;   /* ELF_CORE_XFPREG_TYPE */
 #endif
struct memelfnote notes[3];
int num_notes;
@@ -1453,8 +1453,8 @@ static int elf_dump_thread_status(long s
 
 #ifdef ELF_CORE_COPY_XFPREGS
if (elf_core_copy_task_xfpregs(p, t-xfpu)) {
-   fill_note(t-notes[2], LINUX, NT_PRXFPREG, sizeof(t-xfpu),
- t-xfpu);
+   fill_note(t-notes[2], LINUX, ELF_CORE_XFPREG_TYPE,
+ sizeof(t-xfpu), t-xfpu);
t-num_notes++;
sz += notesize(t-notes[2]);
}
@@ -1690,7 +1690,7 @@ static int elf_fdpic_core_dump(long sign
 #ifdef ELF_CORE_COPY_XFPREGS
if (elf_core_copy_task_xfpregs(current, xfpu))
fill_note(notes + numnote++,
- LINUX, NT_PRXFPREG, sizeof(*xfpu), xfpu);
+ LINUX, ELF_CORE_XFPREG_TYPE, sizeof(*xfpu), xfpu);
 #endif
 
fs = get_fs();
Index: linux/include/asm-i386/elf.h
===
--- linux.orig/include/asm-i386/elf.h
+++ linux/include/asm-i386/elf.h
@@ -129,6 +129,7 @@ extern int dump_task_extended_fpu (struc
 #define ELF_CORE_COPY_TASK_REGS(tsk, elf_regs) dump_task_regs(tsk, elf_regs)
 #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
 #define ELF_CORE_COPY_XFPREGS(tsk, elf_xfpregs) dump_task_extended_fpu(tsk, 
elf_xfpregs)
+#define

Re: [PATCH] add Altivec/VMX state to coredumps

2007-09-25 Thread Mark Nelson

Kumar Gala wrote:
 
 On Sep 24, 2007, at 11:03 PM, Mark Nelson wrote:
 
 Update dump_task_altivec() (that has so far never been put to use)
 so that it dumps the Altivec/VMX registers (VR[0] - VR[31], VSCR
 and VRSAVE) in the same format as the ptrace get_vrregs() and add
 the appropriate glue typedefs and #defines to
 include/asm-powerpc/elf.h for it to work.
 
 Is there some way to tell if the core dump has altivec registers state
 in it?
 
 I'm wondering how we distinguish a core dump w/altivec state vs one with
 SPE state.
 
 - k
 
 

If the core dump has the Altivec registers saved in there it will have a
note called LINUX as shown below:

$ readelf -n core

Notes at offset 0x02b4 with length 0x05c8:
  Owner Data size   Description
  CORE  0x010c  NT_PRSTATUS (prstatus structure)
  CORE  0x0080  NT_PRPSINFO (prpsinfo structure)
  CORE  0x00b0  NT_AUXV (auxiliary vector)
  CORE  0x0108  NT_FPREGSET (floating point registers)
  LINUX 0x0220  NT_PRXFPREG (user_xfpregs structure)

This mirrors what occurs with the SSE registers on i386 core dumps in
order to keep things as similar as possible.

I can't find any place where dump_spe() is called at the moment, but I
suppose if it were to be hooked up in the future it could cause confusion.
The Altivec register state in the core file is much larger than what
would be dumped by the current dump_spe(), but I'm not sure if that
matters...

There's a patch for GDB that currently reads the contents of these vector
registers from the core file, but it's being held until this patch has
been commented on and/or approved of, so if it comes to it the note name
could be changed to ALTIVEC (or something similar).


Thanks!
Mark.
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [PATCH] add Altivec/VMX state to coredumps

2007-09-25 Thread Mark Nelson

Kumar Gala wrote:
 
 On Sep 25, 2007, at 8:22 PM, Mark Nelson wrote:
 
 Kumar Gala wrote:

 On Sep 24, 2007, at 11:03 PM, Mark Nelson wrote:

 Update dump_task_altivec() (that has so far never been put to use)
 so that it dumps the Altivec/VMX registers (VR[0] - VR[31], VSCR
 and VRSAVE) in the same format as the ptrace get_vrregs() and add
 the appropriate glue typedefs and #defines to
 include/asm-powerpc/elf.h for it to work.

 Is there some way to tell if the core dump has altivec registers state
 in it?

 I'm wondering how we distinguish a core dump w/altivec state vs one with
 SPE state.

 - k



 If the core dump has the Altivec registers saved in there it will have a
 note called LINUX as shown below:

 $ readelf -n core

 Notes at offset 0x02b4 with length 0x05c8:
   Owner Data size   Description
   CORE  0x010c  NT_PRSTATUS (prstatus structure)
   CORE  0x0080  NT_PRPSINFO (prpsinfo structure)
   CORE  0x00b0  NT_AUXV (auxiliary vector)
   CORE  0x0108  NT_FPREGSET (floating point registers)
   LINUX 0x0220  NT_PRXFPREG (user_xfpregs structure)

 This mirrors what occurs with the SSE registers on i386 core dumps in
 order to keep things as similar as possible.

 I can't find any place where dump_spe() is called at the moment, but I
 suppose if it were to be hooked up in the future it could cause
 confusion.
 The Altivec register state in the core file is much larger than what
 would be dumped by the current dump_spe(), but I'm not sure if that
 matters...

 There's a patch for GDB that currently reads the contents of these vector
 registers from the core file, but it's being held until this patch has
 been commented on and/or approved of, so if it comes to it the note name
 could be changed to ALTIVEC (or something similar).
 
 I think we should NOT overload NT_PRXFPREG and add proper note types
 NT_ALTIVEC  NT_SPE for those register sets.
 
 Who on the GDB side would we need to coordinate such a change with?
 
 - k
 

You're probably right :)

What cores have SPE at the moment? Also, perhaps more importantly, are there 
any plans to have Altivec and SPE in the same core?

I've been working with Carlos Eduardo Seo (Cc'ed on this mail) on the GDB side 
of this.


Thanks!

Mark.
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

[PATCH] add Altivec/VMX state to coredumps

2007-09-24 Thread Mark Nelson

Update dump_task_altivec() (that has so far never been put to use)
so that it dumps the Altivec/VMX registers (VR[0] - VR[31], VSCR
and VRSAVE) in the same format as the ptrace get_vrregs() and add
the appropriate glue typedefs and #defines to
include/asm-powerpc/elf.h for it to work.

Signed-off-by: Mark Nelson [EMAIL PROTECTED]
---
 arch/powerpc/kernel/process.c |   28 +---
 include/asm-powerpc/elf.h |7 +++
 2 files changed, 32 insertions(+), 3 deletions(-)

Index: linux/arch/powerpc/kernel/process.c
===
--- linux.orig/arch/powerpc/kernel/process.c
+++ linux/arch/powerpc/kernel/process.c
@@ -149,10 +149,32 @@ void flush_altivec_to_thread(struct task
}
 }
 
-int dump_task_altivec(struct pt_regs *regs, elf_vrregset_t *vrregs)
+int dump_task_altivec(struct task_struct *tsk, elf_vrregset_t *vrregs)
 {
-   flush_altivec_to_thread(current);
-   memcpy(vrregs, current-thread.vr[0], sizeof(*vrregs));
+   /* ELF_NVRREG includes the VSCR and VRSAVE which we need to save
+* separately, see below */
+   const int nregs = ELF_NVRREG - 2;
+   elf_vrreg_t *reg;
+   u32 *dest;
+
+   if (tsk == current)
+   flush_altivec_to_thread(tsk);
+
+   reg = (elf_vrreg_t *)vrregs;
+
+   /* copy the 32 vr registers */
+   memcpy(reg, tsk-thread.vr[0], nregs * sizeof(*reg));
+   reg += nregs;
+
+   /* copy the vscr */
+   memcpy(reg, tsk-thread.vscr, sizeof(*reg));
+   reg++;
+
+   /* vrsave is stored in the high 32bit slot of the final 128bits */
+   memset(reg, 0, sizeof(*reg));
+   dest = (u32 *)reg;
+   *dest = tsk-thread.vrsave;
+
return 1;
 }
 #endif /* CONFIG_ALTIVEC */
Index: linux/include/asm-powerpc/elf.h
===
--- linux.orig/include/asm-powerpc/elf.h
+++ linux/include/asm-powerpc/elf.h
@@ -212,6 +212,13 @@ static inline int dump_task_regs(struct 
 extern int dump_task_fpu(struct task_struct *, elf_fpregset_t *); 
 #define ELF_CORE_COPY_FPREGS(tsk, elf_fpregs) dump_task_fpu(tsk, elf_fpregs)
 
+typedef elf_vrregset_t elf_fpxregset_t;
+
+#ifdef CONFIG_ALTIVEC
+extern int dump_task_altivec(struct task_struct *, elf_vrregset_t *vrregs);
+#define ELF_CORE_COPY_XFPREGS(tsk, regs) dump_task_altivec(tsk, regs)
+#endif
+
 #endif /* __KERNEL__ */
 
 /* ELF_HWCAP yields a mask that user programs can use to figure out what
___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

66 matches

Mail list logo