date:20121116

[PATCH] sound: use bitmap_weight

2012-11-16 Thread Joe Perches

Use bitmap_weight to count the total number of bits set in bitmap.

Signed-off-by: Joe Perches 
---
 sound/usb/endpoint.c |7 +--
 1 files changed, 1 insertions(+), 6 deletions(-)

diff --git a/sound/usb/endpoint.c b/sound/usb/endpoint.c
index 34de6f2..51a9aa3 100644
--- a/sound/usb/endpoint.c
+++ b/sound/usb/endpoint.c
@@ -485,15 +485,10 @@ __exit_unlock:
 static int wait_clear_urbs(struct snd_usb_endpoint *ep)
 {
unsigned long end_time = jiffies + msecs_to_jiffies(1000);
-   unsigned int i;
int alive;
 
do {
-   alive = 0;
-   for (i = 0; i < ep->nurbs; i++)
-   if (test_bit(i, >active_mask))
-   alive++;
-
+   alive = bitmap_weight(>active_mask, ep->nurbs);
if (!alive)
break;
 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: pwm_backlight/general pwm issue.

2012-11-16 Thread Thierry Reding

On Sat, Nov 17, 2012 at 05:07:39PM +1300, Tony Prisk wrote:
> Hi Thierry,
> 
> Looking a little feedback regarding a problem introduced with the pwm
> patch I sent converting the vt8500 pwm driver to devicetree.
> 
> One of the recommendations you made was to enable/disable the pwm clock
> in pwm_enable/pwm_disable, rather than at driver probe, to reduce power
> usage. Unfortunately, when the last pwm is disabled, the clock is
> disabled which prevents the pwm module from responding to register
> read/writes. This would be fine if pwm_enable was called before any
> other functions.
> 
> The pwm_backlight driver calls pwm_config before pwm_enable, which
> doesn't work because the pwm module has been disabled. I can appreciate
> that no one wants to enable a pwm before it's configured so I don't
> think this is particularly a driver issue.
> 
> 
> My recommendation is the re-enable the previous behaviour which was to
> enable the clock during driver probe, and disable during driver unload.
> 
> Looking for your thoughts (or anyone else that wants to chime in).

What other drivers do is explicitly make sure that the clock is enabled
before accessing registers if the hardware requires so. Does the driver
work if you change it to do so?

In the end I'll leave it up to you how you want to handle this. If the
power savings aren't an issue on vt8500 (and I suppose keeping the
peripheral clock running all the time doesn't save you *that* much) I'm
willing to take a patch that fixes things for you.

Thierry


pgpQCpiKRMfRV.pgp
Description: PGP signature

Re: [patch 7/7] fs, notify: Add procfs fdinfo helper v6

2012-11-16 Thread Cyrill Gorcunov

On Fri, Nov 16, 2012 at 03:56:03PM -0800, Andrew Morton wrote:
> 
> This is a lousy output format.  It's sort-of like a sensible set of
> name-value tuples: "name:value name:value name:value" but
> 
> a) it has lots of random pointless whitespace after the colons and
> 
> b) several of the labels have spaces in them, just to make life
>harder for parsing code and
> 
> c) inotify-wd is secretly printed in decimal while everything else
>is in hex.
> 
> What happens if we do something like the below (which will require a
> changelog update)?

Looks good for me, Andrew. The only reason for such whitespace rich format
was to make output column aligned. But it's fine to have name:val as well.
I'll update. Thanks!

Cyrill
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] staging/serqt_usb2: Refactor qt_status_change_check() in serqt_usb2.c

2012-11-16 Thread YAMANE Toshiaki

Modify qt_status_change_check() and delete qt_status_change().

Signed-off-by: YAMANE Toshiaki 
---
 drivers/staging/serqt_usb2/serqt_usb2.c |   53 +--
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/drivers/staging/serqt_usb2/serqt_usb2.c 
b/drivers/staging/serqt_usb2/serqt_usb2.c
index f68a855..13722b2 100644
--- a/drivers/staging/serqt_usb2/serqt_usb2.c
+++ b/drivers/staging/serqt_usb2/serqt_usb2.c
@@ -291,33 +291,6 @@ static void qt_interrupt_callback(struct urb *urb)
/* FIXME */
 }
 
-static int qt_status_change(unsigned int limit,
-   unsigned char *data,
-   int i,
-   struct quatech_port *qt_port,
-   struct usb_serial_port *port)
-{
-   void (*fn)(struct quatech_port *, unsigned char);
-
-   if (0x00 == data[i + 2]) {
-   dev_dbg(>dev, "Line status status.\n");
-   fn = ProcessLineStatus;
-   } else {
-   dev_dbg(>dev, "Modem status status.\n");
-   fn = ProcessModemStatus;
-   }
-
-   if (i > limit) {
-   dev_dbg(>dev,
-   "Illegal escape seuences in received data\n");
-   return 0;
-   }
-
-   (*fn)(qt_port, data[i + 3]);
-
-   return 1;
-}
-
 static void qt_status_change_check(struct tty_struct *tty,
   struct urb *urb,
   struct quatech_port *qt_port,
@@ -334,11 +307,29 @@ static void qt_status_change_check(struct tty_struct *tty,
flag = 0;
switch (data[i + 2]) {
case 0x00:
+   if (i > (RxCount - 4)) {
+   dev_dbg(>dev,
+   "Illegal escape sequences in 
received data\n");
+   break;
+   }
+
+   i += 3;
+   ProcessLineStatus(qt_port, data[i]);
+
+   flag = 1;
+   break;
+
case 0x01:
-   flag = qt_status_change((RxCount - 4), data, i,
-   qt_port, port);
-   if (flag == 1)
-   i += 3;
+   if (i > (RxCount - 4)) {
+   dev_dbg(>dev,
+   "Illegal escape sequences in 
received data\n");
+   break;
+   }
+
+   i += 3;
+   ProcessModemStatus(qt_port, data[i]);
+
+   flag = 1;
break;
 
case 0xff:
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 3/3 UPDATED] i2c / ACPI: add ACPI enumeration support

2012-11-16 Thread Bjorn Helgaas

On Fri, Nov 16, 2012 at 10:28 AM, Mika Westerberg
 wrote:
> ...
> From: Mika Westerberg 
> Date: Mon, 10 Sep 2012 12:12:32 +0300
> Subject: [PATCH] i2c / ACPI: add ACPI enumeration support
>
> ACPI 5 introduced I2cSerialBus resource that makes it possible to enumerate
> and configure the I2C slave devices behind the I2C controller. This patch
> adds helper functions to support I2C slave enumeration.
>
> An ACPI enabled I2C controller driver only needs to call 
> acpi_i2c_register_devices()
> in order to get its slave devices enumerated, created and bound to the
> corresponding ACPI handle.

I must admit I don't understand the strategy here.  Likely it's only
because I haven't been paying enough attention, but I'll ask anyway in
case anybody else is similarly confused.

The callchain when we enumerate these slave devices looks like this:

acpi_i2c_register_devices(struct i2c_adapter *)
  acpi_walk_namespace(adapter->dev.acpi_handle, acpi_i2c_add_device)
acpi_i2c_add_device
  acpi_bus_get_device
  acpi_bus_get_status
  acpi_dev_get_resources(..., acpi_i2c_add_resource, ...)

  acpi_dev_free_resources
  i2c_new_device
client = kzalloc
client->dev = ...
device_register(>dev)

Is the ACPI namespace in question something like the following?

Device {# i2C master, i.e., the i2c_adapter
  _HID PNP
  Device {  # I2C slave 1, i.e.,  a client
_HID PNPsss1
_CRS
  SerialBus/I2C addr addr1, mode mode1
  IRQ irq1
  }
  Device {  # I2C slave 2
_HID PNPsss2
_CRS
  SerialBus/I2C addr addr2, mode mode2
  IRQ irq2
  }
}

_CRS is a device configuration method, so I would expect that it
exists within the scope of a Device() object.  The way I'm used to
this working is for a driver to specify "I know about PNPsss1
devices."

But it looks like acpi_i2c_register() walks the namespace below an i2c
master device, registering a new i2c device (a slave) for every ACPI
device node with a _CRS method that contains a SERIAL_BUS/TYPE_I2C
descriptor.  It seems like you're basically claiming those devices
nodes based on the contents of their _CRS, not based on their PNP IDs,
which seems strange to me.

We have to be able to hook device enumeration into udev so we can
autoload drivers.  It's obvious how to do that with _HID and _CID --
we just emit a uevent saying "we found a new device with PNP IDs
x,y,z".  I don't see how to do anything similar based on the _CRS
contents.  Again, probably I'm completely missing the point here, and
I'm sorry to be dense.

I guess this isn't really "enumeration" -- the ACPI core has
previously walked this namespace and built acpi_devices for the
Device() nodes, and I think it emits uevents at that time.  So this is
more of a "claim" than an "enumerate."  But the Device() node for the
I2C slave still exists, and it has _HID/_CID, doesn't it?  Do we use
that _HID anywhere?

In any event, after acpi_i2c_register(), I think we have a set of
i2c_client devices (with the above namespace, I assume we'd have two
of them).  I guess acpi_i2c_find_device() is useful now -- it looks
like it takes a "struct device *" (e.g., >dev from a struct
i2c_client), and and gives you back the acpi_handle corresponding to
it?

Here's the callchain of that path:

acpi_i2c_find_device(struct device *)   # acpi_i2c_bus.find_device
  i2c_verify_client(dev)
  acpi_walk_namespace
acpi_i2c_find_child
  acpi_bus_get_device
  acpi_bus_get_status
  acpi_dev_get_resources(..., acpi_i2c_find_child_address, ...)
acpi_i2c_find_child_address
found if (SERIAL_BUS && SERIAL_TYPE_I2C && slave_address == xx)
  acpi_dev_free_resource_list
  *handle = handle

That seems like an awful lot of work to do just to map a struct device
* back to the acpi_handle.  But I don't have any suggestion; just that
observation.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v8 3/3] Take maintainership of power sequences

2012-11-16 Thread Alexandre Courbot

On Sat, Nov 17, 2012 at 2:09 AM, Stephen Warren  wrote:
> Acked-by: Stephen Warren 

Thanks!

>> +POWER SEQUENCES
>> +M:   Alexandre Courbot 
>> +S:   Maintained
>
> Given you're presumably working on this on NVIDIA's time, perhaps make
> that "Supported" not "Maintained"?

Absolutely.

Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Improving the security of Linux processes

2012-11-16 Thread Scott Wisniewski

I'm working on an idea I had for improving the security of processes
in Linux. What I'm trying to do is a little complex, and I'm new to
Kernel development, so I figured it might be a good idea to reach out
to the Kernel community before I got too deep into the development.
Basically, I was hoping to have a high level architecture discussion
and get a feel for whether or not what I'm thinking about is the kind
of thing you guys would be receptive to adding to the kernel
(eventually).

I'm working on something that I like to call "Address Space Layout
Randomization Extreme", or ASLRX. ASLRX aims to extend ASLR by adding
support for dynamically randomizing the CONTENTS of an image, and not
just it's base address.

I've included a copy of the feature's "README" file below. It's been
written as if all of the required components have already been
implemented (they haven't). The general idea is just to provide a
quick description of what I'd like to like to implement. I'd
appreciate any comments or suggestions you might have. In particular,
I'm wondering:

1. Does this sound interesting to anyone?
2. Are there any major "philosophical" barriers to including this sort
of thing in the Kernel?
3. Do you have any advice?

Thanks,

-Scott



Intro
==

ASLRX, or Address Space Layout Randomization Extreme, is an enhanced form of
address space layout randomization designed to improve the security of programs
running under Linux. Traditional ASLR (address space layout randomization) works
be selecting a random address for the stack space used by a program and
selecting random base addresses for each image loaded into a process (including
the "main" image). When combined with non executable data pages, it can
frustrate many attacks.

Unfortunately, because only base addresses are randomized, processes using ASLR
are still vulnerable to many attacks, including return-to-libc attacks. In
particular, if an attacker is able to guess a base address, he can deduce the
address of every function in an image.

There are many forms of information leakage that can be exploited to allow
attackers to infer a base address. It can be done with a web server, for
example, by sending iterative attack payloads that attempt to jump to a
particular (even innocuous) address inside libc. By examining the response
behavior of the server (error codes, timings, etc), an adversary can learn the
address of a function, and hence the base address of the target image. Once the
base address is know, an attacker can pepare a "return-oriented program" to
invoke arbitrary code in the target process. That defeats the benefits of ASLR.

The purpose of ASLRX is to eliminate (or extremely frustrate) such attack
vectors. It works by randomizing the _contents_ of an image, not just its base
address. Thus, even if an attacker is able to deduce the address of a single
function, it (alone) will not tell him the address of other functions in the
image.

An attacker must either deduce each target function individually, or
deduce a large number of functions to enable a high probability of inference. In
either case, the difficult of scucess, the length of time required to
perform an attack, and the
probability of early detection are all increased. Future versions of
ASLRX may also
add the ability to dynamically re-randomize an image as it is running, thus
enabling such attacks to be thwarted without taking target systems offline.
Hardened stack smashing prevention is also possible.

How Does It Work?
==

ASLRX works using a technique know as Software Dynamic Translation, or SDT. It
"rewrites" executables as they are running, in a manner similar to a JIT
compiler. However, instead of translating from an IR to native code (as in a JIT
compiler), or between CPU architectures (as in typical SDT systems), its source
and target languages are the same (X64 machine code).

When an image is loaded under ASLRX, it's executable code is not loaded into the
process. Instead, functions are inserted into the process, on demand, as they
are first executed, into random locations within random pages. Function calls
(and other symbolic references to addresses in other functions) are initially
translated into system calls. When the system calls are executed, the kernel
ensures the target address is loaded into the process (at a random location),
and then modifies the call site to reference the address of the translated
function.

The kernel based SDT scheme offers several advantages:

1. Both the meta-data used to perform translation, and the translation code, is
not accessible to user space. This increases the difficulty of attacks on the
translation infrastructure itself.

2. Given suitable meta-data, arbitrarily complex rewrites are possible. For
example, code can be modified to maintain a shadow stack of return
addresses. This enables dynamic re-randomization of images

Re: ACPI errors with 3.7-rc3

2012-11-16 Thread Robert Hancock


On 11/09/2012 10:36 AM, Feng Tang wrote:

On Fri, Nov 09, 2012 at 10:30:43PM +0800, Moore, Robert wrote:

The ACPI Global Lock is in fact intended to provide exclusion between the BIOS 
and the OS.
Bob


Thanks for the info.

And per my check, most of ACPI FW don't implement this lock, say
after driver probe, the ec->global_lock will be 0.


The DSDT is supposed to define the _GLK control method on the EC if the 
BIOS needs to perform its own access which may conflict with the OS 
usage. If it doesn't, then it should be the case that either the BIOS 
doesn't touch the EC itself or it uses a separate interface that doesn't 
cause conflicts with what the OS is doing.




- Feng





-Original Message-
From: Tang, Feng
Sent: Friday, November 09, 2012 1:29 AM
To: Rafael J. Wysocki
Cc: Greg KH; Azat Khuzhin; linux-a...@vger.kernel.org; Linux Kernel
Mailing List; Zheng, Lv; Len Brown; Moore, Robert
Subject: Re: ACPI errors with 3.7-rc3

On Thu, Nov 08, 2012 at 05:49:40AM +0800, Rafael J. Wysocki wrote:

On Tuesday, November 06, 2012 01:48:26 PM Greg KH wrote:

On Tue, Nov 06, 2012 at 04:42:24PM +0400, Azat Khuzhin wrote:

I'v also have such errors on my macbook pro.

$ dmesg | tail
[17056.008564] ACPI Error: Method parse/execution failed
[\_SB_.PCI0.LPCB.EC__.SMB0.SBRW] (Node 88026547ea10), AE_TIME
(20120711/psparse-536)
[17056.011194] ACPI Error: Method parse/execution failed
[\_SB_.BAT0.UBST] (Node 88026547e678), AE_TIME
(20120711/psparse-536)
[17056.013793] ACPI Error: Method parse/execution failed
[\_SB_.BAT0._BST] (Node 88026547e740), AE_TIME
(20120711/psparse-536)
[17056.016383] ACPI Exception: AE_TIME, Evaluating _BST
(20120711/battery-464) [17056.511373] ACPI: EC: input buffer is
not empty, aborting transaction [17056.512672] ACPI Exception:
AE_TIME, Returned by Handler for [EmbeddedControl]
(20120711/evregion-501) [17056.515256] ACPI Error: Method
parse/execution failed [\_SB_.PCI0.LPCB.EC__.SMB0.SBRW] (Node
88026547ea10), AE_TIME
(20120711/psparse-536)
[17056.517886] ACPI Error: Method parse/execution failed
[\_SB_.BAT0.UBST] (Node 88026547e678), AE_TIME
(20120711/psparse-536)
[17056.520479] ACPI Error: Method parse/execution failed
[\_SB_.BAT0._BST] (Node 88026547e740), AE_TIME
(20120711/psparse-536)
[17056.523070] ACPI Exception: AE_TIME, Evaluating _BST
(20120711/battery-464)


I'm seeing this again right now.  I'm wondering if it's because I'm
running on battery power at the moment:

[41694.309264] ACPI Exception: AE_TIME, Returned by Handler for
[EmbeddedControl] (20120913/evregion-501) [41694.309282] ACPI Error:
Method parse/execution failed [\_SB_.PCI0.LPCB.EC__.SMB0.SBRW] (Node
88045cc64618), AE_TIME (20120913/psparse-536) [41694.309300]
ACPI Error: Method parse/execution failed [\_SB_.BAT0.UBST] (Node
88045cc64988), AE_TIME (20120913/psparse-536) [41694.309310]
ACPI Error: Method parse/execution failed [\_SB_.BAT0._BST] (Node
88045cc648c0), AE_TIME (20120913/psparse-536) [41694.309324]
ACPI Exception: AE_TIME, Evaluating _BST (20120913/battery-464)
[41694.809093] ACPI: EC: input buffer is not empty, aborting
transaction

ec_storm_threshold is still set to 8 in /sys/module/acpi/parameters/
so that's not the issue here.


And also loadavg is too high ~ 10
While there is no process that load CPU up to 100% or like that.
I think that this because of processes that is done in kernel space.
(basically that one who write such errors)

$ uname -a
Linux macbook-pro-sq 3.6.5macbook-pro-custom-v0.1 #4 SMP Sun Nov 4
12:39:03 UTC 2012 x86_64 GNU/Linux


Ah, ok, that means it's not something new in 3.7-rc, so maybe it's
just never worked properly for this hardware :)

So it's not a regression, just an ACPI issue, any ACPI developer
have an idea about this?


Can you please send the output of acpidump from the affected machine(s)?


I doubt this problem is sometimes inevitable for some machines, because
AFAIK most modern machines have the race problem for EC HW controller, as
both OS side and the BIOS may access the EC HW at the same time
without any race control.

For this case, usually the battery and thermal modules (which may be
controlled through EC) are always monitored by BIOS, when OS also
frequently visit them too, the EC's own state machine may be broken and
not responsive due to the race, then cause the timeout error.
And how severe the problem will be depends on the EC HW, the quality of
BIOS code and OS/driver code.

Myself have seen the similar "ACPI: EC: input buffer is not empty,
aborting transaction" error message on one laptop when its EC is busy
visited by OS.

btw, in EC driver I see a "ec->global_lock", don't know if it was designed
to control the race between OS and BIOS.

Thanks,
Feng

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe

[PATCH 5/5] balancenuma: no task swap in finding placement

2012-11-16 Thread Hillf Danton

Node is selected on behalf of given task, but no reason to punish
the currently running tasks on other nodes. That punishment maybe benifit,
who knows. Better if they are treated not in random way.

Signed-off-by: Hillf Danton 
---

--- a/kernel/sched/fair.c   Sat Nov 17 12:29:08 2012
+++ b/kernel/sched/fair.c   Sat Nov 17 12:34:52 2012
@@ -872,7 +872,7 @@ static inline unsigned long balancenuma_

 /*
  * Examines all other nodes examining remote tasks to see if there would
- * be fewer remote numa faults if tasks swapped home nodes
+ * be fewer remote numa faults
  */
 static void task_numa_find_placement(struct task_struct *p)
 {
@@ -933,13 +933,6 @@ static void task_numa_find_placement(str
continue;
}

-   /* Ensure the other task can be swapped */
-   if (!cpumask_test_cpu(this_cpu,
- tsk_cpus_allowed(other_task))) {
-   raw_spin_unlock_irq(>lock);
-   continue;
-   }
-
/*
 * Read the fault statistics. If the remote task is a
 * thread in the process then use the task statistics.
@@ -973,8 +966,7 @@ compare_other:
this_diff = this_weight - p_weight;

/*
-* Would swapping the tasks reduce the overall
-* cross-node NUMA faults?
+* Would nid reduce the overall cross-node NUMA faults?
 */
if (other_diff > 0 && this_diff > 0) {
long weight_diff = other_diff + this_diff;
@@ -995,11 +987,8 @@ compare_other:
}
}

-   /* Swap the task on the selected target node */
if (selected_nid != -1) {
sched_setnode(p, selected_nid);
-   if (selected_task)
-   sched_setnode(selected_task, this_nid);
}
 }

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/5] balancenuma: check mem node in finding placement

2012-11-16 Thread Hillf Danton

No point to migrate task from node A to node A.

Signed-off-by: Hillf Danton 
---

--- a/kernel/sched/fair.c   Sat Nov 17 12:25:44 2012
+++ b/kernel/sched/fair.c   Sat Nov 17 12:25:54 2012
@@ -891,6 +891,8 @@ static void task_numa_find_placement(str
/* Examine a task on every other node */
for_each_online_node(nid) {
int cpu;
+   if (nid == this_nid)
+   continue;
for_each_cpu_and(cpu, cpumask_of_node(nid), allowed) {
struct rq *rq;
struct mm_struct *other_mm;
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/5] balancenuma: cleanup in resetting numa scan

2012-11-16 Thread Hillf Danton

Checking ->mm is not needed, but too late if necessary.

Signed-off-by: Hillf Danton 
---

--- a/kernel/sched/fair.c   Sat Nov 17 12:12:08 2012
+++ b/kernel/sched/fair.c   Sat Nov 17 12:14:26 2012
@@ -1125,7 +1125,7 @@ static void reset_ptenuma_scan(struct ta
ACCESS_ONCE(p->mm->numa_scan_seq)++;
p->mm->numa_scan_offset = 0;

-   if (p->mm && p->mm->mm_balancenuma)
+   if (p->mm->mm_balancenuma)
p->mm->mm_balancenuma->mm_numa_fault_tot >>= 1;
if (p->task_balancenuma) {
int nid;
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/1] mmc: Bad device can cause mmc driver to hang

2012-11-16 Thread Trey Ramsay

On 11/16/2012 06:37 PM, Chris Ball wrote:
> Hi Trey, thanks for the analysis,
> 
> On Fri, Nov 16 2012, Trey Ramsay wrote:
>> Good question.  In regards to the original problem were it was hung in
>> mmc_blk_err_check, the new code path will timeout after 10 minutes, log
>> an error, issue a hardware reset and abort the request. Is the hardware
>> reset enough or will that even work when the device isn't coming out of
>> program state? Should we try to refuse all new I/O?
> 
> mmc_hw_reset() only works for eMMC devices with a hooked up reset GPIO
> -- not SD cards -- and at the moment there's only one system (Intel
> Medfield) that supplies a GPIO, so that's not a general solution.
> 
> Maybe we should just merge your patch for now; we'll definitely get at
> least a pr_err() explaining what's going on, which is an improvement.
> Next time someone hits this (if anyone has an SD card that exhibits
> this problem, it'd be very valuable for testing) we can look at going
> farther, such as immediately setting host->flags |= SDHCI_DEVICE_DEAD.
> What do you think?
> 
> - Chris.
> 

Hi Chris,
Sounds good.  Thanks for the explanation. Setting host->flags |=
SDHCI_DEVICE_DEAD is a great idea.  I'll check with my team to see if we
have any hardware that exhibits this problem.  If we do, I can do some
testing on the code you suggested.

Thanks,
Trey

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/5] balancenuma: fix numa fault statistics

2012-11-16 Thread Hillf Danton

Simply use @pages as we handle both pte and pmd numa faults.

Signed-off-by: Hillf Danton 
---

--- a/kernel/sched/fair.c   Sat Nov 17 11:40:36 2012
+++ b/kernel/sched/fair.c   Sat Nov 17 12:05:06 2012
@@ -1112,10 +1112,10 @@ void task_numa_fault(int node, int pages
}

/* Record fault statistics */
-   p->task_balancenuma->task_numa_fault_tot++;
-   p->task_balancenuma->task_numa_fault[node]++;
-   p->mm->mm_balancenuma->mm_numa_fault_tot++;
-   p->mm->mm_balancenuma->mm_numa_fault[node]++;
+   p->task_balancenuma->task_numa_fault_tot+= pages;
+   p->task_balancenuma->task_numa_fault[node]  += pages;
+   p->mm->mm_balancenuma->mm_numa_fault_tot+= pages;
+   p->mm->mm_balancenuma->mm_numa_fault[node]  += pages;

task_numa_placement(p);
 }
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

When is save to print console messages?

2012-11-16 Thread Woody Wu

Hi,

I want to use print some trace messages to the console in the kernel
setup/init stages. But I don't know what is the earliest time when safe
to do that.  Sould be somewhere after decompress_kernel() invocation?

And, when should I use putstr() and when to use printk()? Thans.

-- 
-woody
I can't go back to yesterday - because I was a different person then.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[git patches] libata fixes

2012-11-16 Thread Jeff Garzik


If you were going to shoot me for not sending these earlier, you would be
right.  -rc6 beat me by ~2 hours it seems, and they really should have
gone out to libata-dev.git and you long before that.

These have been in libata-dev.git for a day or so (unfortunately
linux-next is on vacation).  The main one is #1, with the others being
minor bits.  #1 has multiple tested-by, and can be considered a
regression fix IMO.

1) Fix ACPI oops, https://bugzilla.kernel.org/show_bug.cgi?id=48211

2) Temporary WARN_ONCE() debugging patch for further ACPI debugging.
   The code already oopses here, and so this merely gives slightly
   better info.  Related to https://bugzilla.kernel.org/show_bug.cgi?id=49151
   which has been bisected down to a patch that _exposes_ a latent bug,
   but said bisection target does not actually appear to be the root cause
   itself.

3) sata_svw: fix longstanding error recovery bug, which was
   preventing kdump, by adding missing DMA-start bit check.  Core
   code was already checking DMA-start, but ancillary, less-used
   routines were not.  Fixed.

4) sata_highbank: fix minor __init/__devinit warning

5) Fix minor warning, if CONFIG_PM is set, but CONFIG_PM_SLEEP is not set

6) pata_arasan: proper functioning requires clock setting

Please pull 29448ec129c5c9c7ece2ef28c72a0dafd70c8af2 from
git://git.kernel.org/pub/scm/linux/kernel/git/jgarzik/libata-dev.git 
tags/upstream-linus


to receive the following updates:

 drivers/ata/ahci_platform.c  |  2 +-
 drivers/ata/libata-acpi.c| 11 ---
 drivers/ata/libata-core.c|  4 
 drivers/ata/pata_arasan_cf.c |  8 +++-
 drivers/ata/sata_highbank.c  |  4 ++--
 drivers/ata/sata_svw.c   | 35 +++
 6 files changed, 57 insertions(+), 7 deletions(-)

Aaron Lu (1):
  libata-acpi: Fix NULL ptr derference in ata_acpi_dev_handle

Arnd Bergmann (1):
  sata_highbank: mark ahci_highbank_probe as __devinit

Borislav Petkov (1):
  libata debugging: Warn when unable to find timing descriptor based on 
xfer_mode

David Milburn (1):
  sata_svw: check DMA start bit before reset

Vipul Kumar Samar (1):
  pata_arasan: Initialize cf clock to 166MHz

Yuanhan Liu (1):
  [libata] PM callbacks should be conditionally compiled on CONFIG_PM_SLEEP

diff --git a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
index b1ae480..b7078af 100644
--- a/drivers/ata/ahci_platform.c
+++ b/drivers/ata/ahci_platform.c
@@ -238,7 +238,7 @@ static int __devexit ahci_remove(struct platform_device 
*pdev)
return 0;
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static int ahci_suspend(struct device *dev)
 {
struct ahci_platform_data *pdata = dev_get_platdata(dev);
diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c
index fd9ecf7..5b0ba3f 100644
--- a/drivers/ata/libata-acpi.c
+++ b/drivers/ata/libata-acpi.c
@@ -1105,10 +1105,15 @@ static int ata_acpi_bind_device(struct ata_port *ap, 
struct scsi_device *sdev,
struct acpi_device *acpi_dev;
struct acpi_device_power_state *states;
 
-   if (ap->flags & ATA_FLAG_ACPI_SATA)
-   ata_dev = >link.device[sdev->channel];
-   else
+   if (ap->flags & ATA_FLAG_ACPI_SATA) {
+   if (!sata_pmp_attached(ap))
+   ata_dev = >link.device[sdev->id];
+   else
+   ata_dev = >pmp_link[sdev->channel].device[sdev->id];
+   }
+   else {
ata_dev = >link.device[sdev->id];
+   }
 
*handle = ata_dev_acpi_handle(ata_dev);
 
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 3cc7096..f46fbd3 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2942,6 +2942,10 @@ const struct ata_timing *ata_timing_find_mode(u8 
xfer_mode)
 
if (xfer_mode == t->mode)
return t;
+
+   WARN_ONCE(true, "%s: unable to find timing for xfer_mode 0x%x\n",
+   __func__, xfer_mode);
+
return NULL;
 }
 
diff --git a/drivers/ata/pata_arasan_cf.c b/drivers/ata/pata_arasan_cf.c
index 26201eb..371fd2c 100644
--- a/drivers/ata/pata_arasan_cf.c
+++ b/drivers/ata/pata_arasan_cf.c
@@ -317,6 +317,12 @@ static int cf_init(struct arasan_cf_dev *acdev)
return ret;
}
 
+   ret = clk_set_rate(acdev->clk, 16600);
+   if (ret) {
+   dev_warn(acdev->host->dev, "clock set rate failed");
+   return ret;
+   }
+
spin_lock_irqsave(>host->lock, flags);
/* configure CF interface clock */
writel((pdata->cf_if_clk <= CF_IF_CLK_200M) ? pdata->cf_if_clk :
@@ -908,7 +914,7 @@ static int __devexit arasan_cf_remove(struct 
platform_device *pdev)
return 0;
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static int arasan_cf_suspend(struct device *dev)
 {
struct ata_host *host = dev_get_drvdata(dev);
diff --git a/drivers/ata/sata_highbank.c b/drivers/ata/sata_highbank.c
index

[PATCH 1/5] balancenuma: fix typo in handling pmd numa fault

2012-11-16 Thread Hillf Danton

s/ptep/pmdp/

Also a cleanup packed: use haddr directly.

Btw, will spin if you no longer need the debug trap.

Signed-off-by: Hillf Danton 
---

--- a/mm/huge_memory.c  Sat Nov 17 11:37:36 2012
+++ b/mm/huge_memory.c  Sat Nov 17 11:54:18 2012
@@ -1055,9 +1055,9 @@ clear_pmdnuma:
goto out_unlock;

pmd = pmd_mknonnuma(pmd);
-   set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmdp, pmd);
+   set_pmd_at(mm, haddr, pmdp, pmd);
VM_BUG_ON(pmd_numa(*pmdp));
-   update_mmu_cache_pmd(vma, addr, ptep);
+   update_mmu_cache_pmd(vma, addr, pmdp);

 out_unlock:
spin_unlock(>page_table_lock);
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [sqlite] light weight write barriers

2012-11-16 Thread Vladislav Bolkhovitin



Chris Friesen, on 11/15/2012 05:35 PM wrote:

The easiest way to implement this fsync would involve three things:
1. Schedule writes for all dirty pages in the fs cache that belong to
the affected file, wait for the device to report success, issue a cache
flush to the device (or request ordering commands, if available) to make
it tell the truth, and wait for the device to report success. AFAIK this
already happens, but without taking advantage of any request ordering
commands.
2. The requesting thread returns as soon as the kernel has identified
all data that will be written back. This is new, but pretty similar to
what AIO already does.
3. No write is allowed to enqueue any requests at the device that
involve the same file, until all outstanding fsync complete [3]. This is
new.


This sounds interesting as a way to expose some useful semantics to userspace.

I assume we'd need to come up with a new syscall or something since it doesn't
match the behaviour of posix fsync().


This is how I would export cache sync and requests ordering abstractions to the 
user space:


For async IO (io_submit() and friends) I would extend struct iocb by flags, which 
would allow to set the required capabilities, i.e. if this request is FUA, or full 
cache sync, immediate [1] or not, ORDERED or not, or all at the same time, per 
each iocb.


For the regular read()/write() I would add to "flags" parameter of 
sync_file_range() one more flag: if this sync is immediate or not.


To enforce ordering rules I would add one more command to fcntl(). It would make 
the latest submitted write in this fd ORDERED.


All together those should provide the requested functionality in a simple, 
effective, unambiguous and backward compatible manner.


Vlad

1. See my other today's e-mail about what is immediate cache sync.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [sqlite] light weight write barriers

2012-11-16 Thread Vladislav Bolkhovitin


David Lang, on 11/15/2012 07:07 AM wrote:

There's no such thing as "barrier". It is fully artificial abstraction. After
all, at the bottom of your stack, you will have to translate it either to cache
flush, or commands order enforcement, or both.


When people talk about barriers, they are talking about order enforcement.


Not correct. When people are talking about barriers, they are meaning different 
things. For instance, Alan Cox few e-mails ago was meaning cache flush.


That's the problem with the barriers concept: barriers are ambiguous. There's no 
barrier which can fit all requirements.



the hardware capabilities are not directly accessable from userspace (and they
probably shouldn't be)


The discussion is not about to directly provide storage hardware capabilities to 
the user space. The discussion is to replace fully inadequate barriers 
abstractions to a set of other, adequate abstractions.


For instance:

1. Cache flush primitives:

1.1. FUA

1.2. Non-immediate cache flush, i.e. don't return until all data hit non-volatile 
media


1.3. Immediate cache flush, i.e. return ASAP after the cache sync started, 
possibly before all data hit non-volatile media.


2. ORDERED attribute for requests. It provides the following behavior rules:

A.  All requests without this attribute can be executed in parallel and be freely 
reordered.


B. No ORDERED command can be completed before any previous not-ORDERED or ORDERED 
command completed.


Those abstractions can naturally fit all storage capabilities. For instance:

 - On simple WT cache hardware not supporting ordering commands, (1) translates 
to NOP and (2) to queue draining.


 - On full features HW, both (1) and (2) translates to the appropriate storage 
capabilities.


On FTL storage (B) can be further optimized by doing data transfers for ORDERED 
commands in parallel, but commit them in the requested order.



barriers keep getting mentioned because they are a easy concept to understand.


Well, concept of flat Earth and Sun rotating around it is also easy to understand. 
So, why isn't it used?


Vlad
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [sqlite] light weight write barriers

2012-11-16 Thread Vladislav Bolkhovitin


杨苏立 Yang Su Li, on 11/15/2012 11:14 AM wrote:

1. fsync actually does two things at the same time: ordering writes (in a
barrier-like manner), and forcing cached writes to disk. This makes it very
difficult to implement fsync efficiently.


Exactly!


However, logically they are two distinctive functionalities


Exactly!

Those two points are exactly why concept of barriers must be forgotten for sake of 
productivity and be replaced by a finer grained abstractions as well as why they 
where removed from the Linux kernel


Vlad
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] tmpfs: fix shmem_getpage_gfp VM_BUG_ON

2012-11-16 Thread Hugh Dickins

Further offtopic..

On Fri, 16 Nov 2012, Jaegeuk Hanse wrote:
> Some questions about your shmem/tmpfs: misc and fallocate patchset.
> 
> - Since shmem_setattr can truncate tmpfs files, why need add another similar
> codes in function shmem_fallocate? What's the trick?

I don't know if I understand you.  In general, hole-punching is different
from truncation.  Supporting the hole-punch mode of the fallocate system
call is different from supporting truncation.  They're closely related,
and share code, but meet different specifications.

> - in tmpfs: support fallocate preallocation patch changelog:
>   "Christoph Hellwig: What for exactly?  Please explain why preallocating on
> tmpfs would make any sense.
>   Kay Sievers: To be able to safely use mmap(), regarding SIGBUS, on files on
> the /dev/shm filesystem.  The glibc fallback loop for -ENOSYS [or
> -EOPNOTSUPP] on fallocate is just ugly."
>   Could shmem/tmpfs fallocate prevent one process truncate the file which the
> second process mmap() and get SIGBUS when the second process access mmap but
> out of current size of file?

Again, I don't know if I understand you.  fallocate does not prevent
truncation or races or SIGBUS.  I believe that Kay meant that without
using fallocate to allocate the memory in advance, systemd found it hard
to protect itself from the possibility of getting a SIGBUS, if access to
a shmem mapping happened to run out of memory/space in the middle.

I never grasped why writing the file in advance was not good enough:
fallocate happened to be what they hoped to use, and it was hard to
deny it, given that tmpfs already supported hole-punching, and was
about to convert to the fallocate interface for that.

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[usb_storage][mmc] - Genesys Logic / SilverStone 6-in-1 Card Reader/Writer - Can't mount storage

2012-11-16 Thread Shawn Starr

Hello kernel folks,

I have an old 2.0 USB card reader, Linux detects the device but is not able to 
mount the storage device.

I tried it in both a KVM Windows VM and in Linux to mount the storage but 
device hangs, if I take out card, it detects all 6 device storage capabilities, 
then it detects the mmc card if i re-insert the card but is unable to mount it.

I am using a MMC adapter that can attach a micro-SD card inside it. This USB 
card reader/writer device comes with a 3.5" bay slot w/ a USB connector to 
motherboard and can detached for portable reading.

Anyone seen this problem before? Does it need any quirks used?

Thanks,
Shawn.

--- SNIP ---

Kernel conversation:

[523520.622075] usb 1-1: new high-speed USB device number 8 using ehci_hcd
[523520.749731] usb 1-1: New USB device found, idVendor=05e3, idProduct=0760
[523520.749735] usb 1-1: New USB device strings: Mfr=0, Product=3, 
SerialNumber=4
[523520.749738] usb 1-1: Product: Flash Reader
[523520.749739] usb 1-1: SerialNumber: 37
[523520.751674] scsi31 : usb-storage 1-1:1.0
[523521.753494] scsi 31:0:0:0: Direct-Access Generic  STORAGE DEVICE   0113 
PQ: 0 ANSI: 0
[523521.884032] usb 1-1: reset high-speed USB device number 8 using ehci_hcd
[523522.020128] scsi 31:0:0:1: Direct-Access Generic  STORAGE DEVICE   0113 
PQ: 0 ANSI: 0
[523522.157055] usb 1-1: reset high-speed USB device number 8 using ehci_hcd
[523522.292672] scsi 31:0:0:2: Direct-Access Generic  STORAGE DEVICE   0113 
PQ: 0 ANSI: 0
[523522.426047] usb 1-1: reset high-speed USB device number 8 using ehci_hcd
[523522.561250] scsi 31:0:0:3: Direct-Access Generic  STORAGE DEVICE   0113 
PQ: 0 ANSI: 0
[523522.561828] sd 31:0:0:0: Attached scsi generic sg2 type 0
[523522.562020] sd 31:0:0:1: Attached scsi generic sg3 type 0
[523522.562158] sd 31:0:0:2: Attached scsi generic sg4 type 0
[523522.562294] sd 31:0:0:3: Attached scsi generic sg5 type 0
[523526.062174] usb 1-1: USB disconnect, device number 8
[523526.066148] sd 31:0:0:1: [sdc] READ CAPACITY failed
[523526.066151] sd 31:0:0:1: [sdc]  
[523526.066153] Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
[523526.066154] sd 31:0:0:1: [sdc] Sense not available.
[523526.066162] sd 31:0:0:3: [sde] READ CAPACITY failed
[523526.066164] sd 31:0:0:3: [sde]  
[523526.066165] Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
[523526.066166] sd 31:0:0:3: [sde] Sense not available.
[523526.066180] sd 31:0:0:3: [sde] Write Protect is off
[523526.066181] sd 31:0:0:2: [sdd] READ CAPACITY failed
[523526.066183] sd 31:0:0:2: [sdd]  
[523526.066184] Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
[523526.066185] sd 31:0:0:2: [sdd] Sense not available.
[523526.066187] sd 31:0:0:3: [sde] Mode Sense: 00 00 00 00
[523526.066193] sd 31:0:0:1: [sdc] Write Protect is off
[523526.066195] sd 31:0:0:1: [sdc] Mode Sense: 00 00 00 00
[523526.066201] sd 31:0:0:2: [sdd] Write Protect is on
[523526.066203] sd 31:0:0:2: [sdd] Mode Sense: 90 67 cf b2
[523526.066205] sd 31:0:0:1: [sdc] Asking for cache data failed
[523526.066208] sd 31:0:0:1: [sdc] Assuming drive cache: write through
[523526.066213] sd 31:0:0:2: [sdd] Asking for cache data failed
[523526.066215] sd 31:0:0:2: [sdd] Assuming drive cache: write through
[523526.066261] sd 31:0:0:3: [sde] Asking for cache data failed
[523526.066264] sd 31:0:0:3: [sde] Assuming drive cache: write through
[523526.066411] sd 31:0:0:2: [sdd] READ CAPACITY failed
[523526.066414] sd 31:0:0:2: [sdd]  
[523526.066415] Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
[523526.066416] sd 31:0:0:2: [sdd] Sense not available.
[523526.066450] sd 31:0:0:3: [sde] Attached SCSI removable disk
[523526.066452] sd 31:0:0:2: [sdd] Asking for cache data failed
[523526.066454] sd 31:0:0:2: [sdd] Assuming drive cache: write through
[523526.066456] sd 31:0:0:2: [sdd] Attached SCSI removable disk
[523526.066457] sd 31:0:0:1: [sdc] Attached SCSI removable disk
[523526.072084] sd 31:0:0:0: [sdb] READ CAPACITY failed
[523526.072088] sd 31:0:0:0: [sdb]  
[523526.072089] Result: hostbyte=DID_NO_CONNECT driverbyte=DRIVER_OK
[523526.072091] sd 31:0:0:0: [sdb] Sense not available.
[523526.072102] sd 31:0:0:0: [sdb] Write Protect is off
[523526.072104] sd 31:0:0:0: [sdb] Mode Sense: 00 00 00 00
[523526.072114] sd 31:0:0:0: [sdb] Asking for cache data failed
[523526.072116] sd 31:0:0:0: [sdb] Assuming drive cache: write through
[523526.072274] sd 31:0:0:0: [sdb] Attached SCSI removable disk
[523526.410077] hub 1-0:1.0: unable to enumerate USB device on port 1
[523527.324054] usb 1-1: new high-speed USB device number 10 using ehci_hcd
[523527.450935] usb 1-1: New USB device found, idVendor=05e3, idProduct=0760
[523527.450941] usb 1-1: New USB device strings: Mfr=0, Product=3, 
SerialNumber=4
[523527.450944] usb 1-1: Product: Flash Reader
[523527.450947] usb 1-1: SerialNumber: 37
[523527.453972] scsi32 : usb-storage 1-1:1.0
[523528.458085] scsi 32:0:0:0: Direct-Access Generic  STORAGE DEVICE   0113 
PQ: 0 ANSI: 0

Re: [RFC] MIPS: BCM63XX: add Device Tree clock definitions

2012-11-16 Thread Stephen Warren

On 11/14/2012 05:11 AM, Jonas Gorski wrote:
> On 13 November 2012 06:02, Stephen Warren  wrote:
>> On 11/11/2012 05:50 AM, Jonas Gorski wrote:
>>> Add definitions for the clocks found and used in all supported SoCs.
>>
>>> diff --git a/arch/mips/bcm63xx/dts/bcm6328.dtsi 
>>> b/arch/mips/bcm63xx/dts/bcm6328.dtsi
>>
>>> + clocks {
>>> + #address-cells = <1>;
>>> + #size-cells = <0>;
>>> +
>>> + periph: pll {
>>> + compatible = "brcm,bcm63xx-clock";
>>> + #clock-cells = <0>;
>>> + clock-frequency = <5000>;
>>> + clock-output-names = "periph";
>>> + };
>>
>> Here too, it seems like some reg properties would be required.
> 
> This is more or less a dummy clock with no real backing for it, but
> some of the drivers expect this clock to be present (even just to get
> the frequency).

Should compatible="fixed-clock" then if this is just a dummy? Ideally
though, nothing "dummy" would be added to the DT; the kernel would
continue to provide the dummy values via code until the DT was able to
fully represent the actual HW.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] MIPS: BCM63XX: add Device Tree glue code for IRQ handling

2012-11-16 Thread Stephen Warren

On 11/14/2012 05:09 AM, Jonas Gorski wrote:
> On 13 November 2012 06:00, Stephen Warren  wrote:
>> On 11/11/2012 05:50 AM, Jonas Gorski wrote:
>>> Register IRQ domains through Device Tree for the internal and external
>>> interrupt controllers. Register the same IRQ ranges as previously to
>>> provide backward compatibility for non-DT drivers.
>>
>>> diff --git a/Documentation/devicetree/bindings/mips/bcm63xx/epic.txt 
>>> b/Documentation/devicetree/bindings/mips/bcm63xx/epic.txt
>>
>> Rather than putting binding docs in an arch-specific directory, perhaps
>> put them into a device-type-specific directory, such as
>> bindings/interrupt-controller/brcm,bcm63xx-epic.txt?
> 
> Almost everyone has their interrupt-controller bindings in
> $arch/$platform, but if interrupt-controller is the preferred
> location, I can certainly move it there; I have no hard preference for
> any location.

Yes, people have been putting them in arch/platform, but I think there's
a move to more type-based locations.

>>> diff --git a/arch/mips/bcm63xx/dts/bcm6328.dtsi 
>>> b/arch/mips/bcm63xx/dts/bcm6328.dtsi
>>
>>>   ranges = <0 0x1000 0x2>;
>>>   compatible = "simple-bus";
>>> +
>>> + interrupt-parent = <>;
>>> +
>>> + perf@0 {
>>> + epic: interrupt-controller@18 {
>>
>> Don't you need some reg properties in the perf and interrupt-controller
>> nodes so that the register address can be determined?
> 
> Since there is no support code for that property yet I did not add it.
> I haven't quite finished yet how the final bindings will be (since
> there are/were a few things I haven't finished researching yet, e.g.
> how this controller works in SMP context, and how interrupt
> controllers are supposed to work).
> 
> I can add all expected properties now and add support for them later,
> but I feel that this might add properties that will then never
> supported, and nobody updates the documentation for that, so I'd
> rather like to keep the documentation/dts(i) in sync with what the
> actual code expects/supports.

The DT bindings and DT content are supposed to be fully defined the
first time around, such that even if the kernel doesn't use the reg
property yet, if you were to use the DT created now with a future kernel
that does use the reg property, it's already there.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

pwm_backlight/general pwm issue.

2012-11-16 Thread Tony Prisk

Hi Thierry,

Looking a little feedback regarding a problem introduced with the pwm
patch I sent converting the vt8500 pwm driver to devicetree.

One of the recommendations you made was to enable/disable the pwm clock
in pwm_enable/pwm_disable, rather than at driver probe, to reduce power
usage. Unfortunately, when the last pwm is disabled, the clock is
disabled which prevents the pwm module from responding to register
read/writes. This would be fine if pwm_enable was called before any
other functions.

The pwm_backlight driver calls pwm_config before pwm_enable, which
doesn't work because the pwm module has been disabled. I can appreciate
that no one wants to enable a pwm before it's configured so I don't
think this is particularly a driver issue.


My recommendation is the re-enable the previous behaviour which was to
enable the clock during driver probe, and disable during driver unload.

Looking for your thoughts (or anyone else that wants to chime in).

Regards
Tony Prisk

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Linux 3.2.34

2012-11-16 Thread Ben Hutchings

I'm announcing the release of the 3.2.34 kernel.

All users of the 3.2 kernel series should upgrade.

The updated 3.2.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git 
linux-3.2.y
and can be browsed at the normal kernel.org git web browser:
http://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git

Ben.



 Documentation/feature-removal-schedule.txt  |8 ---
 Makefile|2 +-
 arch/arm/mach-at91/at91rm9200_devices.c |2 +-
 arch/arm/mach-at91/at91sam9260_devices.c|2 +-
 arch/arm/mach-at91/at91sam9261_devices.c|2 +-
 arch/arm/mach-at91/at91sam9263_devices.c|2 +-
 arch/arm/mach-at91/at91sam9rl_devices.c |2 +-
 arch/arm/mach-at91/setup.c  |2 +-
 arch/x86/include/asm/system.h   |7 --
 arch/x86/kernel/process.c   |   24 ---
 arch/x86/xen/mmu.c  |   21 +-
 crypto/cryptd.c |   11 +++-
 drivers/block/floppy.c  |   48 --
 drivers/gpio/gpio-timberdale.c  |4 +-
 drivers/gpu/drm/drm_fops.c  |5 +-
 drivers/gpu/drm/i915/intel_drv.h|4 +-
 drivers/gpu/drm/i915/intel_overlay.c|   14 +++-
 drivers/gpu/drm/i915/intel_sdvo.c   |   59 -
 drivers/gpu/drm/i915/intel_sdvo_regs.h  |2 +
 drivers/gpu/drm/nouveau/nouveau_drv.c   |   34 ++
 drivers/gpu/drm/nouveau/nouveau_state.c |4 +-
 drivers/gpu/drm/nouveau/nv04_dac.c  |8 +--
 drivers/gpu/drm/nouveau/nv04_dfp.c  |6 +-
 drivers/gpu/drm/nouveau/nv04_tv.c   |4 +-
 drivers/gpu/drm/radeon/evergreen.c  |2 +-
 drivers/gpu/drm/radeon/radeon_legacy_encoders.c |1 +
 drivers/gpu/drm/vmwgfx/vmwgfx_dmabuf.c  |2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c |5 ++
 drivers/hid/hid-microsoft.c |   18 --
 drivers/hwmon/w83627ehf.c   |1 +
 drivers/input/touchscreen/tsc40.c   |1 -
 drivers/net/ethernet/marvell/sky2.c |4 +-
 drivers/net/ethernet/realtek/r8169.c|7 +-
 drivers/net/usb/usbnet.c|3 +
 drivers/net/wireless/ath/ath9k/xmit.c   |   10 ++-
 drivers/net/wireless/rt2x00/rt2800lib.c |2 +-
 drivers/target/iscsi/iscsi_target.c |4 +-
 drivers/target/iscsi/iscsi_target_core.h|1 +
 drivers/target/iscsi/iscsi_target_login.c   |1 +
 drivers/target/iscsi/iscsi_target_util.c|   22 ++-
 drivers/target/iscsi/iscsi_target_util.h|1 +
 drivers/target/target_core_configfs.c   |3 +-
 drivers/target/target_core_device.c |   18 +++---
 drivers/usb/serial/mos7840.c|1 -
 drivers/xen/gntdev.c|   36 ++-
 fs/cifs/cifsacl.c   |   49 ++
 fs/ecryptfs/main.c  |   23 ++-
 fs/nfs/dns_resolve.c|5 +-
 fs/nfs/internal.h   |5 +-
 fs/nfs/mount_clnt.c |2 +-
 fs/nfs/namespace.c  |   19 --
 fs/nfs/nfs4namespace.c  |3 +-
 fs/nfs/nfs4proc.c   |   40 +++-
 fs/nfs/super.c  |2 +-
 fs/nfsd/export.c|6 +-
 fs/notify/fanotify/fanotify.c   |1 +
 fs/xfs/xfs_log_recover.c|2 +-
 include/linux/if_link.h |1 +
 include/linux/rtnetlink.h   |3 +
 include/linux/sunrpc/cache.h|   16 +
 include/net/cfg80211.h  |9 +++
 include/net/rtnetlink.h |2 +-
 include/sound/core.h|3 +
 include/trace/events/xen.h  |8 +++
 kernel/module.c |   24 +++
 mm/vmscan.c |2 +
 net/bluetooth/hci_conn.c|2 +
 net/core/dev.c  |2 +-
 net/core/rtnetlink.c|   78 +--
 net/ipv4/tcp.c  |8 +--
 net/ipv4/tcp_illinois.c |8 ++-
 net/ipv6/ndisc.c|3 +-
 net/ipv6/route.c|4 +-
 net/l2tp/l2tp_eth.c |1 +
 net/mac80211/ibss.c |2 +-
 net/mac80211/rx.c   |   71

Re: [PATCH v8 1/3] Runtime Interpreted Power Sequences

2012-11-16 Thread Alexandre Courbot

Hi Mark,

On Fri, Nov 16, 2012 at 7:35 PM, Mark Rutland  wrote:
> Given there are several ARM platforms that may have an interest in this, 
> please
> consider posting this to the ARM mailing list:
> linux-arm-ker...@lists.infradead.org.

That's right. New revision on the way.

>> +Similarly, each power sequence declares its steps as sub-nodes of itself. 
>> Steps
>> +must be named sequentially, with the first step named step0, the second 
>> step1,
>> +etc. Failure to follow this rule will result in a parsing error.
>
> Could we not encode the step number in the unit-address? i.e. step@N rather 
> than
> stepN.

That was the way I did it initially, but it has been pointed out that
doing so would require to have #address-cells and #size-cells in every
power sequence, as well as a "reg" property in every step (see
https://lkml.org/lkml/2012/7/31/454 ). Although I'd prefer to use the
@ notation too (and neither dtc nor the kernel complained when I did
it), I tend to think the current solution is less burdensome than
having these redundant properties.

>> +"gpio" type required properties:
>> +  - gpio: phandle of the GPIO to use.
>> +  - value: value this GPIO should take. Must be 0 or 1.
>
> Is there any reason for id to be a name rather than a phandle? It seems
> inconsistent with the gpio case.

That's another long story. But to make it short, I'd like to make it
possible for power sequences to be referenced and shared between
devices of the same type (as everybody knows, copy/pasting is bad). If
we use phandles in steps, the power sequence becomes tied to the
referred resources and thus cannot be shared with another instance of
the same device. On the other hand, using an identifier that is
resolved at runtime (through e.g. regulator_get(device *, char *)
leverages the existing frameworks and makes things more flexible.

GPIO is currently the exception. It is the only framework for which
you cannot currently resolve a resource from a device and an
identifier. So at the moment we have to use a phandle - but we are
also working with Linus Walleij to provide an alternative GPIO API
that will be more like what we have for regulators/pinctrl/PWM/etc.

Another problem with phandles is that some of the functions that
resolve them are not publicly exported (i.e. AFAIK there is no public
function that returns a regulator from a phandle - the only to obtain
one is through regulator_get)

> I also see from the example below that the gpio property is not just a 
> phandle,
> as it has the gpio-specifier appended. Is there a better way of describing 
> this
> format in the documentation?

This is already clearly be defined in
Documentation/devicetree/bindings/gpio/, isn't it?

Thanks,
Alex.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC PATCH 02/06] input/rmi4: Core files

2012-11-16 Thread Christopher Heiny

rmi_bus.c implements the basic functionality of the RMI bus.  This file is
greatly simplified compared to the previous patch - we've switched from
"do it yourself" device/driver binding to using device_type to distinguish
between the two kinds of devices on the bus (sensor devices and function
specific devices) and using the standard bus implementation to manage devices
and drivers.


rmi_driver.c is a driver for the general functionality of the RMI sensor as a
whole, managing those behaviors (including IRQ handling) that are not specific
to any RMI4 function.  It has some unavoidable dependencies on F01 behavior,
though we have worked to minimize those as far as possible.


The header file rmi_driver.h provides definitions that are shared among
the modules of the RMI implementation, but not thought to be necessary
outside it.


Greg KH - Linus Walleij recommended that we seek your input on these core
files, particularly the bus implementation.


Signed-off-by: Christopher Heiny 

Cc: Greg Kroah-Hartman 
Cc: Dmitry Torokhov 
Cc: Linus Walleij 
Cc: Naveen Kumar Gaddipati 
Cc: Joeri de Gram 

---

 drivers/input/rmi4/rmi_bus.c|  248 ++
 drivers/input/rmi4/rmi_driver.c | 1663 +++
 drivers/input/rmi4/rmi_driver.h |  139 
 include/uapi/linux/input.h  |1 +
 4 files changed, 2051 insertions(+), 0 deletions(-)

diff --git a/drivers/input/rmi4/rmi_bus.c b/drivers/input/rmi4/rmi_bus.c
new file mode 100644
index 000..a912349
--- /dev/null
+++ b/drivers/input/rmi4/rmi_bus.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2011, 2012 Synaptics Incorporated
+ * Copyright (c) 2011 Unixphere
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "rmi_driver.h"
+
+DEFINE_MUTEX(rmi_bus_mutex);
+
+static struct attribute *function_dev_attrs[] = {
+   NULL,
+};
+
+static struct attribute_group function_dev_attr_group = {
+   .attrs = function_dev_attrs,
+};
+
+static const struct attribute_group *function_dev_attr_groups[] = {
+   _dev_attr_group,
+   NULL,
+};
+
+struct device_type rmi_function_type = {
+   .name = "rmi_function",
+   .groups = function_dev_attr_groups,
+};
+EXPORT_SYMBOL_GPL(rmi_function_type);
+
+static struct attribute *sensor_dev_attrs[] = {
+   NULL,
+};
+static struct attribute_group sensor_dev_attr_group = {
+   .attrs = sensor_dev_attrs,
+};
+
+static const struct attribute_group *sensor_dev_attr_groups[] = {
+   _dev_attr_group,
+   NULL,
+};
+
+struct device_type rmi_sensor_type = {
+   .name = "rmi_sensor",
+   .groups = sensor_dev_attr_groups,
+};
+EXPORT_SYMBOL_GPL(rmi_sensor_type);
+
+static atomic_t physical_device_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_RMI4_DEBUG
+static struct dentry *rmi_debugfs_root;
+#endif
+
+#ifdef CONFIG_PM
+static int rmi_bus_suspend(struct device *dev)
+{
+   struct device_driver *driver = dev->driver;
+   const struct dev_pm_ops *pm;
+
+   if (!driver)
+   return 0;
+
+   pm = driver->pm;
+   if (pm && pm->suspend)
+   return pm->suspend(dev);
+   if (driver->suspend)
+   return driver->suspend(dev, PMSG_SUSPEND);
+
+   return 0;
+}
+
+static int rmi_bus_resume(struct device *dev)
+{
+   struct device_driver *driver = dev->driver;
+   const struct dev_pm_ops *pm;
+
+   if (!driver)
+   return 0;
+
+   pm = driver->pm;
+   if (pm && pm->resume)
+   return pm->resume(dev);
+   if (driver->resume)
+   return driver->resume(dev);
+
+   return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(rmi_bus_pm_ops,
+rmi_bus_suspend, rmi_bus_resume);
+
+struct bus_type rmi_bus_type = {
+   .name   = "rmi",
+   .pm = _bus_pm_ops
+};
+EXPORT_SYMBOL_GPL(rmi_bus_type);
+
+static void release_rmidev_device(struct device *dev)
+{
+   device_unregister(dev);
+}
+
+/**
+ * rmi_register_phys_device - register a physical device connection on the RMI
+ * bus.  Physical drivers provide communication from the devices on the bus to
+ * the RMI4 sensor on a bus such as SPI, I2C, and so on.
+ *
+ * @phys: the physical device to register
+ */
+int

[RFC PATCH 05/06] input/rmi4: F01 - device control

2012-11-16 Thread Christopher Heiny

RMI Function 01 implements basic device control and power management
behaviors for the RMI4 sensor.

rmi_f01.h exports definitions that we expect to be used by other functionality
in the future (such as firmware reflash).


Signed-off-by: Christopher Heiny 

Cc: Dmitry Torokhov 
Cc: Linus Walleij 
Cc: Naveen Kumar Gaddipati 
Cc: Joeri de Gram 


---

 drivers/input/rmi4/rmi_f01.c | 1348 ++
 drivers/input/rmi4/rmi_f01.h |  160 +
 2 files changed, 1508 insertions(+), 0 deletions(-)

diff --git a/drivers/input/rmi4/rmi_f01.c b/drivers/input/rmi4/rmi_f01.c
new file mode 100644
index 000..038266c
--- /dev/null
+++ b/drivers/input/rmi4/rmi_f01.c
@@ -0,0 +1,1348 @@
+/*
+ * Copyright (c) 2011-2012 Synaptics Incorporated
+ * Copyright (c) 2011 Unixphere
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "rmi_driver.h"
+#include "rmi_f01.h"
+
+#define FUNCTION_NUMBER 0x01
+
+/**
+ * @reset - set this bit to force a firmware reset of the sensor.
+ */
+struct f01_device_commands {
+   bool reset:1;
+   u8 reserved:7;
+};
+
+/**
+ * @ctrl0 - see documentation in rmi_f01.h.
+ * @interrupt_enable - A mask of per-function interrupts on the touch sensor.
+ * @doze_interval - controls the interval between checks for finger presence
+ * when the touch sensor is in doze mode, in units of 10ms.
+ * @wakeup_threshold - controls the capacitance threshold at which the touch
+ * sensor will decide to wake up from that low power state.
+ * @doze_holdoff - controls how long the touch sensor waits after the last
+ * finger lifts before entering the doze state, in units of 100ms.
+ */
+struct f01_device_control {
+   struct f01_device_control_0 ctrl0;
+   u8 *interrupt_enable;
+   u8 doze_interval;
+   u8 wakeup_threshold;
+   u8 doze_holdoff;
+};
+
+/**
+ * @has_ds4_queries - if true, the query registers relating to Design Studio 4
+ * features are present.
+ * @has_multi_phy - if true, multiple physical communications interfaces are
+ * supported.
+ * @has_guest - if true, a "guest" device is supported.
+ */
+struct f01_query_42 {
+   bool has_ds4_queries:1;
+   bool has_multi_phy:1;
+   bool has_guest:1;
+   u8 reserved:5;
+} __attribute__((__packed__));
+
+/**
+ * @length - the length of the remaining Query43.* register block, not
+ * including the first register.
+ * @has_package_id_query -  the package ID query data will be accessible from
+ * inside the ProductID query registers.
+ * @has_packrat_query -  the packrat query data will be accessible from inside
+ * the ProductID query registers.
+ * @has_reset_query - the reset pin related registers are valid.
+ * @has_maskrev_query - the silicon mask revision number will be reported.
+ * @has_i2c_control - the register F01_RMI_Ctrl6 will exist.
+ * @has_spi_control - the register F01_RMI_Ctrl7 will exist.
+ * @has_attn_control - the register F01_RMI_Ctrl8 will exist.
+ * @reset_enabled - the hardware reset pin functionality has been enabled
+ * for this device.
+ * @reset_polarity - If this bit reports as ‘0’, it means that the reset state
+ * is active low. A ‘1’ means that the reset state is active high.
+ * @pullup_enabled - If set, it indicates that a built-in weak pull up has
+ * been enabled on the Reset pin; clear means that no pull-up is present.
+ * @reset_pin_number - This field represents which GPIO pin number has been
+ * assigned the reset functionality.
+ */
+struct f01_ds4_queries {
+   u8 length:4;
+   u8 reserved_1:4;
+
+   bool has_package_id_query:1;
+   bool has_packrat_query:1;
+   bool has_reset_query:1;
+   bool has_maskrev_query:1;
+   u8 reserved_2:4;
+
+   bool has_i2c_control:1;
+   bool has_spi_control:1;
+   bool has_attn_control:1;
+   u8 reserved_3:5;
+
+   bool reset_enabled:1;
+   bool reset_polarity:1;
+   bool pullup_enabled:1;
+   u8 reserved_4:1;
+   u8 reset_pin_number:4;
+} __attribute__((__packed__));
+
+struct f01_data {
+   struct f01_device_control device_control;
+   struct f01_basic_queries basic_queries;
+   struct f01_device_status device_status;
+   u8 product_id[RMI_PRODUCT_ID_LENGTH+1];
+
+

[RFC PATCH 03/06] input/rmi4: I2C physical interface

2012-11-16 Thread Christopher Heiny


rmi_i2c.c abstracts an RMI4 device on some arbitrary I2C bus as a logical
device in the RMI bus.  It handles reads/writes from/to the RMI4 devices,
and manages the page select register setting (since the meaning of page
select is dependent on the physical layer used to communicate with the RMi4
device).


Signed-off-by: Christopher Heiny 

Cc: Dmitry Torokhov 
Cc: Linus Walleij 
Cc: Naveen Kumar Gaddipati 
Cc: Joeri de Gram 

Acked-by: Jean Delvare 

---

 drivers/input/rmi4/rmi_i2c.c |  490 ++
 1 files changed, 490 insertions(+), 0 deletions(-)

diff --git a/drivers/input/rmi4/rmi_i2c.c b/drivers/input/rmi4/rmi_i2c.c
new file mode 100644
index 000..ca32101
--- /dev/null
+++ b/drivers/input/rmi4/rmi_i2c.c
@@ -0,0 +1,490 @@
+/*
+ * Copyright (c) 2011, 2012 Synaptics Incorporated
+ * Copyright (c) 2011 Unixphere
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "rmi_driver.h"
+
+#define BUFFER_SIZE_INCREMENT 32
+/**
+ * struct rmi_i2c_data - stores information for i2c communication
+ *
+ * @page_mutex: Locks current page to avoid changing pages in unexpected ways.
+ * @page: Keeps track of the current virtual page
+ * @phys: Pointer to the physical interface
+ *
+ * @tx_buf: Buffer used for transmitting data to the sensor over i2c.
+ * @tx_buf_size: Size of the buffer
+ * @debug_buf: Buffer used for exposing buffer contents using dev_dbg
+ * @debug_buf_size: Size of the debug buffer.
+ *
+ * @comms_debug: Latest data read/written for debugging I2C communications
+ * @debugfs_comms: Debugfs file for debugging I2C communications
+ *
+ */
+struct rmi_i2c_data {
+   struct mutex page_mutex;
+   int page;
+   struct rmi_phys_device *phys;
+
+   u8 *tx_buf;
+   int tx_buf_size;
+   u8 *debug_buf;
+   int debug_buf_size;
+
+   bool comms_debug;
+#ifdef CONFIG_RMI4_DEBUG
+   struct dentry *debugfs_comms;
+#endif
+};
+
+#ifdef CONFIG_RMI4_DEBUG
+
+
+/**
+ * struct i2c_debugfs_data - stores information for debugfs
+ *
+ * @done: Indicates that we are done reading debug data. Subsequent reads
+ * will return EOF.
+ * @i2c_data: Pointer to the i2c data
+ *
+ */
+struct i2c_debugfs_data {
+   bool done;
+   struct rmi_i2c_data *i2c_data;
+};
+
+static int debug_open(struct inode *inodep, struct file *filp)
+{
+   struct i2c_debugfs_data *data;
+
+   data = kzalloc(sizeof(struct i2c_debugfs_data), GFP_KERNEL);
+   if (!data)
+   return -ENOMEM;
+
+   data->i2c_data = inodep->i_private;
+   filp->private_data = data;
+   return 0;
+}
+
+static int debug_release(struct inode *inodep, struct file *filp)
+{
+   kfree(filp->private_data);
+   return 0;
+}
+
+static ssize_t comms_debug_read(struct file *filp, char __user *buffer,
+   size_t size, loff_t *offset) {
+   int retval;
+   char *local_buf;
+   struct i2c_debugfs_data *dfs = filp->private_data;
+   struct rmi_i2c_data *data = dfs->i2c_data;
+
+   if (dfs->done)
+   return 0;
+
+   local_buf = kcalloc(size, sizeof(u8), GFP_KERNEL);
+   if (!local_buf)
+   return -ENOMEM;
+
+   dfs->done = 1;
+
+   retval = snprintf(local_buf, PAGE_SIZE, "%u\n", data->comms_debug);
+
+   if (retval <= 0 || copy_to_user(buffer, local_buf, retval))
+   retval = -EFAULT;
+   kfree(local_buf);
+
+   return retval;
+}
+
+static ssize_t comms_debug_write(struct file *filp, const char __user *buffer,
+  size_t size, loff_t *offset) {
+   int retval;
+   char *local_buf;
+   unsigned int new_value;
+   struct i2c_debugfs_data *dfs = filp->private_data;
+   struct rmi_i2c_data *data = dfs->i2c_data;
+
+   local_buf = kcalloc(size, sizeof(u8), GFP_KERNEL);
+   if (!local_buf)
+   return -ENOMEM;
+   retval = copy_from_user(local_buf, buffer, size);
+   if (retval) {
+   kfree(local_buf);
+   return -EFAULT;
+   }
+
+   retval = sscanf(local_buf, "%u", _value);
+   kfree(local_buf);
+   if (retval != 1 || new_value > 1)
+   return -EINVAL;
+
+

[RFC PATCH 04/06] input/rmi4: Config files and makefiles

2012-11-16 Thread Christopher Heiny


Infrastructure files for configuration and building.


Signed-off-by: Christopher Heiny 

Cc: Dmitry Torokhov 
Cc: Linus Walleij 
Cc: Naveen Kumar Gaddipati 
Cc: Joeri de Gram 


---

 drivers/input/Kconfig   |2 +
 drivers/input/Makefile  |3 ++
 drivers/input/rmi4/Kconfig  |   76 +++
 drivers/input/rmi4/Makefile |   22 
 4 files changed, 103 insertions(+), 0 deletions(-)

diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index 55f7e57..2c543c0 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -188,6 +188,8 @@ source "drivers/input/touchscreen/Kconfig"

 source "drivers/input/misc/Kconfig"

+source "drivers/input/rmi4/Kconfig"
+
 endif

 menu "Hardware I/O ports"
diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index 5ca3f63..88354fc 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -24,4 +24,7 @@ obj-$(CONFIG_INPUT_TABLET)+= tablet/
 obj-$(CONFIG_INPUT_TOUCHSCREEN)+= touchscreen/
 obj-$(CONFIG_INPUT_MISC)   += misc/

+obj-y += rmi4/
+
 obj-$(CONFIG_INPUT_APMPOWER)   += apm-power.o
+
diff --git a/drivers/input/rmi4/Kconfig b/drivers/input/rmi4/Kconfig
new file mode 100644
index 000..41cbbee
--- /dev/null
+++ b/drivers/input/rmi4/Kconfig
@@ -0,0 +1,76 @@
+#
+# RMI4 configuration
+#
+config RMI4_BUS
+   bool "Synaptics RMI4 bus support"
+   help
+ Say Y here if you want to support the Synaptics RMI4 bus.  This is
+ required for all RMI4 device support.
+
+ If unsure, say Y.
+
+ This feature is not currently available as a loadable module.
+
+config RMI4_DEBUG
+   bool "RMI4 Debugging"
+   depends on RMI4_BUS
+   select DEBUG_FS
+   help
+ Say Y here to enable debug feature in the RMI4 driver.
+
+ Note that the RMI4 driver debug features can generate a lot of
+ output (potentially clogging up your dmesg output) and generally
+ slow down driver operation.  It's recommended to enable them only
+ if you are actively developing/debugging RMI4 features.
+
+ If unsure, say N.
+
+config RMI4_I2C
+   bool "RMI4 I2C Support"
+   depends on RMI4_BUS && I2C
+   help
+ Say Y here if you want to support RMI4 devices connected to an I2C
+ bus.
+
+ If unsure, say Y.
+
+ This feature is not currently available as a loadable module.
+
+config RMI4_GENERIC
+   bool "RMI4 Generic driver"
+   depends on RMI4_BUS
+   help
+ Say Y here if you want to support generic RMI4 devices.
+
+ This is pretty much required if you want to do anything useful with
+ your RMI device.
+
+ This feature is not currently available as a loadable module.
+
+config RMI4_F11
+   tristate "RMI4 Function 11 (2D pointing)"
+   depends on RMI4_BUS && RMI4_GENERIC
+   help
+ Say Y here if you want to add support for RMI4 function 11.
+
+ Function 11 provides 2D multifinger pointing for touchscreens and
+ touchpads.  For sensors that support relative pointing, F11 also
+ provides mouse input.
+
+ To compile this driver as a module, choose M here: the
+ module will be called rmi-f11.
+
+config RMI4_F11_PEN
+   bool "RMI4 F11 Pen Support"
+   depends on RMI4_F11
+   help
+ Say Y here to add support for pen input to RMI4 function 11.
+
+ If this feature is enabled, when pen inputs are detected they
+ will be reported to the input stream as MT_TOOL_PEN.  Otherwise,
+ pens will be treated the same as fingers.
+
+ Not all UI implementations deal gracefully with pen discrimination.
+ If your system is not recognizing pen touches and you know your
+ sensor supports pen input, you probably want to turn this feature
+ off.
diff --git a/drivers/input/rmi4/Makefile b/drivers/input/rmi4/Makefile
new file mode 100644
index 000..8882c3d
--- /dev/null
+++ b/drivers/input/rmi4/Makefile
@@ -0,0 +1,22 @@
+obj-$(CONFIG_RMI4_BUS) += rmi_bus.o
+obj-$(CONFIG_RMI4_I2C) += rmi_i2c.o
+obj-$(CONFIG_RMI4_GENERIC) += rmi_driver.o rmi_f01.o
+obj-$(CONFIG_RMI4_F11) += rmi_f11.o
+
+ccflags-$(CONFIG_RMI4_DEBUG) += -DDEBUG
+
+ifeq ($(KERNELRELEASE),)
+
+# KERNELDIR ?= /home/
+PWD := $(shell pwd)
+
+.PHONY: build clean
+
+build:
+   $(MAKE) -C $(KERNELDIR) M=$(PWD) modules
+
+clean:
+   rm -rf *.o *~ core .depend .*.cmd *.ko *.mod.c
+
+endif
+
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC PATCH 01/06] input/rmi4: Public header and documentation

2012-11-16 Thread Christopher Heiny

rmi.h provides public definitions required by the RMI bus implementation and
modules that interact with it.

debugfs and sysfs attributes are documented in files in
Documentation/ABI/testing.  There's two files, one for debugfs and one for
sysfs.


Signed-off-by: Christopher Heiny 

Cc: Dmitry Torokhov 
Cc: Linus Walleij 
Cc: Naveen Kumar Gaddipati 
Cc: Joeri de Gram 

---

 Documentation/ABI/testing/debugfs-rmi4 |   99 ++
 Documentation/ABI/testing/sysfs-rmi4   |  103 ++
 include/linux/rmi.h|  596 
 3 files changed, 798 insertions(+), 0 deletions(-)

diff --git a/Documentation/ABI/testing/debugfs-rmi4 
b/Documentation/ABI/testing/debugfs-rmi4
new file mode 100644
index 000..ef0739d
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-rmi4
@@ -0,0 +1,99 @@
+What:  /sys/kernel/debug/rmi/devices
+Date:  October 2012
+KernelVersion: 3.x
+Contact:   Christopher Heiny 
+Description:
+
+  The RMI4 driver implementation exposes a set of informational and control
+  parameters via debugfs.  These parameters are those that typically are only
+  viewed or adjusted during product development, tuning, and debug.
+  For parameters that are  referenced and/or adjusted during normal operation,
+  please see sysfs-rmi4 in this directory.
+
+  General debugging parameters for a particular RMI4 sensor are found in
+  /sys/kernel/debug/rmi/sensorXX/, where XX is a the device's ID as a two
+  digit number (padded with leading zeros).  Function specific parameters
+  for an RMI4 sensor are found in /sys/kernel/debug/rmi/devices/FYY/, where
+  XX is a the device's ID as a two digit number (padded with leading zeros)
+  and YY is the hexdecimal function number (for example, F11 for RMI function
+  F11).
+
+  For RMI4 functions that support multiple sensor instances (such as F11),
+  the parameters for individual sensors have .Z appended to them, where Z is
+  the index of the sensor instance (for example, clip.0, clip.1, clip.2, and
+  so on).
+
+  Some of the parameters exposed here are described in detail in the
+  RMI4 Specification, which is found here:
+http://www.synaptics.com/sites/default/files/511-000136-01_revD.pdf
+  For such parameters, we'll reference you to that document, rather than
+  copying the contents here.
+
+  /sys/kernel/debug/rmi/
+  /sensorXX/
+  attn_count - (ro) Shows the number of ATTN interrupts handled so far.
+  comms_debug - (rw) Write 1 to this dump information about register
+  reads and writes to the console.  Write 0 to this to turn
+  this feature off.  WARNING: Imposes a major performance
+  penalty when turned on.
+  irq_debug - (rw) Write 1 to this dump information about interrupts
+  to the console.  Write 0 to this to turn this feature off.
+  WARNIG: Imposes a major performance penalty when turned on.
+  phys - (ro) Presents information about the physical connection of
+  this device.  It has one line, with the format:
+
+   prot tx_count tx_bytes tx_errors rx_count rx_bytes 
rx_errors attn
+
+  Where
+   prot is one of i2c, spi1, or spi2
+   tx_count is the number of write operations
+   tx_bytes is the number of bytes written
+   tx_errors is the number of write operations that 
encountered errors
+   rx_count is the number of read operations
+   rx_bytes is the total number of bytes read
+   rx_errors is the number of read operations that 
encountered errors
+
+  All counts are 64-bit unsigned values, and are set to zero
+  when the physical layer driver is initialized.
+
+  /sensorXX/F01/
+  interrupt_enable - (rw) allows you to read or modify the F01
+  interrupt enable mask (the F01_RMI_Ctrl1 register(s)).
+
+  /sensorXX/F11/
+  clip.Z - (rw) Controls in-driver coordinate clipping for the 2D
+  sensor Z.  This is a set of four unsigned values in the
+  range [0..65535], representing the lower bounds on X, the
+  upper bounds on X, the lower bounds on Y, and the upper
+  bounds on Y.  Coordinates will be clipped to these ranges.
+  If enabled, clip is the final transformation to be applied
+  to the coordinates. The default upper and lower bounds for
+  clip are 0 and 65535 respectively for both axes.
+  delta_threshold.Z - (rw) Controls the F11 distance thresholds. This
+  contains two values, corresponding to F11_2D_Ctrl2 and
+  F11_2D_Ctrl3.  Se the spec for more details.
+  flip.Z - (rw) This parameter is a pair of single binary digits (for
+

[RFC PATCH 00/06] input: Synaptics RMI4 Touchscreen Driver

2012-11-16 Thread Christopher Heiny

This patch implements a driver supporting Synaptics ClearPad and other
touchscreen sensors that use the RMI4 protocol, as defined here:


http://www.synaptics.com/sites/default/files/511-000136-01-Rev-E-RMI4%20Intrfacing%20Guide.pdf

as well as successor documents that haven't made their way through to
publication yet.

This code supersedes the patch submitted on 2012-10-05.  For all files
included in this patch, we believe that all outstanding issues arising
from the previous submissions have been addressed, except as follows:

* we've investigated using irq_chip to manage chip interrupt dispatch, and
that certainly is a good idea.  However, we need to support kernels back to
3.0.x, and the required functionality is not yet present in those older kernels.
Once we no longer need to support 3.0.x, we'll jump onto irq_chip right
away.

* some of the requested changes to rmi_f11.c are simply not possible while
still retaining general driver functionality.  We've clarified existing comments
and added new ones to explain why that is the case.


This patch is against the v3.7-rc5 tag of Linus' kernel tree, object
77b67063bb6bce6d475e910d3b886a606d0d91f7.  It should work fine with that kernel,
but will not work with earlier kernels due to changes in the input subsystem.



Included in this patch are:
- full support for an RMI virtual bus as a standard kernel bus

- physical layer implementation for I2C

- device driver for general RMI4 sensor functionality

- function implementations for the following RMI4 functions:
* F01 device control
* F11 multifinger pointing

The driver supports a system having one or more RMI sensors attached to it.
Most devices have just a single touch sensor, but some have more than one.
An example is the Fuse concept phone, which has 4 RMI sensors in it.

Each sensor is presented as a device on the RMI logical bus (/sys/bus/rmi).
Devices are named/numbered in the order they are discovered on the bus,
starting with /sys/bus/rmi/devices/sensor00 for the first once, .../sensor01
for the second one, and so on.

Individual RMI functions are presented as child devices of the sensor device.
For example, sensor00.fn01, sensor00.fn11, and so on.  Control of an RMI
function's operating parameters is implemented via sysfs or debugfs (depending
on whether the parameters are used during normal operation or system
development/prototyping).



The amount of feedback received on previous patches precludes addressing each
item individually.  However, major changes for this patch are:
- elimination of sysfs management macros
- elimination of roll-your-own bitmask management
- moved potentially large arrays and structs from the stack to the heap
- elimination of the union/struct idiom for mapping register groups
- corrected identification of input devices, including adding a BUS_RMI
  bus type.


We've broken this patch into 6 parts, as follows:
01 - public header files and documentation
02 - core sensor and bus implementation
03 - I2C physical layer driver
04 - Kconfigs and Makefiles
05..06 - drivers for individual RMI functions


Comments and other feedback on this driver are welcomed.

Christopher Heiny and the Synaptics RMI4 driver team

Signed-off-by: Christopher Heiny 

Cc: Jean Delvare 
Cc: Linus Walleij 
Cc: Naveen Kumar Gaddipati 
Cc: Joeri de Gram 

---
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 01/46] x86, mm: Add global page_size_mask and probe one time only

2012-11-16 Thread Yinghai Lu

Now we pass around use_gbpages and use_pse for calculating page table size,
Later we will need to call init_memory_mapping for every ram range one by one,
that mean those calculation will be done several times.

Those information are the same for all ram range and could be stored in
page_size_mask and could be probed it one time only.

Move that probing code out of init_memory_mapping into separated function
probe_page_size_mask(), and call it before all init_memory_mapping.

Suggested-by: Ingo Molnar 
Signed-off-by: Yinghai Lu 
Reviewed-by: Pekka Enberg 
---
 arch/x86/include/asm/pgtable.h |1 +
 arch/x86/kernel/setup.c|1 +
 arch/x86/mm/init.c |   55 ++-
 3 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a1f780d..98ac76d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -602,6 +602,7 @@ static inline int pgd_none(pgd_t pgd)
 #ifndef __ASSEMBLY__
 
 extern int direct_gbpages;
+void probe_page_size_mask(void);
 
 /* local pte updates need not use xchg for locking */
 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ca45696..01fb5f9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -913,6 +913,7 @@ void __init setup_arch(char **cmdline_p)
setup_real_mode();
 
init_gbpages();
+   probe_page_size_mask();
 
/* max_pfn_mapped is updated here */
max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn

[PATCH v8 07/46] x86, mm: Find early page table buffer together

2012-11-16 Thread Yinghai Lu

We should not do that in every calling of init_memory_mapping.

At the same time need to move down early_memtest, and could remove after_bootmem
checking.

-v2: fix one early_memtest with 32bit by passing max_pfn_mapped instead.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |   66 ++-
 1 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 51f919f..1ce0d03 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -274,16 +274,6 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
memset(mr, 0, sizeof(mr));
nr_range = split_mem_range(mr, 0, start, end);
 
-   /*
-* Find space for the kernel direct mapping tables.
-*
-* Later we should allocate these tables in the local node of the
-* memory mapped. Unfortunately this is done currently before the
-* nodes are discovered.
-*/
-   if (!after_bootmem)
-   find_early_table_space(start, end);
-
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
   mr[i].page_size_mask);
@@ -296,6 +286,36 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
 
__flush_tlb_all();
 
+   return ret >> PAGE_SHIFT;
+}
+
+void __init init_mem_mapping(void)
+{
+   probe_page_size_mask();
+
+   /*
+* Find space for the kernel direct mapping tables.
+*
+* Later we should allocate these tables in the local node of the
+* memory mapped. Unfortunately this is done currently before the
+* nodes are discovered.
+*/
+#ifdef CONFIG_X86_64
+   find_early_table_space(0, max_pfn< max_low_pfn) {
+   max_pfn_mapped = init_memory_mapping(1UL<<32,
+max_pfn< pgt_buf_start)
+   if (pgt_buf_end > pgt_buf_start)
x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
PFN_PHYS(pgt_buf_end));
 
-   if (!after_bootmem)
-   early_memtest(start, end);
+   /* stop the wrong using */
+   pgt_buf_top = 0;
 
-   return ret >> PAGE_SHIFT;
-}
-
-void __init init_mem_mapping(void)
-{
-   probe_page_size_mask();
-
-   /* max_pfn_mapped is updated here */
-   max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) {
-   max_pfn_mapped = init_memory_mapping(1UL<<32,
-max_pfn

[PATCH v8 05/46] x86, mm: Revert back good_end setting for 64bit

2012-11-16 Thread Yinghai Lu

After

| commit 8548c84da2f47e71bbbe300f55edb768492575f7
| Author: Takashi Iwai 
| Date:   Sun Oct 23 23:19:12 2011 +0200
|
|x86: Fix S4 regression
|
|Commit 4b239f458 ("x86-64, mm: Put early page table high") causes a S4
|regression since 2.6.39, namely the machine reboots occasionally at S4
|resume.  It doesn't happen always, overall rate is about 1/20.  But,
|like other bugs, once when this happens, it continues to happen.
|
|This patch fixes the problem by essentially reverting the memory
|assignment in the older way.

Have some page table around 512M again, that will prevent kdump to find 512M
under 768M.

We need revert that reverting, so we could put page table high again for 64bit.

Takashi agreed that S4 regression could be something else.

https://lkml.org/lkml/2012/6/15/182

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 9e17f9e..dbef4ff 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -234,8 +234,8 @@ static void __init find_early_table_space(struct map_range 
*mr, int nr_range)
 #ifdef CONFIG_X86_32
/* for fixmap */
tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
good_end = max_pfn_mapped << PAGE_SHIFT;
+#endif
 
base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
if (!base)
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 02/46] x86, mm: Split out split_mem_range from init_memory_mapping

2012-11-16 Thread Yinghai Lu

So make init_memory_mapping smaller and readable.

-v2: use 0 instead of nr_range as input parameter found by Yasuaki Ishimatsu.

Suggested-by: Ingo Molnar 
Signed-off-by: Yinghai Lu 
Reviewed-by: Pekka Enberg 
Cc: Yasuaki Ishimatsu 
---
 arch/x86/mm/init.c |   41 +
 1 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index aa5b0da..6368b86 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -146,25 +146,13 @@ static int __meminit save_mr(struct map_range *mr, int 
nr_range,
return nr_range;
 }
 
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-  unsigned long end)
+static int __meminit split_mem_range(struct map_range *mr, int nr_range,
+unsigned long start,
+unsigned long end)
 {
unsigned long start_pfn, end_pfn;
-   unsigned long ret = 0;
unsigned long pos;
-   struct map_range mr[NR_RANGE_MR];
-   int nr_range, i;
-
-   printk(KERN_INFO "init_memory_mapping: [mem %#010lx-%#010lx]\n",
-  start, end - 1);
-
-   memset(mr, 0, sizeof(mr));
-   nr_range = 0;
+   int i;
 
/* head if not big page alignment ? */
start_pfn = start >> PAGE_SHIFT;
@@ -258,6 +246,27 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
(mr[i].page_size_mask & (1

[PATCH v8 08/46] x86, mm: Separate out calculate_table_space_size()

2012-11-16 Thread Yinghai Lu

It should take physical address range that will need to be mapped.
find_early_table_space should take range that pgt buff should be in.

Separating page table size calculating and finding early page table to
reduce confusing.

Signed-off-by: Yinghai Lu 
Reviewed-by: Pekka Enberg 
---
 arch/x86/mm/init.c |   38 +++---
 1 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1ce0d03..7b961d0 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -196,12 +196,10 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
  * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 
1GB
  * pages. Then find enough contiguous space for those page tables.
  */
-static void __init find_early_table_space(unsigned long start, unsigned long 
end)
+static unsigned long __init calculate_table_space_size(unsigned long start, 
unsigned long end)
 {
int i;
unsigned long puds = 0, pmds = 0, ptes = 0, tables;
-   unsigned long good_end;
-   phys_addr_t base;
struct map_range mr[NR_RANGE_MR];
int nr_range;
 
@@ -240,9 +238,17 @@ static void __init find_early_table_space(unsigned long 
start, unsigned long end
 #ifdef CONFIG_X86_32
/* for fixmap */
tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-   good_end = max_pfn_mapped << PAGE_SHIFT;
 #endif
 
+   return tables;
+}
+
+static void __init find_early_table_space(unsigned long start,
+ unsigned long good_end,
+ unsigned long tables)
+{
+   phys_addr_t base;
+
base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
if (!base)
panic("Cannot find space for the kernel page tables");
@@ -250,10 +256,6 @@ static void __init find_early_table_space(unsigned long 
start, unsigned long end
pgt_buf_start = base >> PAGE_SHIFT;
pgt_buf_end = pgt_buf_start;
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
-
-   printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem 
%#010lx-%#010lx]\n",
-   mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
-   (pgt_buf_top << PAGE_SHIFT) - 1);
 }
 
 /*
@@ -291,6 +293,8 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
 
 void __init init_mem_mapping(void)
 {
+   unsigned long tables, good_end, end;
+
probe_page_size_mask();
 
/*
@@ -301,10 +305,18 @@ void __init init_mem_mapping(void)
 * nodes are discovered.
 */
 #ifdef CONFIG_X86_64
-   find_early_table_space(0, max_pfn< pgt_buf_start)
+   if (pgt_buf_end > pgt_buf_start) {
+   printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ 
[mem %#010lx-%#010lx] final\n",
+   end - 1, pgt_buf_start << PAGE_SHIFT,
+   (pgt_buf_end << PAGE_SHIFT) - 1);
x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
PFN_PHYS(pgt_buf_end));
+   }
 
/* stop the wrong using */
pgt_buf_top = 0;
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 10/46] x86, mm: if kernel .text .data .bss are not marked as E820_RAM, complain and fix

2012-11-16 Thread Yinghai Lu

From: Jacob Shin 

There could be cases where user supplied memmap=exactmap memory
mappings do not mark the region where the kernel .text .data and
.bss reside as E820_RAM, as reported here:

https://lkml.org/lkml/2012/8/14/86

Handle it by complaining, and adding the range back into the e820.

Signed-off-by: Jacob Shin 
Signed-off-by: Yinghai Lu 
Reviewed-by: Pekka Enberg 
---
 arch/x86/kernel/setup.c |   14 ++
 1 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4bd8921..d85cbd9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -832,6 +832,20 @@ void __init setup_arch(char **cmdline_p)
insert_resource(_resource, _resource);
insert_resource(_resource, _resource);
 
+   /*
+* Complain if .text .data and .bss are not marked as E820_RAM and
+* attempt to fix it by adding the range. We may have a confused BIOS,
+* or the user may have incorrectly supplied it via memmap=exactmap. If
+* we really are running on top non-RAM, we will crash later anyways.
+*/
+   if (!e820_all_mapped(code_resource.start, __pa(__brk_limit), E820_RAM)) 
{
+   pr_warn(".text .data .bss are not marked as E820_RAM!\n");
+
+   e820_add_region(code_resource.start,
+   __pa(__brk_limit) - code_resource.start + 1,
+   E820_RAM);
+   }
+
trim_bios_range();
 #ifdef CONFIG_X86_32
if (ppro_with_ram_bug()) {
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 12/46] x86, mm: use pfn_range_is_mapped() with CPA

2012-11-16 Thread Yinghai Lu

We are going to map ram only, so under max_low_pfn_mapped,
between 4g and max_pfn_mapped does not mean mapped at all.

Use pfn_range_is_mapped() directly.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/pageattr.c |   16 +++-
 1 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a718e0d..44acfcd 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -551,16 +551,10 @@ static int split_large_page(pte_t *kpte, unsigned long 
address)
for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
set_pte([i], pfn_pte(pfn, ref_prot));
 
-   if (address >= (unsigned long)__va(0) &&
-   address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT))
+   if (pfn_range_is_mapped(PFN_DOWN(__pa(address)),
+   PFN_DOWN(__pa(address)) + 1))
split_page_count(level);
 
-#ifdef CONFIG_X86_64
-   if (address >= (unsigned long)__va(1UL<<32) &&
-   address < (unsigned long)__va(max_pfn_mapped << PAGE_SHIFT))
-   split_page_count(level);
-#endif
-
/*
 * Install the new, split up pagetable.
 *
@@ -729,13 +723,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
unsigned long vaddr;
int ret;
 
-   if (cpa->pfn >= max_pfn_mapped)
+   if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
return 0;
 
-#ifdef CONFIG_X86_64
-   if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
-   return 0;
-#endif
/*
 * No need to redo, when the primary call touched the direct
 * mapping already:
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 13/46] x86, mm: use pfn_range_is_mapped() with gart

2012-11-16 Thread Yinghai Lu

We are going to map ram only, so under max_low_pfn_mapped,
between 4g and max_pfn_mapped does not mean mapped at all.

Use pfn_range_is_mapped() directly.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/amd_gart_64.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e663112..b574b29 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -768,10 +768,9 @@ int __init gart_iommu_init(void)
aper_base   = info.aper_base;
end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
 
-   if (end_pfn > max_low_pfn_mapped) {
-   start_pfn = (aper_base>>PAGE_SHIFT);
+   start_pfn = PFN_DOWN(aper_base);
+   if (!pfn_range_is_mapped(start_pfn, end_pfn))
init_memory_mapping(start_pfn

[PATCH v8 16/46] x86, mm: relocate initrd under all mem for 64bit

2012-11-16 Thread Yinghai Lu

instead of under 4g.

For 64bit, we can use any mapped mem instead of low mem.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/setup.c |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 68dffec..94f922a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -324,7 +324,7 @@ static void __init relocate_initrd(void)
char *p, *q;
 
/* We need to move the initrd down into directly mapped mem */
-   ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_low_pfn_mapped),
+   ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
 area_size, PAGE_SIZE);
 
if (!ramdisk_here)
@@ -392,7 +392,7 @@ static void __init reserve_initrd(void)
 
initrd_start = 0;
 
-   mapped_size = get_mem_size(max_low_pfn_mapped);
+   mapped_size = get_mem_size(max_pfn_mapped);
if (ramdisk_size >= (mapped_size>>1))
panic("initrd too large to handle, "
   "disabling initrd (%lld needed, %lld available)\n",
@@ -401,8 +401,7 @@ static void __init reserve_initrd(void)
printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
ramdisk_end - 1);
 
-   if (ramdisk_end <= (max_low_pfn_mapped

[PATCH v8 18/46] x86, mm: Use big page size for small memory range

2012-11-16 Thread Yinghai Lu

We could map small range in the middle of big range at first, so should use
big page size at first to avoid using small page size to break down page table.

Only can set big page bit when that range has ram area around it.

-v2: fix 32bit boundary checking. We can not count ram above max_low_pfn
for 32 bit.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |   37 +
 1 files changed, 37 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bb44e9f..da591eb 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -88,6 +88,40 @@ static int __meminit save_mr(struct map_range *mr, int 
nr_range,
return nr_range;
 }
 
+/*
+ * adjust the page_size_mask for small range to go with
+ * big page size instead small one if nearby are ram too.
+ */
+static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
+int nr_range)
+{
+   int i;
+
+   for (i = 0; i < nr_range; i++) {
+   if ((page_size_mask & (1<> PAGE_SHIFT) > max_low_pfn)
+   continue;
+#endif
+
+   if (memblock_is_region_memory(start, end - start))
+   mr[i].page_size_mask |= 1

[PATCH v8 03/46] x86, mm: Move down find_early_table_space()

2012-11-16 Thread Yinghai Lu

It will need to call split_mem_range().
Move it down after that to avoid extra declaration.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |  117 ++--
 1 files changed, 59 insertions(+), 58 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6368b86..701abbc 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -36,64 +36,6 @@ struct map_range {
 };
 
 static int page_size_mask;
-/*
- * First calculate space needed for kernel direct mapping page tables to cover
- * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 
1GB
- * pages. Then find enough contiguous space for those page tables.
- */
-static void __init find_early_table_space(struct map_range *mr, int nr_range)
-{
-   int i;
-   unsigned long puds = 0, pmds = 0, ptes = 0, tables;
-   unsigned long start = 0, good_end;
-   phys_addr_t base;
-
-   for (i = 0; i < nr_range; i++) {
-   unsigned long range, extra;
-
-   range = mr[i].end - mr[i].start;
-   puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
-
-   if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
-   extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
-   pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-   } else {
-   pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
-   }
-
-   if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
-   extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
-#ifdef CONFIG_X86_32
-   extra += PMD_SIZE;
-#endif
-   ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-   } else {
-   ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
-   }
-   }
-
-   tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-   tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-   tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
-#ifdef CONFIG_X86_32
-   /* for fixmap */
-   tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-#endif
-   good_end = max_pfn_mapped << PAGE_SHIFT;
-
-   base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
-   if (!base)
-   panic("Cannot find space for the kernel page tables");
-
-   pgt_buf_start = base >> PAGE_SHIFT;
-   pgt_buf_end = pgt_buf_start;
-   pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
-
-   printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem 
%#010lx-%#010lx]\n",
-   mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT,
-   (pgt_buf_top << PAGE_SHIFT) - 1);
-}
 
 void probe_page_size_mask(void)
 {
@@ -250,6 +192,65 @@ static int __meminit split_mem_range(struct map_range *mr, 
int nr_range,
 }
 
 /*
+ * First calculate space needed for kernel direct mapping page tables to cover
+ * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 
1GB
+ * pages. Then find enough contiguous space for those page tables.
+ */
+static void __init find_early_table_space(struct map_range *mr, int nr_range)
+{
+   int i;
+   unsigned long puds = 0, pmds = 0, ptes = 0, tables;
+   unsigned long start = 0, good_end;
+   phys_addr_t base;
+
+   for (i = 0; i < nr_range; i++) {
+   unsigned long range, extra;
+
+   range = mr[i].end - mr[i].start;
+   puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
+
+   if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
+   extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
+   pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+   } else {
+   pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
+   }
+
+   if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
+   extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+   extra += PMD_SIZE;
+#endif
+   ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   } else {
+   ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   }
+   }
+
+   tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+   tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+   tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+   /* for fixmap */
+   tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+   good_end = max_pfn_mapped << PAGE_SHIFT;
+
+   base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
+   if (!base)
+   panic("Cannot find space for the kernel page tables");
+
+   pgt_buf_start = base >> PAGE_SHIFT;
+   pgt_buf_end = pgt_buf_start;
+   pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+
+

[PATCH v8 04/46] x86, mm: Move init_memory_mapping calling out of setup.c

2012-11-16 Thread Yinghai Lu

Now init_memory_mapping is called two times, later will be called for every
ram ranges.

Could put all related init_mem calling together and out of setup.c.

Actually, it reverts commit 1e7
x86: Exclude E820_RESERVED regions and memory holes above 4 GB from direct 
mapping.
will address that later with complete solution include handling hole under 4g.

Signed-off-by: Yinghai Lu 
Reviewed-by: Pekka Enberg 
---
 arch/x86/include/asm/init.h|1 -
 arch/x86/include/asm/pgtable.h |2 +-
 arch/x86/kernel/setup.c|   27 +--
 arch/x86/mm/init.c |   19 ++-
 4 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index adcc0ae..4f13998 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -12,7 +12,6 @@ kernel_physical_mapping_init(unsigned long start,
 unsigned long end,
 unsigned long page_size_mask);
 
-
 extern unsigned long __initdata pgt_buf_start;
 extern unsigned long __meminitdata pgt_buf_end;
 extern unsigned long __meminitdata pgt_buf_top;
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 98ac76d..dd1a888 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -602,7 +602,7 @@ static inline int pgd_none(pgd_t pgd)
 #ifndef __ASSEMBLY__
 
 extern int direct_gbpages;
-void probe_page_size_mask(void);
+void init_mem_mapping(void);
 
 /* local pte updates need not use xchg for locking */
 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 01fb5f9..23b079f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -913,34 +913,9 @@ void __init setup_arch(char **cmdline_p)
setup_real_mode();
 
init_gbpages();
-   probe_page_size_mask();
 
-   /* max_pfn_mapped is updated here */
-   max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) {
-   int i;
-   unsigned long start, end;
-   unsigned long start_pfn, end_pfn;
-
-   for_each_mem_pfn_range(i, MAX_NUMNODES, _pfn, _pfn,
-NULL) {
-
-   end = PFN_PHYS(end_pfn);
-   if (end <= (1UL<<32))
-   continue;
-
-   start = PFN_PHYS(start_pfn);
-   max_pfn_mapped = init_memory_mapping(
-   max((1UL<<32), start), end);
-   }
-
-   /* can we preseve max_low_pfn ?*/
-   max_low_pfn = max_pfn;
-   }
-#endif
memblock.current_limit = get_max_mapped();
dma_contiguous_reserve(0);
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 701abbc..9e17f9e 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -37,7 +37,7 @@ struct map_range {
 
 static int page_size_mask;
 
-void probe_page_size_mask(void)
+static void __init probe_page_size_mask(void)
 {
 #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
/*
@@ -315,6 +315,23 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
return ret >> PAGE_SHIFT;
 }
 
+void __init init_mem_mapping(void)
+{
+   probe_page_size_mask();
+
+   /* max_pfn_mapped is updated here */
+   max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< max_low_pfn) {
+   max_pfn_mapped = init_memory_mapping(1UL<<32,
+max_pfn

[PATCH v8 11/46] x86, mm: Fixup code testing if a pfn is direct mapped

2012-11-16 Thread Yinghai Lu

From: Jacob Shin 

Update code that previously assumed pfns [ 0 - max_low_pfn_mapped ) and
[ 4GB - max_pfn_mapped ) were always direct mapped, to now look up
pfn_mapped ranges instead.

-v2: change applying sequence to keep git bisecting working.
 so add dummy pfn_range_is_mapped(). - Yinghai Lu

Signed-off-by: Jacob Shin 
Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/page_types.h |8 
 arch/x86/kernel/cpu/amd.c |8 +++-
 arch/x86/platform/efi/efi.c   |7 +++
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index e21fdd1..45aae6e 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,6 +51,14 @@ static inline phys_addr_t get_max_mapped(void)
return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
 }
 
+static inline bool pfn_range_is_mapped(unsigned long start_pfn,
+   unsigned long end_pfn)
+{
+   return end_pfn <= max_low_pfn_mapped ||
+  (end_pfn > (1UL << (32 - PAGE_SHIFT)) &&
+   end_pfn <= max_pfn_mapped);
+}
+
 extern unsigned long init_memory_mapping(unsigned long start,
 unsigned long end);
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f7e98a2..9619ba6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -676,12 +676,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 * benefit in doing so.
 */
if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, )) {
+   unsigned long pfn = tseg >> PAGE_SHIFT;
+
printk(KERN_DEBUG "tseg: %010llx\n", tseg);
-   if ((tseg>>PMD_SHIFT) <
-   (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
-   ((tseg>>PMD_SHIFT) <
-   (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
-   (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT
+   if (pfn_range_is_mapped(pfn, pfn + 1))
set_memory_4k((unsigned long)__va(tseg), 1);
}
}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index ad44391..36e53f0 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -835,7 +835,7 @@ void __init efi_enter_virtual_mode(void)
efi_memory_desc_t *md, *prev_md = NULL;
efi_status_t status;
unsigned long size;
-   u64 end, systab, end_pfn;
+   u64 end, systab, start_pfn, end_pfn;
void *p, *va, *new_memmap = NULL;
int count = 0;
 
@@ -888,10 +888,9 @@ void __init efi_enter_virtual_mode(void)
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
 
+   start_pfn = PFN_DOWN(md->phys_addr);
end_pfn = PFN_UP(end);
-   if (end_pfn <= max_low_pfn_mapped
-   || (end_pfn > (1UL << (32 - PAGE_SHIFT))
-   && end_pfn <= max_pfn_mapped)) {
+   if (pfn_range_is_mapped(start_pfn, end_pfn)) {
va = __va(md->phys_addr);
 
if (!(md->attribute & EFI_MEMORY_WB))
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 15/46] x86, mm: Only direct map addresses that are marked as E820_RAM

2012-11-16 Thread Yinghai Lu

From: Jacob Shin 

Currently direct mappings are created for [ 0 to max_low_pfn<
Signed-off-by: Yinghai Lu 
Reviewed-by: Pekka Enberg 
---
 arch/x86/include/asm/page_types.h |8 +--
 arch/x86/kernel/setup.c   |8 ++-
 arch/x86/mm/init.c|  120 +
 arch/x86/mm/init_64.c |6 +-
 4 files changed, 117 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index 45aae6e..54c9787 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,13 +51,7 @@ static inline phys_addr_t get_max_mapped(void)
return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
 }
 
-static inline bool pfn_range_is_mapped(unsigned long start_pfn,
-   unsigned long end_pfn)
-{
-   return end_pfn <= max_low_pfn_mapped ||
-  (end_pfn > (1UL << (32 - PAGE_SHIFT)) &&
-   end_pfn <= max_pfn_mapped);
-}
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
 
 extern unsigned long init_memory_mapping(unsigned long start,
 unsigned long end);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bd52f9d..68dffec 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -116,9 +116,11 @@
 #include 
 
 /*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
+ * max_low_pfn_mapped: highest direct mapped pfn under 4GB
+ * max_pfn_mapped: highest direct mapped pfn over 4GB
+ *
+ * The direct mapping only covers E820_RAM regions, so the ranges and gaps are
+ * represented by pfn_mapped
  */
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7b961d0..bb44e9f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -243,6 +243,38 @@ static unsigned long __init 
calculate_table_space_size(unsigned long start, unsi
return tables;
 }
 
+static unsigned long __init calculate_all_table_space_size(void)
+{
+   unsigned long start_pfn, end_pfn;
+   unsigned long tables;
+   int i;
+
+   /* the ISA range is always mapped regardless of memory holes */
+   tables = calculate_table_space_size(0, ISA_END_ADDRESS);
+
+   for_each_mem_pfn_range(i, MAX_NUMNODES, _pfn, _pfn, NULL) {
+   u64 start = start_pfn << PAGE_SHIFT;
+   u64 end = end_pfn << PAGE_SHIFT;
+
+   if (end <= ISA_END_ADDRESS)
+   continue;
+
+   if (start < ISA_END_ADDRESS)
+   start = ISA_END_ADDRESS;
+#ifdef CONFIG_X86_32
+   /* on 32 bit, we only map up to max_low_pfn */
+   if ((start >> PAGE_SHIFT) >= max_low_pfn)
+   continue;
+
+   if ((end >> PAGE_SHIFT) > max_low_pfn)
+   end = max_low_pfn << PAGE_SHIFT;
+#endif
+   tables += calculate_table_space_size(start, end);
+   }
+
+   return tables;
+}
+
 static void __init find_early_table_space(unsigned long start,
  unsigned long good_end,
  unsigned long tables)
@@ -258,6 +290,34 @@ static void __init find_early_table_space(unsigned long 
start,
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
 }
 
+static struct range pfn_mapped[E820_X_MAX];
+static int nr_pfn_mapped;
+
+static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long 
end_pfn)
+{
+   nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
+nr_pfn_mapped, start_pfn, end_pfn);
+   nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
+
+   max_pfn_mapped = max(max_pfn_mapped, end_pfn);
+
+   if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
+   max_low_pfn_mapped = max(max_low_pfn_mapped,
+min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
+}
+
+bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
+{
+   int i;
+
+   for (i = 0; i < nr_pfn_mapped; i++)
+   if ((start_pfn >= pfn_mapped[i].start) &&
+   (end_pfn <= pfn_mapped[i].end))
+   return true;
+
+   return false;
+}
+
 /*
  * Setup the direct mapping of the physical memory at PAGE_OFFSET.
  * This runs before bootmem is initialized and gets pages directly from
@@ -288,9 +348,55 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
 
__flush_tlb_all();
 
+   add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
+
return ret >> PAGE_SHIFT;
 }
 
+/*
+ * Iterate through E820 memory map and create direct mappings for only

[PATCH v8 21/46] x86, mm: setup page table in top-down

2012-11-16 Thread Yinghai Lu

Get pgt_buf early from BRK, and use it to map PMD_SIZE from top at first.
Then use mapped pages to map more ranges below, and keep looping until
all pages get mapped.

alloc_low_page will use page from BRK at first, after that buffer is used
up, will use memblock to find and reserve pages for page table usage.

Introduce min_pfn_mapped to make sure find new pages from mapped ranges,
that will be updated when lower pages get mapped.

Also add step_size to make sure that don't try to map too big range with
limited mapped pages initially, and increase the step_size when we have
more mapped pages on hand.

We don't need to call pagetable_reserve anymore, reserve work is done
in alloc_low_page() directly.

At last we can get rid of calculation and find early pgt related code.

-v2: update to after fix_xen change,
 also use MACRO for initial pgt_buf size and add comments with it.
-v3: skip big reserved range in memblock.reserved near end.
-v4: don't need fix_xen change now.
-v5: add changelog about moving about reserving pagetable to alloc_low_page.

Suggested-by: "H. Peter Anvin" 
Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/page_types.h |1 +
 arch/x86/include/asm/pgtable.h|1 +
 arch/x86/kernel/setup.c   |3 +
 arch/x86/mm/init.c|  210 +++--
 arch/x86/mm/init_32.c |   17 +++-
 arch/x86/mm/init_64.c |   17 +++-
 6 files changed, 94 insertions(+), 155 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index 54c9787..9f6f3e6 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -45,6 +45,7 @@ extern int devmem_is_allowed(unsigned long pagenr);
 
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
+extern unsigned long min_pfn_mapped;
 
 static inline phys_addr_t get_max_mapped(void)
 {
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index dd1a888..6991a3e 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -603,6 +603,7 @@ static inline int pgd_none(pgd_t pgd)
 
 extern int direct_gbpages;
 void init_mem_mapping(void);
+void early_alloc_pgt_buf(void);
 
 /* local pte updates need not use xchg for locking */
 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 94f922a..f7634092 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -124,6 +124,7 @@
  */
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
+unsigned long min_pfn_mapped;
 
 #ifdef CONFIG_DMI
 RESERVE_BRK(dmi_alloc, 65536);
@@ -900,6 +901,8 @@ void __init setup_arch(char **cmdline_p)
 
reserve_ibft_region();
 
+   early_alloc_pgt_buf();
+
/*
 * Need to conclude brk, before memblock_x86_fill()
 *  it could use memblock_find_in_range, could overlap with
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c688ea3..2393d00 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -21,6 +21,21 @@ unsigned long __initdata pgt_buf_start;
 unsigned long __meminitdata pgt_buf_end;
 unsigned long __meminitdata pgt_buf_top;
 
+/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
+#define INIT_PGT_BUF_SIZE  (5 * PAGE_SIZE)
+RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
+void  __init early_alloc_pgt_buf(void)
+{
+   unsigned long tables = INIT_PGT_BUF_SIZE;
+   phys_addr_t base;
+
+   base = __pa(extend_brk(tables, PAGE_SIZE));
+
+   pgt_buf_start = base >> PAGE_SHIFT;
+   pgt_buf_end = pgt_buf_start;
+   pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
+}
+
 int after_bootmem;
 
 int direct_gbpages
@@ -228,105 +243,6 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
return nr_range;
 }
 
-/*
- * First calculate space needed for kernel direct mapping page tables to cover
- * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 
1GB
- * pages. Then find enough contiguous space for those page tables.
- */
-static unsigned long __init calculate_table_space_size(unsigned long start, 
unsigned long end)
-{
-   int i;
-   unsigned long puds = 0, pmds = 0, ptes = 0, tables;
-   struct map_range mr[NR_RANGE_MR];
-   int nr_range;
-
-   memset(mr, 0, sizeof(mr));
-   nr_range = 0;
-   nr_range = split_mem_range(mr, nr_range, start, end);
-
-   for (i = 0; i < nr_range; i++) {
-   unsigned long range, extra;
-
-   range = mr[i].end - mr[i].start;
-   puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
-
-   if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
-   extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
-   pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-   } else {
-   pmds += (range + PMD_SIZE -

[PATCH v8 23/46] x86, mm: Remove parameter in alloc_low_page for 64bit

2012-11-16 Thread Yinghai Lu

Now all page table buf are pre-mapped, and could use virtual address directly.
So don't need to remember physical address anymore.

Remove that phys pointer in alloc_low_page(), and that will allow us to merge
alloc_low_page between 64bit and 32bit.

Signed-off-by: Yinghai Lu 
Acked-by: Stefano Stabellini 
---
 arch/x86/mm/init_64.c |   19 +++
 1 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5ee9242..1960820 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -314,14 +314,13 @@ void __init cleanup_highmap(void)
}
 }
 
-static __ref void *alloc_low_page(unsigned long *phys)
+static __ref void *alloc_low_page(void)
 {
unsigned long pfn;
void *adr;
 
if (after_bootmem) {
adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
-   *phys = __pa(adr);
 
return adr;
}
@@ -342,7 +341,6 @@ static __ref void *alloc_low_page(unsigned long *phys)
 
adr = __va(pfn * PAGE_SIZE);
clear_page(adr);
-   *phys  = pfn * PAGE_SIZE;
return adr;
 }
 
@@ -401,7 +399,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, 
unsigned long end,
int i = pmd_index(address);
 
for (; i < PTRS_PER_PMD; i++, address = next) {
-   unsigned long pte_phys;
pmd_t *pmd = pmd_page + pmd_index(address);
pte_t *pte;
pgprot_t new_prot = prot;
@@ -456,11 +453,11 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, 
unsigned long end,
continue;
}
 
-   pte = alloc_low_page(_phys);
+   pte = alloc_low_page();
last_map_addr = phys_pte_init(pte, address, end, new_prot);
 
spin_lock(_mm.page_table_lock);
-   pmd_populate_kernel(_mm, pmd, __va(pte_phys));
+   pmd_populate_kernel(_mm, pmd, pte);
spin_unlock(_mm.page_table_lock);
}
update_page_count(PG_LEVEL_2M, pages);
@@ -476,7 +473,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned 
long end,
int i = pud_index(addr);
 
for (; i < PTRS_PER_PUD; i++, addr = next) {
-   unsigned long pmd_phys;
pud_t *pud = pud_page + pud_index(addr);
pmd_t *pmd;
pgprot_t prot = PAGE_KERNEL;
@@ -530,12 +526,12 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, 
unsigned long end,
continue;
}
 
-   pmd = alloc_low_page(_phys);
+   pmd = alloc_low_page();
last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
  prot);
 
spin_lock(_mm.page_table_lock);
-   pud_populate(_mm, pud, __va(pmd_phys));
+   pud_populate(_mm, pud, pmd);
spin_unlock(_mm.page_table_lock);
}
__flush_tlb_all();
@@ -560,7 +556,6 @@ kernel_physical_mapping_init(unsigned long start,
 
for (; start < end; start = next) {
pgd_t *pgd = pgd_offset_k(start);
-   unsigned long pud_phys;
pud_t *pud;
 
next = (start + PGDIR_SIZE) & PGDIR_MASK;
@@ -574,12 +569,12 @@ kernel_physical_mapping_init(unsigned long start,
continue;
}
 
-   pud = alloc_low_page(_phys);
+   pud = alloc_low_page();
last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
 page_size_mask);
 
spin_lock(_mm.page_table_lock);
-   pgd_populate(_mm, pgd, __va(pud_phys));
+   pgd_populate(_mm, pgd, pud);
spin_unlock(_mm.page_table_lock);
pgd_changed = true;
}
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 25/46] x86, mm: Move min_pfn_mapped back to mm/init.c

2012-11-16 Thread Yinghai Lu

Also change it to static.

Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/page_types.h |1 -
 arch/x86/kernel/setup.c   |1 -
 arch/x86/mm/init.c|2 ++
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index 9f6f3e6..54c9787 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -45,7 +45,6 @@ extern int devmem_is_allowed(unsigned long pagenr);
 
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
-extern unsigned long min_pfn_mapped;
 
 static inline phys_addr_t get_max_mapped(void)
 {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index f7634092..2015194 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -124,7 +124,6 @@
  */
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
-unsigned long min_pfn_mapped;
 
 #ifdef CONFIG_DMI
 RESERVE_BRK(dmi_alloc, 65536);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8481892..6392bf9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -23,6 +23,8 @@ unsigned long __initdata pgt_buf_start;
 unsigned long __meminitdata pgt_buf_end;
 unsigned long __meminitdata pgt_buf_top;
 
+static unsigned long min_pfn_mapped;
+
 __ref void *alloc_low_page(void)
 {
unsigned long pfn;
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 22/46] x86, mm: Remove early_memremap workaround for page table accessing on 64bit

2012-11-16 Thread Yinghai Lu

We try to put page table high to make room for kdump, and at that time
those ranges are not mapped yet, and have to use ioremap to access it.

Now after patch that pre-map page table top down.
x86, mm: setup page table in top-down
We do not need that workaround anymore.

Just use __va to return directly mapping address.

Signed-off-by: Yinghai Lu 
Acked-by: Stefano Stabellini 
---
 arch/x86/mm/init_64.c |   38 --
 1 files changed, 4 insertions(+), 34 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index eefaea6..5ee9242 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -340,36 +340,12 @@ static __ref void *alloc_low_page(unsigned long *phys)
} else
pfn = pgt_buf_end++;
 
-   adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
+   adr = __va(pfn * PAGE_SIZE);
clear_page(adr);
*phys  = pfn * PAGE_SIZE;
return adr;
 }
 
-static __ref void *map_low_page(void *virt)
-{
-   void *adr;
-   unsigned long phys, left;
-
-   if (after_bootmem)
-   return virt;
-
-   phys = __pa(virt);
-   left = phys & (PAGE_SIZE - 1);
-   adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
-   adr = (void *)(((unsigned long)adr) | left);
-
-   return adr;
-}
-
-static __ref void unmap_low_page(void *adr)
-{
-   if (after_bootmem)
-   return;
-
-   early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
-}
-
 static unsigned long __meminit
 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
  pgprot_t prot)
@@ -442,10 +418,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, 
unsigned long end,
if (pmd_val(*pmd)) {
if (!pmd_large(*pmd)) {
spin_lock(_mm.page_table_lock);
-   pte = map_low_page((pte_t 
*)pmd_page_vaddr(*pmd));
+   pte = (pte_t *)pmd_page_vaddr(*pmd);
last_map_addr = phys_pte_init(pte, address,
end, prot);
-   unmap_low_page(pte);
spin_unlock(_mm.page_table_lock);
continue;
}
@@ -483,7 +458,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, 
unsigned long end,
 
pte = alloc_low_page(_phys);
last_map_addr = phys_pte_init(pte, address, end, new_prot);
-   unmap_low_page(pte);
 
spin_lock(_mm.page_table_lock);
pmd_populate_kernel(_mm, pmd, __va(pte_phys));
@@ -518,10 +492,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, 
unsigned long end,
 
if (pud_val(*pud)) {
if (!pud_large(*pud)) {
-   pmd = map_low_page(pmd_offset(pud, 0));
+   pmd = pmd_offset(pud, 0);
last_map_addr = phys_pmd_init(pmd, addr, end,
 page_size_mask, prot);
-   unmap_low_page(pmd);
__flush_tlb_all();
continue;
}
@@ -560,7 +533,6 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned 
long end,
pmd = alloc_low_page(_phys);
last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
  prot);
-   unmap_low_page(pmd);
 
spin_lock(_mm.page_table_lock);
pud_populate(_mm, pud, __va(pmd_phys));
@@ -596,17 +568,15 @@ kernel_physical_mapping_init(unsigned long start,
next = end;
 
if (pgd_val(*pgd)) {
-   pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+   pud = (pud_t *)pgd_page_vaddr(*pgd);
last_map_addr = phys_pud_init(pud, __pa(start),
 __pa(end), page_size_mask);
-   unmap_low_page(pud);
continue;
}
 
pud = alloc_low_page(_phys);
last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
 page_size_mask);
-   unmap_low_page(pud);
 
spin_lock(_mm.page_table_lock);
pgd_populate(_mm, pgd, __va(pud_phys));
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 20/46] x86, mm: Break down init_all_memory_mapping

2012-11-16 Thread Yinghai Lu

Will replace that with top-down page table initialization.
New API need to take range: init_range_memory_mapping()

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |   41 +++--
 1 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index da591eb..c688ea3 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -398,40 +398,30 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
  * Depending on the alignment of E820 ranges, this may possibly result in using
  * smaller size (i.e. 4K instead of 2M or 1G) page tables.
  */
-static void __init init_all_memory_mapping(void)
+static void __init init_range_memory_mapping(unsigned long range_start,
+  unsigned long range_end)
 {
unsigned long start_pfn, end_pfn;
int i;
 
-   /* the ISA range is always mapped regardless of memory holes */
-   init_memory_mapping(0, ISA_END_ADDRESS);
-
for_each_mem_pfn_range(i, MAX_NUMNODES, _pfn, _pfn, NULL) {
u64 start = (u64)start_pfn << PAGE_SHIFT;
u64 end = (u64)end_pfn << PAGE_SHIFT;
 
-   if (end <= ISA_END_ADDRESS)
+   if (end <= range_start)
continue;
 
-   if (start < ISA_END_ADDRESS)
-   start = ISA_END_ADDRESS;
-#ifdef CONFIG_X86_32
-   /* on 32 bit, we only map up to max_low_pfn */
-   if ((start >> PAGE_SHIFT) >= max_low_pfn)
+   if (start < range_start)
+   start = range_start;
+
+   if (start >= range_end)
continue;
 
-   if ((end >> PAGE_SHIFT) > max_low_pfn)
-   end = max_low_pfn << PAGE_SHIFT;
-#endif
-   init_memory_mapping(start, end);
-   }
+   if (end > range_end)
+   end = range_end;
 
-#ifdef CONFIG_X86_64
-   if (max_pfn > max_low_pfn) {
-   /* can we preseve max_low_pfn ?*/
-   max_low_pfn = max_pfn;
+   init_memory_mapping(start, end);
}
-#endif
 }
 
 void __init init_mem_mapping(void)
@@ -461,8 +451,15 @@ void __init init_mem_mapping(void)
(pgt_buf_top << PAGE_SHIFT) - 1);
 
max_pfn_mapped = 0; /* will get exact value next */
-   init_all_memory_mapping();
-
+   /* the ISA range is always mapped regardless of memory holes */
+   init_memory_mapping(0, ISA_END_ADDRESS);
+   init_range_memory_mapping(ISA_END_ADDRESS, end);
+#ifdef CONFIG_X86_64
+   if (max_pfn > max_low_pfn) {
+   /* can we preseve max_low_pfn ?*/
+   max_low_pfn = max_pfn;
+   }
+#endif
/*
 * Reserve the kernel pagetable pages we used (pgt_buf_start -
 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 26/46] x86, mm, Xen: Remove mapping_pagetable_reserve()

2012-11-16 Thread Yinghai Lu

Page table area are pre-mapped now after
x86, mm: setup page table in top-down
x86, mm: Remove early_memremap workaround for page table accessing on 
64bit

mapping_pagetable_reserve is not used anymore, so remove it.

Also remove operation in mask_rw_pte(), as modified allow_low_page
always return pages that are already mapped, moreover
xen_alloc_pte_init, xen_alloc_pmd_init, etc, will mark the page RO
before hooking it into the pagetable automatically.

-v2: add changelog about mask_rw_pte() from Stefano.

Signed-off-by: Yinghai Lu 
Cc: Stefano Stabellini 
---
 arch/x86/include/asm/pgtable_types.h |1 -
 arch/x86/include/asm/x86_init.h  |   12 
 arch/x86/kernel/x86_init.c   |4 
 arch/x86/mm/init.c   |4 
 arch/x86/xen/mmu.c   |   28 
 5 files changed, 0 insertions(+), 49 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index ec8a1fc..79738f2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -301,7 +301,6 @@ int phys_mem_access_prot_allowed(struct file *file, 
unsigned long pfn,
 /* Install a pte for a particular vaddr in kernel space. */
 void set_pte_vaddr(unsigned long vaddr, pte_t pte);
 
-extern void native_pagetable_reserve(u64 start, u64 end);
 #ifdef CONFIG_X86_32
 extern void native_pagetable_init(void);
 #else
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 5769349..3b2ce8f 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -69,17 +69,6 @@ struct x86_init_oem {
 };
 
 /**
- * struct x86_init_mapping - platform specific initial kernel pagetable setup
- * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
- *
- * For more details on the purpose of this hook, look in
- * init_memory_mapping and the commit that added it.
- */
-struct x86_init_mapping {
-   void (*pagetable_reserve)(u64 start, u64 end);
-};
-
-/**
  * struct x86_init_paging - platform specific paging functions
  * @pagetable_init:platform specific paging initialization call to setup
  * the kernel pagetables and prepare accessors functions.
@@ -136,7 +125,6 @@ struct x86_init_ops {
struct x86_init_mpparse mpparse;
struct x86_init_irqsirqs;
struct x86_init_oem oem;
-   struct x86_init_mapping mapping;
struct x86_init_paging  paging;
struct x86_init_timers  timers;
struct x86_init_iommu   iommu;
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 7a3d075..50cf83e 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -62,10 +62,6 @@ struct x86_init_ops x86_init __initdata = {
.banner = default_banner,
},
 
-   .mapping = {
-   .pagetable_reserve  = native_pagetable_reserve,
-   },
-
.paging = {
.pagetable_init = native_pagetable_init,
},
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6392bf9..21173fc 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -112,10 +112,6 @@ static void __init probe_page_size_mask(void)
__supported_pte_mask |= _PAGE_GLOBAL;
}
 }
-void __init native_pagetable_reserve(u64 start, u64 end)
-{
-   memblock_reserve(start, end - start);
-}
 
 #ifdef CONFIG_X86_32
 #define NR_RANGE_MR 3
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index dcf5f2d..bbb883f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1178,20 +1178,6 @@ static void xen_exit_mmap(struct mm_struct *mm)
 
 static void xen_post_allocator_init(void);
 
-static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
-{
-   /* reserve the range used */
-   native_pagetable_reserve(start, end);
-
-   /* set as RW the rest */
-   printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
-   PFN_PHYS(pgt_buf_top));
-   while (end < PFN_PHYS(pgt_buf_top)) {
-   make_lowmem_page_readwrite(__va(end));
-   end += PAGE_SIZE;
-   }
-}
-
 #ifdef CONFIG_X86_64
 static void __init xen_cleanhighmap(unsigned long vaddr,
unsigned long vaddr_end)
@@ -1503,19 +1489,6 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 #else /* CONFIG_X86_64 */
 static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 {
-   unsigned long pfn = pte_pfn(pte);
-
-   /*
-* If the new pfn is within the range of the newly allocated
-* kernel pagetable, and it isn't being mapped into an
-* early_ioremap fixmap slot as a freshly allocated page, make sure
-* it is RO.
-*/
-   if (((!is_early_ioremap_ptep(ptep) &&
-   pfn >=

[PATCH v8 29/46] x86, mm: only call early_ioremap_page_table_range_init() once

2012-11-16 Thread Yinghai Lu

On 32bit, before patcheset that only set page table for ram, we only
call that one time.

Now, we are calling that during every init_memory_mapping if we have holes
under max_low_pfn.

We should only call it one time after all ranges under max_low_page get
mapped just like we did before.

Also that could avoid the risk to run out of pgt_buf in BRK.

Need to update page_table_range_init() to count the pages for kmap page table
at first, and use new added alloc_low_pages() to get pages in sequence.
That will conform to the requirement that pages need to be in low to high order.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c|   13 +
 arch/x86/mm/init_32.c |   47 +--
 2 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index cb4f8ba..bed4888 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -343,14 +343,6 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
   mr[i].page_size_mask);
 
-#ifdef CONFIG_X86_32
-   early_ioremap_page_table_range_init();
-
-   load_cr3(swapper_pg_dir);
-#endif
-
-   __flush_tlb_all();
-
add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
 
return ret >> PAGE_SHIFT;
@@ -447,7 +439,12 @@ void __init init_mem_mapping(void)
/* can we preseve max_low_pfn ?*/
max_low_pfn = max_pfn;
}
+#else
+   early_ioremap_page_table_range_init();
+   load_cr3(swapper_pg_dir);
+   __flush_tlb_all();
 #endif
+
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
 }
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index a7f2df1..0ae1ba8 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -135,8 +135,39 @@ pte_t * __init populate_extra_pte(unsigned long vaddr)
return one_page_table_init(pmd) + pte_idx;
 }
 
+static unsigned long __init
+page_table_range_init_count(unsigned long start, unsigned long end)
+{
+   unsigned long count = 0;
+#ifdef CONFIG_HIGHMEM
+   int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
+   int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
+   int pgd_idx, pmd_idx;
+   unsigned long vaddr;
+
+   if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
+   return 0;
+
+   vaddr = start;
+   pgd_idx = pgd_index(vaddr);
+
+   for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
+   for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
+   pmd_idx++) {
+   if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
+   (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
+   count++;
+   vaddr += PMD_SIZE;
+   }
+   pmd_idx = 0;
+   }
+#endif
+   return count;
+}
+
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
-  unsigned long vaddr, pte_t *lastpte)
+  unsigned long vaddr, pte_t *lastpte,
+  void **adr)
 {
 #ifdef CONFIG_HIGHMEM
/*
@@ -150,16 +181,15 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, 
pmd_t *pmd,
 
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
-   && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-   && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
-   || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
+   && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
pte_t *newpte;
int i;
 
BUG_ON(after_bootmem);
-   newpte = alloc_low_page();
+   newpte = *adr;
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
+   *adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
 
paravirt_alloc_pte(_mm, __pa(newpte) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
@@ -193,6 +223,11 @@ page_table_range_init(unsigned long start, unsigned long 
end, pgd_t *pgd_base)
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte = NULL;
+   unsigned long count = page_table_range_init_count(start, end);
+   void *adr = NULL;
+
+   if (count)
+   adr = alloc_low_pages(count);
 
vaddr = start;
pgd_idx = pgd_index(vaddr);
@@ -205,7 +240,7 @@ page_table_range_init(unsigned long start, unsigned long 
end, pgd_t *pgd_base)
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
pte =

[PATCH v8 17/46] x86, mm: Align start address to correct big page size

2012-11-16 Thread Yinghai Lu

We are going to use buffer in BRK to map small range just under memory top,
and use those new mapped ram to map ram range under it.

The ram range that will be mapped at first could be only page aligned,
but ranges around it are ram too, we could use bigger page to map it to
avoid small page size.

We will adjust page_size_mask in following patch:
x86, mm: Use big page size for small memory range
to use big page size for small ram range.

Before that patch, this patch will make sure start address to be
aligned down according to bigger page size, otherwise entry in page
page will not have correct value.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init_32.c |1 +
 arch/x86/mm/init_64.c |5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 11a5800..27f7fc6 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -310,6 +310,7 @@ repeat:
__pgprot(PTE_IDENT_ATTR |
 _PAGE_PSE);
 
+   pfn &= PMD_MASK >> PAGE_SHIFT;
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 32c7e38..869372a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -464,7 +464,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, 
unsigned long end,
pages++;
spin_lock(_mm.page_table_lock);
set_pte((pte_t *)pmd,
-   pfn_pte(address >> PAGE_SHIFT,
+   pfn_pte((address & PMD_MASK) >> PAGE_SHIFT,
__pgprot(pgprot_val(prot) | 
_PAGE_PSE)));
spin_unlock(_mm.page_table_lock);
last_map_addr = next;
@@ -541,7 +541,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned 
long end,
pages++;
spin_lock(_mm.page_table_lock);
set_pte((pte_t *)pud,
-   pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+   pfn_pte((addr & PUD_MASK) >> PAGE_SHIFT,
+   PAGE_KERNEL_LARGE));
spin_unlock(_mm.page_table_lock);
last_map_addr = next;
continue;
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 09/46] x86, mm: Set memblock initial limit to 1M

2012-11-16 Thread Yinghai Lu

memblock_x86_fill() could double memory array.
If we set memblock.current_limit to 512M, so memory array could be around 512M.
So kdump will not get big range (like 512M) under 1024M.

Try to put it down under 1M, it would use about 4k or so, and that is limited.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/setup.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 23b079f..4bd8921 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -890,7 +890,7 @@ void __init setup_arch(char **cmdline_p)
 
cleanup_highmap();
 
-   memblock.current_limit = get_max_mapped();
+   memblock.current_limit = ISA_END_ADDRESS;
memblock_x86_fill();
 
/*
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 14/46] x86, mm: use pfn_range_is_mapped() with reserve_initrd

2012-11-16 Thread Yinghai Lu

We are going to map ram only, so under max_low_pfn_mapped,
between 4g and max_pfn_mapped does not mean mapped at all.

Use pfn_range_is_mapped() to find out if range is mapped for initrd.

That could happen bootloader put initrd in range but user could
use memmap to carve some of range out.

Also during copying need to use early_memmap to map original initrd
for accessing.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/setup.c |   52 +-
 1 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index d85cbd9..bd52f9d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -317,20 +317,19 @@ static void __init relocate_initrd(void)
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
u64 area_size = PAGE_ALIGN(ramdisk_size);
-   u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
char *p, *q;
 
-   /* We need to move the initrd down into lowmem */
-   ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
-PAGE_SIZE);
+   /* We need to move the initrd down into directly mapped mem */
+   ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_low_pfn_mapped),
+area_size, PAGE_SIZE);
 
if (!ramdisk_here)
panic("Cannot find place for new RAMDISK of size %lld\n",
 ramdisk_size);
 
-   /* Note: this includes all the lowmem currently occupied by
+   /* Note: this includes all the mem currently occupied by
   the initrd, we rely on that fact to keep the data intact. */
memblock_reserve(ramdisk_here, area_size);
initrd_start = ramdisk_here + PAGE_OFFSET;
@@ -340,17 +339,7 @@ static void __init relocate_initrd(void)
 
q = (char *)initrd_start;
 
-   /* Copy any lowmem portion of the initrd */
-   if (ramdisk_image < end_of_lowmem) {
-   clen = end_of_lowmem - ramdisk_image;
-   p = (char *)__va(ramdisk_image);
-   memcpy(q, p, clen);
-   q += clen;
-   ramdisk_image += clen;
-   ramdisk_size  -= clen;
-   }
-
-   /* Copy the highmem portion of the initrd */
+   /* Copy the initrd */
while (ramdisk_size) {
slop = ramdisk_image & ~PAGE_MASK;
clen = ramdisk_size;
@@ -364,7 +353,7 @@ static void __init relocate_initrd(void)
ramdisk_image += clen;
ramdisk_size  -= clen;
}
-   /* high pages is not converted by early_res_to_bootmem */
+
ramdisk_image = boot_params.hdr.ramdisk_image;
ramdisk_size  = boot_params.hdr.ramdisk_size;
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
@@ -373,13 +362,27 @@ static void __init relocate_initrd(void)
ramdisk_here, ramdisk_here + ramdisk_size - 1);
 }
 
+static u64 __init get_mem_size(unsigned long limit_pfn)
+{
+   int i;
+   u64 mapped_pages = 0;
+   unsigned long start_pfn, end_pfn;
+
+   for_each_mem_pfn_range(i, MAX_NUMNODES, _pfn, _pfn, NULL) {
+   start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
+   end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
+   mapped_pages += end_pfn - start_pfn;
+   }
+
+   return mapped_pages << PAGE_SHIFT;
+}
 static void __init reserve_initrd(void)
 {
/* Assume only end is not page aligned */
u64 ramdisk_image = boot_params.hdr.ramdisk_image;
u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-   u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
+   u64 mapped_size;
 
if (!boot_params.hdr.type_of_loader ||
!ramdisk_image || !ramdisk_size)
@@ -387,18 +390,19 @@ static void __init reserve_initrd(void)
 
initrd_start = 0;
 
-   if (ramdisk_size >= (end_of_lowmem>>1)) {
+   mapped_size = get_mem_size(max_low_pfn_mapped);
+   if (ramdisk_size >= (mapped_size>>1))
panic("initrd too large to handle, "
   "disabling initrd (%lld needed, %lld available)\n",
-  ramdisk_size, end_of_lowmem>>1);
-   }
+  ramdisk_size, mapped_size>>1);
 
printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image,
ramdisk_end - 1);
 
-
-   if (ramdisk_end <= end_of_lowmem) {
-   /* All in lowmem, easy case */
+   if (ramdisk_end <= (max_low_pfn_mapped

[PATCH v8 27/46] x86, mm: Add alloc_low_pages(num)

2012-11-16 Thread Yinghai Lu

32bit kmap mapping needs pages to be used for low to high.
At this point those pages are still from pgt_buf_* from BRK, so it is
ok now.
But we want to move early_ioremap_page_table_range_init() out of
init_memory_mapping() and only call it one time later, that will
make page_table_range_init/page_table_kmap_check/alloc_low_page to
use memblock to get page.

memblock allocation for pages are from high to low.
So will get panic from page_table_kmap_check() that has BUG_ON to do
ordering checking.

This patch add alloc_low_pages to make it possible to allocate serveral
pages at first, and hand out pages one by one from low to high.

Signed-off-by: Yinghai Lu 
Cc: Andrew Morton 
---
 arch/x86/mm/init.c|   33 +
 arch/x86/mm/mm_internal.h |6 +-
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 21173fc..02cea14 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -25,36 +25,45 @@ unsigned long __meminitdata pgt_buf_top;
 
 static unsigned long min_pfn_mapped;
 
-__ref void *alloc_low_page(void)
+__ref void *alloc_low_pages(unsigned int num)
 {
unsigned long pfn;
-   void *adr;
+   int i;
 
 #ifdef CONFIG_X86_64
if (after_bootmem) {
-   adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+   unsigned int order;
 
-   return adr;
+   order = get_order((unsigned long)num << PAGE_SHIFT);
+   return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
+   __GFP_ZERO, order);
}
 #endif
 
-   if ((pgt_buf_end + 1) >= pgt_buf_top) {
+   if ((pgt_buf_end + num) >= pgt_buf_top) {
unsigned long ret;
if (min_pfn_mapped >= max_pfn_mapped)
panic("alloc_low_page: ran out of memory");
ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
max_pfn_mapped << PAGE_SHIFT,
-   PAGE_SIZE, PAGE_SIZE);
+   PAGE_SIZE * num , PAGE_SIZE);
if (!ret)
panic("alloc_low_page: can not alloc memory");
-   memblock_reserve(ret, PAGE_SIZE);
+   memblock_reserve(ret, PAGE_SIZE * num);
pfn = ret >> PAGE_SHIFT;
-   } else
-   pfn = pgt_buf_end++;
+   } else {
+   pfn = pgt_buf_end;
+   pgt_buf_end += num;
+   }
+
+   for (i = 0; i < num; i++) {
+   void *adr;
+
+   adr = __va((pfn + i) << PAGE_SHIFT);
+   clear_page(adr);
+   }
 
-   adr = __va(pfn * PAGE_SIZE);
-   clear_page(adr);
-   return adr;
+   return __va(pfn << PAGE_SHIFT);
 }
 
 /* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index b3f993a..7e3b88e 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -1,6 +1,10 @@
 #ifndef __X86_MM_INTERNAL_H
 #define __X86_MM_INTERNAL_H
 
-void *alloc_low_page(void);
+void *alloc_low_pages(unsigned int num);
+static inline void *alloc_low_page(void)
+{
+   return alloc_low_pages(1);
+}
 
 #endif /* __X86_MM_INTERNAL_H */
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 31/46] x86, mm: Move init_gbpages() out of setup.c

2012-11-16 Thread Yinghai Lu

Put it in mm/init.c, and call it from probe_page_mask().
init_mem_mapping is calling probe_page_mask at first.
So calling sequence is not changed.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/setup.c |   15 +--
 arch/x86/mm/init.c  |   12 
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2015194..85b62f1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -282,18 +282,7 @@ void * __init extend_brk(size_t size, size_t align)
return ret;
 }
 
-#ifdef CONFIG_X86_64
-static void __init init_gbpages(void)
-{
-   if (direct_gbpages && cpu_has_gbpages)
-   printk(KERN_INFO "Using GB pages for direct mapping\n");
-   else
-   direct_gbpages = 0;
-}
-#else
-static inline void init_gbpages(void)
-{
-}
+#ifdef CONFIG_X86_32
 static void __init cleanup_highmap(void)
 {
 }
@@ -933,8 +922,6 @@ void __init setup_arch(char **cmdline_p)
 
setup_real_mode();
 
-   init_gbpages();
-
init_mem_mapping();
 
memblock.current_limit = get_max_mapped();
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 3cadf10..8168bf8 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -98,6 +98,16 @@ int direct_gbpages
 #endif
 ;
 
+static void __init init_gbpages(void)
+{
+#ifdef CONFIG_X86_64
+   if (direct_gbpages && cpu_has_gbpages)
+   printk(KERN_INFO "Using GB pages for direct mapping\n");
+   else
+   direct_gbpages = 0;
+#endif
+}
+
 struct map_range {
unsigned long start;
unsigned long end;
@@ -108,6 +118,8 @@ static int page_size_mask;
 
 static void __init probe_page_size_mask(void)
 {
+   init_gbpages();
+
 #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
/*
 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 24/46] x86, mm: Merge alloc_low_page between 64bit and 32bit

2012-11-16 Thread Yinghai Lu

They are almost same except 64 bit need to handle after_bootmem case.

Add mm_internal.h to make that alloc_low_page() only to be accessible
from arch/x86/mm/init*.c

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c|   34 ++
 arch/x86/mm/init_32.c |   26 ++
 arch/x86/mm/init_64.c |   32 ++--
 arch/x86/mm/mm_internal.h |6 ++
 4 files changed, 44 insertions(+), 54 deletions(-)
 create mode 100644 arch/x86/mm/mm_internal.h

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 2393d00..8481892 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -17,10 +17,44 @@
 #include 
 #include/* for MAX_DMA_PFN */
 
+#include "mm_internal.h"
+
 unsigned long __initdata pgt_buf_start;
 unsigned long __meminitdata pgt_buf_end;
 unsigned long __meminitdata pgt_buf_top;
 
+__ref void *alloc_low_page(void)
+{
+   unsigned long pfn;
+   void *adr;
+
+#ifdef CONFIG_X86_64
+   if (after_bootmem) {
+   adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+
+   return adr;
+   }
+#endif
+
+   if ((pgt_buf_end + 1) >= pgt_buf_top) {
+   unsigned long ret;
+   if (min_pfn_mapped >= max_pfn_mapped)
+   panic("alloc_low_page: ran out of memory");
+   ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+   max_pfn_mapped << PAGE_SHIFT,
+   PAGE_SIZE, PAGE_SIZE);
+   if (!ret)
+   panic("alloc_low_page: can not alloc memory");
+   memblock_reserve(ret, PAGE_SIZE);
+   pfn = ret >> PAGE_SHIFT;
+   } else
+   pfn = pgt_buf_end++;
+
+   adr = __va(pfn * PAGE_SIZE);
+   clear_page(adr);
+   return adr;
+}
+
 /* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
 #define INIT_PGT_BUF_SIZE  (5 * PAGE_SIZE)
 RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 7bb1106..a7f2df1 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -53,36 +53,14 @@
 #include 
 #include 
 
+#include "mm_internal.h"
+
 unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
 
 bool __read_mostly __vmalloc_start_set = false;
 
-static __init void *alloc_low_page(void)
-{
-   unsigned long pfn;
-   void *adr;
-
-   if ((pgt_buf_end + 1) >= pgt_buf_top) {
-   unsigned long ret;
-   if (min_pfn_mapped >= max_pfn_mapped)
-   panic("alloc_low_page: ran out of memory");
-   ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
-   max_pfn_mapped << PAGE_SHIFT,
-   PAGE_SIZE, PAGE_SIZE);
-   if (!ret)
-   panic("alloc_low_page: can not alloc memory");
-   memblock_reserve(ret, PAGE_SIZE);
-   pfn = ret >> PAGE_SHIFT;
-   } else
-   pfn = pgt_buf_end++;
-
-   adr = __va(pfn * PAGE_SIZE);
-   clear_page(adr);
-   return adr;
-}
-
 /*
  * Creates a middle page table and puts a pointer to it in the
  * given global directory entry. This only returns the gd entry
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1960820..1d53def 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -54,6 +54,8 @@
 #include 
 #include 
 
+#include "mm_internal.h"
+
 static int __init parse_direct_gbpages_off(char *arg)
 {
direct_gbpages = 0;
@@ -314,36 +316,6 @@ void __init cleanup_highmap(void)
}
 }
 
-static __ref void *alloc_low_page(void)
-{
-   unsigned long pfn;
-   void *adr;
-
-   if (after_bootmem) {
-   adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
-
-   return adr;
-   }
-
-   if ((pgt_buf_end + 1) >= pgt_buf_top) {
-   unsigned long ret;
-   if (min_pfn_mapped >= max_pfn_mapped)
-   panic("alloc_low_page: ran out of memory");
-   ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
-   max_pfn_mapped << PAGE_SHIFT,
-   PAGE_SIZE, PAGE_SIZE);
-   if (!ret)
-   panic("alloc_low_page: can not alloc memory");
-   memblock_reserve(ret, PAGE_SIZE);
-   pfn = ret >> PAGE_SHIFT;
-   } else
-   pfn = pgt_buf_end++;
-
-   adr = __va(pfn * PAGE_SIZE);
-   clear_page(adr);
-   return adr;
-}
-
 static unsigned long __meminit
 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
  pgprot_t prot)
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
new file mode 100644
index 000..b3f993a
--- /dev/null

[PATCH v8 28/46] x86, mm: Add pointer about Xen mmu requirement for alloc_low_pages

2012-11-16 Thread Yinghai Lu

From: Stefano Stabellini 

Add link for more information
279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve

-v2: updated to commets from hpa to include commit name.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |9 +
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 02cea14..cb4f8ba 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -25,6 +25,15 @@ unsigned long __meminitdata pgt_buf_top;
 
 static unsigned long min_pfn_mapped;
 
+/*
+ * Pages returned are already directly mapped.
+ *
+ * Changing that is likely to break Xen, see commit:
+ *
+ *279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
+ *
+ * for detailed information.
+ */
 __ref void *alloc_low_pages(unsigned int num)
 {
unsigned long pfn;
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 33/46] x86, mm: Move function declaration into mm_internal.h

2012-11-16 Thread Yinghai Lu

They are only for mm/init*.c.

Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/init.h |   16 +++-
 arch/x86/mm/mm_internal.h   |7 +++
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 626ea8d..bac770b 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -1,15 +1,5 @@
-#ifndef _ASM_X86_INIT_32_H
-#define _ASM_X86_INIT_32_H
+#ifndef _ASM_X86_INIT_H
+#define _ASM_X86_INIT_H
 
-#ifdef CONFIG_X86_32
-extern void __init early_ioremap_page_table_range_init(void);
-#endif
 
-extern void __init zone_sizes_init(void);
-
-extern unsigned long __init
-kernel_physical_mapping_init(unsigned long start,
-unsigned long end,
-unsigned long page_size_mask);
-
-#endif /* _ASM_X86_INIT_32_H */
+#endif /* _ASM_X86_INIT_H */
diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 7e3b88e..dc79ac1 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -7,4 +7,11 @@ static inline void *alloc_low_page(void)
return alloc_low_pages(1);
 }
 
+void early_ioremap_page_table_range_init(void);
+
+unsigned long kernel_physical_mapping_init(unsigned long start,
+unsigned long end,
+unsigned long page_size_mask);
+void zone_sizes_init(void);
+
 #endif /* __X86_MM_INTERNAL_H */
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 39/46] x86, mm: Unifying after_bootmem for 32bit and 64bit

2012-11-16 Thread Yinghai Lu

after_bootmem has different meaning in 32bit and 64bit.
32bit: after bootmem is ready
64bit: after bootmem is distroyed
Let's merget them make 32bit the same as 64bit.

for 32bit, it is mixing alloc_bootmem_pages, and alloc_low_page under
after_bootmem is set or not set.

alloc_bootmem is just wrapper for memblock for x86.

Now we have alloc_low_page() with memblock too. We can drop bootmem path
now, and only alloc_low_page only.

At the same time, we make alloc_low_page could handle real after_bootmem
for 32bit, because alloc_bootmem_pages could fallback to use slab too.

At last move after_bootmem set position for 32bit the same as 64bit.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c|2 --
 arch/x86/mm/init_32.c |   21 -
 2 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f410dc6..2a27e5a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -39,7 +39,6 @@ __ref void *alloc_low_pages(unsigned int num)
unsigned long pfn;
int i;
 
-#ifdef CONFIG_X86_64
if (after_bootmem) {
unsigned int order;
 
@@ -47,7 +46,6 @@ __ref void *alloc_low_pages(unsigned int num)
return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
__GFP_ZERO, order);
}
-#endif
 
if ((pgt_buf_end + num) >= pgt_buf_top) {
unsigned long ret;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 19ef9f0..f4fc4a2 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -73,10 +73,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 
 #ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-   if (after_bootmem)
-   pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE);
-   else
-   pmd_table = (pmd_t *)alloc_low_page();
+   pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
@@ -98,17 +95,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 static pte_t * __init one_page_table_init(pmd_t *pmd)
 {
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
-   pte_t *page_table = NULL;
-
-   if (after_bootmem) {
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-   page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
-#endif
-   if (!page_table)
-   page_table =
-   (pte_t *)alloc_bootmem_pages(PAGE_SIZE);
-   } else
-   page_table = (pte_t *)alloc_low_page();
+   pte_t *page_table = (pte_t *)alloc_low_page();
 
paravirt_alloc_pte(_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
@@ -708,8 +695,6 @@ void __init setup_bootmem_allocator(void)
printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
 max_pfn_mapped

[PATCH v8 36/46] x86, mm: use PFN_DOWN in split_mem_range()

2012-11-16 Thread Yinghai Lu

to replace own inline version for shifting.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |   44 ++--
 1 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 0e625e6..1cca052 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -208,8 +208,8 @@ static int __meminit split_mem_range(struct map_range *mr, 
int nr_range,
int i;
 
/* head if not big page alignment ? */
-   start_pfn = start >> PAGE_SHIFT;
-   pos = start_pfn << PAGE_SHIFT;
+   start_pfn = PFN_DOWN(start);
+   pos = PFN_PHYS(start_pfn);
 #ifdef CONFIG_X86_32
/*
 * Don't use a large page for the first 2/4MB of memory
@@ -218,59 +218,59 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
 * slowdowns.
 */
if (pos == 0)
-   end_pfn = PMD_SIZE >> PAGE_SHIFT;
+   end_pfn = PFN_DOWN(PMD_SIZE);
else
-   end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT;
+   end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE));
 #else /* CONFIG_X86_64 */
-   end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT;
+   end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE));
 #endif
-   if (end_pfn > (end >> PAGE_SHIFT))
-   end_pfn = end >> PAGE_SHIFT;
+   if (end_pfn > PFN_DOWN(end))
+   end_pfn = PFN_DOWN(end);
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-   pos = end_pfn << PAGE_SHIFT;
+   pos = PFN_PHYS(end_pfn);
}
 
/* big page (2M) range */
-   start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT;
+   start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE));
 #ifdef CONFIG_X86_32
-   end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT;
+   end_pfn = PFN_DOWN(round_down(end, PMD_SIZE));
 #else /* CONFIG_X86_64 */
-   end_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT;
-   if (end_pfn > (round_down(end, PMD_SIZE) >> PAGE_SHIFT))
-   end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT;
+   end_pfn = PFN_DOWN(round_up(pos, PUD_SIZE));
+   if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE)))
+   end_pfn = PFN_DOWN(round_down(end, PMD_SIZE));
 #endif
 
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<> PAGE_SHIFT;
-   end_pfn = round_down(end, PUD_SIZE) >> PAGE_SHIFT;
+   start_pfn = PFN_DOWN(round_up(pos, PUD_SIZE));
+   end_pfn = PFN_DOWN(round_down(end, PUD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
 ((1<> PAGE_SHIFT;
-   end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT;
+   start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE));
+   end_pfn = PFN_DOWN(round_down(end, PMD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<>PAGE_SHIFT;
-   end_pfn = end>>PAGE_SHIFT;
+   start_pfn = PFN_DOWN(pos);
+   end_pfn = PFN_DOWN(end);
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
 
/* try to merge same page size and continuous */
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 32/46] x86, mm: change low/hignmem_pfn_init to static on 32bit

2012-11-16 Thread Yinghai Lu

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init_32.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0ae1ba8..322ee56 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -575,7 +575,7 @@ early_param("highmem", parse_highmem);
  * artificially via the highmem=x boot parameter then create
  * it:
  */
-void __init lowmem_pfn_init(void)
+static void __init lowmem_pfn_init(void)
 {
/* max_low_pfn is 0, we already have early_res support */
max_low_pfn = max_pfn;
@@ -611,7 +611,7 @@ void __init lowmem_pfn_init(void)
  * We have more RAM than fits into lowmem - we try to put it into
  * highmem, also taking the highmem=x boot parameter into account:
  */
-void __init highmem_pfn_init(void)
+static void __init highmem_pfn_init(void)
 {
max_low_pfn = MAXMEM_PFN;
 
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 34/46] x86, mm: Add check before clear pte above max_low_pfn on 32bit

2012-11-16 Thread Yinghai Lu

During test patch that adjust page_size_mask to map small range ram with
big page size, found page table is setup wrongly for 32bit. And
native_pagetable_init wrong clear pte for pmd with large page support.

1. add more comments about why we are expecting pte.

2. add BUG checking, so next time we could find problem earlier
   when we mess up page table setup again.

3. max_low_pfn is not included boundary for low memory mapping.
   We should check from max_low_pfn instead of +1.

4. add print out when some pte really get cleared, or we should use
   WARN() to find out why above max_low_pfn get mapped? so we could
   fix it.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init_32.c |   18 --
 1 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 322ee56..19ef9f0 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -480,9 +480,14 @@ void __init native_pagetable_init(void)
 
/*
 * Remove any mappings which extend past the end of physical
-* memory from the boot time page table:
+* memory from the boot time page table.
+* In virtual address space, we should have at least two pages
+* from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
+* definition. And max_low_pfn is set to VMALLOC_END physical
+* address. If initial memory mapping is doing right job, we
+* should have pte used near max_low_pfn or one pmd is not present.
 */
-   for (pfn = max_low_pfn + 1; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
+   for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
va = PAGE_OFFSET + (pfn<> PAGE_SHIFT);
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 19/46] x86, mm: Don't clear page table if range is ram

2012-11-16 Thread Yinghai Lu

After we add code use buffer in BRK to pre-map buf for page table in
following patch:
x86, mm: setup page table in top-down
it should be safe to remove early_memmap for page table accessing.
Instead we get panic with that.

It turns out that we clear the initial page table wrongly for next range
that is separated by holes.
And it only happens when we are trying to map ram range one by one.

We need to check if the range is ram before clearing page table.

We change the loop structure to remove the extra little loop and use
one loop only, and in that loop will caculate next at first, and check if
[addr,next) is covered by E820_RAM.

-v2: E820_RESERVED_KERN is treated as E820_RAM. EFI one change some E820_RAM
 to that, so next kernel by kexec will know that range is used already.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init_64.c |   40 +++-
 1 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 869372a..fa28e3e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -363,20 +363,20 @@ static unsigned long __meminit
 phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
  pgprot_t prot)
 {
-   unsigned pages = 0;
+   unsigned long pages = 0, next;
unsigned long last_map_addr = end;
int i;
 
pte_t *pte = pte_page + pte_index(addr);
 
-   for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, 
pte++) {
-
+   for (i = pte_index(addr); i < PTRS_PER_PTE; i++, addr = next, pte++) {
+   next = (addr & PAGE_MASK) + PAGE_SIZE;
if (addr >= end) {
-   if (!after_bootmem) {
-   for(; i < PTRS_PER_PTE; i++, pte++)
-   set_pte(pte, __pte(0));
-   }
-   break;
+   if (!after_bootmem &&
+   !e820_any_mapped(addr & PAGE_MASK, next, E820_RAM) 
&&
+   !e820_any_mapped(addr & PAGE_MASK, next, 
E820_RESERVED_KERN))
+   set_pte(pte, __pte(0));
+   continue;
}
 
/*
@@ -419,16 +419,15 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, 
unsigned long end,
pte_t *pte;
pgprot_t new_prot = prot;
 
+   next = (address & PMD_MASK) + PMD_SIZE;
if (address >= end) {
-   if (!after_bootmem) {
-   for (; i < PTRS_PER_PMD; i++, pmd++)
-   set_pmd(pmd, __pmd(0));
-   }
-   break;
+   if (!after_bootmem &&
+   !e820_any_mapped(address & PMD_MASK, next, 
E820_RAM) &&
+   !e820_any_mapped(address & PMD_MASK, next, 
E820_RESERVED_KERN))
+   set_pmd(pmd, __pmd(0));
+   continue;
}
 
-   next = (address & PMD_MASK) + PMD_SIZE;
-
if (pmd_val(*pmd)) {
if (!pmd_large(*pmd)) {
spin_lock(_mm.page_table_lock);
@@ -497,13 +496,12 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, 
unsigned long end,
pmd_t *pmd;
pgprot_t prot = PAGE_KERNEL;
 
-   if (addr >= end)
-   break;
-
next = (addr & PUD_MASK) + PUD_SIZE;
-
-   if (!after_bootmem && !e820_any_mapped(addr, next, 0)) {
-   set_pud(pud, __pud(0));
+   if (addr >= end) {
+   if (!after_bootmem &&
+   !e820_any_mapped(addr & PUD_MASK, next, E820_RAM) &&
+   !e820_any_mapped(addr & PUD_MASK, next, 
E820_RESERVED_KERN))
+   set_pud(pud, __pud(0));
continue;
}
 
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 42/46] x86, mm: kill numa_free_all_bootmem()

2012-11-16 Thread Yinghai Lu

Now NO_BOOTMEM version free_all_bootmem_node() does not really
do free_bootmem at all, and it only call register_page_bootmem_info_node
instead.

That is confusing, try to kill that free_all_bootmem_node().

Before that, this patch will remove numa_free_all_bootmem().

That function could be replaced with register_page_bootmem_info() and
free_all_bootmem();

Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/numa_64.h |2 --
 arch/x86/mm/init_64.c  |   15 +++
 arch/x86/mm/numa_64.c  |   13 -
 3 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 0c05f7a..fe4d2d4 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -1,6 +1,4 @@
 #ifndef _ASM_X86_NUMA_64_H
 #define _ASM_X86_NUMA_64_H
 
-extern unsigned long numa_free_all_bootmem(void);
-
 #endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1d53def..4178530 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -629,6 +629,16 @@ EXPORT_SYMBOL_GPL(arch_add_memory);
 
 static struct kcore_list kcore_vsyscall;
 
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NUMA
+   int i;
+
+   for_each_online_node(i)
+   register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
+
 void __init mem_init(void)
 {
long codesize, reservedpages, datasize, initsize;
@@ -641,11 +651,8 @@ void __init mem_init(void)
reservedpages = 0;
 
/* this will put all low memory onto the freelists */
-#ifdef CONFIG_NUMA
-   totalram_pages = numa_free_all_bootmem();
-#else
+   register_page_bootmem_info();
totalram_pages = free_all_bootmem();
-#endif
 
absent_pages = absent_pages_in_range(0, max_pfn);
reservedpages = max_pfn - totalram_pages - absent_pages;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 92e2711..9405ffc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -10,16 +10,3 @@ void __init initmem_init(void)
 {
x86_numa_init();
 }
-
-unsigned long __init numa_free_all_bootmem(void)
-{
-   unsigned long pages = 0;
-   int i;
-
-   for_each_online_node(i)
-   pages += free_all_bootmem_node(NODE_DATA(i));
-
-   pages += free_low_memory_core_early(MAX_NUMNODES);
-
-   return pages;
-}
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 44/46] sparc, mm: Remove calling of free_all_bootmem_node()

2012-11-16 Thread Yinghai Lu

Now NO_BOOTMEM version free_all_bootmem_node() does not really
do free_bootmem at all, and it only call
register_page_bootmem_info_node instead.

That is confusing, try to kill that free_all_bootmem_node().

Before that, this patch will remove calling of free_all_bootmem_node()

We add register_page_bootmem_info() to call register_page_bootmem_info_node
directly.

Also could use free_all_bootmem() for numa case, and it is just
the same as free_low_memory_core_early().

Signed-off-by: Yinghai Lu 
Cc: "David S. Miller" 
Cc: Andrew Morton 
Cc: sparcli...@vger.kernel.org
Acked-by: "David S. Miller" 
---
 arch/sparc/mm/init_64.c |   24 +++-
 1 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 9e28a11..b24bac2 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2021,6 +2021,16 @@ static void __init patch_tlb_miss_handler_bitmap(void)
flushi(_addr_bitmap_insn[0]);
 }
 
+static void __init register_page_bootmem_info(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+   int i;
+
+   for_each_online_node(i)
+   if (NODE_DATA(i)->node_spanned_pages)
+   register_page_bootmem_info_node(NODE_DATA(i));
+#endif
+}
 void __init mem_init(void)
 {
unsigned long codepages, datapages, initpages;
@@ -2038,20 +2048,8 @@ void __init mem_init(void)
 
high_memory = __va(last_valid_pfn << PAGE_SHIFT);
 
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-   {
-   int i;
-   for_each_online_node(i) {
-   if (NODE_DATA(i)->node_spanned_pages != 0) {
-   totalram_pages +=
-   free_all_bootmem_node(NODE_DATA(i));
-   }
-   }
-   totalram_pages += free_low_memory_core_early(MAX_NUMNODES);
-   }
-#else
+   register_page_bootmem_info();
totalram_pages = free_all_bootmem();
-#endif
 
/* We subtract one to account for the mem_map_zero page
 * allocated below.
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 40/46] x86, mm: Move after_bootmem to mm_internel.h

2012-11-16 Thread Yinghai Lu

it is only used in arch/x86/mm/init*.c

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/mm_internal.h |2 ++
 include/linux/mm.h|1 -
 2 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index dc79ac1..6b563a1 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -14,4 +14,6 @@ unsigned long kernel_physical_mapping_init(unsigned long 
start,
 unsigned long page_size_mask);
 void zone_sizes_init(void);
 
+extern int after_bootmem;
+
 #endif /* __X86_MM_INTERNAL_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa06804..390bd14 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1355,7 +1355,6 @@ extern void __init mmap_init(void);
 extern void show_mem(unsigned int flags);
 extern void si_meminfo(struct sysinfo * val);
 extern void si_meminfo_node(struct sysinfo *val, int nid);
-extern int after_bootmem;
 
 extern __printf(3, 4)
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 43/46] x86, mm: kill numa_64.h

2012-11-16 Thread Yinghai Lu

Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/numa.h|2 --
 arch/x86/include/asm/numa_64.h |4 
 arch/x86/kernel/acpi/boot.c|1 -
 arch/x86/kernel/cpu/amd.c  |1 -
 arch/x86/kernel/cpu/intel.c|1 -
 arch/x86/kernel/setup.c|3 ---
 6 files changed, 0 insertions(+), 12 deletions(-)
 delete mode 100644 arch/x86/include/asm/numa_64.h

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 49119fc..52560a2 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -54,8 +54,6 @@ static inline int numa_cpu_node(int cpu)
 
 #ifdef CONFIG_X86_32
 # include 
-#else
-# include 
 #endif
 
 #ifdef CONFIG_NUMA
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
deleted file mode 100644
index fe4d2d4..000
--- a/arch/x86/include/asm/numa_64.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef _ASM_X86_NUMA_64_H
-#define _ASM_X86_NUMA_64_H
-
-#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e651f7a..4b23aa1 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -51,7 +51,6 @@ EXPORT_SYMBOL(acpi_disabled);
 
 #ifdef CONFIG_X86_64
 # include 
-# include 
 #endif /* X86 */
 
 #define BAD_MADT_ENTRY(entry, end) (   \
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9619ba6..913f94f 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,6 @@
 #include 
 
 #ifdef CONFIG_X86_64
-# include 
 # include 
 # include 
 #endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 198e019..3b547cc 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -17,7 +17,6 @@
 
 #ifdef CONFIG_X86_64
 #include 
-#include 
 #endif
 
 #include "cpu.h"
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 85b62f1..6d29d1f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -108,9 +108,6 @@
 #include 
 #include 
 #include 
-#ifdef CONFIG_X86_64
-#include 
-#endif
 #include 
 #include 
 #include 
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 46/46] x86, mm: Let "memmap=" take more entries one time

2012-11-16 Thread Yinghai Lu

Current "memmap=" only can take one entry every time.
when we have more entries, we have to use memmap= for each of them.

For pxe booting, we have command line length limitation, those extra
"memmap=" would waste too much space.

This patch make memmap= could take several entries one time,
and those entries will be split with ','

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/e820.c |   16 +++-
 1 files changed, 15 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index df06ade..d32abea 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -835,7 +835,7 @@ static int __init parse_memopt(char *p)
 }
 early_param("mem", parse_memopt);
 
-static int __init parse_memmap_opt(char *p)
+static int __init parse_memmap_one(char *p)
 {
char *oldp;
u64 start_at, mem_size;
@@ -877,6 +877,20 @@ static int __init parse_memmap_opt(char *p)
 
return *p == '\0' ? 0 : -EINVAL;
 }
+static int __init parse_memmap_opt(char *str)
+{
+   while (str) {
+   char *k = strchr(str, ',');
+
+   if (k)
+   *k++ = 0;
+
+   parse_memmap_one(str);
+   str = k;
+   }
+
+   return 0;
+}
 early_param("memmap", parse_memmap_opt);
 
 void __init finish_e820_parsing(void)
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 45/46] mm: Kill NO_BOOTMEM version free_all_bootmem_node()

2012-11-16 Thread Yinghai Lu

Now NO_BOOTMEM version free_all_bootmem_node() does not really
do free_bootmem at all, and it only call register_page_bootmem_info_node
for online nodes instead.

That is confusing.

We can kill that free_all_bootmem_node(), after we kill two callings
in x86 and sparc.

Signed-off-by: Yinghai Lu 
---
 mm/nobootmem.c |   14 --
 1 files changed, 0 insertions(+), 14 deletions(-)

diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 714d5d6..f22c228 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -141,20 +141,6 @@ unsigned long __init free_low_memory_core_early(int nodeid)
 }
 
 /**
- * free_all_bootmem_node - release a node's free pages to the buddy allocator
- * @pgdat: node to be released
- *
- * Returns the number of pages actually released.
- */
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
-{
-   register_page_bootmem_info_node(pgdat);
-
-   /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
-   return 0;
-}
-
-/**
  * free_all_bootmem - release free pages to the buddy allocator
  *
  * Returns the number of pages actually released.
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 37/46] x86, mm: use pfn instead of pos in split_mem_range

2012-11-16 Thread Yinghai Lu

could save some bit shifting operations.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |   29 ++---
 1 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1cca052..4bf1c53 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -204,12 +204,11 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
 unsigned long end)
 {
unsigned long start_pfn, end_pfn;
-   unsigned long pos;
+   unsigned long pfn;
int i;
 
/* head if not big page alignment ? */
-   start_pfn = PFN_DOWN(start);
-   pos = PFN_PHYS(start_pfn);
+   pfn = start_pfn = PFN_DOWN(start);
 #ifdef CONFIG_X86_32
/*
 * Don't use a large page for the first 2/4MB of memory
@@ -217,26 +216,26 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
 * and overlapping MTRRs into large pages can cause
 * slowdowns.
 */
-   if (pos == 0)
+   if (pfn == 0)
end_pfn = PFN_DOWN(PMD_SIZE);
else
-   end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE));
+   end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #else /* CONFIG_X86_64 */
-   end_pfn = PFN_DOWN(round_up(pos, PMD_SIZE));
+   end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #endif
if (end_pfn > PFN_DOWN(end))
end_pfn = PFN_DOWN(end);
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-   pos = PFN_PHYS(end_pfn);
+   pfn = end_pfn;
}
 
/* big page (2M) range */
-   start_pfn = PFN_DOWN(round_up(pos, PMD_SIZE));
+   start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #ifdef CONFIG_X86_32
end_pfn = PFN_DOWN(round_down(end, PMD_SIZE));
 #else /* CONFIG_X86_64 */
-   end_pfn = PFN_DOWN(round_up(pos, PUD_SIZE));
+   end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE)))
end_pfn = PFN_DOWN(round_down(end, PMD_SIZE));
 #endif
@@ -244,32 +243,32 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1

[PATCH v8 38/46] x86, mm: use limit_pfn for end pfn

2012-11-16 Thread Yinghai Lu

instead of shifting end to get that.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |   20 +++-
 1 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 4bf1c53..f410dc6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -203,10 +203,12 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
 unsigned long start,
 unsigned long end)
 {
-   unsigned long start_pfn, end_pfn;
+   unsigned long start_pfn, end_pfn, limit_pfn;
unsigned long pfn;
int i;
 
+   limit_pfn = PFN_DOWN(end);
+
/* head if not big page alignment ? */
pfn = start_pfn = PFN_DOWN(start);
 #ifdef CONFIG_X86_32
@@ -223,8 +225,8 @@ static int __meminit split_mem_range(struct map_range *mr, 
int nr_range,
 #else /* CONFIG_X86_64 */
end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #endif
-   if (end_pfn > PFN_DOWN(end))
-   end_pfn = PFN_DOWN(end);
+   if (end_pfn > limit_pfn)
+   end_pfn = limit_pfn;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
pfn = end_pfn;
@@ -233,11 +235,11 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
/* big page (2M) range */
start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
 #ifdef CONFIG_X86_32
-   end_pfn = PFN_DOWN(round_down(end, PMD_SIZE));
+   end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
 #else /* CONFIG_X86_64 */
end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
-   if (end_pfn > PFN_DOWN(round_down(end, PMD_SIZE)))
-   end_pfn = PFN_DOWN(round_down(end, PMD_SIZE));
+   if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
+   end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
 #endif
 
if (start_pfn < end_pfn) {
@@ -249,7 +251,7 @@ static int __meminit split_mem_range(struct map_range *mr, 
int nr_range,
 #ifdef CONFIG_X86_64
/* big page (1G) range */
start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
-   end_pfn = PFN_DOWN(round_down(end, PUD_SIZE));
+   end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
@@ -259,7 +261,7 @@ static int __meminit split_mem_range(struct map_range *mr, 
int nr_range,
 
/* tail is not big page (1G) alignment */
start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
-   end_pfn = PFN_DOWN(round_down(end, PMD_SIZE));
+   end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1

[PATCH v8 41/46] x86, mm: Use clamp_t() in init_range_memory_mapping

2012-11-16 Thread Yinghai Lu

save some lines, and make code more readable.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |   21 +
 1 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 2a27e5a..6f85de8 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -357,31 +357,20 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
  * would have hole in the middle or ends, and only ram parts will be mapped.
  */
 static unsigned long __init init_range_memory_mapping(
-  unsigned long range_start,
-  unsigned long range_end)
+  unsigned long r_start,
+  unsigned long r_end)
 {
unsigned long start_pfn, end_pfn;
unsigned long mapped_ram_size = 0;
int i;
 
for_each_mem_pfn_range(i, MAX_NUMNODES, _pfn, _pfn, NULL) {
-   u64 start = (u64)start_pfn << PAGE_SHIFT;
-   u64 end = (u64)end_pfn << PAGE_SHIFT;
-
-   if (end <= range_start)
-   continue;
-
-   if (start < range_start)
-   start = range_start;
-
-   if (start >= range_end)
+   u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
+   u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
+   if (start >= end)
continue;
 
-   if (end > range_end)
-   end = range_end;
-
init_memory_mapping(start, end);
-
mapped_ram_size += end - start;
}
 
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 35/46] x86, mm: use round_up/down in split_mem_range()

2012-11-16 Thread Yinghai Lu

to replace own inline version for those roundup and rounddown.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |   30 --
 1 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8168bf8..0e625e6 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -218,13 +218,11 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
 * slowdowns.
 */
if (pos == 0)
-   end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+   end_pfn = PMD_SIZE >> PAGE_SHIFT;
else
-   end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-<< (PMD_SHIFT - PAGE_SHIFT);
+   end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT;
 #else /* CONFIG_X86_64 */
-   end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-   << (PMD_SHIFT - PAGE_SHIFT);
+   end_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT;
 #endif
if (end_pfn > (end >> PAGE_SHIFT))
end_pfn = end >> PAGE_SHIFT;
@@ -234,15 +232,13 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
}
 
/* big page (2M) range */
-   start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-<< (PMD_SHIFT - PAGE_SHIFT);
+   start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT;
 #ifdef CONFIG_X86_32
-   end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+   end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT;
 #else /* CONFIG_X86_64 */
-   end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-<< (PUD_SHIFT - PAGE_SHIFT);
-   if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-   end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+   end_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT;
+   if (end_pfn > (round_down(end, PMD_SIZE) >> PAGE_SHIFT))
+   end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT;
 #endif
 
if (start_pfn < end_pfn) {
@@ -253,9 +249,8 @@ static int __meminit split_mem_range(struct map_range *mr, 
int nr_range,
 
 #ifdef CONFIG_X86_64
/* big page (1G) range */
-   start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-<< (PUD_SHIFT - PAGE_SHIFT);
-   end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+   start_pfn = round_up(pos, PUD_SIZE) >> PAGE_SHIFT;
+   end_pfn = round_down(end, PUD_SIZE) >> PAGE_SHIFT;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
@@ -264,9 +259,8 @@ static int __meminit split_mem_range(struct map_range *mr, 
int nr_range,
}
 
/* tail is not big page (1G) alignment */
-   start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-<< (PMD_SHIFT - PAGE_SHIFT);
-   end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+   start_pfn = round_up(pos, PMD_SIZE) >> PAGE_SHIFT;
+   end_pfn = round_down(end, PMD_SIZE) >> PAGE_SHIFT;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1

[PATCH v8 06/46] x86, mm: Change find_early_table_space() paramters

2012-11-16 Thread Yinghai Lu

call split_mem_range inside the function.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |   12 +---
 1 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index dbef4ff..51f919f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -196,12 +196,18 @@ static int __meminit split_mem_range(struct map_range 
*mr, int nr_range,
  * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 
1GB
  * pages. Then find enough contiguous space for those page tables.
  */
-static void __init find_early_table_space(struct map_range *mr, int nr_range)
+static void __init find_early_table_space(unsigned long start, unsigned long 
end)
 {
int i;
unsigned long puds = 0, pmds = 0, ptes = 0, tables;
-   unsigned long start = 0, good_end;
+   unsigned long good_end;
phys_addr_t base;
+   struct map_range mr[NR_RANGE_MR];
+   int nr_range;
+
+   memset(mr, 0, sizeof(mr));
+   nr_range = 0;
+   nr_range = split_mem_range(mr, nr_range, start, end);
 
for (i = 0; i < nr_range; i++) {
unsigned long range, extra;
@@ -276,7 +282,7 @@ unsigned long __init_refok init_memory_mapping(unsigned 
long start,
 * nodes are discovered.
 */
if (!after_bootmem)
-   find_early_table_space(mr, nr_range);
+   find_early_table_space(start, end);
 
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 30/46] x86, mm: Move back pgt_buf_* to mm/init.c

2012-11-16 Thread Yinghai Lu

Also change them to static.

Signed-off-by: Yinghai Lu 
---
 arch/x86/include/asm/init.h |4 
 arch/x86/mm/init.c  |6 +++---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 4f13998..626ea8d 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -12,8 +12,4 @@ kernel_physical_mapping_init(unsigned long start,
 unsigned long end,
 unsigned long page_size_mask);
 
-extern unsigned long __initdata pgt_buf_start;
-extern unsigned long __meminitdata pgt_buf_end;
-extern unsigned long __meminitdata pgt_buf_top;
-
 #endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index bed4888..3cadf10 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -19,9 +19,9 @@
 
 #include "mm_internal.h"
 
-unsigned long __initdata pgt_buf_start;
-unsigned long __meminitdata pgt_buf_end;
-unsigned long __meminitdata pgt_buf_top;
+static unsigned long __initdata pgt_buf_start;
+static unsigned long __initdata pgt_buf_end;
+static unsigned long __initdata pgt_buf_top;
 
 static unsigned long min_pfn_mapped;
 
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v8 00/46] x86, mm: map ram from top-down with BRK and memblock.

2012-11-16 Thread Yinghai Lu

rebase patchset together tip/x86/mm2 on top of linus v3.7-rc4

so this one include patchset : x86, mm: init_memory_mapping cleanup
in tip/x86/mm2
---
Current kernel init memory mapping between [0, TOML) and [4G, TOMH)
Some AMD systems have mem hole between 4G and TOMH, around 1T.
According to HPA, we should only mapping ram range.
1. Seperate calculate_table_space_size and find_early_page_table out with
   init_memory_mapping.
2. For all ranges, will allocate page table one time
3. init mapping for ram range one by one.
---

pre mapping page table patcheset includes:
1. use brk to mapping first PMD_SIZE range under end of ram.
2. top down to initialize page table range by range.
3. get rid of calculate_page_table, and find_early_page_table.
4. remove early_ioremap in page table accessing.
5. remove workaround in xen to mark page RO.

v2: changes, update xen interface about pagetable_reserve, so not
   use pgt_buf_* in xen code directly.
v3: use range top-down to initialize page table, so will not use
   calculating/find early table anymore.
   also reorder the patches sequence.
v4: add mapping_mark_page_ro to fix xen, also move pgt_buf_* to init.c
and merge alloc_low_page(), and for 32bit need to add alloc_low_pages
to fix 32bit kmap setting.
v5: remove mark_page_ro workaround  and add another 5 cleanup patches.
v6: rebase on v3.7-rc4 and add 4 cleanup patches.
v7: fix max_low_pfn_mapped for xen domu memmap that does not have hole under 4g
add pfn_range_is_mapped() calling for left over.
v8: updated some changelog and add some Acks from Stefano.
Put v8 on every patch's subject, so hpa would not check old version.
hope could catch window for v3.8

could be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git 
for-x86-mm

Jacob Shin (3):
  x86, mm: if kernel .text .data .bss are not marked as E820_RAM, complain and 
fix
  x86, mm: Fixup code testing if a pfn is direct mapped
  x86, mm: Only direct map addresses that are marked as E820_RAM

Stefano Stabellini (1):
  x86, mm: Add pointer about Xen mmu requirement for alloc_low_pages

Yinghai Lu (42):
  x86, mm: Add global page_size_mask and probe one time only
  x86, mm: Split out split_mem_range from init_memory_mapping
  x86, mm: Move down find_early_table_space()
  x86, mm: Move init_memory_mapping calling out of setup.c
  x86, mm: Revert back good_end setting for 64bit
  x86, mm: Change find_early_table_space() paramters
  x86, mm: Find early page table buffer together
  x86, mm: Separate out calculate_table_space_size()
  x86, mm: Set memblock initial limit to 1M
  x86, mm: use pfn_range_is_mapped() with CPA
  x86, mm: use pfn_range_is_mapped() with gart
  x86, mm: use pfn_range_is_mapped() with reserve_initrd
  x86, mm: relocate initrd under all mem for 64bit
  x86, mm: Align start address to correct big page size
  x86, mm: Use big page size for small memory range
  x86, mm: Don't clear page table if range is ram
  x86, mm: Break down init_all_memory_mapping
  x86, mm: setup page table in top-down
  x86, mm: Remove early_memremap workaround for page table accessing on 64bit
  x86, mm: Remove parameter in alloc_low_page for 64bit
  x86, mm: Merge alloc_low_page between 64bit and 32bit
  x86, mm: Move min_pfn_mapped back to mm/init.c
  x86, mm, Xen: Remove mapping_pagetable_reserve()
  x86, mm: Add alloc_low_pages(num)
  x86, mm: only call early_ioremap_page_table_range_init() once
  x86, mm: Move back pgt_buf_* to mm/init.c
  x86, mm: Move init_gbpages() out of setup.c
  x86, mm: change low/hignmem_pfn_init to static on 32bit
  x86, mm: Move function declaration into mm_internal.h
  x86, mm: Add check before clear pte above max_low_pfn on 32bit
  x86, mm: use round_up/down in split_mem_range()
  x86, mm: use PFN_DOWN in split_mem_range()
  x86, mm: use pfn instead of pos in split_mem_range
  x86, mm: use limit_pfn for end pfn
  x86, mm: Unifying after_bootmem for 32bit and 64bit
  x86, mm: Move after_bootmem to mm_internel.h
  x86, mm: Use clamp_t() in init_range_memory_mapping
  x86, mm: kill numa_free_all_bootmem()
  x86, mm: kill numa_64.h
  sparc, mm: Remove calling of free_all_bootmem_node()
  mm: Kill NO_BOOTMEM version free_all_bootmem_node()
  x86, mm: Let "memmap=" take more entries one time

 arch/sparc/mm/init_64.c  |   24 +-
 arch/x86/include/asm/init.h  |   21 +--
 arch/x86/include/asm/numa.h  |2 -
 arch/x86/include/asm/numa_64.h   |6 -
 arch/x86/include/asm/page_types.h|2 +
 arch/x86/include/asm/pgtable.h   |2 +
 arch/x86/include/asm/pgtable_types.h |1 -
 arch/x86/include/asm/x86_init.h  |   12 -
 arch/x86/kernel/acpi/boot.c  |1 -
 arch/x86/kernel/amd_gart_64.c|5 +-
 arch/x86/kernel/cpu/amd.c|9 +-
 arch/x86/kernel/cpu/intel.c  |1 -
 arch/x86/kernel/e820.c   |   16 ++-
 arch/x86/kernel/setup.c  |  121 --
 arch/x86/kernel/x86_init.c

[PATCH 2/3] netcls_cgroup: introduce cgroup_cls_state->is_local

2012-11-16 Thread Tejun Heo

cs->is_local will be used to indicate whether the cgroup has its own
configuration or inherited from the parent.  It's set when classid is
configured by writing a positive value to cgroup file
"net_cls.classid" and cleared when a negative value is written.

is_local is visible to userland via cgroup file "net_cls.is_local" so
that userland can know whether a cgroup has its config or not.

This patch doesn't yet change hierarchy behavior.  The next patch will
use is_local to implement proper hierarchy.

Signed-off-by: Tejun Heo 
---
 include/net/cls_cgroup.h |  1 +
 net/sched/cls_cgroup.c   | 23 ---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index b6a6eeb..5759d98 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -22,6 +22,7 @@ struct cgroup_cls_state
 {
struct cgroup_subsys_state css;
u32 classid;
+   bool is_local;  /* class id is explicitly configured for this cgroup */
 };
 
 extern void sock_update_classid(struct sock *sk);
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 80a80c4..6e3ef64 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -70,19 +70,36 @@ static u64 read_classid(struct cgroup *cgrp, struct cftype 
*cft)
return cgrp_cls_state(cgrp)->classid;
 }
 
-static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value)
+static int write_classid(struct cgroup *cgrp, struct cftype *cft,
+const char *buf)
 {
+   struct cgroup_cls_state *cs = cgrp_cls_state(cgrp);
+   s64 v;
+
+   if (sscanf(buf, "%lld", ) != 1)
+   return -EINVAL;
+
mutex_lock(_mutex);
-   cgrp_cls_state(cgrp)->classid = (u32) value;
+   cs->classid = clamp_val(v, 0, UINT_MAX);
+   cs->is_local = v >= 0;
mutex_unlock(_mutex);
return 0;
 }
 
+static u64 read_is_local(struct cgroup *cgrp, struct cftype *cft)
+{
+   return cgrp_cls_state(cgrp)->is_local;
+}
+
 static struct cftype ss_files[] = {
{
.name = "classid",
.read_u64 = read_classid,
-   .write_u64 = write_classid,
+   .write_string = write_classid,
+   },
+   {
+   .name = "is_local",
+   .read_u64 = read_is_local,
},
{ } /* terminate */
 };
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCHSET cgroup/for-3.8] netcls_cgroup: implement hierarchy support

2012-11-16 Thread Tejun Heo

Hello, guys.

This patchset implements proper hierarchy support for netcls_cgroup.
Pretty simliar to the netprio one[3].  Simpler as each cgroup has
single config value instead of array of them.

This patchset contains the following three patches.

 0001-netcls_cgroup-introduce-netcls_mutex.patch
 0002-netcls_cgroup-introduce-cgroup_cls_state-is_local.patch
 0003-netcls_cgroup-implement-proper-hierarchy-support.patch

This patchset is on top of

cgroup/for-3.8 ef9fe980c6 ("cgroup_freezer: implement proper hierarchy support")
+ [1] "[PATCHSET cgroup/for-3.8] cgroup: allow ->post_create() to fail"
+ [2] "[PATCH 1/2] cgroup: s/CGRP_CLONE_CHILDREN/CGRP_CPUSET_CLONE_CHILDREN/"
  "[PATCH 2/2] cgroup, cpuset: remove cgroup_subsys->post_clone()"
+ [3] "[PATCHSET cgroup/for-3.8] netprio_cgroup: implement hierarchy support"

and available in the following git branch.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
review-netcls_cgroup-hierarchy

diffstat follows.

 include/net/cls_cgroup.h |1
 net/sched/cls_cgroup.c   |  102 ---
 2 files changed, 88 insertions(+), 15 deletions(-)

Thanks.

--
tejun

[1] http://thread.gmane.org/gmane.linux.kernel.cgroups/5047
[2] http://thread.gmane.org/gmane.linux.kernel/1393151
[3] https://lkml.org/lkml/2012/11/16/514
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/3] netcls_cgroup: implement proper hierarchy support

2012-11-16 Thread Tejun Heo

netcls_cgroup implemented rather weird hierarchy behavior.  A new
cgroup would inherit configuration from its parent but once created it
operates independently from its parent - updates to a parent doesn't
affect its children.

Proper hierarchy behavior can easily be implemented using cgroup
descendant iterator and the is_local flag.  Writing a positive value
to "net_cls.classid" updates the cgroup's classid and propagates the
classid downwards.  Writing a negative value removes the local config
and makes it inherit the parent's classid, and the inherited classid
is propagated downwards.

This makes netcls_cgroup properly hierarchical and behave the same way
as netprio_cgroup.

Signed-off-by: Tejun Heo 
---
 net/sched/cls_cgroup.c | 62 +-
 1 file changed, 51 insertions(+), 11 deletions(-)

diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 6e3ef64..e9e24ac 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -70,6 +70,44 @@ static u64 read_classid(struct cgroup *cgrp, struct cftype 
*cft)
return cgrp_cls_state(cgrp)->classid;
 }
 
+/**
+ * propagate_classid - proapgate classid configuration downwards
+ * @root: cgroup to propagate classid down from
+ *
+ * Propagate @root's classid to descendants of @root.  Each descendant of
+ * @root re-inherits from its parent in pre-order tree walk.  This should
+ * be called after the classid of @root is changed to keep the descendants
+ * up-to-date.
+ *
+ * This may race with a new cgroup coming online and propagation may happen
+ * before finishing ->css_online() or while being taken offline.  As a
+ * cgroup_cls_state is ready after ->css_alloc() and propagation doesn't
+ * affect the parent, this is safe.
+ *
+ * Should be called with netcls_mutex held.
+ */
+static void propagate_classid(struct cgroup *root)
+{
+   struct cgroup *pos;
+
+   lockdep_assert_held(_mutex);
+   rcu_read_lock();
+
+   cgroup_for_each_descendant_pre(pos, root) {
+   struct cgroup_cls_state *cs = cgrp_cls_state(pos);
+
+   /*
+* Don't propagate if @pos has local configuration.  We can
+* skip @pos's subtree but don't have to.  Just propagate
+* through for simplicity.
+*/
+   if (!cs->is_local)
+   cs->classid = cgrp_cls_state(pos->parent)->classid;
+   }
+
+   rcu_read_unlock();
+}
+
 static int write_classid(struct cgroup *cgrp, struct cftype *cft,
 const char *buf)
 {
@@ -80,8 +118,19 @@ static int write_classid(struct cgroup *cgrp, struct cftype 
*cft,
return -EINVAL;
 
mutex_lock(_mutex);
-   cs->classid = clamp_val(v, 0, UINT_MAX);
-   cs->is_local = v >= 0;
+
+   if (v >= 0) {
+   cs->classid = clamp_val(v, 0, UINT_MAX);
+   cs->is_local = true;
+   } else {
+   if (cgrp->parent)
+   cs->classid = cgrp_cls_state(cgrp->parent)->classid;
+   else
+   cs->classid = 0;
+   cs->is_local = false;
+   }
+   propagate_classid(cgrp);
+
mutex_unlock(_mutex);
return 0;
 }
@@ -112,15 +161,6 @@ struct cgroup_subsys net_cls_subsys = {
.subsys_id  = net_cls_subsys_id,
.base_cftypes   = ss_files,
.module = THIS_MODULE,
-
-   /*
-* While net_cls cgroup has the rudimentary hierarchy support of
-* inheriting the parent's classid on cgroup creation, it doesn't
-* properly propagates config changes in ancestors to their
-* descendents.  A child should follow the parent's configuration
-* but be allowed to override it.  Fix it and remove the following.
-*/
-   .broken_hierarchy = true,
 };
 
 struct cls_cgroup_head {
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/3] netcls_cgroup: introduce netcls_mutex

2012-11-16 Thread Tejun Heo

Introduce netcls_mutex to synchronize modifications to
cgroup_cls_state.  New cgrp now inherits classid from ->css_online()
and write_classid() updates classid while holdin netcls_mutex.

As write_classid() doesn't propagate new configuration downwards, this
currently doesn't make any userland-visible difference, but will help
implementing proper hierarchy support.

Signed-off-by: Tejun Heo 
---
 net/sched/cls_cgroup.c | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 8cdc18e..80a80c4 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -17,11 +17,14 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
 
+static DEFINE_MUTEX(netcls_mutex);
+
 static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp)
 {
return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id),
@@ -42,12 +45,21 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct 
cgroup *cgrp)
if (!cs)
return ERR_PTR(-ENOMEM);
 
-   if (cgrp->parent)
-   cs->classid = cgrp_cls_state(cgrp->parent)->classid;
-
return >css;
 }
 
+/* @cgrp coming online, inherit the parent's classid */
+static int cgrp_css_online(struct cgroup *cgrp)
+{
+   if (!cgrp->parent)
+   return 0;
+
+   mutex_lock(_mutex);
+   cgrp_cls_state(cgrp)->classid = cgrp_cls_state(cgrp->parent)->classid;
+   mutex_unlock(_mutex);
+   return 0;
+}
+
 static void cgrp_css_free(struct cgroup *cgrp)
 {
kfree(cgrp_cls_state(cgrp));
@@ -60,7 +72,9 @@ static u64 read_classid(struct cgroup *cgrp, struct cftype 
*cft)
 
 static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value)
 {
+   mutex_lock(_mutex);
cgrp_cls_state(cgrp)->classid = (u32) value;
+   mutex_unlock(_mutex);
return 0;
 }
 
@@ -76,6 +90,7 @@ static struct cftype ss_files[] = {
 struct cgroup_subsys net_cls_subsys = {
.name   = "net_cls",
.css_alloc  = cgrp_css_alloc,
+   .css_online = cgrp_css_online,
.css_free   = cgrp_css_free,
.subsys_id  = net_cls_subsys_id,
.base_cftypes   = ss_files,
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] dt/platform: Use cell-index for device naming if available

2012-11-16 Thread Stepan Moskovchenko


On 11/15/2012 8:10 AM, Grant Likely wrote:

On Mon, 12 Nov 2012 18:48:43 -0800, Stepan Moskovchenko 
 wrote:

On 11/11/2012 5:45 PM, Stepan Moskovchenko wrote:



On Sun, Nov 11, 2012 at 2:32 AM, Rob Herring 
wrote:

On 11/09/2012 06:48 PM, Stepan Moskovchenko wrote:

Use the cell-index property to construct names for platform
devices, falling back on the existing scheme of using the
device register address if cell-index is not specified.

The cell-index property is a more useful device identifier,
especially in systems containing several numbered instances
of a particular hardware block, since it more easily
illustrates how devices relate to each other.

Additionally, userspace software may rely on the classic
. naming scheme to access device attributes in
sysfs, without having to know the physical addresses of
that device on every platform the userspace software may
support. Using cell-index for device naming allows the
device addresses to be hidden from userspace and to be
exposed by logical device number without having to rely on
auxdata to perform name overrides. This allows userspace to
make assumptions about which sysfs nodes map to which
logical instance of a specific hardware block.

Signed-off-by: Stepan Moskovchenko 
---
I had also considered using something like the linux,label property to
allow
custom names for platform devices without resorting to auxdata, but the
cell-index approach seems more in line with what cell-index was
intended for
and with what the pre-DT platform device naming scheme used to be.
Please let
me know if you think there is a better way to accomplish this.

This is just being sent out as an RFC for now. If there are no
objections, I
will send this out as an official patch, along with (or combined with)
a patch
to fix up the device names in things like clock tables of any affected
platforms.


cell-index is basically deprecated. This has been discussed multiple
times in the past. You can use auxdata if you really need to have the
old name.


Actually, I think it would be fine to use an /aliases entry to set the
device name. That's the place to put global namespace information.

g.



Ah, thank you. I would prefer to stay away from auxdata, since it involves
placing more platform-specific data into the kernel, and it is my
understanding that auxdata is intended as a temporary measure. The
/aliases approach looks interesting, and I'll see what I can do with it -
hopefully I can have an RFC / patch soon. It looks like we would want an
"inverse" alias lookup- that is, we would need to know which alias
corresponds to a given node. Is it possible for a node to have multiple
aliases?


yes


If so, which shall we use to create the device name? Anyway, I
will further look into how these aliases work.


Well, why exactly do you want to control the names of devices? Is it so
that devices match up with what they are, or is it to make things match
up with things like clocks and regulators. If it is the latter, then no,
don't do this. Use auxdata. When the kernel requires a specific name for
a device it is very much a kernel *internal* detail. It does not make
sense to encode that into the device tree when it isn't something part
of the binding.




Steve


Hi Grant,

Looking through the alias code, I see that the stem and the alias ID are
stored and parsed separately. For the current way of using aliases, this
makes sense. However, can you please clarify what you meant by using an
/aliases entry to set the device name?

The first and most straightforward approach would be to use the entire
alias name as the device name, making no distinction between the alias
stem and ID. However, since it is possible to have multiple aliases to
the same device, which of the aliases shall we use to construct the
device name? Additionally, this may cause possible problems for legacy
software that expects names in the format of ., since '.' is
not a valid character for alias names as defined by the DT spec,
although strictly speaking this approach would successfully solve the
problem of giving devices predictable and controllable names.

Another way an /aliases entry could be used to set the device name is to
have a . naming scheme, where the name comes from node->name
(as is done in of_device_make_bus_id) and the ID gets queried using
of_alias_get_id(). We would need to create a new alias stem for this
purpose, and suppose that something like "platform" would work. The
name-setting code would then roughly look as follows:

+   alias_id = of_alias_get_id(node, "platform");
+   if (alias_id != -ENODEV) {
+   dev_set_name(dev, "%s.%d", node->name, alias_id);
+   return;
+   }

The downside to this approach is that it imposes the restriction that
device ID numbers now have to be unique throughout the system, whereas
before only the . combinations had to be unique. This is the
result of only the ID number being present in the alias table, with each
such ID number

Re: [PATCH v2] pstore/ram: no timekeeping calls when unavailable

2012-11-16 Thread John Stultz


On 11/16/2012 06:53 PM, Anton Vorontsov wrote:

On Fri, Nov 09, 2012 at 05:26:53PM -0800, Kees Cook wrote:
[]

@@ -171,7 +171,13 @@ static size_t ramoops_write_kmsg_hdr(struct
persistent_ram_zone *prz)
 struct timeval timestamp;
 size_t len;

-   do_gettimeofday();
+   /* Handle dumping before timekeeping has resumed. */
+   if (unlikely(timekeeping_suspended)) {
+   timestamp.tv_sec = 0;
+   timestamp.tv_usec = 0;
+   } else
+   do_gettimeofday();
+

Would nulling out the timestamp be better done in do_gettimeofday()?  That
way we don't have to export timekeeping internals and users would get
something more sane for this corner case.

Well... I'm not sure. If we don't want to expose the
timekeeping_suspended variable, maybe we need a function to check
this? I think it's probably better to find the users of timekeeping
that could call it when suspended. That's why I figured the BUG was
there. Very very few things should be attempting to call gettimeofday
in a place where it might be suspended. As such, it seems like those
things should be able to determine how to handle it. Maybe not
everything would be sensible to get back 0s.

In this particular case, I'm fine with removing the BUG and returning
0 instead, since that's fine for ramoops. :)

In the lack of agreement on kernel/time/timekeeping.c change, I can't
apply the patch. And personally I tend to agree that doing this workaround
in the pstore code is odd. How about introducing ___do_gettimeofday() that
is safe to call when suspened, and the func would have good kernel doc
comments explaining the purpose of it?

Yea, I wanted to revisit this, because it is an odd case.

We don't want to call getnstimeofday() while the timekeeping code is 
suspended, since the clocksource cycle_last value may be invalid if the 
hardware was reset during suspend.  Kees is correct,  the WARN_ONs were 
there to make sure no one tries to use the timekeeping core before its 
resumed, so removing them is problematic.


Your sugggestion of having the __do_gettimeofday() internal accessor 
that maybe returns an error if timekeeping has been suspended could work.


The other possibility is depending on the needs for accuracy with the 
timestamp, current_kernel_time() might be a better interface to use, 
since it will return the time at the last tick, and doesn't require 
accessing the clocksource hardware.  Might that be a simpler solution? 
Or is sub-tick granularity necessary?


thanks
-john







--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2] pstore/ram: no timekeeping calls when unavailable

2012-11-16 Thread Anton Vorontsov

On Fri, Nov 09, 2012 at 05:26:53PM -0800, Kees Cook wrote:
[]
> >> @@ -171,7 +171,13 @@ static size_t ramoops_write_kmsg_hdr(struct
> >> persistent_ram_zone *prz)
> >> struct timeval timestamp;
> >> size_t len;
> >>
> >> -   do_gettimeofday();
> >> +   /* Handle dumping before timekeeping has resumed. */
> >> +   if (unlikely(timekeeping_suspended)) {
> >> +   timestamp.tv_sec = 0;
> >> +   timestamp.tv_usec = 0;
> >> +   } else
> >> +   do_gettimeofday();
> >> +
> >
> > Would nulling out the timestamp be better done in do_gettimeofday()?  That
> > way we don't have to export timekeeping internals and users would get
> > something more sane for this corner case.
> 
> Well... I'm not sure. If we don't want to expose the
> timekeeping_suspended variable, maybe we need a function to check
> this? I think it's probably better to find the users of timekeeping
> that could call it when suspended. That's why I figured the BUG was
> there. Very very few things should be attempting to call gettimeofday
> in a place where it might be suspended. As such, it seems like those
> things should be able to determine how to handle it. Maybe not
> everything would be sensible to get back 0s.
> 
> In this particular case, I'm fine with removing the BUG and returning
> 0 instead, since that's fine for ramoops. :)

In the lack of agreement on kernel/time/timekeeping.c change, I can't
apply the patch. And personally I tend to agree that doing this workaround
in the pstore code is odd. How about introducing ___do_gettimeofday() that
is safe to call when suspened, and the func would have good kernel doc
comments explaining the purpose of it?

Thanks,
Anton.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] ramoops: Fixup section annotations

2012-11-16 Thread Anton Vorontsov

On Mon, Oct 22, 2012 at 11:53:33AM -0700, Kees Cook wrote:
> On Wed, Oct 17, 2012 at 12:39 AM, Hannes Reinecke  wrote:
> > The compiler complained about missing section annotations.
> > Fix it.
> >
> > Signed-off-by: Hannes Reinecke 
> > Cc: Anton Vorontsov 
> > Cc: Colin Cross 
> > Cc: Kees Cook 
> > Cc: Tony Luck 
> 
> Acked-by: Kees Cook 

Applied, thank you!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] power: battery: pointer math issue in gab_probe()

2012-11-16 Thread Anton Vorontsov

On Sat, Sep 29, 2012 at 10:13:46AM +0300, Dan Carpenter wrote:
> psy->properties is an enum (32 bit type) so adding sizeof() puts us
> four times further along than we intended.  It should be cast to a char
> pointer before doing the math.
> 
> Signed-off-by: Dan Carpenter 
> ---
> Casting to void * would also work on GCC, at least.

Applied, thanks a lot!

> 
> diff --git a/drivers/power/generic-adc-battery.c 
> b/drivers/power/generic-adc-battery.c
> index 9bdf444..776f118 100644
> --- a/drivers/power/generic-adc-battery.c
> +++ b/drivers/power/generic-adc-battery.c
> @@ -279,7 +279,8 @@ static int __devinit gab_probe(struct platform_device 
> *pdev)
>   }
>  
>   memcpy(psy->properties, gab_props, sizeof(gab_props));
> - properties = psy->properties + sizeof(gab_props);
> + properties = (enum power_supply_property *)
> + ((char *)psy->properties + sizeof(gab_props));
>  
>   /*
>* getting channel from iio and copying the battery properties
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] autofs4 - use simple_empty() for empty directory check

2012-11-16 Thread Ian Kent

On Fri, 2012-11-16 at 17:34 +, Al Viro wrote:
> On Fri, Nov 16, 2012 at 08:43:28AM -0800, Linus Torvalds wrote:
> > On Fri, Nov 16, 2012 at 8:36 AM, Ian Kent  wrote:
> > >
> > > Sure, are you recommending I alter the fs/libfs.c functions to add a
> > > function that doesn't have the outer lock, and have simple_empty() call
> > > that, then use it in autofs?
> > 
> > Yup. That's the standard pattern, although usually we *strive* to make
> > the unlocked versions be static to the internal code, and then use
> > them there for the various helpers. In your case that seems
> > impossible, since you do depend on holding the d_lock in the caller
> > after the tests. But at least we don't have to duplicate the code and
> > have it in two unrelated places.
> > 
> > Al? Comments?
> 
> The thing is, I'm not convinced we really need ->d_lock held downstream.
> E.g.  __autofs4_add_expiring() ought to be OK with just sbi->lookup_lock.
> Not sure about the situation in autofs4_d_automount() - the thing is messy
> as hell ;-/
> 
> Ian, do we really need that __simple_empty() variant in either caller?  What
> is getting protected by ->d_lock after it and do we really need ->d_lock
> continuously held for that?

Yeah, I've thought about that a few times now but haven't gone so far as
to change it.

I'll have another look.

Ian


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v6 2/2] net: Add support for NTB virtual ethernet device

2012-11-16 Thread Jon Mason

A virtual ethernet device that uses the NTB transport API to
send/receive data.

Signed-off-by: Jon Mason 
Reviewed-by: Nicholas Bellinger 
---
 MAINTAINERS  |1 +
 drivers/net/Kconfig  |4 +
 drivers/net/Makefile |1 +
 drivers/net/ntb_netdev.c |  419 ++
 4 files changed, 425 insertions(+)
 create mode 100644 drivers/net/ntb_netdev.c

diff --git a/MAINTAINERS b/MAINTAINERS
index b6139ad..e4e7652 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5197,6 +5197,7 @@ NTB DRIVER
 M: Jon Mason 
 S: Supported
 F: drivers/ntb/
+F: drivers/net/ntb_netdev.c
 F: include/linux/ntb.h
 
 NTFS FILESYSTEM
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 6a70184..5db9acb 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -189,6 +189,10 @@ config NETPOLL_TRAP
 config NET_POLL_CONTROLLER
def_bool NETPOLL
 
+config NTB_NETDEV
+   tristate "Virtual Ethernet over NTB"
+   depends on NTB
+
 config RIONET
tristate "RapidIO Ethernet over messaging driver support"
depends on RAPIDIO
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 335db78..ef3d090 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -71,3 +71,4 @@ obj-$(CONFIG_USB_IPHETH)+= usb/
 obj-$(CONFIG_USB_CDC_PHONET)   += usb/
 
 obj-$(CONFIG_HYPERV_NET) += hyperv/
+obj-$(CONFIG_NTB_NETDEV) += ntb_netdev.o
diff --git a/drivers/net/ntb_netdev.c b/drivers/net/ntb_netdev.c
new file mode 100644
index 000..af48a69
--- /dev/null
+++ b/drivers/net/ntb_netdev.c
@@ -0,0 +1,419 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ *   redistributing this file, you may do so under either license.
+ *
+ *   GPL LICENSE SUMMARY
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of version 2 of the GNU General Public License as
+ *   published by the Free Software Foundation.
+ *
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2012 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copy
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Intel PCIe NTB Network Linux driver
+ *
+ * Contact Information:
+ * Jon Mason 
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define NTB_NETDEV_VER "0.6"
+
+MODULE_DESCRIPTION(KBUILD_MODNAME);
+MODULE_VERSION(NTB_NETDEV_VER);
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel Corporation");
+
+struct ntb_netdev {
+   struct list_head list;
+   struct pci_dev *pdev;
+   struct net_device *ndev;
+   struct ntb_transport_qp *qp;
+};
+
+#defineNTB_TX_TIMEOUT_MS   1000
+#defineNTB_RXQ_SIZE100
+
+static LIST_HEAD(dev_list);
+
+static void ntb_netdev_event_handler(void *data, int status)
+{
+   struct net_device *ndev = data;
+   struct ntb_netdev *dev = netdev_priv(ndev);
+
+   netdev_dbg(ndev, "Event %x, Link %x\n", status,
+  ntb_transport_link_query(dev->qp));
+
+   /* Currently, only link status event is supported */
+   if (status)
+   netif_carrier_on(ndev);
+   else
+   netif_carrier_off(ndev);
+}
+
+static void ntb_netdev_rx_handler(struct ntb_transport_qp *qp, void *qp_data,
+ void *data, int len)
+{
+   struct net_device *ndev = qp_data;
+   struct sk_buff

[PATCH v6 0/2] PCI-Express Non-Transparent Bridge Support

2012-11-16 Thread Jon Mason

I am submitting version 6 of the PCI-Express Non-Transparent Bridge
patches for inclusion in 3.8 via Greg KH's char-misc-next tree.  All
outstanding issues have been addressed.

Version 6 corrects Greg KH's issues, most notably the improper usage of
the Linux device model.
http://thread.gmane.org/gmane.linux.kernel.pci/18599

Thanks,
Jon

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Linux 3.7-rc6

2012-11-16 Thread Linus Torvalds

Slightly less than a week, but since I'm leaving for vacation
tomorrow, here it is.

Things have continued to be pretty calm. We have a few more commits
here than there were in -rc5, but not enough to make me worry, and
most of the changes really tend to be tiny. And the few commits that
aren't one-liners (or "few-liners") tend to be reverts (eg
re-introducing /proc//oom_adj) or some pretty obscure stuff (the
MIPS irqflags functions).

So we've got some arch updates (mainly mips and unicore32, with a
smattering of arm[64] and s390) and driver changes (sound, net, usb).
Along with some networking and mm fixes.

The appended shortlog gives a flavor of the kinds of things that
happened, it's really not all that exciting, but it's short enough to
be easy to read through to get some kind of idea.

I'll have a laptop with me as I'm away, but if things calm down even
further, I'll be happy. I'll do an -rc7, but considering how calm
things have been, I suspect that's the last -rc. Unless something
dramatic happens.

Linus

---

Akinobu Mita (1):
  drm/ttm: remove unneeded preempt_disable/enable

Al Viro (2):
  unicore32: switch to generic kernel_thread()/kernel_execve()
  unicore32: switch to generic sys_execve()

Alex Deucher (1):
  drm/radeon: fix logic error in atombios_encoders.c

Alexander Duyck (2):
  vxlan: Fix error that was resulting in VXLAN MTU size being 10
bytes too large
  vxlan: Update hard_header_len based on lowerdev when instantiating VXLAN

Andrew Lunn (1):
  Regulator: core: Unregister when gpio request fails.

Andrew Morton (1):
  revert "mm: fix-up zone present pages"

Andrew Vagin (1):
  tcp: fix retransmission in repair mode

Antonio Quartulli (4):
  batman-adv: fix tt_global_entries flags update
  batman-adv: correctly pass the client flag on tt_response
  batman-adv: don't add TEMP clients belonging to other backbone nodes
  batman-adv: process broadcast packets in BLA earlier

Arik Nemtsov (1):
  mac80211: sync acccess to tx_filtered/ps_tx_buf queues

Aristeu Rozanski (1):
  device_cgroup: fix unchecked cgroup parent usage

Artem Bityutskiy (2):
  UBIFS: introduce categorized lprops counter
  UBIFS: fix mounting problems after power cuts

Axel Lin (2):
  irqchip: irq-bcm2835: Add terminating entry for of_device_id table
  tty: serial: max310x: Add terminating entry for spi_device_id table

Benjamin Poirier (1):
  menuconfig: Replace CIRCLEQ by list_head-style lists.

Bjørn Mork (2):
  USB: keyspan: fix typo causing GPF on open
  net: cdc_ncm: add Huawei devices

Catalin Marinas (1):
  arm64: Distinguish between user and kernel XN bits

Charles Keepax (2):
  ASoC: bells: Correct type in sub speaker DAI name for WM5102
  regulator: core: Avoid deadlock when regulator_register fails

Christoph Fritz (1):
  ARM: imx: ehci: fix host power mask bit

Colin Cross (1):
  Revert "Staging: Android alarm: IOCTL command encoding fix"

Colin Ian King (1):
  pstore: Fix NULL pointer dereference in console writes

Dan Carpenter (3):
  ALSA: es1968: precedence bug in snd_es1968_tea575x_get_pins()
  ALSA: fm801: precedence bug in snd_fm801_tea575x_get_pins()
  vmwgfx: return an -EFAULT if copy_to_user() fails

Dan Williams (2):
  USB: option: add Novatel E362 and Dell Wireless 5800 USB IDs
  USB: option: add Alcatel X220/X500D USB IDs

David Howells (1):
  UAPI: (Scripted) Disintegrate arch/unicore32/include/asm

David Rientjes (3):
  mm, oom: reintroduce /proc/pid/oom_adj
  mm: fix build warning for uninitialized value
  mips, arc: fix build failure

David S. Miller (1):
  Revert "drivers/net/phy/mdio-bitbang.c: Call mdiobus_unregister
before mdiobus_free"

David Spinadel (1):
  mac80211: init sched_scan_ies

Dimitris Papastamos (2):
  ASoC: bells: Add missing select of WM0010
  ASoC: bells: Select WM1250-EV1 Springbank audio I/O module

Eric Dumazet (2):
  tcp: tcp_replace_ts_recent() should not be called from
tcp_validate_incoming()
  tcp: handle tcp_net_metrics_init() order-5 memory allocation failures

Eric Millbrandt (1):
  ASoC: wm8978: pll incorrectly configured when codec is master

Fabio Estevam (3):
  ASoC: mxs-saif: Add MODULE_ALIAS
  ASoC: mxs-saif: Fix channel swap for 24-bit format
  ARM: boot: Fix usage of kecho

Felipe Balbi (1):
  Revert "usb: musb: use DMA mode 1 whenever possible"

Felix Fietkau (2):
  mac80211: do not call ieee80211_configure_filter if no interfaces are up
  mac80211: call skb_dequeue/ieee80211_free_txskb instead of
__skb_queue_purge

Greg Kroah-Hartman (1):
  Revert "USB/host: Cleanup unneccessary irq disable code"

Guan Xuetao (4):
  UniCore32 bugfix: add missed CONFIG_ZONE_DMA
  UniCore32-bugfix: fix mismatch return value of __xchg_bad_pointer
  UniCore32-bugfix: Remove definitions in asm/bug.h to solve
difference between native and

[PATCH] acpi_system_write_wakeup_device(): fix error check for unsigned variable.

2012-11-16 Thread Cyril Roelandt

The LEN variable is unsigned, therefore checking whether it is less than 0 is
useless.

Signed-off-by: Cyril Roelandt 
---
 drivers/acpi/proc.c |2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/acpi/proc.c b/drivers/acpi/proc.c
index 27adb09..37871a7 100644
--- a/drivers/acpi/proc.c
+++ b/drivers/acpi/proc.c
@@ -366,8 +366,6 @@ acpi_system_write_wakeup_device(struct file *file,
 
if (len > 4)
len = 4;
-   if (len < 0)
-   return -EFAULT;
 
if (copy_from_user(strbuf, buffer, len))
return -EFAULT;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] staging/serqt_usb2: Refactor qt_status_change_check() in serqt_usb2.c

2012-11-16 Thread Joe Perches

On Sat, 2012-11-17 at 05:19 +0900, YAMANE Toshiaki wrote:
> Modify qt_status_change_check() and delete qt_status_change().
> 
> Signed-off-by: YAMANE Toshiaki 
> ---
>  drivers/staging/serqt_usb2/serqt_usb2.c |   53 
> +--
>  1 file changed, 22 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/staging/serqt_usb2/serqt_usb2.c 
> b/drivers/staging/serqt_usb2/serqt_usb2.c
[]
> @@ -334,11 +307,29 @@ static void qt_status_change_check(struct tty_struct 
> *tty,
>   flag = 0;
>   switch (data[i + 2]) {
>   case 0x00:
> + if (i > (RxCount - 4)) {
> + dev_dbg(>dev,
> + "Illegal escape seuences in 
> received data\n");

trivia: seuences/sequence

> + break;
> + }
> +
> + ProcessLineStatus(qt_port, data[i + 3]);
> +
> + i += 3;

you could move the i += 3 before the ProcessLineStatus
and use data[i]

> + flag = 1;
> + break;
> +
>   case 0x01:
> - flag = qt_status_change((RxCount - 4), data, i,
> - qt_port, port);
> - if (flag == 1)
> - i += 3;
> + if (i > (RxCount - 4)) {
> + dev_dbg(>dev,
> + "Illegal escape seuences in 
> received data\n");

typo here too

> + break;
> + }
> +
> + ProcessModemStatus(qt_port, data[i + 3]);
> +
> + i += 3;

same i += 3

> + flag = 1;
>   break;
>  
>   case 0xff:

What about something like:

case 0x0:
case 0x1:
if (i > (RxCount - 4)) {
dev_dbg(>dev,
"Illegal escape sequence in 
received data\n");
break;
}

if (data[i + 2] == 0x0)
ProcessLineStatus(qt_port, data[i + 3]);
else
ProcessModemStatus(qt_port, data[i + 
3]);

i += 3;
flag = 1;
break;



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC v3 0/3] vmpressure_fd: Linux VM pressure notifications

2012-11-16 Thread Anton Vorontsov

On Fri, Nov 16, 2012 at 01:57:09PM -0800, David Rientjes wrote:
> > > I'm wondering if we should have more than three different levels.
> > > 
> > 
> > In the case I outlined below, for backwards compatibility. What I
> > actually mean is that memcg *currently* allows arbitrary notifications.
> > One way to merge those, while moving to a saner 3-point notification, is
> > to still allow the old writes and fit them in the closest bucket.
> 
> Yeah, but I'm wondering why three is the right answer.

You were not Cc'ed, so let me repeat why I ended up w/ the levels (not
necessary three levels), instead of relying on the 0..100 scale:

 The main change is that I decided to go with discrete levels of the
 pressure.

 When I started writing the man page, I had to describe the 'reclaimer
 inefficiency index', and while doing this I realized that I'm describing
 how the kernel is doing the memory management, which we try to avoid in
 the vmevent. And applications don't really care about these details:
 reclaimers, its inefficiency indexes, scanning window sizes, priority
 levels, etc. -- it's all "not interesting", and purely kernel's stuff. So
 I guess Mel Gorman was right, we need some sort of levels.

 What applications (well, activity managers) are really interested in is
 this:

 1. Do we we sacrifice resources for new memory allocations (e.g. files
cache)?
 2. Does the new memory allocations' cost becomes too high, and the system
hurts because of this?
 3. Are we about to OOM soon?

 And here are the answers:

 1. VMEVENT_PRESSURE_LOW
 2. VMEVENT_PRESSURE_MED
 3. VMEVENT_PRESSURE_OOM

 There is no "high" pressure, since I really don't see any definition of
 it, but it's possible to introduce new levels without breaking ABI.

Later I came up with the fourth level:

 Maybe it makes sense to implement something like PRESSURE_MILD/BALANCE
 with an additional nr_pages threshold, which basically hits the kernel
 about how many easily reclaimable pages userland has (that would be a
 part of our definition for the mild/balance pressure level).

I.e. the fourth level can serve as a two-way communication w/ the kernel.
But again, this would be just an extension, I don't want to introduce this
now.

> > > Umm, why do users of cpusets not want to be able to trigger memory 
> > > pressure notifications?
> > > 
> > Because cpusets only deal with memory placement, not memory usage.
> 
> The set of nodes that a thread is allowed to allocate from may face memory 
> pressure up to and including oom while the rest of the system may have a 
> ton of free memory.  Your solution is to compile and mount memcg if you 
> want notifications of memory pressure on those nodes.  Others in this 
> thread have already said they don't want to rely on memcg for any of this 
> and, as Anton showed, this can be tied directly into the VM without any 
> help from memcg as it sits today.  So why implement a simple and clean 

You meant 'why not'?

> mempressure cgroup that can be used alone or co-existing with either memcg 
> or cpusets?
> 
> > And it is not that moving a task to cpuset disallows you to do any of
> > this: you could, as long as the same set of tasks are mounted in a
> > corresponding memcg.
> > 
> 
> Same thing with a separate mempressure cgroup.  The point is that there 
> will be users of this cgroup that do not want the overhead imposed by 
> memcg (which is why it's disabled in defconfig) and there's no direct 
> dependency that causes it to be a part of memcg.

There's also an API "inconvenince issue" with memcg's usage_in_bytes
stuff: applications have a hard time resetting the threshold to 'emulate'
the pressure notifications, and they also have to count bytes (like 'total
- used = free') to set the threshold. While a separate 'pressure'
notifications shows exactly what apps actually want to know: the pressure.

Thanks,
Anton.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH RFT 1/2] pinctrl: dove: Fix dove_audio1_ctrl_set when BIT(0|1|2|3) of config is clear

2012-11-16 Thread Axel Lin

> pinctrl for Dove will not be enabled before DT clock support for Dove,
> which will hopefully make it into 3.8. If you agree, I will test and push it
> with the related patches for enabling pinctrl.
Ok. Thanks.

Axel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] OMAP2+: mux: Fixed gpio mux mode analysis.

2012-11-16 Thread Tony Lindgren

* oleg.matcovs...@ti.com  [121115 13:42]:
> From: Oleg Matcovschi 
> 
> OMAP_MODE_GPIO() macro verified only OMAP_MUX_MODE4.

Indeed, that needs to be fixed.

> It is not correct for following platforms:
> 2430 - gpio mux mode 3
> 44xx - gpio mux mode 3

Looks like these are set properly to GPIO_IN_MODE3 with
omap_mux_init().

> 54xx - gpio mux mode 6

This will be only using pinctrl-single.c, so we don't
really have to worrry about this one. But I guess we might
as well fix that too while at it if somebody backports omap5
support to some older kernel..
 
> Patch reserves first 3 bits in partition flags for storing gpio mux
> mode in same format as stored in control pad register.
> Modified OMAP_MODE_GPIO() macro handles all possible cases of gpio mux mode.
> Modified omap_mux_init() flags of omap34xx to include OMAP_MUX_GPIO_IN_MODE4.

Why don't you just add int gpio to struct omap_mux_partition?

You're not saving many bytes as at most we have two partitions
so far per SoC.
 
> --- a/arch/arm/mach-omap2/mux34xx.c
> +++ b/arch/arm/mach-omap2/mux34xx.c
> @@ -2053,7 +2053,7 @@ int __init omap3_mux_init(struct omap_board_mux 
> *board_subset, int flags)
>   return -EINVAL;
>   }
>  
> - return omap_mux_init("core", 0,
> + return omap_mux_init("core", OMAP_MUX_GPIO_IN_MODE4,
>OMAP3_CONTROL_PADCONF_MUX_PBASE,
>OMAP3_CONTROL_PADCONF_MUX_SIZE,
>omap3_muxmodes, package_subset, board_subset,

The default is GPIO_IN_MODE4, but that seems to be a bad
default choice as only omap3 uses it. So yeah, might as well
initialize that too and not assume any defaults GPIO mode.

Regards,

Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/mm] x86/acpi: Use __pa_symbol instead of __pa on C visible symbols

2012-11-16 Thread tip-bot for Alexander Duyck

Commit-ID:  afd51a0e32cd79261f0e823400886ed322a355ac
Gitweb: http://git.kernel.org/tip/afd51a0e32cd79261f0e823400886ed322a355ac
Author: Alexander Duyck 
AuthorDate: Fri, 16 Nov 2012 13:57:43 -0800
Committer:  H. Peter Anvin 
CommitDate: Fri, 16 Nov 2012 16:42:10 -0800

x86/acpi: Use __pa_symbol instead of __pa on C visible symbols

This change just updates one spot where __pa was being used when __pa_symbol
should have been used.  By using __pa_symbol we are able to drop a few extra
lines of code as we don't have to test to see if the virtual pointer is a
part of the kernel text or just standard virtual memory.

Cc: Len Brown 
Cc: Pavel Machek 
Acked-by: "Rafael J. Wysocki" 
Signed-off-by: Alexander Duyck 
Link: 
http://lkml.kernel.org/r/20121116215737.8521.51167.st...@ahduyck-cp1.jf.intel.com
Signed-off-by: H. Peter Anvin 
---
 arch/x86/kernel/acpi/sleep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 11676cf..f146a3c 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -69,7 +69,7 @@ int acpi_suspend_lowlevel(void)
 
 #ifndef CONFIG_64BIT
header->pmode_entry = (u32)_pmode_return;
-   header->pmode_cr3 = (u32)__pa(_page_table);
+   header->pmode_cr3 = (u32)__pa_symbol(initial_page_table);
saved_magic = 0x12345678;
 #else /* CONFIG_64BIT */
 #ifdef CONFIG_SMP
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/mm] x86/ftrace: Use __pa_symbol instead of __pa on C visible symbols

2012-11-16 Thread tip-bot for Alexander Duyck

Commit-ID:  217f155e9fc68bf2a6c58a7b47e0d1ce68d78818
Gitweb: http://git.kernel.org/tip/217f155e9fc68bf2a6c58a7b47e0d1ce68d78818
Author: Alexander Duyck 
AuthorDate: Fri, 16 Nov 2012 13:57:32 -0800
Committer:  H. Peter Anvin 
CommitDate: Fri, 16 Nov 2012 16:42:09 -0800

x86/ftrace: Use __pa_symbol instead of __pa on C visible symbols

Instead of using __pa which is meant to be a general function for converting
virtual addresses to physical addresses we can use __pa_symbol which is the
preferred way of decoding kernel text virtual addresses to physical addresses.

In this case we are not directly converting C visible symbols however if we
know that the instruction pointer is somewhere between _text and _etext we
know that we are going to be translating an address form the kernel text
space.

Cc: Steven Rostedt 
Cc: Frederic Weisbecker 
Signed-off-by: Alexander Duyck 
Link: 
http://lkml.kernel.org/r/20121116215718.8521.24026.st...@ahduyck-cp1.jf.intel.com
Signed-off-by: H. Peter Anvin 
---
 arch/x86/kernel/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1d41402..42a392a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -89,7 +89,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code)
 * kernel identity mapping to modify code.
 */
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
-   ip = (unsigned long)__va(__pa(ip));
+   ip = (unsigned long)__va(__pa_symbol(ip));
 
return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE);
 }
@@ -279,7 +279,7 @@ static int ftrace_write(unsigned long ip, const char *val, 
int size)
 * kernel identity mapping to modify code.
 */
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
-   ip = (unsigned long)__va(__pa(ip));
+   ip = (unsigned long)__va(__pa_symbol(ip));
 
return probe_kernel_write((void *)ip, val, size);
 }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/mm] x86: Use __pa_symbol instead of __pa on C visible symbols

2012-11-16 Thread tip-bot for Alexander Duyck

Commit-ID:  fc8d782677f163dee76427fdd8a92bebd2b50b23
Gitweb: http://git.kernel.org/tip/fc8d782677f163dee76427fdd8a92bebd2b50b23
Author: Alexander Duyck 
AuthorDate: Fri, 16 Nov 2012 13:57:13 -0800
Committer:  H. Peter Anvin 
CommitDate: Fri, 16 Nov 2012 16:42:09 -0800

x86: Use __pa_symbol instead of __pa on C visible symbols

When I made an attempt at separating __pa_symbol and __pa I found that there
were a number of cases where __pa was used on an obvious symbol.

I also caught one non-obvious case as _brk_start and _brk_end are based on the
address of __brk_base which is a C visible symbol.

In mark_rodata_ro I was able to reduce the overhead of kernel symbol to
virtual memory translation by using a combination of __va(__pa_symbol())
instead of page_address(virt_to_page()).

Signed-off-by: Alexander Duyck 
Link: 
http://lkml.kernel.org/r/20121116215640.8521.80483.st...@ahduyck-cp1.jf.intel.com
Signed-off-by: H. Peter Anvin 
---
 arch/x86/kernel/cpu/intel.c |  2 +-
 arch/x86/kernel/setup.c | 16 
 arch/x86/mm/init_64.c   | 18 --
 arch/x86/mm/pageattr.c  |  8 
 arch/x86/platform/efi/efi.c |  4 ++--
 arch/x86/realmode/init.c|  8 
 6 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 198e019..2249e7e 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -168,7 +168,7 @@ int __cpuinit ppro_with_ram_bug(void)
 #ifdef CONFIG_X86_F00F_BUG
 static void __cpuinit trap_init_f00f_bug(void)
 {
-   __set_fixmap(FIX_F00F_IDT, __pa(_table), PAGE_KERNEL_RO);
+   __set_fixmap(FIX_F00F_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);
 
/*
 * Update the IDT descriptor and reload the IDT so that
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ca45696..2702c5d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -300,8 +300,8 @@ static void __init cleanup_highmap(void)
 static void __init reserve_brk(void)
 {
if (_brk_end > _brk_start)
-   memblock_reserve(__pa(_brk_start),
-__pa(_brk_end) - __pa(_brk_start));
+   memblock_reserve(__pa_symbol(_brk_start),
+_brk_end - _brk_start);
 
/* Mark brk area as locked down and no longer taking any
   new allocations */
@@ -761,12 +761,12 @@ void __init setup_arch(char **cmdline_p)
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = _brk_end;
 
-   code_resource.start = virt_to_phys(_text);
-   code_resource.end = virt_to_phys(_etext)-1;
-   data_resource.start = virt_to_phys(_etext);
-   data_resource.end = virt_to_phys(_edata)-1;
-   bss_resource.start = virt_to_phys(&__bss_start);
-   bss_resource.end = virt_to_phys(&__bss_stop)-1;
+   code_resource.start = __pa_symbol(_text);
+   code_resource.end = __pa_symbol(_etext)-1;
+   data_resource.start = __pa_symbol(_etext);
+   data_resource.end = __pa_symbol(_edata)-1;
+   bss_resource.start = __pa_symbol(__bss_start);
+   bss_resource.end = __pa_symbol(__bss_stop)-1;
 
 #ifdef CONFIG_CMDLINE_BOOL
 #ifdef CONFIG_CMDLINE_OVERRIDE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3baff25..0374a10 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -770,12 +770,10 @@ void set_kernel_text_ro(void)
 void mark_rodata_ro(void)
 {
unsigned long start = PFN_ALIGN(_text);
-   unsigned long rodata_start =
-   ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+   unsigned long rodata_start = PFN_ALIGN(__start_rodata);
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
-   unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
-   unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
-   unsigned long data_start = (unsigned long) &_sdata;
+   unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
+   unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
 
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
   (end - start) >> 10);
@@ -800,12 +798,12 @@ void mark_rodata_ro(void)
 #endif
 
free_init_pages("unused kernel memory",
-   (unsigned long) page_address(virt_to_page(text_end)),
-   (unsigned long)
-page_address(virt_to_page(rodata_start)));
+   (unsigned long) __va(__pa_symbol(text_end)),
+   (unsigned long) __va(__pa_symbol(rodata_start)));
+
free_init_pages("unused kernel memory",
-   (unsigned long) page_address(virt_to_page(rodata_end)),
-   (unsigned long) page_address(virt_to_page(data_start)));
+   (unsigned long) __va(__pa_symbol(rodata_end)),
+

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1400 matches

Mail list logo