[PATCH v4 3/3] mtd: nand: omap2: Support for hardware BCH error correction.

2013-01-03 Thread Philip Avinash
ELM module can be used for hardware error correction of BCH 4 & 8 bit.
ELM module functionality is verified by checking the availability of
handle for ELM module in device tree. Hence supporting
1. ELM module available, BCH error correction done by ELM module. Also
support read & write page in one shot by adding custom read_page and
write_page methods. This helps in optimizing code for NAND flashes with
page size less than 4 KB.
2. If ELM module not available fall back to software BCH error
correction support.

New structure member is added to omap_nand_info
1. "is_elm_used" to know the status of whether the ELM module is used for
   error correction or not.
2. "elm_dev" device pointer to elm device on detection of ELM module.

Also being here update the device tree documentation of gpmc-nand for
adding optional property elm_id.

Note:
ECC layout uses 1 extra bytes for 512 byte of data to handle erased
pages. Extra byte programmed to zero for programmed pages. Also BCH8
requires 14 byte ecc to maintain compatibility with RBL ECC layout.
This results a common ecc layout across RBL, U-boot & Linux with BCH8.

Signed-off-by: Philip Avinash 
---
This patch depend on http://www.spinics.net/lists/linux-omap/msg83504.html
for GPMC DT binding.

Changes since v3:
- ELM module availability is find by checking device tree handle
  for ELM module. Hence remove support for elm_request and
  configure ELM module directly from NAND driver for required BCH
  ecc algorithm.
- device tree binding document update for gpmc nand.
Changes since v2:
- Threshold for erased bit flip in erased page set to minimum
  of 4 or half of ecc.strength. So max bit flips allowed
  at fixed offset is 4.
- Add dependency on runtime detection of ELM module
  instead of platform data. Dependency on is_elm_used
  flag in platform data removed.
- Return number of bit flips from read_page().
- Add bit correction support in case of bit flip in OOB
  as BCH scheme can correct bit flips in ECC.
Changes since v1:
- Incorporated GPMC modification to nand driver
- Erased page detects by checking OOB byte at fixed offset.

 .../devicetree/bindings/mtd/gpmc-nand.txt  |4 +
 drivers/mtd/nand/omap2.c   |  573 ++--
 include/linux/platform_data/elm.h  |3 +-
 3 files changed, 540 insertions(+), 40 deletions(-)

diff --git a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt 
b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
index 9f464f9..e7f8d7e 100644
--- a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
@@ -29,6 +29,9 @@ Optional properties:
"bch4"  4-bit BCH ecc code
"bch8"  8-bit BCH ecc code
 
+ - elm_id: Specifies elm device node. This is required to support BCH
+   error correction using ELM module.
+
 For inline partiton table parsing (optional):
 
  - #address-cells: should be set to 1
@@ -46,6 +49,7 @@ Example for an AM33xx board:
#address-cells = <2>;
#size-cells = <1>;
ranges = <0 0 0x0800 0x2000>;   /* CS0: NAND */
+   elm_id = <>;
 
nand@0,0 {
reg = <0 0 0>; /* CS0, offset 0 */
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 7d907b7..8e820dd 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -22,9 +22,12 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #ifdef CONFIG_MTD_NAND_OMAP_BCH
 #include 
+#include 
 #endif
 
 #include 
@@ -120,6 +123,30 @@
 #define BCH8_MAX_ERROR 8   /* upto 8 bit correctable */
 #define BCH4_MAX_ERROR 4   /* upto 4 bit correctable */
 
+#define SECTOR_BYTES   512
+/* 4 bit padding to make byte aligned, 56 = 52 + 4 */
+#define BCH4_BIT_PAD   4
+#define BCH8_ECC_MAX   ((SECTOR_BYTES + BCH8_ECC_OOB_BYTES) * 8)
+#define BCH4_ECC_MAX   ((SECTOR_BYTES + BCH4_ECC_OOB_BYTES) * 8)
+
+/* GPMC ecc engine settings for read */
+#define BCH_WRAPMODE_1 1   /* BCH wrap mode 1 */
+#define BCH8R_ECC_SIZE00x1a/* ecc_size0 = 26 */
+#define BCH8R_ECC_SIZE10x2 /* ecc_size1 = 2 */
+#define BCH4R_ECC_SIZE00xd /* ecc_size0 = 13 */
+#define BCH4R_ECC_SIZE10x3 /* ecc_size1 = 3 */
+
+/* GPMC ecc engine settings for write */
+#define BCH_WRAPMODE_6 6   /* BCH wrap mode 6 */
+#define BCH_ECC_SIZE0  0x0 /* ecc_size0 = 0, no oob protection */
+#define BCH_ECC_SIZE1  0x20/* ecc_size1 = 32 */
+
+#ifdef CONFIG_MTD_NAND_OMAP_BCH
+static u_char bch8_vector[] = {0xf3, 0xdb, 0x14, 0x16, 0x8b, 0xd2, 0xbe, 0xcc,
+   0xac, 0x6b, 0xff, 0x99, 0x7b};
+static u_char bch4_vector[] = {0x00, 0x6b, 

[PATCH v4 2/3] mtd: devices: elm: Add support for ELM error correction

2013-01-03 Thread Philip Avinash
The ELM hardware module can be used to speedup BCH 4/8/16 ECC scheme
error correction.
For now only 4 & 8 bit support is added

Signed-off-by: Philip Avinash 
---
Changes since v3:
- Remove export symbol and API support for elm_request and
  provided export_symbol support for elm_config.
Changes since v2:
- Remove __devinit & __devexit annotations
Changes since v1:
- Change build attribute to CONFIG_MTD_NAND_OMAP_BCH
- Reduced indentation using by passing elm_info , offset
  to elm_read & elm_write
- Removed syndrome manipulation functions.

 Documentation/devicetree/bindings/mtd/elm.txt |   16 +
 drivers/mtd/devices/Makefile  |4 +-
 drivers/mtd/devices/elm.c |  404 +
 include/linux/platform_data/elm.h |   53 
 4 files changed, 477 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/devicetree/bindings/mtd/elm.txt
 create mode 100644 drivers/mtd/devices/elm.c
 create mode 100644 include/linux/platform_data/elm.h

diff --git a/Documentation/devicetree/bindings/mtd/elm.txt 
b/Documentation/devicetree/bindings/mtd/elm.txt
new file mode 100644
index 000..e43c668
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/elm.txt
@@ -0,0 +1,16 @@
+Error location module
+
+Required properties:
+- compatible: Must be "ti,am33xx-elm"
+- reg: physical base address and size of the registers map.
+- interrupts: Interrupt number for the elm.
+
+Optional properties:
+- ti,hwmods: Name of the hwmod associated to the elm
+
+Example:
+elm: elm@0 {
+   compatible  = "ti,am33xx-elm";
+   reg = <0x4808 0x2000>;
+   interrupts = <4>;
+};
diff --git a/drivers/mtd/devices/Makefile b/drivers/mtd/devices/Makefile
index 395733a..369a194 100644
--- a/drivers/mtd/devices/Makefile
+++ b/drivers/mtd/devices/Makefile
@@ -17,8 +17,10 @@ obj-$(CONFIG_MTD_LART)   += lart.o
 obj-$(CONFIG_MTD_BLOCK2MTD)+= block2mtd.o
 obj-$(CONFIG_MTD_DATAFLASH)+= mtd_dataflash.o
 obj-$(CONFIG_MTD_M25P80)   += m25p80.o
+obj-$(CONFIG_MTD_NAND_OMAP_BCH)+= elm.o
 obj-$(CONFIG_MTD_SPEAR_SMI)+= spear_smi.o
 obj-$(CONFIG_MTD_SST25L)   += sst25l.o
 obj-$(CONFIG_MTD_BCM47XXSFLASH)+= bcm47xxsflash.o
 
-CFLAGS_docg3.o += -I$(src)
\ No newline at end of file
+
+CFLAGS_docg3.o += -I$(src)
diff --git a/drivers/mtd/devices/elm.c b/drivers/mtd/devices/elm.c
new file mode 100644
index 000..f78f43f
--- /dev/null
+++ b/drivers/mtd/devices/elm.c
@@ -0,0 +1,404 @@
+/*
+ * Error Location Module
+ *
+ * Copyright (C) 2012 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define ELM_IRQSTATUS  0x018
+#define ELM_IRQENABLE  0x01c
+#define ELM_LOCATION_CONFIG0x020
+#define ELM_PAGE_CTRL  0x080
+#define ELM_SYNDROME_FRAGMENT_00x400
+#define ELM_SYNDROME_FRAGMENT_60x418
+#define ELM_LOCATION_STATUS0x800
+#define ELM_ERROR_LOCATION_0   0x880
+
+/* ELM Interrupt Status Register */
+#define INTR_STATUS_PAGE_VALID BIT(8)
+
+/* ELM Interrupt Enable Register */
+#define INTR_EN_PAGE_MASK  BIT(8)
+
+/* ELM Location Configuration Register */
+#define ECC_BCH_LEVEL_MASK 0x3
+
+/* ELM syndrome */
+#define ELM_SYNDROME_VALID BIT(16)
+
+/* ELM_LOCATION_STATUS Register */
+#define ECC_CORRECTABLE_MASK   BIT(8)
+#define ECC_NB_ERRORS_MASK 0x1f
+
+/* ELM_ERROR_LOCATION_0-15 Registers */
+#define ECC_ERROR_LOCATION_MASK0x1fff
+
+#define ELM_ECC_SIZE   0x7ff
+
+#define SYNDROME_FRAGMENT_REG_SIZE 0x40
+#define ERROR_LOCATION_SIZE0x100
+
+struct elm_info {
+   struct device *dev;
+   void __iomem *elm_base;
+   struct completion elm_completion;
+   struct list_head list;
+   enum bch_ecc bch_type;
+};
+
+static LIST_HEAD(elm_devices);
+
+static void elm_write_reg(struct elm_info *info, int offset, u32 val)
+{
+   writel(val, info->elm_base + offset);
+}
+
+static u32 elm_read_reg(struct elm_info *info, int offset)
+{
+   return readl(info->elm_base + offset);
+}
+
+/**
+ * elm_config - Configure ELM module
+ * @dev:   ELM device
+ * @bch_type:  Type of BCH ecc
+ */
+void elm_config(struct device *dev, enum 

[PATCH v4 1/3] mtd: nand: omap2: Update nerrors using ecc.strength

2013-01-03 Thread Philip Avinash
Remove check of ecc bytes with 13, number of errors can directly update
from nand ecc strength. This will increase re-usability of the code.
Also add macro definitions BCH8_ERROR_MAX & BCH4_ERROR_MAX for better
readability and cleaner code.

Signed-off-by: Philip Avinash 
---
Changes since v3:
- Update commit message.

 drivers/mtd/nand/omap2.c |   12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 1d333497c..7d907b7 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -117,6 +117,9 @@
 
 #define OMAP24XX_DMA_GPMC  4
 
+#define BCH8_MAX_ERROR 8   /* upto 8 bit correctable */
+#define BCH4_MAX_ERROR 4   /* upto 4 bit correctable */
+
 /* oob info generated runtime depending on ecc algorithm and layout selected */
 static struct nand_ecclayout omap_oobinfo;
 /* Define some generic bad / good block scan pattern which are used
@@ -1041,7 +1044,7 @@ static void omap3_enable_hwecc_bch(struct mtd_info *mtd, 
int mode)
struct nand_chip *chip = mtd->priv;
u32 val;
 
-   nerrors = (info->nand.ecc.bytes == 13) ? 8 : 4;
+   nerrors = info->nand.ecc.strength;
dev_width = (chip->options & NAND_BUSWIDTH_16) ? 1 : 0;
nsectors = 1;
/*
@@ -1218,13 +1221,14 @@ static int omap3_init_bch(struct mtd_info *mtd, int 
ecc_opt)
struct omap_nand_info *info = container_of(mtd, struct omap_nand_info,
   mtd);
 #ifdef CONFIG_MTD_NAND_OMAP_BCH8
-   const int hw_errors = 8;
+   const int hw_errors = BCH8_MAX_ERROR;
 #else
-   const int hw_errors = 4;
+   const int hw_errors = BCH4_MAX_ERROR;
 #endif
info->bch = NULL;
 
-   max_errors = (ecc_opt == OMAP_ECC_BCH8_CODE_HW) ? 8 : 4;
+   max_errors = (ecc_opt == OMAP_ECC_BCH8_CODE_HW) ?
+   BCH8_MAX_ERROR : BCH4_MAX_ERROR;
if (max_errors != hw_errors) {
pr_err("cannot configure %d-bit BCH ecc, only %d-bit supported",
   max_errors, hw_errors);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v4 0/3] mtd: nand: OMAP: ELM error correction support for BCH ecc

2013-01-03 Thread Philip Avinash
Support to use ELM as BCH 4 & 8 bit error correction module. Also performance
enhancement by adding single shot read_page and write_page functions for the
nand flashes with page size less than 4 KB.

ELM module can be used to correct errors reported by BCH 4, 8 & 16 bit
ECC scheme. For now only 4 & 8 bit support is added.

BCH 4 & 8 bit error detection support is already available in mainline
kernel and works with software error correction.

This series is based on linux 3.8-rc2 and tested with [1].
Also this patch series depend on [1] for NAND flash device
tree data and gpmc nand device tree binding documentation updates.

1. [PATCH v7 0/5] OMAP GPMC DT bindings
http://www.spinics.net/lists/linux-omap/msg83505.html

Tested on am335x-evm for BCH 4 and 8 bit error correction.

Changes since v1:
- Erased page is identified by checking byte [13/7] in read
  ecc. To filter out bit flips in OOB area, check 0 bits in
  the byte greater than 4.
- GPMC ecc engine configuration moves to omap2.c NAND driver.

Changes since v2:
- Added runtime detection of elm module, instead of depending
  on platform data.
- Added bit flip correction in OOB ecc data if bit flip happen
  OOB data.

Changes since v3:
- Availability and usability of ELM module is detected from device
  tree nodes by checking availability of ELM node in device tree.

Philip Avinash (3):
  mtd: nand: omap2: Update nerrors using ecc.strength
  mtd: devices: elm: Add support for ELM error correction
  mtd: nand: omap2: Support for hardware BCH error correction.

 Documentation/devicetree/bindings/mtd/elm.txt  |   16 +
 .../devicetree/bindings/mtd/gpmc-nand.txt  |4 +
 drivers/mtd/devices/Makefile   |4 +-
 drivers/mtd/devices/elm.c  |  405 ++
 drivers/mtd/nand/omap2.c   |  583 ++--
 include/linux/platform_data/elm.h  |   54 ++
 6 files changed, 1023 insertions(+), 43 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/mtd/elm.txt
 create mode 100644 drivers/mtd/devices/elm.c
 create mode 100644 include/linux/platform_data/elm.h

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 5/5] KVM: x86: improve reexecute_instruction

2013-01-03 Thread Xiao Guangrong
Hi Gleb,

Thanks for your review and sorry for the delay reply since i was on my vacation.

On 12/23/2012 11:02 PM, Gleb Natapov wrote:
> On Sat, Dec 15, 2012 at 03:01:12PM +0800, Xiao Guangrong wrote:

>>
>> +is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, addr,
>> +   , user_fault);
>> +
> is_self_change_mapping() has a subtle side-effect by setting
> vcpu->arch.target_gfn_is_pt. From reading the page_fault() function
> you cannot guess why is_self_change_mapping() is not called inside "if
> (walker.level >= PT_DIRECTORY_LEVEL)" since this is the only place where
> its output is used. May be pass it pointer to target_gfn_is_pt as a
> parameter to make it clear that return value is not the only output of
> the function.

Yes, it is clearer, will do it in the next version.

> 
>>  if (walker.level >= PT_DIRECTORY_LEVEL)
>>  force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
>> -   || FNAME(is_self_change_mapping)(vcpu, , user_fault);
>> +   || is_self_change_mapping;
>>  else
>>  force_pt_level = 1;
>>  if (!force_pt_level) {
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index bf66169..fc33563 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -4756,29 +4756,25 @@ static int handle_emulation_failure(struct kvm_vcpu 
>> *vcpu)
>>  static bool reexecute_instruction(struct kvm_vcpu *vcpu, unsigned long cr2)
>>  {
>>  gpa_t gpa = cr2;
>> +gfn_t gfn;
>>  pfn_t pfn;
>> -unsigned int indirect_shadow_pages;
>> -
>> -spin_lock(>kvm->mmu_lock);
>> -indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
>> -spin_unlock(>kvm->mmu_lock);
>> -
>> -if (!indirect_shadow_pages)
>> -return false;
>>
>>  if (!vcpu->arch.mmu.direct_map) {
>> -gpa = kvm_mmu_gva_to_gpa_read(vcpu, cr2, NULL);
>> +/*
>> + * Write permission should be allowed since only
>> + * write access need to be emulated.
>> + */
>> +gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
>> +
>> +/*
>> + * If the mapping is invalid in guest, let cpu retry
>> + * it to generate fault.
>> + */
>>  if (gpa == UNMAPPED_GVA)
>> -return true; /* let cpu generate fault */
>> +return true;
>>  }
> Why not fold this change to if (!vcpu->arch.mmu.direct_map) into
> previous patch where it was introduced. This looks independent of
> what you are doing in this patch.

Fine to me.

> 
>>
>> -/*
>> - * if emulation was due to access to shadowed page table
>> - * and it failed try to unshadow page and re-enter the
>> - * guest to let CPU execute the instruction.
>> - */
>> -if (kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)))
>> -return true;
>> +gfn = gpa_to_gfn(gpa);
>>
>>  /*
>>   * Do not retry the unhandleable instruction if it faults on the
>> @@ -4786,13 +4782,33 @@ static bool reexecute_instruction(struct kvm_vcpu 
>> *vcpu, unsigned long cr2)
>>   * retry instruction -> write #PF -> emulation fail -> retry
>>   * instruction -> ...
>>   */
>> -pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
>> -if (!is_error_noslot_pfn(pfn)) {
>> -kvm_release_pfn_clean(pfn);
>> +pfn = gfn_to_pfn(vcpu->kvm, gfn);
>> +
>> +/*
>> + * If the instruction failed on the error pfn, it can not be fixed,
>> + * report the error to userspace.
>> + */
>> +if (is_error_noslot_pfn(pfn))
>> +return false;
>> +
>> +kvm_release_pfn_clean(pfn);
>> +
>> +/* The instructions are well-emulated on direct mmu. */
>> +if (vcpu->arch.mmu.direct_map) {
>> +unsigned int indirect_shadow_pages;
>> +
>> +spin_lock(>kvm->mmu_lock);
>> +indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
>> +spin_unlock(>kvm->mmu_lock);
>> +
>> +if (indirect_shadow_pages)
>> +kvm_mmu_unprotect_page(vcpu->kvm, gfn);
>> +
>>  return true;
>>  }
>>
>> -return false;
>> +kvm_mmu_unprotect_page(vcpu->kvm, gfn);
>> +return !(vcpu->arch.fault_addr == cr2 && vcpu->arch.target_gfn_is_pt);
> Do you store fault_addr only to avoid using stale target_gfn_is_pt? If
> yes why not reset target_gfn_is_pt to false at the beginning of a page
> fault and get rid of fault_addr?

Good suggestion, will do. :)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 net-next] softirq: reduce latencies

2013-01-03 Thread Eric Dumazet
From: Eric Dumazet 

In various network workloads, __do_softirq() latencies can be up
to 20 ms if HZ=1000, and 200 ms if HZ=100.

This is because we iterate 10 times in the softirq dispatcher,
and some actions can consume a lot of cycles.

This patch changes the fallback to ksoftirqd condition to :

- A time limit of 2 ms.
- need_resched() being set on current task

When one of this condition is met, we wakeup ksoftirqd for further
softirq processing if we still have pending softirqs.

Using need_resched() as the only condition can trigger RCU stalls,
as we can keep BH disabled for too long.

I ran several benchmarks and got no significant difference in
throughput, but a very significant reduction of latencies (one order
of magnitude) :

In following bench, 200 antagonist "netperf -t TCP_RR" are started in
background, using all available cpus.

Then we start one "netperf -t TCP_RR", bound to the cpu handling the NIC
IRQ (hard+soft)

Before patch :

# netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k
RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind
RT_LATENCY=550110.424
MIN_LATENCY=146858
MAX_LATENCY=997109
P50_LATENCY=305000
P90_LATENCY=55
P99_LATENCY=71
MEAN_LATENCY=376989.12
STDDEV_LATENCY=184046.92

After patch :

# netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k
RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind
RT_LATENCY=40545.492
MIN_LATENCY=9834
MAX_LATENCY=78366
P50_LATENCY=33583
P90_LATENCY=59000
P99_LATENCY=69000
MEAN_LATENCY=38364.67
STDDEV_LATENCY=12865.26

Signed-off-by: Eric Dumazet 
Cc: David Miller 
Cc: Tom Herbert 
Cc: Ben Hutchings 
---
v2: min(1, (2*HZ/1000)) -> max(1, (2*HZ/1000)), as spotted by Ben

 kernel/softirq.c |   17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index ed567ba..8d5e4be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,21 +195,21 @@ void local_bh_enable_ip(unsigned long ip)
 EXPORT_SYMBOL(local_bh_enable_ip);
 
 /*
- * We restart softirq processing MAX_SOFTIRQ_RESTART times,
- * and we fall back to softirqd after that.
+ * We restart softirq processing for at most 2 ms,
+ * and if need_resched() is not set.
  *
- * This number has been established via experimentation.
+ * These limits have been established via experimentation.
  * The two things to balance is latency against fairness -
  * we want to handle softirqs as soon as possible, but they
  * should not be able to lock up the box.
  */
-#define MAX_SOFTIRQ_RESTART 10
+#define MAX_SOFTIRQ_TIME  max(1, (2*HZ/1000))
 
 asmlinkage void __do_softirq(void)
 {
struct softirq_action *h;
__u32 pending;
-   int max_restart = MAX_SOFTIRQ_RESTART;
+   unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
int cpu;
unsigned long old_flags = current->flags;
 
@@ -264,11 +264,12 @@ restart:
local_irq_disable();
 
pending = local_softirq_pending();
-   if (pending && --max_restart)
-   goto restart;
+   if (pending) {
+   if (time_before(jiffies, end) && !need_resched())
+   goto restart;
 
-   if (pending)
wakeup_softirqd();
+   }
 
lockdep_softirq_exit();
 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH net-next] softirq: reduce latencies

2013-01-03 Thread Eric Dumazet
On Fri, 2013-01-04 at 11:14 +0400, Oleg A.Arkhangelsky wrote:

> It leads to many context switches when softirqs processing deffered to
> ksoftirqd kthreads which can be very expensive. Here is some evidence
> of ksoftirqd activation effects:
> 
> http://marc.info/?l=linux-netdev=124116262916969=2
> 
> Look for "magic threshold". Yes, I know there was another bug in scheduler
> discovered that time, but this bug was only about tick accounting.
> 

This thread is 3 years old : 

- It was a router workload. Forwarded packets should not wakeup a task.
- The measure of how cpus spent their cycles was completely wrong.
- A lot of things have changed, both in network stack and scheduler.

In fact, under moderate load, my patch is able to loop more than 10
times before deferring to ksoftirqd.

Under stress, ksoftirqd will be started anyway, and its a good thing,
because it enables process migration.

500 "context switches" [1] per second instead of 50 on behalf of
ksoftirqd is absolutely not measurable. It also permits smoother RCU
cleanups.

I did a lot of benchmarks, and didnt see any regression yet, but usual
noise.

[1] Under load, __do_softirq() would be called 500 times per second,
instead of ~50 times per second.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] writeback: fix writeback cache thrashing

2013-01-03 Thread Namjae Jeon
2013/1/4, Simon Jeons :
> On Thu, 2013-01-03 at 13:35 +0900, Namjae Jeon wrote:
>> 2013/1/2, Jan Kara :
>> > On Tue 01-01-13 08:51:04, Wanpeng Li wrote:
>> >> On Mon, Dec 31, 2012 at 12:30:54PM +0100, Jan Kara wrote:
>> >> >On Sun 30-12-12 14:59:50, Namjae Jeon wrote:
>> >> >> From: Namjae Jeon 
>> >> >>
>> >> >> Consider Process A: huge I/O on sda
>> >> >> doing heavy write operation - dirty memory becomes more
>> >> >> than dirty_background_ratio
>> >> >> on HDD - flusher thread flush-8:0
>> >> >>
>> >> >> Consider Process B: small I/O on sdb
>> >> >> doing while [1]; read 1024K + rewrite 1024K + sleep 2sec
>> >> >> on Flash device - flusher thread flush-8:16
>> >> >>
>> >> >> As Process A is a heavy dirtier, dirty memory becomes more
>> >> >> than dirty_background_thresh. Due to this, below check becomes
>> >> >> true(checking global_page_state in over_bground_thresh)
>> >> >> for all bdi devices(even for very small dirtied bdi - sdb):
>> >> >>
>> >> >> In this case, even small cached data on 'sdb' is forced to flush
>> >> >> and writeback cache thrashing happens.
>> >> >>
>> >> >> When we added debug prints inside above 'if' condition and ran
>> >> >> above Process A(heavy dirtier on bdi with flush-8:0) and
>> >> >> Process B(1024K frequent read/rewrite on bdi with flush-8:16)
>> >> >> we got below prints:
>> >> >>
>> >> >> [Test setup: ARM dual core CPU, 512 MB RAM]
>> >> >>
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56064
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56704
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 84720
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 94720
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   384
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   960
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92160
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   256
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   768
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   256
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   320
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92032
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 91968
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =  1024
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   576
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 84352
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   512
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92608
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92544
>> >> >> KB
>> >> >>
>> >> >> As mentioned in above log, when global dirty memory > global
>> >> >> background_thresh
>> >> >> small cached data is also forced to flush by flush-8:16.
>> >> >>
>> >> >> If removing global background_thresh checking code, we can reduce
>> >> >> cache
>> >> >> thrashing of frequently used small data.
>> >> >  It's not completely clear to me:
>> >> >  Why is this a problem? Wearing of the flash? Power consumption? I'd
>> >> > like
>> >> >to understand this before changing the code...
>> Hi Jan.
>> Yes, it can reduce wearing and fragmentation of flash. And also from
>> one scenario - we
>> think it might reduce power consumption also.
>>
>> >> >
>> >> >> And It will be great if we can reserve a portion of writeback cache
>> >> >> using
>> >> >> min_ratio.
>> >> >>
>> >> >> After applying patch:
>> >> >> $ echo 5 > /sys/block/sdb/bdi/min_ratio
>> >> >> $ cat /sys/block/sdb/bdi/min_ratio
>> >> >> 5
>> >> >>
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56064
>> >> >> KB
>> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56704

Re: radeon 0000:02:00.0: GPU lockup CP stall for more than 10000msec

2013-01-03 Thread Borislav Petkov
On Wed, Jan 02, 2013 at 06:37:23PM -0500, Alex Deucher wrote:
> From: Alex Deucher 
> Date: Wed, 2 Jan 2013 18:30:21 -0500
> Subject: [PATCH] drm/radeon/r6xx: fix DMA engine for ttm bo transfers
> 
> count must be a multiple of 2.
> 
> Cc: Borislav Petkov 
> Cc: Markus Trippelsdorf 
> Signed-off-by: Alex Deucher 

Thanks, will run it on the box in question next week when I have access.

Btw, you could add the note about count needing to be a multiple of 2 as
a comment in the code below, for future reference.

> ---
>  drivers/gpu/drm/radeon/r600.c |4 ++--
>  1 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c
> index 2aaf147..9f4ce5e 100644
> --- a/drivers/gpu/drm/radeon/r600.c
> +++ b/drivers/gpu/drm/radeon/r600.c
> @@ -2636,8 +2636,8 @@ int r600_copy_dma(struct radeon_device *rdev,
>  
>   for (i = 0; i < num_loops; i++) {
>   cur_size_in_dw = size_in_dw;
> - if (cur_size_in_dw > 0x)
> - cur_size_in_dw = 0x;
> + if (cur_size_in_dw > 0xFFFE)
> + cur_size_in_dw = 0xFFFE;
>   size_in_dw -= cur_size_in_dw;
>   radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_COPY, 0, 0, 
> cur_size_in_dw));
>   radeon_ring_write(ring, dst_offset & 0xfffc);
> -- 
> 1.7.7.5

-- 
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: load/unload dccp module caused oops

2013-01-03 Thread CAI Qian
The bisecting pointed out this commit fixed the problem in
the mainline.

3c58346525d82625e68e24f071804c2dc057b6f4
slab: Simplify bootstrap

However, simply back-ported this single commit to the 3.7.1
stable wasn't enough to fix it. My guess is that there are
some other slab/slub commits required to fix this. Keep digging...

The kernel config used the SLUB,
http://people.redhat.com/qcai/stable/.config

CAI Qian

- Original Message -
> From: "CAI Qian" 
> To: net...@vger.kernel.org
> Cc: "Dave Miller" , sta...@vger.kernel.org
> Sent: Friday, January 4, 2013 9:57:43 AM
> Subject: Re: load/unload dccp module caused
> 
> Adding the netdev as Dave suggested.
> 
> - Original Message -
> > From: "CAI Qian" 
> > To: sta...@vger.kernel.org
> > Cc: "Dave Miller" 
> > Sent: Monday, December 31, 2012 5:42:59 PM
> > Subject: load/unload dccp module caused
> > 
> > Just a head up that load and then unload the dccp module
> > caused an oops below using the current stable kernel - v3.7.1.
> > Some additional data point here: the mainline v3.6 release has
> > no such problem, so this looks like a regression. The mainline
> > v3.8-rc1 also has no such problem, so it looks like it has
> > already been fixed there but looks like yet queued up for the
> > stable yet (tested a few commits in Greg's stable-queue and
> > Dave's net-stable queue did not find anything obvious to fix
> > this). I am in-process to bisect to figure out the one that
> > need to back-port right now.
> > 
> > [   93.809573]
> > =
> > [   93.809577] BUG kmalloc-16 (Tainted: GB   ): Objects
> > remaining in kmalloc-16 on kmem_cache_close()
> > [   93.809580]
> > -
> > [   93.809580]
> > ...
> > [  356.336244] INFO: Object 0xc000fa1f0aa0 @offset=2720
> > [  356.336247] INFO: Object 0xc000fa1f0ab0 @offset=2736
> > [  356.336249] INFO: Object 0xc000fa1f0ac0 @offset=2752
> > [  356.336254] INFO: Object 0xc000fa1f0ad0 @offset=2768
> > [  356.336257] INFO: Object 0xc000fa1f0ae0 @offset=2784
> > [  356.336259] INFO: Object 0xc000fa1f0af0 @offset=2800
> > [  356.336262] INFO: Object 0xc000fa1f0b80 @offset=2944
> > [  356.336264] INFO: Object 0xc000fa1f0bd0 @offset=3024
> > [  356.336271] INFO: Object 0xc000fa1f1870 @offset=6256
> > [  356.336274] INFO: Object 0xc000fa1f1880 @offset=6272
> > [  356.336276] INFO: Object 0xc000fa1f1890 @offset=6288
> > [  356.346976] INFO: Object 0xc000fa1f18a0 @offset=6304
> > [  356.346979] INFO: Object 0xc000fa1f18b0 @offset=6320
> > [  356.346981] INFO: Object 0xc000fa1f1950 @offset=6480
> > [  356.346986] INFO: Object 0xc000fa1f1960 @offset=6496
> > [  356.346989] INFO: Object 0xc000fa1f1970 @offset=6512
> > [  356.346991] INFO: Object 0xc000fa1f1980 @offset=6528
> > [  356.346994] INFO: Object 0xc000fa1f1990 @offset=6544
> > [  356.346997] INFO: Object 0xc000fa1f19a0 @offset=6560
> > [  356.346999] INFO: Object 0xc000fa1f19b0 @offset=6576
> > [  356.347005] INFO: Object 0xc000fa1f19c0 @offset=6592
> > [  356.347008] INFO: Object 0xc000fa1f19d0 @offset=6608
> > [  356.347010] INFO: Object 0xc000fa1f19e0 @offset=6624
> > [  356.347012] INFO: Object 0xc000fa1f19f0 @offset=6640
> > [  356.347081] kmem_cache_destroy kmalloc-16: Slab cache still has
> > objects
> > ...
> > [441283.322161] BUG: unable to handle kernel NULL pointer
> > dereference
> > at   (null)
> > [441283.331020] IP: []
> > __kmem_cache_shutdown+0xa9/0x2f0
> > [441283.338320] PGD 105568f067 PUD 104a086067 PMD 0
> > [441283.343600] Oops:  [#1] SMP
> > [441283.347318] Modules linked in: dccp(-) nf_tproxy_core deflate
> > zlib_deflate lzo nls_koi8_u nls_cp932 ts_kmp sctp libcrc32c
> > binfmt_misc des_generic md4 nls_utf8 cifs dns_resolver sg iTCO_wdt
> > kvm_intel igb iTCO_vendor_support coretemp kvm crc32c_intel lpc_ich
> > i7core_edac edac_core i2c_i801 i2c_core mfd_core pcspkr microcode
> > ioatdma dca sr_mod cdrom ata_generic sd_mod pata_acpi crc_t10dif
> > ata_piix libata megaraid_sas dm_mirror dm_region_hash dm_log dm_mod
> > [last unloaded: inet_diag]
> > [441283.395187] CPU 6
> > [441283.397337] Pid: 40979, comm: modprobe Tainted: GB
> >3.7.1+ #10 QCI QSSC-S4R/QSSC-S4R
> > [441283.407245] RIP: 0010:[]
> >  []
> > __kmem_cache_shutdown+0xa9/0x2f0
> > [441283.417256] RSP: 0018:88205247de08  EFLAGS: 00010292
> > [441283.423280] RAX: 881059780001 RBX: 88085acfa000 RCX:
> > 001c7d72
> > [441283.431336] RDX: 001c7d71 RSI: 0ff0 RDI:
> > 88085f802600
> > [441283.439394] RBP: 88205247de68 R08: 00016940 R09:
> > 88105fd36940
> > [441283.447451] R10: ea004165e000 R11: 81178721 R12:
> > ffe0
> > [441283.455508] R13: 88085acf9000 R14: 88085f802500 R15:
> > ea00216b3e40
> > 

Re: [PATCH] RFC: leds-pwm: don't disable pwm when setting brightness to 0

2013-01-03 Thread Thierry Reding
On Thu, Jan 03, 2013 at 09:55:14PM +0100, Uwe Kleine-König wrote:
> Hi Thierry,
> 
> On Thu, Jan 03, 2013 at 10:01:18AM +0100, Thierry Reding wrote:
> > On Thu, Oct 25, 2012 at 04:03:49PM +0800, Shawn Guo wrote:
> > > On Wed, Oct 24, 2012 at 03:52:46PM +0200, Uwe Kleine-König wrote:
> > > > This fixes disabling the LED on i.MX28. The PWM hardware delays using
> > > > the newly set pwm-config until the beginning of a new period.  It's very
> > > > likely that pwm_disable is called before the current period ends. In
> > > > case the LED was on brightness=max before the LED stays on because in
> > > > the disabled PWM block the period never ends.
> > > > 
> > > > It's unclear if the mxs-pwm driver doesn't implement the API as expected
> > > > (i.e. it should block until the newly set config is effective) or if the
> > > > leds-pwm driver makes wrong assumptions. This patch assumes the latter.
> > > > 
> > > > Signed-off-by: Uwe Kleine-König 
> > > > ---
> > > > Hello,
> > > > 
> > > > I'm not sure this is correct, but this is the workaround I'm using until
> > > > I get some feed back.
> > > 
> > > I'm fine with it, since it fixes a real problem.  Let's see what
> > > Thierry says.
> > 
> > I lost track of this thread somehow, so sorry for not getting back to
> > you earlier. The root cause of this problem seems to be that it isn't
> > very well defined (actually not at all) what is supposed to happen in
> > the case when a PWM is disabled.
> > 
> > There really are only two ways forward: a) we need to write down what
> > the PWM subsystem expects to happen when a PWM is disabled or b) keep
> > the currently undefined behaviour. With the latter I expect this kind
> > of issue to keep popping up every once in a while with all sorts of
> > ad-hoc solutions being implemented to solve the problem.
> > 
> > I think the best option would be to have some definition about what the
> > PWM signal should look like after a call to pwm_disable(). However this
> > doesn't turn out to be as trivial as it sounds. For instance, the most
> > straightforward definition in my opinion would be to specify that a PWM
> > signal should be constantly low after the call to pwm_disable(). It is
> > what I think most people would assume is the natural disable state of a
> > PWM.
> > 
> > However, one case where a similar problem was encountered involved a
> > hardware design that used an external inverter to change the polarity of
> > a PWM signal that was used to drive a backlight. In such a case, if the
> > controller were programmed to keep the output low when disabling, the
> > display would in fact be fully lit. This is further complicated by the
> > fact that the controller allows the output level of the disabled PWM
> > signal to be configured. This is nice because it means that pretty much
> > any scenario is covered, but it also doesn't make it any easier to put
> > this into a generic framework.
> > 
> > Having said that, I'm tempted to go with a simple definition like the
> > above anyway and handle obscure cases with board-specific quirks. I
> I don't understand what you mean with "the above" here. I guess it's
> "PWM signal should be constantly low after the call to pwm_disable".

Yes, exactly.

> To cover this we could add a function pwm_disable_blurb() that accepts a
> parameter specifying the desired signal state: "high", "low" or (maybe)
> "don't care". pwm_disable would then (probably) mean
> pwm_disable_blurb("don't care"). But maybe this already contradicts your
> idea about being simple and clean?!

I'm wondering if that's really necessary. This really seems more of a
board-specific question. If you run pwm_disable() on a PWM device, it
should be turned "off" (whatever that means in the context of a board
design) after the call terminates.

Part of the problem is that we want to keep the board-specific
complexities out of client drivers. For instance in the case you
encountered, the leds-pwm driver shouldn't have to know any of the
details pertaining to the i.MX28. That is, leds-pwm should be able to
call pwm_disable() if it wants to turn off the PWM signal.

If we add pwm_disable_blurb() as you suggest, what is leds-pwm supposed
to pass in? Usually this would be "low", but on other hardware (with
additional inverter circuitry) it would be "high". We certainly don't
want to have leds-pwm handling that kind of logic. The PWM signal
polarity is entirely defined at the board-level and therefore should be
handled by board setup code (or encoded in DT).

> Also note that I had another/alternative issue with the API, i.e. when
> the pwm routines should return.

Right. All of the above would entail that pwm_config() should either
block until the configuration is active, or alternatively that when
pwm_disable() is called without the new configuration being active yet,
it is pwm_disable() that needs to wait until the configuration becomes
active.

Another alternative would be that leds-pwm wouldn't have to call

RE: [PATCH 2/2]linux-usb:optimize to match the Huawei USB storage devices and support new switch command

2013-01-03 Thread Fangxiaozhi (Franko)
Dear Matthew:


> -Original Message-
> From: Matthew Dharm [mailto:mdharm-...@one-eyed-alien.net]
> Sent: Wednesday, December 19, 2012 11:41 PM
> To: Sebastian Andrzej Siewior
> Cc: Fangxiaozhi (Franko); linux-...@vger.kernel.org;
> linux-kernel@vger.kernel.org; Xueguiying (Zihan); Linlei (Lei Lin);
> g...@kroah.com; Yili (Neil); Wangyuhua (Roger, Credit); Huqiao; ba...@ti.com
> Subject: Re: [PATCH 2/2]linux-usb:optimize to match the Huawei USB storage
> devices and support new switch command
> 
> On Wed, Dec 19, 2012 at 12:34 AM, Sebastian Andrzej Siewior
>  wrote:
> > On Wed, Dec 19, 2012 at 03:13:32AM +, Fangxiaozhi (Franko) wrote:
> >> > And shouldn't you read something from the us->recv_bulk_pipe after
> >> > that?
> >>   Well, because our device will re-connect to switch the ports if it
> receives the command.
> >>   So it is not necessary to read the response of the command.
> >
> > Hmm. I guess this for Matthew / Greg to decide, I don't insist on anything.
> > Maybe a comment would be nice because now it looks, atleast to me,
> > that something is missing.
> 
> I think an unusual situation like that deserves a comment that explains that 
> the
> device is about to disconnect / reconnect, so reading status is not necessary.
You mean that we have to add some comment in the source code, 
to explain why we don't read the response. Right?

> 
> I am also concerned about the error of using  instead of bcbw.  I doubt
> this code would have worked with that typo in place.  How was this patch
> tested?
> 
> Also, the dongles_pid function is really just a different implementation of 
> the
> unusual_devs.h table.  I think that it is much easier for people to add new
> entries to the table, rather than edit your code, when new dongles are 
> released.
> BUT, your code includes many more PIDs than the table did.  Again, how was
> this tested for the new PIDs covered?  
In the dongles_pid function, we have check all the product IDs for our 
dongles, which is assigned for all of our Mobile Broadband products in our 
company. So the product ID of our new dongle in future, will also be included 
in this list.
In our lab, we can configure our dongle firmware to support all of 
these product ID. We have test them(cover all the product ID), and this 
function works fine.

>At a minimum, some comment in
> dongles_pid is required to highlight this area of code for possible future
> expansion as new devices are released.
As far as I know, the product ID list in dongles_pid function includes 
all. We will not add any other product ID for our dongle. So we need not update 
the product ID list in dongles_pid function in future.
However, I also will add the comment to highlight the area of code, as 
your advice did.
> 
> Matt
> 
> --
> Matthew Dharm
> Maintainer, USB Mass Storage driver for Linux

Best Regards,
Franko Fang


Re: [PATCH] drivers/thermal/spear_thermal.c: use devm_clk_get

2013-01-03 Thread Zhang Rui
On Fri, 2012-12-07 at 11:29 +0100, Julia Lawall wrote:
> From: Julia Lawall 
> 
> devm_clk_get allocates a resource that is released when a driver detaches.
> This patch uses devm_clk_get for data that is allocated in the probe
> function of a platform device and is only released in the remove function.
> 
> Signed-off-by: Julia Lawall 
> 
applied to thermal-next.

thanks,
rui

> ---
> I was not able to compile this code.  At one point, devm_clk_get was not
> supported for all architectures.  If that is still the case, and the code
> doesn't compile, then just ignore the patch.
> 
>  drivers/thermal/spear_thermal.c |7 ++-
>  1 file changed, 2 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/thermal/spear_thermal.c b/drivers/thermal/spear_thermal.c
> index 6b2d8b2..3c5ee56 100644
> --- a/drivers/thermal/spear_thermal.c
> +++ b/drivers/thermal/spear_thermal.c
> @@ -131,7 +131,7 @@ static int spear_thermal_probe(struct platform_device 
> *pdev)
>   return -ENOMEM;
>   }
>  
> - stdev->clk = clk_get(>dev, NULL);
> + stdev->clk = devm_clk_get(>dev, NULL);
>   if (IS_ERR(stdev->clk)) {
>   dev_err(>dev, "Can't get clock\n");
>   return PTR_ERR(stdev->clk);
> @@ -140,7 +140,7 @@ static int spear_thermal_probe(struct platform_device 
> *pdev)
>   ret = clk_enable(stdev->clk);
>   if (ret) {
>   dev_err(>dev, "Can't enable clock\n");
> - goto put_clk;
> + return ret;
>   }
>  
>   stdev->flags = val;
> @@ -163,8 +163,6 @@ static int spear_thermal_probe(struct platform_device 
> *pdev)
>  
>  disable_clk:
>   clk_disable(stdev->clk);
> -put_clk:
> - clk_put(stdev->clk);
>  
>   return ret;
>  }
> @@ -183,7 +181,6 @@ static int spear_thermal_exit(struct platform_device 
> *pdev)
>   writel_relaxed(actual_mask & ~stdev->flags, stdev->thermal_base);
>  
>   clk_disable(stdev->clk);
> - clk_put(stdev->clk);
>  
>   return 0;
>  }
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7u1 01/31] x86, mm: Fix page table early allocation offset checking

2013-01-03 Thread Borislav Petkov
On Thu, Jan 03, 2013 at 04:48:21PM -0800, Yinghai Lu wrote:
> During debugging loading kernel above 4G, found one page if is not used
> in BRK with early page allocation.
> 
> pgt_buf_top is address that can not be used, so should check if that new
> end is above that top, otherwise last page will not be used.
> 
> Fix that checking and also add print out for every allocation from BRK.

This commit message still bothers the hell out of me. Please, fix it up
to something more readable like the below, for example:

"pgt_buf_top is an address which cannot be used so we should check
whether the new 'end' is above it. Otherwise, the last BRK page remains
unused.

Fix that check and add a debug printout of every BRK allocation."

Thanks.

-- 
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 9/9] ARM: tegra: Add initial support for Tegra 114 SoC.

2013-01-03 Thread Hiroshi Doyu
Arnd Bergmann  wrote @ Thu, 3 Jan 2013 17:28:19 +0100:

> On Thursday 20 December 2012, Hiroshi Doyu wrote:
> > +
> > +DT_MACHINE_START(TEGRA114_DT, "NVIDIA Tegra114 (Flattened Device Tree)")
> > +   .smp= smp_ops(tegra_smp_ops),
> > +   .map_io = tegra_map_common_io,
> > +   .init_early = tegra30_init_early,
> > +   .init_irq   = tegra_dt_init_irq,
> > +   .handle_irq = gic_handle_irq,
> > +   .init_time  = clocksource_of_init,
> > +   .init_machine   = tegra114_dt_init,
> > +   .init_late  = tegra_init_late,
> > +   .restart= tegra_assert_system_reset,
> > +   .dt_compat  = tegra114_dt_board_compat,
> > +MACHINE_END
> 
> This one is so similar to Tegra30 that I wonder if it's actually worth keeping
> them separate still. The only difference I see is the clock initialization.
> Maybe that can be factored out to keep this the same as Tegra30.
> 
> Or are you planning to add more SoC specific here that would make this harder?

That was the original plan, but I'll consider if your proposal works
or not once again. Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v7u1 00/31] x86, boot, 64bit: Add support for loading ramdisk and bzImage above 4G

2013-01-03 Thread Borislav Petkov
On Thu, Jan 03, 2013 at 04:48:20PM -0800, Yinghai Lu wrote:

> 
> git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git 
> for-x86-boot
> 
> and it is on top of linus's tree 2013-01-03
> plus tip:x86/mm, tip:x86/mm2

This is causing a merge conflict when merging tip:x86/mm2
after having merged tip:x86/mm ontop of -rc2+ (today's Linus'
tree) in mm/nobootmem.c. free_all_bootmem_node has gained a
reset_node_lowmem_managed_pages() call which got added in
9feedc9d831e18ae6d0d15aa562e5e46ba53647b.

Now, you have a patch in tip:x86/mm2 which kills that
free_all_bootmem_node() function but the commit above adds that
reset_node_lowmem_managed_pages() call to it.

A proper merge conflict resolve would need to be added to the pull
request which sends tip:x86/mm2 upstream and then you'd need to rebase
your stuff ontop. Or something better which I'm not thinking of right
now...

Thanks.

-- 
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/7] clk: highbank: Use common of_clk_init() function

2013-01-03 Thread Prashant Gaikwad
Use common of_clk_init() function for clocks initialization.

Signed-off-by: Prashant Gaikwad 
---
 arch/arm/mach-highbank/core.h |1 -
 arch/arm/mach-highbank/highbank.c |3 ++-
 drivers/clk/clk-highbank.c|   18 --
 3 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/arch/arm/mach-highbank/core.h b/arch/arm/mach-highbank/core.h
index 80235b4..3f65206 100644
--- a/arch/arm/mach-highbank/core.h
+++ b/arch/arm/mach-highbank/core.h
@@ -2,7 +2,6 @@
 #define __HIGHBANK_CORE_H
 
 extern void highbank_set_cpu_jump(int cpu, void *jump_addr);
-extern void highbank_clocks_init(void);
 extern void highbank_restart(char, const char *);
 extern void __iomem *scu_base_addr;
 
diff --git a/arch/arm/mach-highbank/highbank.c 
b/arch/arm/mach-highbank/highbank.c
index f6ca285..fb148da 100644
--- a/arch/arm/mach-highbank/highbank.c
+++ b/arch/arm/mach-highbank/highbank.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -116,7 +117,7 @@ static void __init highbank_timer_init(void)
WARN_ON(!timer_base);
irq = irq_of_parse_and_map(np, 0);
 
-   highbank_clocks_init();
+   of_clk_init(NULL);
lookup.clk = of_clk_get(np, 0);
clkdev_add();
 
diff --git a/drivers/clk/clk-highbank.c b/drivers/clk/clk-highbank.c
index 52fecad..5d1de2e 100644
--- a/drivers/clk/clk-highbank.c
+++ b/drivers/clk/clk-highbank.c
@@ -314,33 +314,23 @@ static void __init hb_pll_init(struct device_node *node)
 {
hb_clk_init(node, _pll_ops);
 }
+CLK_OF_DECLARE(hb_pll, "calxeda,hb-pll-clock", hb_pll_init);
 
 static void __init hb_a9periph_init(struct device_node *node)
 {
hb_clk_init(node, _ops);
 }
+CLK_OF_DECLARE(hb_a9periph, "calxeda,hb-a9periph-clock", hb_a9periph_init);
 
 static void __init hb_a9bus_init(struct device_node *node)
 {
struct clk *clk = hb_clk_init(node, _ops);
clk_prepare_enable(clk);
 }
+CLK_OF_DECLARE(hb_a9bus, "calxeda,hb-a9bus-clock", hb_a9bus_init);
 
 static void __init hb_emmc_init(struct device_node *node)
 {
hb_clk_init(node, _ops);
 }
-
-static const __initconst struct of_device_id clk_match[] = {
-   { .compatible = "fixed-clock", .data = of_fixed_clk_setup, },
-   { .compatible = "calxeda,hb-pll-clock", .data = hb_pll_init, },
-   { .compatible = "calxeda,hb-a9periph-clock", .data = hb_a9periph_init, 
},
-   { .compatible = "calxeda,hb-a9bus-clock", .data = hb_a9bus_init, },
-   { .compatible = "calxeda,hb-emmc-clock", .data = hb_emmc_init, },
-   {}
-};
-
-void __init highbank_clocks_init(void)
-{
-   of_clk_init(clk_match);
-}
+CLK_OF_DECLARE(hb_emmc, "calxeda,hb-emmc-clock", hb_emmc_init);
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/7] clk: tegra: Use common of_clk_init() function

2013-01-03 Thread Prashant Gaikwad
Use common of_clk_init() function for clocks initialization.

Signed-off-by: Prashant Gaikwad 
---
 arch/arm/mach-tegra/common.c|4 ++--
 drivers/clk/tegra/clk-tegra20.c |3 ++-
 drivers/clk/tegra/clk-tegra30.c |3 ++-
 drivers/clk/tegra/clk.c |   11 ---
 drivers/clk/tegra/clk.h |   12 
 include/linux/clk/tegra.h   |1 -
 6 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/arch/arm/mach-tegra/common.c b/arch/arm/mach-tegra/common.c
index 3a7280d..4fde6a9 100644
--- a/arch/arm/mach-tegra/common.c
+++ b/arch/arm/mach-tegra/common.c
@@ -22,7 +22,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include 
 #include 
@@ -64,7 +64,7 @@ static const struct of_device_id tegra_dt_irq_match[] 
__initconst = {
 
 void __init tegra_dt_init_irq(void)
 {
-   tegra_clocks_init();
+   of_clk_init(NULL);
tegra_init_irq();
of_irq_init(tegra_dt_irq_match);
 }
diff --git a/drivers/clk/tegra/clk-tegra20.c b/drivers/clk/tegra/clk-tegra20.c
index 32a3d81..3436ad9 100644
--- a/drivers/clk/tegra/clk-tegra20.c
+++ b/drivers/clk/tegra/clk-tegra20.c
@@ -1193,7 +1193,7 @@ static const struct of_device_id apb_match[] __initconst 
= {
{},
 };
 
-void __init tegra20_clock_init(struct device_node *np)
+static void __init tegra20_clock_init(struct device_node *np)
 {
int i;
struct device_node *node;
@@ -1253,3 +1253,4 @@ void __init tegra20_clock_init(struct device_node *np)
 
tegra_cpu_car_ops = _cpu_car_ops;
 }
+CLK_OF_DECLARE(tegra20, "nvidia,tegra20-car", tegra20_clock_init);
diff --git a/drivers/clk/tegra/clk-tegra30.c b/drivers/clk/tegra/clk-tegra30.c
index 30fb743..f17e857 100644
--- a/drivers/clk/tegra/clk-tegra30.c
+++ b/drivers/clk/tegra/clk-tegra30.c
@@ -1971,7 +1971,7 @@ static const struct of_device_id apb_match[] __initconst 
= {
{},
 };
 
-void __init tegra30_clock_init(struct device_node *np)
+static void __init tegra30_clock_init(struct device_node *np)
 {
struct device_node *node;
int i;
@@ -2031,3 +2031,4 @@ void __init tegra30_clock_init(struct device_node *np)
 
tegra_cpu_car_ops = _cpu_car_ops;
 }
+CLK_OF_DECLARE(tegra30, "nvidia,tegra30-car", tegra30_clock_init);
diff --git a/drivers/clk/tegra/clk.c b/drivers/clk/tegra/clk.c
index a603b9a..ce4441a 100644
--- a/drivers/clk/tegra/clk.c
+++ b/drivers/clk/tegra/clk.c
@@ -72,14 +72,3 @@ void __init tegra_init_from_table(struct 
tegra_clk_init_table *tbl,
}
}
 }
-
-static const struct of_device_id tegra_dt_clk_match[] = {
-   { .compatible = "nvidia,tegra20-car", .data = tegra20_clock_init },
-   { .compatible = "nvidia,tegra30-car", .data = tegra30_clock_init },
-   { }
-};
-
-void __init tegra_clocks_init(void)
-{
-   of_clk_init(tegra_dt_clk_match);
-}
diff --git a/drivers/clk/tegra/clk.h b/drivers/clk/tegra/clk.h
index f1ed1d0..7fdf8e6 100644
--- a/drivers/clk/tegra/clk.h
+++ b/drivers/clk/tegra/clk.h
@@ -476,16 +476,4 @@ void tegra_init_from_table(struct tegra_clk_init_table 
*tbl,
 void tegra_init_dup_clks(struct tegra_clk_duplicate *dup_list,
struct clk *clks[], int clk_max);
 
-#ifdef CONFIG_ARCH_TEGRA_2x_SOC
-void tegra20_clock_init(struct device_node *np);
-#else
-static inline void tegra20_clock_init(struct device_node *np) {}
-#endif /* CONFIG_ARCH_TEGRA_2x_SOC */
-
-#ifdef CONFIG_ARCH_TEGRA_3x_SOC
-void tegra30_clock_init(struct device_node *np);
-#else
-static inline void tegra30_clock_init(struct device_node *np) {}
-#endif /* CONFIG_ARCH_TEGRA_3x_SOC */
-
 #endif /* TEGRA_CLK_H */
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 404d6f9..2e8b399 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -122,6 +122,5 @@ static inline void tegra_cpu_clock_resume(void)
 
 void tegra_periph_reset_deassert(struct clk *c);
 void tegra_periph_reset_assert(struct clk *c);
-void tegra_clocks_init(void);
 
 #endif /* __LINUX_CLK_TEGRA_H_ */
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 7/7] clk: vexpress: Use common of_clk_init() function

2013-01-03 Thread Prashant Gaikwad
Use common of_clk_init() function for clock initialization.

Signed-off-by: Prashant Gaikwad 
---
 drivers/clk/versatile/clk-vexpress-osc.c |1 +
 drivers/clk/versatile/clk-vexpress.c |8 +---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/clk/versatile/clk-vexpress-osc.c 
b/drivers/clk/versatile/clk-vexpress-osc.c
index dcb6ae0..256c8be 100644
--- a/drivers/clk/versatile/clk-vexpress-osc.c
+++ b/drivers/clk/versatile/clk-vexpress-osc.c
@@ -144,3 +144,4 @@ error:
vexpress_config_func_put(osc->func);
kfree(osc);
 }
+CLK_OF_DECLARE(vexpress_soc, "arm,vexpress-osc", vexpress_osc_of_setup);
diff --git a/drivers/clk/versatile/clk-vexpress.c 
b/drivers/clk/versatile/clk-vexpress.c
index c742ac7..f889f2f 100644
--- a/drivers/clk/versatile/clk-vexpress.c
+++ b/drivers/clk/versatile/clk-vexpress.c
@@ -99,19 +99,13 @@ struct clk *vexpress_sp810_of_get(struct of_phandle_args 
*clkspec, void *data)
return vexpress_sp810_timerclken[clkspec->args[0]];
 }
 
-static const __initconst struct of_device_id vexpress_fixed_clk_match[] = {
-   { .compatible = "fixed-clock", .data = of_fixed_clk_setup, },
-   { .compatible = "arm,vexpress-osc", .data = vexpress_osc_of_setup, },
-   {}
-};
-
 void __init vexpress_clk_of_init(void)
 {
struct device_node *node;
struct clk *clk;
struct clk *refclk, *timclk;
 
-   of_clk_init(vexpress_fixed_clk_match);
+   of_clk_init(NULL);
 
node = of_find_compatible_node(NULL, NULL, "arm,sp810");
vexpress_sp810_init(of_iomap(node, 0));
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/7] clk: vt8500: Use common of_clk_init() function

2013-01-03 Thread Prashant Gaikwad
Use common of_clk_init() function for clock initialization.

Signed-off-by: Prashant Gaikwad 
---
 drivers/clk/clk-vt8500.c |   15 ---
 1 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/drivers/clk/clk-vt8500.c b/drivers/clk/clk-vt8500.c
index fe25570..3ce1c3e 100644
--- a/drivers/clk/clk-vt8500.c
+++ b/drivers/clk/clk-vt8500.c
@@ -272,7 +272,7 @@ static __init void vtwm_device_clk_init(struct device_node 
*node)
rc = of_clk_add_provider(node, of_clk_src_simple_get, clk);
clk_register_clkdev(clk, clk_name, NULL);
 }
-
+CLK_OF_DECLARE(vt8500_device, "via,vt8500-device-clock", vtwm_device_clk_init);
 
 /* PLL clock related functions */
 
@@ -502,20 +502,13 @@ static void __init vt8500_pll_init(struct device_node 
*node)
 {
vtwm_pll_clk_init(node, PLL_TYPE_VT8500);
 }
+CLK_OF_DECLARE(vt8500_pll, "via,vt8500-pll-clock", vt8500_pll_init);
 
 static void __init wm8650_pll_init(struct device_node *node)
 {
vtwm_pll_clk_init(node, PLL_TYPE_WM8650);
 }
-
-static const __initconst struct of_device_id clk_match[] = {
-   { .compatible = "fixed-clock", .data = of_fixed_clk_setup, },
-   { .compatible = "via,vt8500-pll-clock", .data = vt8500_pll_init, },
-   { .compatible = "wm,wm8650-pll-clock", .data = wm8650_pll_init, },
-   { .compatible = "via,vt8500-device-clock",
-   .data = vtwm_device_clk_init, },
-   { /* sentinel */ }
-};
+CLK_OF_DECLARE(wm8650_pll, "wm,wm8650-pll-clock", wm8650_pll_init);
 
 void __init vtwm_clk_init(void __iomem *base)
 {
@@ -524,5 +517,5 @@ void __init vtwm_clk_init(void __iomem *base)
 
pmc_base = base;
 
-   of_clk_init(clk_match);
+   of_clk_init(NULL);
 }
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 6/7] clk: zynq: Use common of_clk_init() function

2013-01-03 Thread Prashant Gaikwad
Use common of_clk_init() function for clock initialization.

Signed-off-by: Prashant Gaikwad 
---
 drivers/clk/clk-zynq.c |   14 --
 1 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/clk/clk-zynq.c b/drivers/clk/clk-zynq.c
index 37a3051..b14a25f 100644
--- a/drivers/clk/clk-zynq.c
+++ b/drivers/clk/clk-zynq.c
@@ -81,6 +81,7 @@ static void __init zynq_pll_clk_setup(struct device_node *np)
if (WARN_ON(ret))
return;
 }
+CLK_OF_DECLARE(zynq_pll, "xlnx,zynq-pll", zynq_pll_clk_setup);
 
 struct zynq_periph_clk {
struct clk_hw   hw;
@@ -187,6 +188,7 @@ static void __init zynq_periph_clk_setup(struct device_node 
*np)
if (WARN_ON(err))
return;
 }
+CLK_OF_DECLARE(zynq_periph, "xlnx,zynq-periph-clock", zynq_periph_clk_setup);
 
 /* CPU Clock domain is modelled as a mux with 4 children subclks, whose
  * derivative rates depend on CLK_621_TRUE
@@ -366,18 +368,10 @@ static void __init zynq_cpu_clk_setup(struct device_node 
*np)
if (WARN_ON(err))
return;
 }
-
-static const __initconst struct of_device_id zynq_clk_match[] = {
-   { .compatible = "fixed-clock", .data = of_fixed_clk_setup, },
-   { .compatible = "xlnx,zynq-pll", .data = zynq_pll_clk_setup, },
-   { .compatible = "xlnx,zynq-periph-clock",
-   .data = zynq_periph_clk_setup, },
-   { .compatible = "xlnx,zynq-cpu-clock", .data = zynq_cpu_clk_setup, },
-   {}
-};
+CLK_OF_DECLARE(zynq_cpu, "xlnx,zynq-cpu-clock", zynq_cpu_clk_setup);
 
 void __init xilinx_zynq_clocks_init(void __iomem *slcr)
 {
slcr_base = slcr;
-   of_clk_init(zynq_clk_match);
+   of_clk_init(NULL);
 }
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/7] clk: sunxi: Use common of_clk_init() function

2013-01-03 Thread Prashant Gaikwad
Use common of_clk_init() function to initialize clocks.

Signed-off-by: Prashant Gaikwad 
---
 drivers/clk/clk-sunxi.c   |   30 --
 drivers/clocksource/sunxi_timer.c |4 ++--
 include/linux/clk/sunxi.h |   22 --
 3 files changed, 2 insertions(+), 54 deletions(-)
 delete mode 100644 drivers/clk/clk-sunxi.c
 delete mode 100644 include/linux/clk/sunxi.h

diff --git a/drivers/clk/clk-sunxi.c b/drivers/clk/clk-sunxi.c
deleted file mode 100644
index 0e831b5..000
--- a/drivers/clk/clk-sunxi.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright 2012 Maxime Ripard
- *
- * Maxime Ripard 
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include 
-#include 
-#include 
-#include 
-
-static const __initconst struct of_device_id clk_match[] = {
-   { .compatible = "fixed-clock", .data = of_fixed_clk_setup, },
-   {}
-};
-
-void __init sunxi_init_clocks(void)
-{
-   of_clk_init(clk_match);
-}
diff --git a/drivers/clocksource/sunxi_timer.c 
b/drivers/clocksource/sunxi_timer.c
index 6c2ed56..08e1756 100644
--- a/drivers/clocksource/sunxi_timer.c
+++ b/drivers/clocksource/sunxi_timer.c
@@ -23,7 +23,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #define TIMER_CTL_REG  0x00
 #define TIMER_CTL_ENABLE   (1 << 0)
@@ -124,7 +124,7 @@ void __init sunxi_timer_init(void)
if (irq <= 0)
panic("Can't parse IRQ");
 
-   sunxi_init_clocks();
+   of_clk_init(NULL);
 
clk = of_clk_get(node, 0);
if (IS_ERR(clk))
diff --git a/include/linux/clk/sunxi.h b/include/linux/clk/sunxi.h
deleted file mode 100644
index e074fdd..000
--- a/include/linux/clk/sunxi.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright 2012 Maxime Ripard
- *
- * Maxime Ripard 
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef __LINUX_CLK_SUNXI_H_
-#define __LINUX_CLK_SUNXI_H_
-
-void __init sunxi_init_clocks(void);
-
-#endif
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/7] clk: add common of_clk_init() function

2013-01-03 Thread Prashant Gaikwad
Modify of_clk_init function so that it will determine which
driver to initialize based on device tree instead of each driver
registering to it.

Based on a similar patch for drivers/irqchip by Thomas Petazzoni and
drivers/clocksource by Stephen Warren.

Signed-off-by: Prashant Gaikwad 
---
 drivers/clk/clk-fixed-rate.c  |1 +
 drivers/clk/clk.c |9 +
 include/asm-generic/vmlinux.lds.h |   10 ++
 include/linux/clk-provider.h  |6 ++
 4 files changed, 26 insertions(+), 0 deletions(-)

diff --git a/drivers/clk/clk-fixed-rate.c b/drivers/clk/clk-fixed-rate.c
index af78ed6..f2104df 100644
--- a/drivers/clk/clk-fixed-rate.c
+++ b/drivers/clk/clk-fixed-rate.c
@@ -101,4 +101,5 @@ void __init of_fixed_clk_setup(struct device_node *node)
of_clk_add_provider(node, of_clk_src_simple_get, clk);
 }
 EXPORT_SYMBOL_GPL(of_fixed_clk_setup);
+CLK_OF_DECLARE(fixed_clk, "fixed-clock", of_fixed_clk_setup);
 #endif
diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index 037b48a..fb38dd8 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static DEFINE_SPINLOCK(enable_lock);
 static DEFINE_MUTEX(prepare_lock);
@@ -1805,6 +1806,11 @@ struct of_clk_provider {
void *data;
 };
 
+extern struct of_device_id __clk_of_table[];
+
+static const struct of_device_id __clk_of_table_sentinel
+   __used __section(__clk_of_table_end);
+
 static LIST_HEAD(of_clk_providers);
 static DEFINE_MUTEX(of_clk_lock);
 
@@ -1933,6 +1939,9 @@ void __init of_clk_init(const struct of_device_id 
*matches)
 {
struct device_node *np;
 
+   if (!matches)
+   matches = __clk_of_table;
+
for_each_matching_node(np, matches) {
const struct of_device_id *match = of_match_node(matches, np);
of_clk_init_cb_t clk_init_cb = match->data;
diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index 1e744c5..8282f7c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -158,6 +158,15 @@
 #define CLKSRC_OF_TABLES()
 #endif
 
+#ifdef CONFIG_COMMON_CLK
+#define CLK_OF_TABLES() . = ALIGN(8);  \
+   VMLINUX_SYMBOL(__clk_of_table) = .; \
+   *(__clk_of_table)   \
+   *(__clk_of_table_end)
+#else
+#define CLK_OF_TABLES()
+#endif
+
 #define KERNEL_DTB()   \
STRUCT_ALIGN(); \
VMLINUX_SYMBOL(__dtb_start) = .;\
@@ -502,6 +511,7 @@
CPU_DISCARD(init.rodata)\
MEM_DISCARD(init.rodata)\
CLKSRC_OF_TABLES()  \
+   CLK_OF_TABLES() \
KERNEL_DTB()
 
 #define INIT_TEXT  \
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 4989b8a..7f197d7 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -379,7 +379,13 @@ struct clk_onecell_data {
 };
 struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void 
*data);
 const char *of_clk_get_parent_name(struct device_node *np, int index);
+
 void of_clk_init(const struct of_device_id *matches);
 
+#define CLK_OF_DECLARE(name, compat, fn)   \
+   static const struct of_device_id __clk_of_table_##name  \
+   __used __section(__clk_of_table)\
+   = { .compatible = compat, .data = fn };
+
 #endif /* CONFIG_COMMON_CLK */
 #endif /* CLK_PROVIDER_H */
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH net-next] softirq: reduce latencies

2013-01-03 Thread Eric Dumazet
On Fri, 2013-01-04 at 06:31 +0100, Sedat Dilek wrote:

> 
> Will you send a v2 with this change...?
> 
> -#define MAX_SOFTIRQ_TIME  min(1, (2*HZ/1000))
> +#define MAX_SOFTIRQ_TIME  max(1, (2*HZ/1000))

I will, I was planning to do this after waiting for other
comments/reviews.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH net-next] softirq: reduce latencies

2013-01-03 Thread Eric Dumazet
On Fri, 2013-01-04 at 14:16 +0900, Namhyung Kim wrote:

> Probably a silly question:
> 
> Why not using ktime rather than jiffies for this?

ktime is too expensive on some hardware.

Here we only want a safety belt, no need for high time resolution.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] cpuidle - fix lock contention in the idle path

2013-01-03 Thread Daniel Lezcano
On 01/02/2013 10:13 PM, Russ Anderson wrote:
> On Wed, Dec 26, 2012 at 11:01:48AM +0100, Daniel Lezcano wrote:
>> The commit bf4d1b5ddb78f86078ac6ae0415802d5f0c68f92 introduces
>> a lock in the cpuidle_get_cpu_driver function. This function
>> is used in the idle_call function.
>>
>> The problem is the contention with a large number of cpus because
>> they try to access the idle routine at the same time.
>>
>> The lock could be safely removed because of how is used the
>> cpuidle api. The cpuidle_register_driver is called first but
>> until the cpuidle_register_device is not called we don't
>> enter in the cpuidle idle call function because the device
>> is not enabled.
>>
>> The cpuidle_unregister_driver function, leading the a NULL driver,
>> is not called before the cpuidle_unregister_device.
>>
>> This is how is used the cpuidle api from the different drivers.
>>
>> However, a cleanup around the lock and a proper refcounting
>> mechanism should be used to ensure the consistency in the api,
>> like cpuidle_unregister_driver should failed if its refcounting
>> is not 0.
>>
>> These modifications will need some code reorganization and rewrite
>> which does not fit with a fix.
> 
> I agree.
> 
>> The following patch is a hot fix by returning to the initial behavior
>> by removing the lock when getting the driver.
> 
> The patch fixes the problem.  Verified on a system with 1024 cpus.
> Thanks.
> 
>> Signed-off-by: Daniel Lezcano 
> 
> Reported-by: Russ Anderson 
> Acked-by: Russ Anderson 

Hi Rafael,

could you consider this patch for merging ?

Thanks
  -- Daniel


-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5] usb: phy: samsung: Add support to set pmu isolation

2013-01-03 Thread Vivek Gautam
Hi,


On Fri, Dec 28, 2012 at 2:43 PM, Vivek Gautam  wrote:
> Adding support to parse device node data in order to get
> required properties to set pmu isolation for usb-phy.
>
> Signed-off-by: Vivek Gautam 
> ---
>

Any further comments on this ? Or does this seem fine ?


-- 
Thanks & Regards
Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[git pull] radeon and minor nouveau fixes

2013-01-03 Thread Dave Airlie

Hi Linus,

Just a radeon pull from Alex, fixes a few regressions since 3.7 and 
reworks some of the reset handling, and two minor nouveau fixes I found on 
the list, Ben will be back next week to take care of the couple of larger 
nouveau patches that I see outstanding.

Dave.

The following changes since commit d5757dbe79870d825d0dec30074d48683e1d7e9a:

  Revert "drm: tegra: protect DC register access with mutex" (2012-12-30 
21:58:20 +1000)

are available in the git repository at:

  git://people.freedesktop.org/~airlied/linux drm-next

for you to fetch changes up to eda85d6ad490923152544fba0473798b6cc0edf6:

  drm/nouveau: fix init with agpgart-uninorth (2013-01-04 16:04:33 +1000)


Aaro Koskinen (1):
  drm/nouveau: fix init with agpgart-uninorth

Alex Deucher (8):
  drm/radeon: add connector table for Mac G4 Silver
  drm/radeon/r6xx: fix DMA engine for ttm bo transfers
  drm/radeon: fix typo in evergreen dma fence
  drm/radeon: add GPU reset flags
  drm/radeon: switch to a finer grained reset for r6xx/7xx
  drm/radeon: switch to a finer grained reset for evergreen
  drm/radeon: switch to a finer grained reset for cayman/TN
  drm/radeon: switch to a finer grained reset for SI (v2)

Dave Airlie (1):
  Merge branch 'drm-fixes-3.8' of git://people.freedesktop.org/~agd5f/linux 
into drm-next

Guenter Roeck (1):
  drm: nouveau: Fix build warning seen if HWMON is undefined

Jerome Glisse (4):
  drm/radeon: add debugfs file for dma rings
  drm/radeon: improve ring debugfs printing
  drm/radeon: print dma status reg on lockup (v2)
  drm/radeon: reset dma engine on gpu reset (v2)

Niels Ole Salscheider (1):
  drm/radeon: Properly handle DDC probe for DP bridges

 drivers/gpu/drm/nouveau/nouveau_bo.c   |   2 +-
 drivers/gpu/drm/nouveau/nouveau_pm.c   |   4 +-
 drivers/gpu/drm/radeon/evergreen.c |  86 +--
 drivers/gpu/drm/radeon/evergreend.h|  14 +++-
 drivers/gpu/drm/radeon/ni.c| 106 +
 drivers/gpu/drm/radeon/nid.h   |   3 +-
 drivers/gpu/drm/radeon/r600.c  |  89 ++--
 drivers/gpu/drm/radeon/radeon.h|   5 ++
 drivers/gpu/drm/radeon/radeon_combios.c|  51 ++
 drivers/gpu/drm/radeon/radeon_connectors.c |  10 +--
 drivers/gpu/drm/radeon/radeon_display.c|  13 ++--
 drivers/gpu/drm/radeon/radeon_i2c.c|  10 ++-
 drivers/gpu/drm/radeon/radeon_mode.h   |   5 +-
 drivers/gpu/drm/radeon/radeon_ring.c   |  24 +--
 drivers/gpu/drm/radeon/si.c|  78 ++---
 drivers/gpu/drm/radeon/sid.h   |  18 +
 16 files changed, 416 insertions(+), 102 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] adv7343: use devm_kzalloc() instead of kzalloc()

2013-01-03 Thread Laurent Pinchart
Hi Prabhakar,

Thank you for the patches.

For the whole set,

Acked-by: Laurent Pinchart 

On Friday 04 January 2013 10:41:15 Lad, Prabhakar wrote:
> I2C drivers can use devm_kzalloc() too in their .probe() methods. Doing so
> simplifies their clean up paths.
> 
> Signed-off-by: Lad, Prabhakar 
> Signed-off-by: Manjunath Hadli 
> ---
>  Changes for v2:
>  1: Fixed comments pointed out by Laurent.
> 
>  drivers/media/i2c/adv7343.c |9 +++--
>  1 files changed, 3 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/media/i2c/adv7343.c b/drivers/media/i2c/adv7343.c
> index 2b5aa67..432eb5f 100644
> --- a/drivers/media/i2c/adv7343.c
> +++ b/drivers/media/i2c/adv7343.c
> @@ -397,7 +397,8 @@ static int adv7343_probe(struct i2c_client *client,
>   v4l_info(client, "chip found @ 0x%x (%s)\n",
>   client->addr << 1, client->adapter->name);
> 
> - state = kzalloc(sizeof(struct adv7343_state), GFP_KERNEL);
> + state = devm_kzalloc(>dev, sizeof(struct adv7343_state),
> +  GFP_KERNEL);
>   if (state == NULL)
>   return -ENOMEM;
> 
> @@ -431,16 +432,13 @@ static int adv7343_probe(struct i2c_client *client,
>   int err = state->hdl.error;
> 
>   v4l2_ctrl_handler_free(>hdl);
> - kfree(state);
>   return err;
>   }
>   v4l2_ctrl_handler_setup(>hdl);
> 
>   err = adv7343_initialize(>sd);
> - if (err) {
> + if (err)
>   v4l2_ctrl_handler_free(>hdl);
> - kfree(state);
> - }
>   return err;
>  }
> 
> @@ -451,7 +449,6 @@ static int adv7343_remove(struct i2c_client *client)
> 
>   v4l2_device_unregister_subdev(sd);
>   v4l2_ctrl_handler_free(>hdl);
> - kfree(state);
> 
>   return 0;
>  }
-- 
Regards,

Laurent Pinchart

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RESEND PATCH] drm/nouveau: fix init with agpgart-uninorth

2013-01-03 Thread Dave Airlie
On Tue, Jan 1, 2013 at 11:21 AM, Marcin Slusarz
 wrote:
> On Mon, Dec 31, 2012 at 03:34:59AM +0200, Aaro Koskinen wrote:
>> Check that the AGP aperture can be mapped. This follows a similar change
>> done for Radeon (commit 365048ff, drm/radeon: AGP memory is only I/O if
>> the aperture can be mapped by the CPU.).
>>
>> The patch fixes the following error seen on G5 iMac:
>>
>>   nouveau E[ DRM] failed to create kernel channel, -12
>>
>> Reviewed-by: Michel Dänzer 
>> Signed-off-by: Aaro Koskinen 
>> ---
>
> This patch fixes https://bugs.freedesktop.org/show_bug.cgi?id=58806.
> For some (weird) reason Nouveau worked on this configuration on 3.6 kernel,
> so cc'ing stable@vger seems to be appropriate.

Cool I've picked this up and applied it for stable into fixes now.

Dave.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] cpufreq: Don't use cpu removed during cpufreq_driver_unregister

2013-01-03 Thread Srivatsa S. Bhat
On 01/04/2013 10:49 AM, Viresh Kumar wrote:
> On 3 January 2013 19:55, Srivatsa S. Bhat
>  wrote:
>> I took a quick look at the problem you described above, and the cpufreq 
>> code..
>> If we cannot avoid calling cpufreq_add_dev() from cpufreq_remove_dev(), then 
>> I can't
>> think of anything better than what your patch does.
> 
> Good :)
> 
>> BTW, off-topic, while going through that path, I think I found a memory leak
>> in __cpufreq_remove_dev():
>>
>> if (unlikely(cpumask_weight(data->cpus) > 1)) {
>> for_each_cpu(j, data->cpus) {
>> if (j == cpu)
>> continue;
>> per_cpu(cpufreq_cpu_data, j) = NULL;
>> }
>> }
>>
>> We are assigning NULL without freeing that memory.
> 
> Not really. All cpus in affected_cpus (data->cpus), share the same
> policy structure.
> We have already taken backup of cpufreq_cpu_data for the first cpu in "data" 
> and
> are freeing it here:
> 
>   kfree(data);
> 

Ah, ok, got it. Thanks!

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] clk: tegra30: Convert clk out to composite clk

2013-01-03 Thread Prashant Gaikwad
Convert clk out to composite clock type which removes
the mux clock.

Signed-off-by: Prashant Gaikwad 
---
This patch is rebased on ccf-rework for Tegra patch series. It is just to show
how clk-composite can be used, not to be merged. If patch 1 is accepted then
I would like to merge this patch to ccf-rework series.
---
 drivers/clk/tegra/clk-tegra30.c |   51 +-
 drivers/clk/tegra/clk.h |   49 +
 2 files changed, 67 insertions(+), 33 deletions(-)

diff --git a/drivers/clk/tegra/clk-tegra30.c b/drivers/clk/tegra/clk-tegra30.c
index 30fb743..4c16c11 100644
--- a/drivers/clk/tegra/clk-tegra30.c
+++ b/drivers/clk/tegra/clk-tegra30.c
@@ -1191,43 +1191,28 @@ static void __init tegra30_audio_clk_init(void)
clks[spdif_2x] = clk;
 }
 
+static struct tegra_clk_out_init_data tegra_clk_out_list[] = {
+   TEGRA_CLK_OUT_INIT_DATA("clk_out_1", "extern1", "clk_out_1", 
clk_out1_parents, PMC_CLK_OUT_CNTRL, 6, 3, 0, 2, 0, _out_lock, clk_out_1),
+   TEGRA_CLK_OUT_INIT_DATA("clk_out_2", "extern2", "clk_out_2", 
clk_out2_parents, PMC_CLK_OUT_CNTRL, 14, 3, 0, 10, 0, _out_lock, clk_out_2),
+   TEGRA_CLK_OUT_INIT_DATA("clk_out_3", "extern3", "clk_out_3", 
clk_out3_parents, PMC_CLK_OUT_CNTRL, 22, 3, 0, 18, 0, _out_lock, clk_out_3),
+};
+
 static void __init tegra30_pmc_clk_init(void)
 {
struct clk *clk;
+   int i;
 
-   /* clk_out_1 */
-   clk = clk_register_mux(NULL, "clk_out_1_mux", clk_out1_parents,
-  ARRAY_SIZE(clk_out1_parents), 0,
-  pmc_base + PMC_CLK_OUT_CNTRL, 6, 3, 0,
-  _out_lock);
-   clks[clk_out_1_mux] = clk;
-   clk = clk_register_gate(NULL, "clk_out_1", "clk_out_1_mux", 0,
-   pmc_base + PMC_CLK_OUT_CNTRL, 2, 0,
-   _out_lock);
-   clk_register_clkdev(clk, "extern1", "clk_out_1");
-   clks[clk_out_1] = clk;
-
-   /* clk_out_2 */
-   clk = clk_register_mux(NULL, "clk_out_2_mux", clk_out2_parents,
-  ARRAY_SIZE(clk_out1_parents), 0,
-  pmc_base + PMC_CLK_OUT_CNTRL, 14, 3, 0,
-  _out_lock);
-   clk = clk_register_gate(NULL, "clk_out_2", "clk_out_2_mux", 0,
-   pmc_base + PMC_CLK_OUT_CNTRL, 10, 0,
-   _out_lock);
-   clk_register_clkdev(clk, "extern2", "clk_out_2");
-   clks[clk_out_2] = clk;
-
-   /* clk_out_3 */
-   clk = clk_register_mux(NULL, "clk_out_3_mux", clk_out3_parents,
-  ARRAY_SIZE(clk_out1_parents), 0,
-  pmc_base + PMC_CLK_OUT_CNTRL, 22, 3, 0,
-  _out_lock);
-   clk = clk_register_gate(NULL, "clk_out_3", "clk_out_3_mux", 0,
-   pmc_base + PMC_CLK_OUT_CNTRL, 18, 0,
-   _out_lock);
-   clk_register_clkdev(clk, "extern3", "clk_out_3");
-   clks[clk_out_3] = clk;
+   for (i = 0; i < ARRAY_SIZE(tegra_clk_out_list); i++) {
+   struct tegra_clk_out_init_data *out = _clk_out_list[i];
+
+   out->out.mux.reg = pmc_base + out->offset;
+   out->out.gate.reg = pmc_base + out->offset;
+
+   clk = clk_register_composite(NULL, out->name, out->parent_names,
+   out->num_parents, >out.mux.hw, _mux_ops,
+   NULL, NULL, >out.gate.hw, _gate_ops, 0);
+   clks[out->clk_id] = clk;
+   }
 
/* blink */
clk = clk_register_gate(NULL, "blink_override", "clk_32k", 0,
diff --git a/drivers/clk/tegra/clk.h b/drivers/clk/tegra/clk.h
index f1ed1d0..47c536d 100644
--- a/drivers/clk/tegra/clk.h
+++ b/drivers/clk/tegra/clk.h
@@ -437,6 +437,55 @@ struct clk *tegra_clk_super_mux(const char *name, const 
char **parent_names,
u8 width, u8 pllx_index, u8 div2_index,
spinlock_t *lock);
 
+struct tegra_clk_out {
+   struct clk_hw   hw;
+   struct clk_mux  mux;
+   struct clk_gate gate;
+};
+
+#define TEGRA_CLK_OUT(_mux_shift, _mux_width, _mux_flags,  \
+   _gate_bit_idx, _gate_flags, _lock)  \
+   {   \
+   .mux = {\
+   .shift = _mux_shift,\
+   .width = _mux_width,\
+   .flags = _mux_flags,\
+   .lock = _lock,  \
+   },  \
+   .gate = {   \
+   .bit_idx = _gate_bit_idx,   \
+   .flags = _gate_flags,   

[PATCH 1/2] clk: Add composite clock type

2013-01-03 Thread Prashant Gaikwad
Not all clocks are required to be decomposed into basic clock
types but at the same time want to use the functionality
provided by these basic clock types instead of duplicating.

For example, Tegra SoC has ~100 clocks which can be decomposed
into Mux -> Div -> Gate clock types making the clock count to
~300. Also, parent change operation can not be performed on gate
clock which forces to use mux clock in driver if want to change
the parent.

Instead aggregate the basic clock types functionality into one
clock and just use this clock for all operations. This clock
type re-uses the functionality of basic clock types and not
limited to basic clock types but any hardware-specific
implementation.

Signed-off-by: Prashant Gaikwad 
---
 drivers/clk/Makefile |3 +-
 drivers/clk/clk-composite.c  |  208 ++
 include/linux/clk-provider.h |   30 ++
 3 files changed, 240 insertions(+), 1 deletions(-)
 create mode 100644 drivers/clk/clk-composite.c

diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index f0b269a..baf7608 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -2,7 +2,8 @@
 obj-$(CONFIG_HAVE_CLK) += clk-devres.o
 obj-$(CONFIG_CLKDEV_LOOKUP)+= clkdev.o
 obj-$(CONFIG_COMMON_CLK)   += clk.o clk-fixed-rate.o clk-gate.o \
-  clk-mux.o clk-divider.o clk-fixed-factor.o
+  clk-mux.o clk-divider.o clk-fixed-factor.o \
+  clk-composite.o
 # SoCs specific
 obj-$(CONFIG_ARCH_BCM2835) += clk-bcm2835.o
 obj-$(CONFIG_ARCH_NOMADIK) += clk-nomadik.o
diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
new file mode 100644
index 000..8634dbf
--- /dev/null
+++ b/drivers/clk/clk-composite.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#define to_clk_composite(_hw) container_of(_hw, struct clk_composite, hw)
+
+static u8 clk_composite_get_parent(struct clk_hw *hw)
+{
+   struct clk_composite *composite = to_clk_composite(hw);
+   const struct clk_ops *mux_ops = composite->mux_ops;
+   struct clk_hw *mux_hw = composite->mux_hw;
+
+   mux_hw->clk = hw->clk;
+
+   return mux_ops->get_parent(mux_hw);
+}
+
+static int clk_composite_set_parent(struct clk_hw *hw, u8 index)
+{
+   struct clk_composite *composite = to_clk_composite(hw);
+   const struct clk_ops *mux_ops = composite->mux_ops;
+   struct clk_hw *mux_hw = composite->mux_hw;
+
+   mux_hw->clk = hw->clk;
+
+   return mux_ops->set_parent(mux_hw, index);
+}
+
+static unsigned long clk_composite_recalc_rate(struct clk_hw *hw,
+   unsigned long parent_rate)
+{
+   struct clk_composite *composite = to_clk_composite(hw);
+   const struct clk_ops *div_ops = composite->div_ops;
+   struct clk_hw *div_hw = composite->div_hw;
+
+   div_hw->clk = hw->clk;
+
+   return div_ops->recalc_rate(div_hw, parent_rate);
+}
+
+static long clk_composite_round_rate(struct clk_hw *hw, unsigned long rate,
+ unsigned long *prate)
+{
+   struct clk_composite *composite = to_clk_composite(hw);
+   const struct clk_ops *div_ops = composite->div_ops;
+   struct clk_hw *div_hw = composite->div_hw;
+
+   div_hw->clk = hw->clk;
+
+   return div_ops->round_rate(div_hw, rate, prate);
+}
+
+static int clk_composite_set_rate(struct clk_hw *hw, unsigned long rate,
+  unsigned long parent_rate)
+{
+   struct clk_composite *composite = to_clk_composite(hw);
+   const struct clk_ops *div_ops = composite->div_ops;
+   struct clk_hw *div_hw = composite->div_hw;
+
+   div_hw->clk = hw->clk;
+
+   return div_ops->set_rate(div_hw, rate, parent_rate);
+}
+
+static int clk_composite_is_enabled(struct clk_hw *hw)
+{
+   struct clk_composite *composite = to_clk_composite(hw);
+   const struct clk_ops *gate_ops = composite->gate_ops;
+   struct clk_hw *gate_hw = composite->gate_hw;
+
+   gate_hw->clk = hw->clk;
+
+   return gate_ops->is_enabled(gate_hw);
+}
+
+static int clk_composite_enable(struct clk_hw *hw)
+{
+   struct clk_composite *composite = to_clk_composite(hw);
+   const struct clk_ops 

Re: [PATCH v3 3/8] MFD:rtsx: Declare that the DMA address limitationis32bit explicitly

2013-01-03 Thread Dan Carpenter
On Fri, Jan 04, 2013 at 10:46:22AM +0800, wwang wrote:
> 于 2013年01月03日 20:25, Dan Carpenter 写道:
> > On Fri, Dec 28, 2012 at 10:41:28AM +0800, wei_w...@realsil.com.cn wrote:
> >> From: Wei WANG 
> >>
> >> Realtek PCIe card reader only supports 32bit DMA
> >>
> > Is this a bugfix?  If so what does the bug look like?
> >
> > regards,
> > dan carpenter
> >
> Hi Dan:
> 
> No, this is not a bugfix. In default, the kernel will allocate 32bit
> address for DMA. This declaration can improve the readability.
> 

Ah ok.  Really none of the things I mentioned were stuff that
prevented the patch from being merged or couldn't be fixed in follow
on patches.

The thing is that the commit message should always say why a patch
is needed.

regards,
dan carpenter

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 05/11] ARM: dt: tegra30: Add device node for APB MISC

2013-01-03 Thread Prashant Gaikwad

On Friday 04 January 2013 10:51 AM, Stephen Warren wrote:

On 01/03/2013 09:26 PM, Prashant Gaikwad wrote:

On Friday 04 January 2013 09:30 AM, Stephen Warren wrote:

On 01/03/2013 08:23 PM, Prashant Gaikwad wrote:

On Friday 04 January 2013 08:35 AM, Stephen Warren wrote:

On 01/03/2013 06:48 PM, Prashant Gaikwad wrote:

On Thursday 03 January 2013 09:41 PM, Stephen Warren wrote:

...

OK. It sounds like we need a true APB MISC driver then, to
abstract the
differences; the clock driver really shouldn't be touching the APB
MISC
registers in all likelihood, unless a subset of the sections you
mention
above are truly dedicated to clock functionality.

I don't think it is a good idea to create a driver for APB MISC, all
registers are used by different drivers.

Well, it's even worse to have a bunch of other drivers randomly trample
on a set of registers they don't own.


Only chip id revision registers are used in clock driver.

There are already global variables exposed by the Tegra fuse driver;
can
you just read those?

It is not about variables or some value, we have to read some apb
register to flush the write operation in apb bus before we disable
peripheral clock.
We are using chip id revision register for this purpose.

Ah. That's definitely not something the clock driver should be doing
directly. It's probably OK to add a custom Tegra-specific function to
some file in arch/arm/mach-tegra to implement this. Even better would be
a full bus driver for the APB bus, but that's probably too much bloat
for now.

tegra_init_fuse in arch/arm/mach-tegra/fuse.c is already reading chip id
revision register, so I can implement one function to read this register
in fuse.c, which will be used by clock driver and tegra_init_fuse.
But then we need to add it to some header file in include/mach or
include/linux, where? any suggestion?

Somewhere other than arch/arm/mach-tega/include/mach/ would be good, so
we don't have to move it later when we enable multi-platform zImage for
Tegra. Perhaps include/linux/tegra-soc.h? I guess we could move the
existing mach/powergate.h contents into that file later too.


include/linux/tegra-soc.h seems fine, I will send updated patch series.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH net-next] softirq: reduce latencies

2013-01-03 Thread Sedat Dilek
On Fri, Jan 4, 2013 at 5:41 AM, Eric Dumazet  wrote:
> On Thu, 2013-01-03 at 11:41 -0800, Rick Jones wrote:
>
>> In terms of netperf overhead, once you specify P99_LATENCY, you are
>> already in for the pound of cost but only getting the penny of output
>> (so to speak).  While it would clutter the output, one could go ahead
>> and ask for the other latency stats and it won't "cost" anything more:
>>
>> ... -- -k
>> RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY
>>
>> Additional information about how the omni output selectors work can be
>> found at
>> http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Omni-Output-Selection
>>
>> happy benchmarking,
>>
>> rick jones
>>
>> BTW - you will likely see some differences between RT_LATENCY, which is
>> calculated from the average transactions per second, and MEAN_LATENCY,
>> which is calculated from the histogram of individual latencies
>> maintained when any of the _LATENCY outputs other than RT_LATENCY is
>> requested.  Kudos to the folks at Google who did the extensions to the
>> then-existing histogram code to enable it to be used for more reasonably
>> accurate statistics.
>>
>
> Yeah ;)
>
> Here are the before/after_patch results, cpu 2 handling the NIC irqs :
>
>
> Before patch :
>
> # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k
> RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY
> MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
> to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind
> RT_LATENCY=550110.424
> MIN_LATENCY=146858
> MAX_LATENCY=997109
> P50_LATENCY=305000
> P90_LATENCY=55
> P99_LATENCY=71
> MEAN_LATENCY=376989.12
> STDDEV_LATENCY=184046.92
>
> After patch :
>
> # netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k
> RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY
> MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
> to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind
> RT_LATENCY=40545.492
> MIN_LATENCY=9834
> MAX_LATENCY=78366
> P50_LATENCY=33583
> P90_LATENCY=59000
> P99_LATENCY=69000
> MEAN_LATENCY=38364.67
> STDDEV_LATENCY=12865.26
>

Will you send a v2 with this change...?

-#define MAX_SOFTIRQ_TIME  min(1, (2*HZ/1000))
+#define MAX_SOFTIRQ_TIME  max(1, (2*HZ/1000))

- Sedat -
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 05/11] ARM: dt: tegra30: Add device node for APB MISC

2013-01-03 Thread Stephen Warren
On 01/03/2013 09:26 PM, Prashant Gaikwad wrote:
> On Friday 04 January 2013 09:30 AM, Stephen Warren wrote:
>> On 01/03/2013 08:23 PM, Prashant Gaikwad wrote:
>>> On Friday 04 January 2013 08:35 AM, Stephen Warren wrote:
 On 01/03/2013 06:48 PM, Prashant Gaikwad wrote:
> On Thursday 03 January 2013 09:41 PM, Stephen Warren wrote:
>> ...
>> OK. It sounds like we need a true APB MISC driver then, to
>> abstract the
>> differences; the clock driver really shouldn't be touching the APB
>> MISC
>> registers in all likelihood, unless a subset of the sections you
>> mention
>> above are truly dedicated to clock functionality.
> I don't think it is a good idea to create a driver for APB MISC, all
> registers are used by different drivers.
 Well, it's even worse to have a bunch of other drivers randomly trample
 on a set of registers they don't own.

> Only chip id revision registers are used in clock driver.
 There are already global variables exposed by the Tegra fuse driver;
 can
 you just read those?
>>> It is not about variables or some value, we have to read some apb
>>> register to flush the write operation in apb bus before we disable
>>> peripheral clock.
>>> We are using chip id revision register for this purpose.
>> Ah. That's definitely not something the clock driver should be doing
>> directly. It's probably OK to add a custom Tegra-specific function to
>> some file in arch/arm/mach-tegra to implement this. Even better would be
>> a full bus driver for the APB bus, but that's probably too much bloat
>> for now.
> 
> tegra_init_fuse in arch/arm/mach-tegra/fuse.c is already reading chip id
> revision register, so I can implement one function to read this register
> in fuse.c, which will be used by clock driver and tegra_init_fuse.
> But then we need to add it to some header file in include/mach or
> include/linux, where? any suggestion?

Somewhere other than arch/arm/mach-tega/include/mach/ would be good, so
we don't have to move it later when we enable multi-platform zImage for
Tegra. Perhaps include/linux/tegra-soc.h? I guess we could move the
existing mach/powergate.h contents into that file later too.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] cpufreq: Don't use cpu removed during cpufreq_driver_unregister

2013-01-03 Thread Viresh Kumar
On 3 January 2013 19:55, Srivatsa S. Bhat
 wrote:
> I took a quick look at the problem you described above, and the cpufreq code..
> If we cannot avoid calling cpufreq_add_dev() from cpufreq_remove_dev(), then 
> I can't
> think of anything better than what your patch does.

Good :)

> BTW, off-topic, while going through that path, I think I found a memory leak
> in __cpufreq_remove_dev():
>
> if (unlikely(cpumask_weight(data->cpus) > 1)) {
> for_each_cpu(j, data->cpus) {
> if (j == cpu)
> continue;
> per_cpu(cpufreq_cpu_data, j) = NULL;
> }
> }
>
> We are assigning NULL without freeing that memory.

Not really. All cpus in affected_cpus (data->cpus), share the same
policy structure.
We have already taken backup of cpufreq_cpu_data for the first cpu in "data" and
are freeing it here:

kfree(data);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH net-next] softirq: reduce latencies

2013-01-03 Thread Namhyung Kim
Hi,

On Thu, 03 Jan 2013 14:41:15 -0800, Eric Dumazet wrote:
> On Thu, 2013-01-03 at 12:46 -0800, Andrew Morton wrote:
>> Can this change cause worsened latencies in some situations?  Say there
>> are a large number of short-running actions queued.  Presently we'll
>> dispatch ten of them and return.  With this change we'll dispatch many
>> more of them - however many consume 2ms.  So worst-case latency
>> increases from "10 * not-much" to "2 ms".
>
> I tried to reproduce such workload but couldnt. 2 ms (or more exactly 1
> to 2 ms given the jiffies/HZ granularity) is about the time needed to
> process 1000 frames on current hardware.

Probably a silly question:

Why not using ktime rather than jiffies for this?

Thanks,
Namhyung
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC v5 0/8] Support volatile for anonymous range

2013-01-03 Thread Minchan Kim
Hello,

On Thu, Jan 03, 2013 at 09:19:08AM -0800, Sanjay Ghemawat wrote:
> On Wed, Jan 2, 2013 at 8:27 PM, Minchan Kim  wrote:
> > This is still RFC because we need more input from user-space
> > people, more stress test, design discussion about interface/reclaim
> 
> Speaking as one of the authors of tcmalloc, I don't see any particular
> need for this new system call for tcmalloc.  We are fine using
> madvise(MADV_DONTNEED) and don't notice any significant
> performance issues caused by it.  Background: we throttle how
> quickly we release memory back to the system (1-10MB/s), so
> we do not call madvise() very much, and we don't end up reusing
> madvise-ed away pages at a fast rate. My guess is that we won't

It means TCmalloc controls madvise's rate dynamically without
user's intervention? Smart TCmalloc!

Let me ask some questions.
What is your policy for control of throttling of madvise?
I guess policy is following as.

The madvise's frequent calling is bad because pte zap overhead of
madvise + next page fault/memset + page access bit emulatation
page fault in some architecture like ARM when reused the range.
So we should call it fast rate only when memory pressure happens
very carefully. Is it similar to your throttling logic?

If my assumption isn't totally wrong, how could a process know
the memory pressure at the moment by just per-process view, NOT
system view?

If your logic takes some mistake, (for instace, memory pressure
is severe but it doesn't call madvise) working set could be reclaimed
like file-backed pages, which could minimize your benefit via madvise
throttling. I guess it's very fragile. It's more severe in embedded
world because they don't use swap so system encounters OOM instead of
swappout.

In this point, mvolatile's concept is light weight system call by
just mark the flag in the vma and auto free when system suffers from
memory pressure(about this, my plan is zap all pages if kswapd is active
when movlatile system call is called) by reclaimer with preventing
working set page eviction, otherwise enhance application's speed with
removing (minor fault + page allocation + memset). Also, it would make
allocator simple through removing control logic, which is less error-prone
and even might make smart TCmalloc better than now althoug it doesn't have
any significat performance issue.

> see large enough application-level performance improvements to
> cause us to change tcmalloc to use this system call.
> 
> > - What's different with madvise(DONTNEED)?
> >
> >   System call semantic
> >
> >   DONTNEED makes sure user always can see zero-fill pages after
> >   he calls madvise while mvolatile can see old data or encounter
> >   SIGBUS.
> 
> Do you need a new system call for this?  Why not just a new flag to madvise
> with weaker guarantees than zero-filling?  All of the implementation changes
> you point out below could be triggered from that flag.

Agreed and actually, I tried it but changed my mind because it required
adding many hacky codes in madvise due to return value and error's semantic
is totally different with normal madvise and needs three flags at least at
the moment but not sure we need more flags during discussion.

MADV_VOLATILE, MADV_NOVOLATILE, MADV_[NO]VOLATILE|MADV_PARTIAL_DISCARD

I don't want to make madvise dirty and consume lots of new flags of madvise
for a volatile feature. But if everybody want to fold into madivse,
I can do it, too.

Thanks for the feedback, Sanjay!

> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 

-- 
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] cpufreq: Manage only online cpus

2013-01-03 Thread Viresh Kumar
On 3 January 2013 17:32, Rafael J. Wysocki  wrote:
> True, but have those bugs been introduced recently (ie. in v3.8-rc1 or later)?

Don't know... I feel they were always there, its just that nobody
tested it that way :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] tvp7002: use devm_kzalloc() instead of kzalloc()

2013-01-03 Thread Lad, Prabhakar
I2C drivers can use devm_kzalloc() too in their .probe() methods. Doing so
simplifies their clean up paths.

Signed-off-by: Lad, Prabhakar 
Signed-off-by: Manjunath Hadli 
---
 Changes for v2:
 1: Fixed comments pointed out by Laurent.

 drivers/media/i2c/tvp7002.c |   18 ++
 1 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/drivers/media/i2c/tvp7002.c b/drivers/media/i2c/tvp7002.c
index fb6a5b5..537f6b4 100644
--- a/drivers/media/i2c/tvp7002.c
+++ b/drivers/media/i2c/tvp7002.c
@@ -1036,7 +1036,7 @@ static int tvp7002_probe(struct i2c_client *c, const 
struct i2c_device_id *id)
return -ENODEV;
}
 
-   device = kzalloc(sizeof(struct tvp7002), GFP_KERNEL);
+   device = devm_kzalloc(>dev, sizeof(struct tvp7002), GFP_KERNEL);
 
if (!device)
return -ENOMEM;
@@ -1052,7 +1052,7 @@ static int tvp7002_probe(struct i2c_client *c, const 
struct i2c_device_id *id)
 
error = tvp7002_read(sd, TVP7002_CHIP_REV, );
if (error < 0)
-   goto found_error;
+   return error;
 
/* Get revision number */
v4l2_info(sd, "Rev. %02x detected.\n", revision);
@@ -1063,21 +1063,21 @@ static int tvp7002_probe(struct i2c_client *c, const 
struct i2c_device_id *id)
error = tvp7002_write_inittab(sd, tvp7002_init_default);
 
if (error < 0)
-   goto found_error;
+   return error;
 
/* Set polarity information after registers have been set */
polarity_a = 0x20 | device->pdata->hs_polarity << 5
| device->pdata->vs_polarity << 2;
error = tvp7002_write(sd, TVP7002_SYNC_CTL_1, polarity_a);
if (error < 0)
-   goto found_error;
+   return error;
 
polarity_b = 0x01  | device->pdata->fid_polarity << 2
| device->pdata->sog_polarity << 1
| device->pdata->clk_polarity;
error = tvp7002_write(sd, TVP7002_MISC_CTL_3, polarity_b);
if (error < 0)
-   goto found_error;
+   return error;
 
/* Set registers according to default video mode */
preset.preset = device->current_preset->preset;
@@ -1091,16 +1091,11 @@ static int tvp7002_probe(struct i2c_client *c, const 
struct i2c_device_id *id)
int err = device->hdl.error;
 
v4l2_ctrl_handler_free(>hdl);
-   kfree(device);
return err;
}
v4l2_ctrl_handler_setup(>hdl);
 
-found_error:
-   if (error < 0)
-   kfree(device);
-
-   return error;
+   return 0;
 }
 
 /*
@@ -1120,7 +1115,6 @@ static int tvp7002_remove(struct i2c_client *c)
 
v4l2_device_unregister_subdev(sd);
v4l2_ctrl_handler_free(>hdl);
-   kfree(device);
return 0;
 }
 
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] tvp514x: use devm_kzalloc() instead of kzalloc()

2013-01-03 Thread Lad, Prabhakar
I2C drivers can use devm_kzalloc() too in their .probe() methods. Doing so
simplifies their clean up paths.

Signed-off-by: Lad, Prabhakar 
Signed-off-by: Manjunath Hadli 
---
 Changes for v2:
 1: Fixed comments pointed out by Laurent.

 drivers/media/i2c/tvp514x.c |4 +---
 1 files changed, 1 insertions(+), 3 deletions(-)

diff --git a/drivers/media/i2c/tvp514x.c b/drivers/media/i2c/tvp514x.c
index d5e1021..aa94ebc 100644
--- a/drivers/media/i2c/tvp514x.c
+++ b/drivers/media/i2c/tvp514x.c
@@ -951,7 +951,7 @@ tvp514x_probe(struct i2c_client *client, const struct 
i2c_device_id *id)
return -ENODEV;
}
 
-   decoder = kzalloc(sizeof(*decoder), GFP_KERNEL);
+   decoder = devm_kzalloc(>dev, sizeof(*decoder), GFP_KERNEL);
if (!decoder)
return -ENOMEM;
 
@@ -998,7 +998,6 @@ tvp514x_probe(struct i2c_client *client, const struct 
i2c_device_id *id)
int err = decoder->hdl.error;
 
v4l2_ctrl_handler_free(>hdl);
-   kfree(decoder);
return err;
}
v4l2_ctrl_handler_setup(>hdl);
@@ -1023,7 +1022,6 @@ static int tvp514x_remove(struct i2c_client *client)
 
v4l2_device_unregister_subdev(sd);
v4l2_ctrl_handler_free(>hdl);
-   kfree(decoder);
return 0;
 }
 /* TVP5146 Init/Power on Sequence */
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] adv7343: use devm_kzalloc() instead of kzalloc()

2013-01-03 Thread Lad, Prabhakar
I2C drivers can use devm_kzalloc() too in their .probe() methods. Doing so
simplifies their clean up paths.

Signed-off-by: Lad, Prabhakar 
Signed-off-by: Manjunath Hadli 
---
 Changes for v2:
 1: Fixed comments pointed out by Laurent.

 drivers/media/i2c/adv7343.c |9 +++--
 1 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/media/i2c/adv7343.c b/drivers/media/i2c/adv7343.c
index 2b5aa67..432eb5f 100644
--- a/drivers/media/i2c/adv7343.c
+++ b/drivers/media/i2c/adv7343.c
@@ -397,7 +397,8 @@ static int adv7343_probe(struct i2c_client *client,
v4l_info(client, "chip found @ 0x%x (%s)\n",
client->addr << 1, client->adapter->name);
 
-   state = kzalloc(sizeof(struct adv7343_state), GFP_KERNEL);
+   state = devm_kzalloc(>dev, sizeof(struct adv7343_state),
+GFP_KERNEL);
if (state == NULL)
return -ENOMEM;
 
@@ -431,16 +432,13 @@ static int adv7343_probe(struct i2c_client *client,
int err = state->hdl.error;
 
v4l2_ctrl_handler_free(>hdl);
-   kfree(state);
return err;
}
v4l2_ctrl_handler_setup(>hdl);
 
err = adv7343_initialize(>sd);
-   if (err) {
+   if (err)
v4l2_ctrl_handler_free(>hdl);
-   kfree(state);
-   }
return err;
 }
 
@@ -451,7 +449,6 @@ static int adv7343_remove(struct i2c_client *client)
 
v4l2_device_unregister_subdev(sd);
v4l2_ctrl_handler_free(>hdl);
-   kfree(state);
 
return 0;
 }
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/4] kprobes/powerpc: Do not disable External interrupts during single step

2013-01-03 Thread Benjamin Herrenschmidt
On Tue, 2012-12-11 at 11:18 +0530, Suzuki K. Poulose wrote:
> On 12/03/2012 08:37 PM, Suzuki K. Poulose wrote:
> > From: Suzuki K. Poulose 
> >
> > External/Decrement exceptions have lower priority than the Debug Exception.
> > So, we don't have to disable the External interrupts before a single step.
> > However, on BookE, Critical Input Exception(CE) has higher priority than a
> > Debug Exception. Hence we mask them.

I'm not sure about that one ...

>From memory, 4xx has that interesting issue which is that if you have
single step enabled and an interrupt (of *any kind* occurs), the
processor *will* step into the first instruction of the interrupt
handler. (In fact, some silicons have a bug where it can even be the
*second* instruction of the handler, which can be problematic when the
first one is a branch).

This is why you may notice that whole business we have in the handling
of debug/crit interrupts where we try to figure out if that happened,
and return with DE off if it did.

Now, the above mentioned workaround means we might not need to disable
EE indeed.

However, in any case, I don't see what your patch fixes or improves, nor
do I understand what you mean by "it is possible we'd get the single
step reported for CE". Please explain in more details and describe the
problematic scenario.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH net-next] softirq: reduce latencies

2013-01-03 Thread Eric Dumazet
On Thu, 2013-01-03 at 11:41 -0800, Rick Jones wrote:

> In terms of netperf overhead, once you specify P99_LATENCY, you are 
> already in for the pound of cost but only getting the penny of output 
> (so to speak).  While it would clutter the output, one could go ahead 
> and ask for the other latency stats and it won't "cost" anything more:
> 
> ... -- -k 
> RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY
> 
> Additional information about how the omni output selectors work can be 
> found at 
> http://www.netperf.org/svn/netperf2/trunk/doc/netperf.html#Omni-Output-Selection
> 
> happy benchmarking,
> 
> rick jones
> 
> BTW - you will likely see some differences between RT_LATENCY, which is 
> calculated from the average transactions per second, and MEAN_LATENCY, 
> which is calculated from the histogram of individual latencies 
> maintained when any of the _LATENCY outputs other than RT_LATENCY is 
> requested.  Kudos to the folks at Google who did the extensions to the 
> then-existing histogram code to enable it to be used for more reasonably 
> accurate statistics.
> 

Yeah ;)

Here are the before/after_patch results, cpu 2 handling the NIC irqs :


Before patch :

# netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k
RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind
RT_LATENCY=550110.424
MIN_LATENCY=146858
MAX_LATENCY=997109
P50_LATENCY=305000
P90_LATENCY=55
P99_LATENCY=71
MEAN_LATENCY=376989.12
STDDEV_LATENCY=184046.92

After patch :

# netperf -H 7.7.7.84 -t TCP_RR -T2,2 -- -k
RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY
MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind
RT_LATENCY=40545.492
MIN_LATENCY=9834
MAX_LATENCY=78366
P50_LATENCY=33583
P90_LATENCY=59000
P99_LATENCY=69000
MEAN_LATENCY=38364.67
STDDEV_LATENCY=12865.26



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [REGRESSION][3.8.-rc1][ INFO: possible circular locking dependency detected ]

2013-01-03 Thread Christian Kujau
On Sun, 23 Dec 2012 at 13:34, Christian Kujau wrote:

> On Sat, 22 Dec 2012 at 16:28, Maciej Rutecki wrote:
> > Got during suspend to disk:
> 
> I got a similar message on a powerpc G4 system, right after bootup (no 
> suspend involved):
> 
> http://nerdbynature.de/bits/3.8.0-rc1/

FWIW, this is still present with 3.8.0-rc2.

C.

> [   97.803049] ==
> [   97.803051] [ INFO: possible circular locking dependency detected ]
> [   97.803059] 3.8.0-rc1-dirty #2 Not tainted
> [   97.803060] ---
> [   97.803066] kworker/0:1/235 is trying to acquire lock:
> [   97.803097]  ((fb_notifier_list).rwsem){.+.+.+}, at: [] 
> __blocking_notifier_call_chain+0x44/0x88
> [   97.803099] 
> [   97.803099] but task is already holding lock:
> [   97.803110]  (console_lock){+.+.+.}, at: [] 
> console_callback+0x20/0x194
> [   97.803112] 
> [   97.803112] which lock already depends on the new lock.
> 
> ...and on it goes. Please see the URL above for the whole dmesg and 
> .config.
> 
> @Li Zhong: I have applied your fix for the "MAX_STACK_TRACE_ENTRIES too 
>low" warning[0] to 3.8-rc1 (hence the -dirty flag), but in the 
>backtrace "ret_from_kernel_thread" shows up again. FWIW, your
>patch helped to make the "MAX_STACK_TRACE_ENTRIES too low" 
>warning go away in 3.7.0-rc7 and it did not re-appear ever 
>since.
> 
> Thanks,
> Christian.
> 
> [0] http://lkml.indiana.edu/hypermail/linux/kernel/1211.3/01917.html
> 
> > [  269.784867] [ INFO: possible circular locking dependency detected ]
> > [  269.784869] 3.8.0-rc1 #1 Not tainted
> > [  269.784870] ---
> > [  269.784871] kworker/u:3/56 is trying to acquire lock:
> > [  269.784878]  ((fb_notifier_list).rwsem){.+.+.+}, at: 
> > [] 
> > __blocking_notifier_call_chain+0x49/0x80
> > [  269.784879] 
> > [  269.784879] but task is already holding lock:
> > [  269.784884]  (console_lock){+.+.+.}, at: [] 
> > i915_drm_freeze+0x9e/0xbb
> > [  269.784884] 
> > [  269.784884] which lock already depends on the new lock.
> > [  269.784884] 
> > [  269.784885] 
> > [  269.784885] the existing dependency chain (in reverse order) is:
> > [  269.784887] 
> > [  269.784887] -> #1 (console_lock){+.+.+.}:
> > [  269.784890][] lock_acquire+0x95/0x105
> > [  269.784893][] console_lock+0x59/0x5b
> > [  269.784897][] register_con_driver+0x36/0x128
> > [  269.784899][] take_over_console+0x1e/0x45
> > [  269.784903][] fbcon_takeover+0x56/0x98
> > [  269.784906][] fbcon_event_notify+0x2c1/0x5ea
> > [  269.784909][] notifier_call_chain+0x67/0x92
> > [  269.784911][] 
> > __blocking_notifier_call_chain+0x5f/0x80
> > [  269.784912][] 
> > blocking_notifier_call_chain+0xf/0x11
> > [  269.784915][] fb_notifier_call_chain+0x16/0x18
> > [  269.784917][] register_framebuffer+0x20a/0x26e
> > [  269.784920][] 
> > drm_fb_helper_single_fb_probe+0x1ce/0x297
> > [  269.784922][] 
> > drm_fb_helper_initial_config+0x1d7/0x1ef
> > [  269.784924][] intel_fbdev_init+0x6f/0x82
> > [  269.784927][] i915_driver_load+0xa9e/0xc78
> > [  269.784929][] drm_get_pci_dev+0x165/0x26d
> > [  269.784931][] i915_pci_probe+0x60/0x69
> > [  269.784933][] local_pci_probe+0x39/0x61
> > [  269.784935][] pci_device_probe+0xba/0xe0
> > [  269.784938][] driver_probe_device+0x99/0x1c4
> > [  269.784940][] __driver_attach+0x4e/0x6f
> > [  269.784942][] bus_for_each_dev+0x52/0x84
> > [  269.784944][] driver_attach+0x19/0x1b
> > [  269.784946][] bus_add_driver+0xdf/0x203
> > [  269.784948][] driver_register+0x8e/0x114
> > [  269.784952][] __pci_register_driver+0x5d/0x62
> > [  269.784953][] drm_pci_init+0x81/0xe6
> > [  269.784957][] i915_init+0x66/0x68
> > [  269.784959][] do_one_initcall+0x7a/0x136
> > [  269.784962][] kernel_init+0x141/0x296
> > [  269.784964][] ret_from_fork+0x7c/0xb0
> > [  269.784966] 
> > [  269.784966] -> #0 ((fb_notifier_list).rwsem){.+.+.+}:
> > [  269.784967][] __lock_acquire+0xa7e/0xddd
> > [  269.784969][] lock_acquire+0x95/0x105
> > [  269.784971][] down_read+0x34/0x43
> > [  269.784973][] 
> > __blocking_notifier_call_chain+0x49/0x80
> > [  269.784975][] 
> > blocking_notifier_call_chain+0xf/0x11
> > [  269.784977][] fb_notifier_call_chain+0x16/0x18
> > [  269.784979][] fb_set_suspend+0x22/0x4d
> > [  269.784981][] intel_fbdev_set_suspend+0x20/0x22
> > [  269.784983][] i915_drm_freeze+0xab/0xbb
> > [  269.784985][] i915_pm_freeze+0x3d/0x41
> > [  269.784987][] pci_pm_freeze+0x65/0x8d
> > [  269.784990][] dpm_run_callback.isra.3+0x27/0x56
> > [  269.784993][] 

Re: [PATCH 3/3] tracing: Verify target file before registering a uprobe event

2013-01-03 Thread Namhyung Kim
Hi, Steve.

On Wed, 02 Jan 2013 17:50:38 -0500, Steven Rostedt wrote:
> From: Jovi Zhang 
>
> Without this patch, we can register a uprobe event for a directory.
> Enabling such a uprobe event would fail anyway .
>
> Example:
> $ echo 'p /bin:0x4245c0' > /sys/kernel/debug/tracing/uprobe_events
>
> However dirctories cannot be valid targets for uprobe.
> Hence verify if the target is a regular file during the probe
> registration.
>
> Signed-off-by: Jovi Zhang 
> Acked-by: Srikar Dronamraju 
> Signed-off-by: Steven Rostedt 
> ---
>  kernel/trace/trace_uprobe.c |6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
> index 03003cd..0815f25 100644
> --- a/kernel/trace/trace_uprobe.c
> +++ b/kernel/trace/trace_uprobe.c
> @@ -257,6 +257,10 @@ static int create_trace_uprobe(int argc, char **argv)
>   goto fail_address_parse;
>  
>   inode = igrab(path.dentry->d_inode);
> +  if (!S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) {

Doesn't !S_ISREG() include S_ISDIR() case too?

Anyway I can see an additional whitespace in front of the "if".

Thanks,
Namhyung


> + ret = -EINVAL;
> + goto fail_address_parse;
> + }
>  
>   argc -= 2;
>   argv += 2;
> @@ -356,7 +360,7 @@ fail_address_parse:
>   if (inode)
>   iput(inode);
>  
> - pr_info("Failed to parse address.\n");
> + pr_info("Failed to parse address or file.\n");
>  
>   return ret;
>  }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux-next: Tree for Jan 4

2013-01-03 Thread Stephen Rothwell
Hi all,

Changes since 20130103:

New tree: pekey
Dropped tree: pekey (build failure)

The driver-core.current tree lost lots of conflicts.

The net-next tree lost its build failure.

The pekey tree gained a build failure, so I dropped it for today.

The akpm tree gained a conflict against Linus' tree for which I dropped a
patch.



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" as mentioned in the FAQ on the wiki
(see below).

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log files
in the Next directory.  Between each merge, the tree was built with
a ppc64_defconfig for powerpc and an allmodconfig for x86_64. After the
final fixups (if any), it is also built with powerpc allnoconfig (32 and
64 bit), ppc44x_defconfig and allyesconfig (minus
CONFIG_PROFILE_ALL_BRANCHES - this fails its final link) and i386, sparc,
sparc64 and arm defconfig. These builds also have
CONFIG_ENABLE_WARN_DEPRECATED, CONFIG_ENABLE_MUST_CHECK and
CONFIG_DEBUG_INFO disabled when necessary.

Below is a summary of the state of the merge.

We are up to 213 trees (counting Linus' and 28 trees of patches pending
for Linus' tree), more are welcome (even if they are currently empty).
Thanks to those who have contributed, and to those who haven't, please do.

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

There is a wiki covering stuff to do with linux-next at
http://linux.f-seidel.de/linux-next/pmwiki/ .  Thanks to Frank Seidel.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

$ git checkout master
$ git reset --hard stable
Merging origin/master (5f73896 Merge tag 'pinctrl-fixes' of 
git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-pinctrl)
Merging fixes/master (3b095f2 disable the SB105X driver)
Merging kbuild-current/rc-fixes (bad9955 menuconfig: Replace CIRCLEQ by 
list_head-style lists.)
Merging arm-current/fixes (5ced33b ARM: 7611/1: VIC: fix bug in VIC irqdomain 
code)
Merging m68k-current/for-linus (e7e29b4 m68k: Wire up finit_module)
Merging powerpc-merge/merge (e6449c9 powerpc: Add missing NULL terminator to 
avoid boot panic on PPC40x)
Merging sparc/master (4e4d78f sparc: Hook up finit_module syscall.)
Merging net/master (6602d00 vxlan: allow live mac address change)
Merging sound-current/for-linus (8f7f3ab1 ALSA: usb-audio: Add support for 
Creative BT-D1 via usb sound quirks)
Merging pci-current/for-linus (812089e PCI: Reduce Ricoh 0xe822 SD card reader 
base clock frequency to 50MHz)
Merging wireless/master (619c5a9 brcmfmac: fix parsing rsn ie for ap mode.)
Merging driver-core.current/driver-core-linus (9661103 Merge 3.8-rc2 into 
driver-core-linus)
CONFLICT (content): Merge conflict in drivers/pinctrl/pinctrl-sirf.c
Applying: driver-core: remove inadvertently added file
Merging tty.current/tty-linus (a49f0d1 Linux 3.8-rc1)
Merging usb.current/usb-linus (a49f0d1 Linux 3.8-rc1)
Merging staging.current/staging-linus (a49f0d1 Linux 3.8-rc1)
Merging char-misc.current/char-misc-linus (a49f0d1 Linux 3.8-rc1)
Merging input-current/for-linus (e324ce6 Input: gpio_keys - defer probing if 
GPIO probing is deferred)
Merging md-current/for-linus (a9add5d md/raid5: add blktrace calls)
Merging audit-current/for-linus (c158a35 audit: no leading space in 
audit_log_d_path prefix)
Merging crypto-current/master (a2c0911 crypto: caam - Updated SEC-4.0 device 
tree binding for ERA information.)
Merging ide/master (9974e43 ide: fix generic_ide_suspend/resume Oops)
Merging dwmw2/master (084a0ec x86: add CONFIG_X86_MOVBE option)
CONFLICT (content): Merge conflict in arch/x86/Kconfig
Merging sh-current/sh-fixes-for-linus (4403310 SH: Convert out[bwl] macros to 
inline functions)
Merging irqdomain-current/irqdomain/merge (a0d271c Linux 3.6)
Merging devicetree-current/devicetree/merge (ab28698 of: define struct device 
in of_platform.h if !OF_DEVICE and !OF_ADDRESS)
Merging spi-current/spi/merge (d3601e5 spi/sh-hspi: fix return value check in 
hspi_probe().)
Merging gpio-current/gpio/merge (bc1008c gpio/mvebu-gpio: Make mvebu-gpio 
depend on OF_CONFIG)
Merging rr-fixes/fixes (52441fa module: prevent warning when finit_module a 0 
sized file)
Merging asm-generic/master (fb9de7e xtensa: Use generic asm/mmu.h for nommu)
Merging arm/for-next (5ced33b ARM: 7611/1: VIC: fix bug i

Re: [PATCH v2 05/11] ARM: dt: tegra30: Add device node for APB MISC

2013-01-03 Thread Prashant Gaikwad

On Friday 04 January 2013 09:30 AM, Stephen Warren wrote:

On 01/03/2013 08:23 PM, Prashant Gaikwad wrote:

On Friday 04 January 2013 08:35 AM, Stephen Warren wrote:

On 01/03/2013 06:48 PM, Prashant Gaikwad wrote:

On Thursday 03 January 2013 09:41 PM, Stephen Warren wrote:

...

OK. It sounds like we need a true APB MISC driver then, to abstract the
differences; the clock driver really shouldn't be touching the APB MISC
registers in all likelihood, unless a subset of the sections you
mention
above are truly dedicated to clock functionality.

I don't think it is a good idea to create a driver for APB MISC, all
registers are used by different drivers.

Well, it's even worse to have a bunch of other drivers randomly trample
on a set of registers they don't own.


Only chip id revision registers are used in clock driver.

There are already global variables exposed by the Tegra fuse driver; can
you just read those?

It is not about variables or some value, we have to read some apb
register to flush the write operation in apb bus before we disable
peripheral clock.
We are using chip id revision register for this purpose.

Ah. That's definitely not something the clock driver should be doing
directly. It's probably OK to add a custom Tegra-specific function to
some file in arch/arm/mach-tegra to implement this. Even better would be
a full bus driver for the APB bus, but that's probably too much bloat
for now.


tegra_init_fuse in arch/arm/mach-tegra/fuse.c is already reading chip id 
revision register, so I can implement one function to read this register 
in fuse.c, which will be used by clock driver and tegra_init_fuse.
But then we need to add it to some header file in include/mach or 
include/linux, where? any suggestion?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 1/8] Introduce new system call mvolatile

2013-01-03 Thread Minchan Kim
Hi,

On Thu, Jan 03, 2013 at 10:35:24AM -0800, Taras Glek wrote:
> On 1/2/2013 8:27 PM, Minchan Kim wrote:
> >This patch adds new system call m[no]volatile.
> >If someone asks is_volatile system call, it could be added, too.
> >
> >The reason why I introduced new system call instead of madvise is
> >m[no]volatile vma handling is totally different with madvise's vma
> >handling.
> >
> >1) The m[no]volatile should be successful although the range includes
> >unmapped or non-volatile range. It just skips such range
> >without stopping with returning error although it encounters
> >invalid range. It makes user convenient without calling several
> >system call of small range - Suggested by John Stultz
> >
> >2) The purged state of volatile range should be propagated out to user
> >although the range is merged with adjacent non-volatile range when
> >user calls mnovolatile.
> >
> >3) mvolatile's interface could be changed with madvise
> >in future discussion.  For example, I feel needs
> >movlatile(start, len, mode).
> >'mode' means FULL_VOLATILE or PARTIAL_VOLATILE.
> >FULL volatile means that if VM decide to reclaim the range, it would
> >reclaim all of pages in the range but in case of PARTIAL_VOLATILE,
> >VM could reclaim just a few number of pages in the range.
> >In case of tmpfs-volatile, user may regenerate all images data once
> >one of page in the range is discarded so there is pointless that
> >VM discard a page in the range when memory pressure is severe.
> >In case of anon-volatile, too excess discarding cause too many minor
> >fault for the allocator so it would be better to discard part of
> >the range.
> I don't understand point 3).
> Are you saying that using mvolatile in conjuction with madvise could
> allow mvolatile behavior to be tweaked in the future? Or are you
> suggesting adding an extra parameter in the future(what would that
> have to do with madvise)?

I meant I might want to expand mvolatile's interface like below
during discussion.

int mvolatile(start, len, mode);

> 
> 4) Having a new system call makes it easier for userspace apps to
> detect kernels without this functionality.

I coudn't understand your claim.
Now mvolatile just return EINVAL on !CONFIG_VOLATILE_PAGE system.
Why is it easy compared to returning EINVAL when we call madvise(VOLATILE)
on !CONFIG_VOLATILE_PAGE?

> 
> I really like the proposed interface. I like the suggestion of

Thanks.

> having explicit FULL|PARTIAL_VOLATILE. Why not include
> PARTIAL_VOLATILE as a required 3rd param in first version with
> expectation that FULL_VOLATILE will be added later(and returning
> some not-supported error in meantime)?

I just wanted to discuss about needs of it.
The reason I need PARTIAL_VOLATILE is that avoids many minor fault
for allocator. Is it useful for tmpfs-volatile, too?

Thanks for the feedback, Taras.

> >
> >3) The mvolatile system call's return value is quite different with
> >madvise. Look at below semantic explanation.
> >
> >So I want to separate mvolatile from madvise.
> >
> >mvolatile(start, len)'s semantics
> >
> >1) It makes range(start, len) as volatile although the range includes
> >unmapped area, speacial mapping and mlocked area which are just skipped.
> >
> >Return -EINVAL if range doesn't include a right vma at all.
> >Return -ENOMEM with interrupting range opeartion if memory is not
> >enough to merge/split vmas. In this case, some ranges would be
> >volatile and others not so user may recall mvolatile after he
> >cancel all range by mnovolatile.
> >Return 0 if range consists of only proper vmas.
> >Return 1 if part of range includes hole/huge/ksm/mlock/special area.
> >
> >2) If user calls mvolatile to the range which was already volatile VMA and
> >even purged state, VOLATILE attributes still remains but purged state
> >is reset. I expect some user want to split volatile vma into smaller
> >ranges. Although he can do it for mnovlatile(whole range) and serveral 
> >calling
> >with movlatile(smaller range), this function can avoid mnovolatile if he
> >doesn't care purged state. I'm not sure we really need this function so
> >I hope listen opinions. Unfortunately, current implemenation doesn't split
> >volatile VMA with new range in this case. I forgot implementing it
> >in this version but decide to send it to listen opinions because
> >implementing is rather trivial if we decided.
> >
> >mnovolatile(start, len)'s semantics is following as.
> >
> >1) It makes range(start, len) as non-volatile although the range
> >includes unmapped area, speacial mapping and non-volatile range
> >which are just skipped.
> >
> >2) If the range is purged, it will return 1 regardless of including
> >invalid range.
> If I understand this correctly:
> mvolatile(0, 10);
> //then range [9,10] is purged by kernel
> mnovolatile(0,4) will fail?
> that seems counterintuitive.
> 
> One of the uses for mnovolatile is to atomicly lock the 

set_page_dirty_lock + migrate_pages

2013-01-03 Thread Kyle Hubert
I have an interesting hang on a kernel I am working on. I am working
with an out of tree driver that does get_user_pages and programs an
IOMMU with the physical pages. It also listens for MMU notifier
callbacks so that it may invalidate the IOMMU PTEs. After the
invalidate, it then calls set_page_dirty_lock and page_cache_release.

However, if memory compaction is initiated during a running job,
migrate_pages will try_to_unmap the page. When it gets down to
try_to_unmap_one, the MMU notifier callback will be issued while the
page is locked. Of course, once the MMU notifier callback is executing
the kernel deadlocks as set_page_dirty_lock will never complete. This
appears to be the only location the page is locked when calling
mmu_notifier_invalidate_page.

So, I would love to switch to calling set_page_dirty unconditionally.
I am worried about the mapping changes to the page though. I thought
set_page_dirty_lock is supposed to protect against remappings and
HWPoisoning. I can't distinguish when the page would be locked or not
inside the MMU notifier callback, so I would have to adopt a solution
that can work in both environments. I suppose I could call
TestSetPageLock, and if it fails then schedule a work queue to release
the page, but this would certainly have an impact on migrate pages
(and maybe fork). Also, wouldn't get_user_pages protect against
remappings as we hold a reference count on the page?

As an aside, if the page is anonymous, I don't even need
set_page_dirty_lock at all, right? I could just use set_page_dirty,
no? Could I get page->mapping and test for the PAGE_MAPPING_ANON bit
set? This wouldn't solve my problem, as we support file backed pages,
I am just querying to understand.

Thanks for the help,
-Kyle
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 05/11] ARM: dt: tegra30: Add device node for APB MISC

2013-01-03 Thread Stephen Warren
On 01/03/2013 08:23 PM, Prashant Gaikwad wrote:
> On Friday 04 January 2013 08:35 AM, Stephen Warren wrote:
>> On 01/03/2013 06:48 PM, Prashant Gaikwad wrote:
>>> On Thursday 03 January 2013 09:41 PM, Stephen Warren wrote:
...
 OK. It sounds like we need a true APB MISC driver then, to abstract the
 differences; the clock driver really shouldn't be touching the APB MISC
 registers in all likelihood, unless a subset of the sections you
 mention
 above are truly dedicated to clock functionality.
>>>
>>> I don't think it is a good idea to create a driver for APB MISC, all
>>> registers are used by different drivers.
>>
>> Well, it's even worse to have a bunch of other drivers randomly trample
>> on a set of registers they don't own.
>>
>>> Only chip id revision registers are used in clock driver.
>> There are already global variables exposed by the Tegra fuse driver; can
>> you just read those?
> 
> It is not about variables or some value, we have to read some apb
> register to flush the write operation in apb bus before we disable
> peripheral clock.
> We are using chip id revision register for this purpose.

Ah. That's definitely not something the clock driver should be doing
directly. It's probably OK to add a custom Tegra-specific function to
some file in arch/arm/mach-tegra to implement this. Even better would be
a full bus driver for the APB bus, but that's probably too much bloat
for now.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: ppoll() stuck on POLLIN while TCP peer is sending

2013-01-03 Thread Eric Wong
Eric Wong  wrote:
> Eric Wong  wrote:
> > I think this requires frequent dirtying/cycling of pages to reproduce.
> > (from copying large files around) to interact with compaction.
> > I'll see if I can reproduce the issue with read-only FS activity.
> 
> Still successfully running the read-only test on my main machine, will
> provide another update in a few hours or so if it's still successful
> (it usually takes <1 hour to hit).

The read-only test is still going on my main machine.
I think writes/dirty data is required to reproduce the issue...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v5 3/3] PM: Introduce Intel PowerClamp Driver

2013-01-03 Thread Joe Perches
On Thu, 2013-01-03 at 07:10 -0800, Jacob Pan wrote:
> Intel PowerClamp driver performs synchronized idle injection across
> all online CPUs. The goal is to maintain a given package level C-state
> ratio.

[]

> +static int window_size_set(const char *arg, const struct kernel_param *kp)
> +{
> + int ret = 0;
> + unsigned long new_window_size;
> +
> + ret = kstrtoul(arg, 10, _window_size);
> + if (ret)
> + goto exit_win;
> + if (new_window_size > 10 || new_window_size < 2) {
> + pr_err("Invalid window size %lu, between 2-10\n",
> + new_window_size);
> + ret = -EINVAL;
> + }
> +
> + window_size = new_window_size;

Possible assignment of known invalid windows size?
Maybe you should add
goto exit;
after
ret = -EINVAL;

or add
new_window_size = clamp(new_window_size, 2ul, 10ul);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 7/7] cgroup: remove css_get_next

2013-01-03 Thread Li Zefan
On 2013/1/4 1:54, Michal Hocko wrote:
> Now that we have generic and well ordered cgroup tree walkers there is
> no need to keep css_get_next in the place.
> 
> Signed-off-by: Michal Hocko 
> Acked-by: KAMEZAWA Hiroyuki 

Acked-by: Li Zefan 

> ---
>  include/linux/cgroup.h |7 ---
>  kernel/cgroup.c|   49 
> 
>  2 files changed, 56 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux-next: manual merge of the akpm tree with Linus' tree

2013-01-03 Thread Stephen Rothwell
Hi Andrew,

Today's linux-next merge of the akpm tree got conflicts in
include/linux/mempolicy.h and mm/mempolicy.c between commit 42288fe366c4
("mm: mempolicy: Convert shared_policy mutex to spinlock") from Linus'
tree and commit "mm, mempolicy: introduce spinlock to read shared policy
tree" from the akpm tree.

These two commits seem to be fixing the same problem, so I dropped the
akpm tree commit.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgpUpyal0kLmk.pgp
Description: PGP signature


Re: [PATCH v2 05/11] ARM: dt: tegra30: Add device node for APB MISC

2013-01-03 Thread Prashant Gaikwad

On Friday 04 January 2013 08:35 AM, Stephen Warren wrote:

On 01/03/2013 06:48 PM, Prashant Gaikwad wrote:

On Thursday 03 January 2013 09:41 PM, Stephen Warren wrote:

On 01/02/2013 11:11 PM, Prashant Gaikwad wrote:

On Thursday 03 January 2013 03:30 AM, Stephen Warren wrote:

On 12/27/2012 07:47 AM, Prashant Gaikwad wrote:

APB misc contains multiple registers required by different modules
such as CAR.

I don't see a DT binding document that describes what
nvidia,tegra30-apbmisc means. Also, the register range for this new
node
overlaps that for the pinmux node, so they can't both "request" their
register region. You may need multiple entries in the apbmisc reg
property to avoid this.

apbmisc reg for Tegra30 can be divided into following entries:

strap registers
jtag configuration registers
pull_up/pull_down control registers
vclk control registers
tvdac registers
chip id revision registers
pad control registers

This list is not same for Tegra20 and Tegra30.

OK. It sounds like we need a true APB MISC driver then, to abstract the
differences; the clock driver really shouldn't be touching the APB MISC
registers in all likelihood, unless a subset of the sections you mention
above are truly dedicated to clock functionality.

I don't think it is a good idea to create a driver for APB MISC, all
registers are used by different drivers.

Well, it's even worse to have a bunch of other drivers randomly trample
on a set of registers they don't own.


Only chip id revision registers are used in clock driver.

There are already global variables exposed by the Tegra fuse driver; can
you just read those?


It is not about variables or some value, we have to read some apb 
register to flush the write operation in apb bus before we disable 
peripheral clock.

We are using chip id revision register for this purpose.


OR

another way is to add chip id revision register region to CAR node as
done for pinmux node and remove apb misc node.

The pinmux controller doesn't have a reg entry for the chip ID register.
I don't understand what you mean here.

I mean as we have separate entry for PAD control registers region in
pinmux node we can have also have separate entry for chid id revision
register region in CAR node.

The pad control registers are part of the pinmux HW, so it makes perfect
sense for the pinmux driver to control them. The APB misc registers
aren't part of the clock register set, so it doesn't make sense to the
clock driver to touch them.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 05/11] ARM: dt: tegra30: Add device node for APB MISC

2013-01-03 Thread Stephen Warren
On 01/03/2013 06:48 PM, Prashant Gaikwad wrote:
> On Thursday 03 January 2013 09:41 PM, Stephen Warren wrote:
>> On 01/02/2013 11:11 PM, Prashant Gaikwad wrote:
>>> On Thursday 03 January 2013 03:30 AM, Stephen Warren wrote:
 On 12/27/2012 07:47 AM, Prashant Gaikwad wrote:
> APB misc contains multiple registers required by different modules
> such as CAR.
 I don't see a DT binding document that describes what
 nvidia,tegra30-apbmisc means. Also, the register range for this new
 node
 overlaps that for the pinmux node, so they can't both "request" their
 register region. You may need multiple entries in the apbmisc reg
 property to avoid this.
>>> apbmisc reg for Tegra30 can be divided into following entries:
>>>
>>> strap registers
>>> jtag configuration registers
>>> pull_up/pull_down control registers
>>> vclk control registers
>>> tvdac registers
>>> chip id revision registers
>>> pad control registers
>>>
>>> This list is not same for Tegra20 and Tegra30.
>> OK. It sounds like we need a true APB MISC driver then, to abstract the
>> differences; the clock driver really shouldn't be touching the APB MISC
>> registers in all likelihood, unless a subset of the sections you mention
>> above are truly dedicated to clock functionality.
> 
> I don't think it is a good idea to create a driver for APB MISC, all
> registers are used by different drivers.

Well, it's even worse to have a bunch of other drivers randomly trample
on a set of registers they don't own.

> Only chip id revision registers are used in clock driver.

There are already global variables exposed by the Tegra fuse driver; can
you just read those?

>>> OR
>>>
>>> another way is to add chip id revision register region to CAR node as
>>> done for pinmux node and remove apb misc node.
>>
>> The pinmux controller doesn't have a reg entry for the chip ID register.
>> I don't understand what you mean here.
> 
> I mean as we have separate entry for PAD control registers region in
> pinmux node we can have also have separate entry for chid id revision
> register region in CAR node.

The pad control registers are part of the pinmux HW, so it makes perfect
sense for the pinmux driver to control them. The APB misc registers
aren't part of the clock register set, so it doesn't make sense to the
clock driver to touch them.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] Revert "nohz: Fix idle ticks in cpu summary line of /proc/stat" (commit 7386cdbf2f57ea8cff3c9fde93f206e58b9fe13f).

2013-01-03 Thread Srivatsa Vaddagiri
With offline cpus no longer beeing seen in nohz mode (ts->idle_active=0), we
don't need the check for cpu_online() introduced in commit 7386cdbf. Offline
cpu's idle time as last recorded in its ts->idle_sleeptime will be reported
(thus excluding its offline time as part of idle time statistics).

Cc: mho...@suse.cz
Cc: srivatsa.b...@linux.vnet.ibm.com
Signed-off-by: Srivatsa Vaddagiri 
---
 fs/proc/stat.c |   14 --
 1 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e296572..64c3b31 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -45,13 +45,10 @@ static cputime64_t get_iowait_time(int cpu)
 
 static u64 get_idle_time(int cpu)
 {
-   u64 idle, idle_time = -1ULL;
-
-   if (cpu_online(cpu))
-   idle_time = get_cpu_idle_time_us(cpu, NULL);
+   u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL);
 
if (idle_time == -1ULL)
-   /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
+   /* !NO_HZ so we can rely on cpustat.idle */
idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
else
idle = usecs_to_cputime64(idle_time);
@@ -61,13 +58,10 @@ static u64 get_idle_time(int cpu)
 
 static u64 get_iowait_time(int cpu)
 {
-   u64 iowait, iowait_time = -1ULL;
-
-   if (cpu_online(cpu))
-   iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+   u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL);
 
if (iowait_time == -1ULL)
-   /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
+   /* !NO_HZ so we can rely on cpustat.iowait */
iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
else
iowait = usecs_to_cputime64(iowait_time);
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] cpuhotplug/nohz: Remove offline cpus from nohz-idle state

2013-01-03 Thread Srivatsa Vaddagiri
Modify idle loop of arm, mips, s390, sh and x86 architectures to exit from nohz
state before dying upon hot-remove. This change is needed to avoid userspace
tools like top command from seeing a rollback in total idle time over some
sampling periods.

Additionaly, modify idle loop on all architectures supporting cpu hotplug to
have idle thread of a dying cpu die immediately after scheduler returns control
to it. There is no point in wasting time via calls to *_enter()/*_exit() before
noticing the need to die and dying.

Additional ARM specific change:
Revert commit ff081e05 ("ARM: 7457/1: smp: Fix suspicious
RCU originating from cpu_die()"), which added a RCU_NONIDLE() wrapper
around call to complete(). That wrapper is no longer needed as cpu_die() is
now called outside of a rcu_idle_enter()/exit() section. I also think that the
wait_for_completion() based wait in ARM's __cpu_die() can be replaced with a
busy-loop based one, as the wait there in general should be terminated within
few cycles.

Cc: Russell King 
Cc: Paul E. McKenney 
Cc: Stephen Boyd 
Cc: linux-arm-ker...@lists.infradead.org
Cc: Mike Frysinger 
Cc: uclinux-dist-de...@blackfin.uclinux.org
Cc: Ralf Baechle 
Cc: linux-m...@linux-mips.org
Cc: Benjamin Herrenschmidt 
Cc: linuxppc-...@lists.ozlabs.org
Cc: Martin Schwidefsky 
Cc: linux-s...@vger.kernel.org
Cc: Paul Mundt 
Cc: linux...@vger.kernel.org
Cc: "David S. Miller" 
Cc: sparcli...@vger.kernel.org
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: mho...@suse.cz
Cc: srivatsa.b...@linux.vnet.ibm.com
Signed-off-by: Srivatsa Vaddagiri 
---
 arch/arm/kernel/process.c  |9 -
 arch/arm/kernel/smp.c  |2 +-
 arch/blackfin/kernel/process.c |8 
 arch/mips/kernel/process.c |6 +++---
 arch/powerpc/kernel/idle.c |2 +-
 arch/s390/kernel/process.c |4 ++--
 arch/sh/kernel/idle.c  |5 ++---
 arch/sparc/kernel/process_64.c |3 ++-
 arch/x86/kernel/process.c  |5 ++---
 9 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index c6dec5f..254099b 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -191,11 +191,6 @@ void cpu_idle(void)
rcu_idle_enter();
ledtrig_cpu(CPU_LED_IDLE_START);
while (!need_resched()) {
-#ifdef CONFIG_HOTPLUG_CPU
-   if (cpu_is_offline(smp_processor_id()))
-   cpu_die();
-#endif
-
/*
 * We need to disable interrupts here
 * to ensure we don't miss a wakeup call.
@@ -224,6 +219,10 @@ void cpu_idle(void)
rcu_idle_exit();
tick_nohz_idle_exit();
schedule_preempt_disabled();
+#ifdef CONFIG_HOTPLUG_CPU
+   if (cpu_is_offline(smp_processor_id()))
+   cpu_die();
+#endif
}
 }
 
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 84f4cbf..a8e3b8a 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -251,7 +251,7 @@ void __ref cpu_die(void)
mb();
 
/* Tell __cpu_die() that this CPU is now safe to dispose of */
-   RCU_NONIDLE(complete(_died));
+   complete(_died);
 
/*
 * actual CPU shutdown procedure is at least platform (if not
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 3e16ad9..2bee1af 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -83,10 +83,6 @@ void cpu_idle(void)
while (1) {
void (*idle)(void) = pm_idle;
 
-#ifdef CONFIG_HOTPLUG_CPU
-   if (cpu_is_offline(smp_processor_id()))
-   cpu_die();
-#endif
if (!idle)
idle = default_idle;
tick_nohz_idle_enter();
@@ -98,6 +94,10 @@ void cpu_idle(void)
preempt_enable_no_resched();
schedule();
preempt_disable();
+#ifdef CONFIG_HOTPLUG_CPU
+   if (cpu_is_offline(smp_processor_id()))
+   cpu_die();
+#endif
}
 }
 
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index a11c6f9..41102a0 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -71,13 +71,13 @@ void __noreturn cpu_idle(void)
start_critical_timings();
}
}
+   rcu_idle_exit();
+   tick_nohz_idle_exit();
+   schedule_preempt_disabled();
 #ifdef CONFIG_HOTPLUG_CPU
if (!cpu_online(cpu) && !cpu_isset(cpu, cpu_callin_map))
play_dead();
 #endif
-   rcu_idle_exit();
-   tick_nohz_idle_exit();
-   schedule_preempt_disabled();
}
 }
 
diff --git a/arch/powerpc/kernel/idle.c 

[PATCH 0/2] cpuhotplug/nohz: Fix issue of "negative" idle time

2013-01-03 Thread Srivatsa Vaddagiri
On most architectures (arm, mips, s390, sh and x86) idle thread of a cpu does
not cleanly exit nohz state before dying upon hot-remove. As a result,
offline cpu is seen to be in nohz mode (ts->idle_active = 1) and its offline
time can potentially be included in total idle time reported via /proc/stat.
When the same cpu later comes online, its offline time however is not included
in its idle time statistics, thus causing a rollback in total idle time to be
observed by applications like top.

Example output from Android top command highlighting this issue is below:

User 232%, System 70%, IOW 46%, IRQ 1%
User 1322 + Nice 0 + Sys 399 + Idle -1423 + IOW 264 + IRQ 0 + SIRQ 7 = 569

top is reporting system to be idle for -1423 ticks over some sampling period.
This happens as total idle time reported in cpu line of /proc/stat *dropped*
from the last value observed (cached) by top command.

While this was originally seen on a ARM platform running 3.4 based kernel, I
could easily recreate it on my x86 desktop running latest tip/master kernel
(HEAD 3a7bfcad). Online/offline a cpu in a tight loop and in another loop read
/proc/stat and observe if total idle time drops from previously read value.

Although commit 7386cdbf (nohz: Fix idle ticks in cpu summary line of
/proc/stat) aims to avoid this bug, its not preemption proof. A
thread could get preempted after the cpu_online() check in get_idle_time(), thus
potentially leading to get_cpu_idle_time_us() being invoked on a offline cpu.

One potential fix is to serialize hotplug with /proc/stat read operation (via
use of get/put_online_cpus()), which I disliked in favor of the other
solution proposed in this series.

In this patch series:

- Patch 1/2 modifies idle loop on architectures arm, mips, s390, sh and x86 to
  exit nohz state before the associated idle thread dies upon hotremove. This
  fixes the idle time accounting bug.

  Patch 1/2 also modifies idle loop on all architectures supporting cpu hotplug
  to have idle thread of a dying cpu die immediately after schedule() returns
  control to it. I see no point in wasting time via calls to *_enter()/*_exit()
  before noticing the need to die and dying.

- Patch 2/2 reverts commit 7386cdbf (nohz: Fix idle ticks in cpu summary line of
  /proc/stat). The cpu_online() check introduced by it is no longer necessary
  with Patch 1/2 applied. Having fewer code sites worry about online status of
  cpus is a good thing!

---

 arch/arm/kernel/process.c  |9 -
 arch/arm/kernel/smp.c  |2 +-
 arch/blackfin/kernel/process.c |8 
 arch/mips/kernel/process.c |6 +++---
 arch/powerpc/kernel/idle.c |2 +-
 arch/s390/kernel/process.c |4 ++--
 arch/sh/kernel/idle.c  |5 ++---
 arch/sparc/kernel/process_64.c |3 ++-
 arch/x86/kernel/process.c  |5 ++---
 fs/proc/stat.c |   14 --
 10 files changed, 25 insertions(+), 33 deletions(-)
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
hosted by The Linux Foundation

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/8] Don't allow volatile attribute on THP and KSM

2013-01-03 Thread Minchan Kim
On Thu, Jan 03, 2013 at 08:27:31AM -0800, Dave Hansen wrote:
> On 01/02/2013 08:28 PM, Minchan Kim wrote:
> > VOLATILE imply the the pages in the range isn't working set any more
> > so it's pointless that make them to THP/KSM.
> 
> One of the points of this implementation is that it be able to preserve
> memory contents when there is no pressure.  If those contents happen to
> contain a THP/KSM page, and there's no pressure, it seems like the right
> thing to do is to leave that memory in place.

Indeed. I should have written more cleary,

Current implementation is following as

1. madvised-THP/KSM(1, 10) -> mvolatile(1, 10) -> fail
2. mvolatile(1, 10) -> madvised-THP/KSM(1, 10) -> fail
3. always-THP -> mvolatile -> success -> if memory pressure happens
   -> split_huge_page -> discard.

I think 2,3 makes sense to me but we need to fix 1 in further patches.

> 
> It might be a fair thing to do this in order to keep the implementation
> more sane at the moment.  But, we should make sure there's some good
> text on that in the changelog.

Absolutely, Thanks for pointing out, Dave.

> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 

-- 
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: protect against concurrent vma expansion

2013-01-03 Thread Al Viro
On Thu, Jan 03, 2013 at 06:40:05PM -0600, Simon Jeons wrote:
> On Wed, 2012-12-19 at 19:01 -0800, Michel Lespinasse wrote:
> > Hi Simon,
> > 
> > On Wed, Dec 19, 2012 at 5:56 PM, Simon Jeons  wrote:
> > > One question.
> > >
> > > I found that mainly callsite of expand_stack() is #PF, but it holds
> > > mmap_sem each time before call expand_stack(), how can hold a *shared*
> > > mmap_sem happen?
> > 
> > the #PF handler calls down_read(>mmap_sem) before calling expand_stack.
> > 
> > I think I'm just confusing you with my terminology; shared lock ==
> > read lock == several readers might hold it at once (I'd say they share
> > it)
> 
> Sorry for my late response. 
> 
> Since expand_stack() will modify vma, then why hold a read lock here?

To prevent that vma being ripped out.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 3/8] MFD:rtsx: Declare that the DMA address limitationis32bit explicitly

2013-01-03 Thread wwang
于 2013年01月03日 20:25, Dan Carpenter 写道:
> On Fri, Dec 28, 2012 at 10:41:28AM +0800, wei_w...@realsil.com.cn wrote:
>> From: Wei WANG 
>>
>> Realtek PCIe card reader only supports 32bit DMA
>>
> Is this a bugfix?  If so what does the bug look like?
>
> regards,
> dan carpenter
>
Hi Dan:

No, this is not a bugfix. In default, the kernel will allocate 32bit
address for DMA. This declaration can improve the readability.

BR,
wei wang
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC v4 0/3] Support volatile for anonymous range

2013-01-03 Thread Minchan Kim
On Fri, Dec 28, 2012 at 09:24:53AM +0900, Kamezawa Hiroyuki wrote:
> (2012/12/26 12:46), Minchan Kim wrote:
> >Hi Kame,
> >
> >What are you doing these holiday season? :)
> >I can't believe you sit down in front of computer.
> >
> Honestly, my holiday starts tomorrow ;) (but until 1/5 in the next year.)
> 
> >>
> >>Hm, by the way, the user need to attach pages to the process by causing 
> >>page-fault
> >>(as you do by memset()) before calling mvolatile() ?
> >
> >For effectiveness, Yes.
> >
> 
> Isn't it better to make page-fault by get_user_pages() in mvolatile() ?
> Calling page fault in userland seems just to increase burden of apps.

It seems you misunderstood. Firstly, this patch's goal is to minimize
minor fault + page allocation + memset_zero if possible on anon pages.

If someone(like allocator) calls madvise(DONTNEED)/munmap on range
which has garbage collected memory, VM zaps all the pte so if user
try to reuse that range, we can't avoid above overheads.

The mvolatile avoids them with not zapping ptes when memory pressure isn't
severe while VM can discard pages without swapping out if memory pressure
happens.

So, GUP in mvolatile isn't necessary.

> 
> >>
> >>I think your approach is interesting, anyway.
> >
> >Thanks for your interest, Kame.
> >
> >あけましておめでとう.
> >
> 
> A happy new year.
> 
> Thanks,
> -Kame
> 
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 

-- 
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 7/8] zswap: add to mm/

2013-01-03 Thread Dave Chinner
On Thu, Jan 03, 2013 at 02:37:01PM -0800, Dan Magenheimer wrote:
> > From: Dave Chinner [mailto:da...@fromorbit.com]
> > Subject: Re: [PATCH 7/8] zswap: add to mm/
> > 
> > 
> 
> OK, I have suitably proven how little I know about slab
> and have received some needed education from your
> response... Thanks for that Dave.
> 
> So let me ask some questions instead of making
> stupid assumptions.
> 
> > Thinking that there is a fixed amount of memory that you should
> > reserve for some subsystem is simply the wrong approach to take.
> > caches are dynamic and the correct system balance should result of
> > the natural behaviour of the reclaim algorithms.
> >
> > The shrinker infrastructure doesn't set any set size goals - it
> > simply tries to balance the reclaim across all the shrinkers and
> > relative to the page cache... 
> 
> First, it's important to note that zcache/zswap is not
> really a subsystem.  It's simply a way of increasing
> the number of anonymous pages (zswap and zcache) and
> pagecache pages (zcache only) in RAM by using compression.
> Because compressed pages can't be byte-addressed directly,
> pages enter zcache/zswap through a "transformation"
> process I've likened to a Fourier transform:  In
> their compressed state, they must be managed differently
> than normal whole pages.  Compressed anonymous pages must
> transition back to uncompressed before they can be used.
> Compressed pagecache pages (zcache only) can be either
> uncompressed when needed or gratuitously discarded (eventually)
> when not needed.
> 
> So I've been proceeding with the assumption that it is the
> sum of wholepages used by both compressed-anonymous pages
> and uncompressed-anonymous pages that must be managed/balanced,
> and that this sum should be managed similarly to the non-z
> case of the total number of anonymous pages in the system
> (and similarly for compressed+uncompressed pagecache pages).
> 
> Are you suggesting that slab can/should be used instead?

I'm not suggesting that any specific solution can/should be used.
What I'm trying to point out that is caches and shrinkers do not
need to be slab based. i.e. all that matters is that you have some
allocation method, some method of tracking the allocated objects,
and some method of reclaiming them, and all the details/policies can
be hidden within the subsystem via shrinker based reclaim...

> > And so the two subsystems need different reclaim implementations.
> > And, well, that's exactly what we have shrinkers for - implmenting
> > subsystem specific reclaim policy. The shrinker infrastructure is
> > responsible for them keeping balance between all the caches that
> > have shrinkers and the size of the page cache...
> 
> Given the above, do you think either compressed-anonymous-pages or
> compressed-pagecache-pages are suitable candidates for the shrinker
> infrastructure?

I don't know all the details of what you are trying to do, but you
seem to be describing a two-level heirarchy - a pool of compressed
data and a pool of uncompressed data, and under memory pressure are
migrating data from the uncompressed pool to the compressed pool. On
access, you are migrating back the other way.  Hence it seems to me
that you could implement the process of migration from the
uncompressed pool to the compressed pool as a shrinker so that it
only happens as a result of memory pressure

> Note that compressed anonymous pages are always dirty so
> cannot be "reclaimed" as such.  But the mechanism that Seth
> and I are working on causes compressed anonymous pages to
> be decompressed and then sent to backing store, which does
> (eventually, after I/O latency) free up pageframes.

The lack of knowledge I have about zcache/zswap means I might be
saying something stupid, but why wouldn't you simply write the
uncompressed page to the backing store and then compress it on IO
completion? If you have to uncompress it for the application to
either modify the page again or write it to the backing store,
doesn't it make things much simpler if the cache only holds clean
pages? And if it only holds clean pages, then another shrinker could
be used to keep the size of it in check

> Currently zcache does use the shrinker API for reclaiming
> pageframes-used-for-compressed-pagecache-pages.  Since
> these _are_ a form of pagecache pages, is the shrinker suitable?

Yes.

> > There are also cases where we've moved metadata caches out of the
> > page cache into shrinker controlled caches because the page cache
> > reclaim is too simplistic to handle the complex relationships
> > between filesystem metadata. We've done this in XFS, and IIRC btrfs
> > did this recently as well...
> 
> So although the objects in zswap/zcache are less than one page,
> they are still "data" not "metadata", true?

The page cache can be used to hold both filesystem metadata and user
data. As far as you're concerned, the page cache holds "information"
and you cannot make judgements about it's 

Re: [PATCH -v2 19/26] batman-adv: rename random32() to prandom_u32()

2013-01-03 Thread Antonio Quartulli
On Thu, Jan 03, 2013 at 09:19:15PM +0900, Akinobu Mita wrote:
> Use more preferable function name which implies using a pseudo-random
> number generator.
> 
> Signed-off-by: Akinobu Mita 
> Acked-by: Antonio Quartulli 
> Cc: Marek Lindner 
> Cc: Simon Wunderlich 
> Cc: Antonio Quartulli 
> Cc: b.a.t.m@lists.open-mesh.org
> Cc: "David S. Miller" 
> Cc: net...@vger.kernel.org
> ---

Hello Akinobu,

as you can see in <201301021952.49979.lindner_ma...@yahoo.de>, Marek Lindner
already applied this change onto our tree. You didn't need to resend this patch
to netdev, it will be sent by us through a future pull request.


Thanks a lot.
Cheers,

-- 
Antonio Quartulli

..each of us alone is worth nothing..
Ernesto "Che" Guevara


pgp__3FYamFN8.pgp
Description: PGP signature


Re: [PATCH v2 05/11] ARM: dt: tegra30: Add device node for APB MISC

2013-01-03 Thread Prashant Gaikwad

On Thursday 03 January 2013 09:41 PM, Stephen Warren wrote:

On 01/02/2013 11:11 PM, Prashant Gaikwad wrote:

On Thursday 03 January 2013 03:30 AM, Stephen Warren wrote:

On 12/27/2012 07:47 AM, Prashant Gaikwad wrote:

APB misc contains multiple registers required by different modules
such as CAR.

I don't see a DT binding document that describes what
nvidia,tegra30-apbmisc means. Also, the register range for this new node
overlaps that for the pinmux node, so they can't both "request" their
register region. You may need multiple entries in the apbmisc reg
property to avoid this.

apbmisc reg for Tegra30 can be divided into following entries:

strap registers
jtag configuration registers
pull_up/pull_down control registers
vclk control registers
tvdac registers
chip id revision registers
pad control registers

This list is not same for Tegra20 and Tegra30.

OK. It sounds like we need a true APB MISC driver then, to abstract the
differences; the clock driver really shouldn't be touching the APB MISC
registers in all likelihood, unless a subset of the sections you mention
above are truly dedicated to clock functionality.


I don't think it is a good idea to create a driver for APB MISC, all 
registers are used by different drivers.

Only chip id revision registers are used in clock driver.


OR

another way is to add chip id revision register region to CAR node as
done for pinmux node and remove apb misc node.

The pinmux controller doesn't have a reg entry for the chip ID register.
I don't understand what you mean here.


I mean as we have separate entry for PAD control registers region in 
pinmux node we can have also have separate entry for chid id revision 
register region in CAR node.



--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: protect against concurrent vma expansion

2013-01-03 Thread Simon Jeons
On Thu, 2013-01-03 at 16:50 -0800, Michel Lespinasse wrote:
> On Thu, Jan 3, 2013 at 4:40 PM, Simon Jeons  wrote:
> > On Wed, 2012-12-19 at 19:01 -0800, Michel Lespinasse wrote:
> >> Hi Simon,
> >>
> >> On Wed, Dec 19, 2012 at 5:56 PM, Simon Jeons  wrote:
> >> > One question.
> >> >
> >> > I found that mainly callsite of expand_stack() is #PF, but it holds
> >> > mmap_sem each time before call expand_stack(), how can hold a *shared*
> >> > mmap_sem happen?
> >>
> >> the #PF handler calls down_read(>mmap_sem) before calling expand_stack.
> >>
> >> I think I'm just confusing you with my terminology; shared lock ==
> >> read lock == several readers might hold it at once (I'd say they share
> >> it)
> >
> > Sorry for my late response.
> >
> > Since expand_stack() will modify vma, then why hold a read lock here?
> 
> Well, it'd be much nicer if we had a write lock, I think. But, we
> didn't know when taking the lock that we'd end up having to expand
> stacks.
> 
> What happens is that page faults don't generally modify vmas, so they
> get a read lock (just to know what vma the fault is happening in) and
> then fault in the page.
> 

Thanks for your quick explanation. 

> expand_stack() is the one exception to that - after getting the read
> lock as usual, we notice that the fault is not in any vma right now,
> but it's close enough to an expandable vma.

If this senario only occur for userspace stack?

> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Linux 3.2.36

2013-01-03 Thread Ben Hutchings
I'm announcing the release of the 3.2.36 kernel.

All users of the 3.2 kernel series should upgrade.

The updated 3.2.y git tree can be found at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git 
linux-3.2.y
and can be browsed at the normal kernel.org git web browser:
http://git.kernel.org/?p=linux/kernel/git/stable/linux-stable.git

Ben.



 Documentation/hwmon/coretemp   |   12 +-
 Makefile   |2 +-
 arch/arm/include/asm/hwcap.h   |3 +-
 arch/arm/include/asm/vfpmacros.h   |   12 +-
 arch/arm/kernel/swp_emulate.c  |2 +
 arch/arm/mm/mmu.c  |2 +-
 arch/arm/vfp/vfpmodule.c   |9 +-
 arch/cris/include/asm/io.h |   39 ++-
 arch/mips/kernel/process.c |4 +-
 arch/powerpc/kernel/head_64.S  |2 +-
 arch/powerpc/kvm/44x_emulate.c |2 +
 arch/powerpc/platforms/embedded6xx/wii.c   |6 +-
 arch/s390/kvm/kvm-s390.c   |2 +-
 arch/x86/kernel/hpet.c |4 +-
 arch/x86/kvm/x86.c |6 +
 drivers/acpi/battery.c |   77 +
 drivers/acpi/processor_driver.c|1 +
 drivers/acpi/sleep.c   |  340 ++--
 drivers/acpi/video.c   |   14 +
 drivers/acpi/video_detect.c|   37 +++
 drivers/ata/libata-core.c  |1 +
 drivers/ata/libata-eh.c|1 +
 drivers/ata/libata-scsi.c  |6 +-
 drivers/ata/sata_promise.c |   15 +-
 drivers/ata/sata_svw.c |   35 ++
 drivers/atm/solos-pci.c|5 +-
 drivers/base/regmap/regmap-debugfs.c   |2 +-
 drivers/bcma/driver_mips.c |2 +-
 drivers/bluetooth/ath3k.c  |1 +
 drivers/bluetooth/btusb.c  |3 +
 drivers/char/agp/intel-agp.h   |1 +
 drivers/char/agp/intel-gtt.c   |2 +
 drivers/char/ramoops.c |4 +-
 drivers/edac/i7300_edac.c  |8 +-
 drivers/edac/i82975x_edac.c|   11 +-
 drivers/firewire/net.c |   13 +-
 drivers/gpu/drm/drm_crtc_helper.c  |4 +-
 drivers/gpu/drm/drm_edid.c |5 +
 drivers/gpu/drm/i915/i915_debugfs.c|4 +-
 drivers/gpu/drm/i915/i915_drv.c|1 +
 drivers/gpu/drm/i915/i915_gem.c|5 +
 drivers/gpu/drm/i915/i915_irq.c|4 +-
 drivers/gpu/drm/i915/i915_reg.h|2 +
 drivers/gpu/drm/i915/intel_bios.c  |   11 +-
 drivers/gpu/drm/i915/intel_display.c   |   95 +++---
 drivers/gpu/drm/i915/intel_drv.h   |5 +-
 drivers/gpu/drm/i915/intel_lvds.c  |   16 +
 drivers/gpu/drm/radeon/atombios_crtc.c |5 +
 drivers/gpu/drm/radeon/atombios_encoders.c |2 +-
 drivers/gpu/drm/radeon/evergreen.c |4 +-
 drivers/hid/hid-apple.c|3 +
 drivers/hid/hid-core.c |1 +
 drivers/hid/hid-ids.h  |1 +
 drivers/hid/hid-magicmouse.c   |2 +
 drivers/hwmon/coretemp.c   |   34 +-
 drivers/hwmon/fam15h_power.c   |4 +
 drivers/input/joystick/walkera0701.c   |7 +-
 drivers/input/serio/i8042-x86ia64io.h  |9 +
 drivers/iommu/intel-iommu.c|   11 +-
 drivers/leds/leds-lp5521.c |   16 +-
 drivers/md/dm-ioctl.c  |8 +
 drivers/md/persistent-data/dm-btree-internal.h |   16 +-
 drivers/md/persistent-data/dm-btree-remove.c   |   50 +--
 drivers/md/persistent-data/dm-btree-spine.c|6 +-
 drivers/md/persistent-data/dm-btree.c  |   22 +-
 drivers/mfd/mfd-core.c |   15 +-
 drivers/misc/hpilo.c   |   11 +-
 drivers/misc/sgi-xp/xpc_main.c |   34 +-
 drivers/mmc/host/sh_mmcif.c|4 -
 drivers/mtd/nand/gpmi-nand/gpmi-lib.c  |9 +
 drivers/net/bonding/bond_main.c|7 +
 drivers/net/bonding/bond_sysfs.c   |2 +
 drivers/net/can/dev.c  |3 +-
 drivers/net/ethernet/8390/ne.c |1 +
 

linux-next: build failure after merge of the pekey tree

2013-01-03 Thread Stephen Rothwell
Hi David,

After merging the pekey tree, today's linux-next build (x86_64
allmodconfig) failed like this:

ERROR: "modsign_keyring" [crypto/asymmetric_keys/pefile_key_parser.ko] 
undefined!
ERROR: "x509_key_preparse" [crypto/asymmetric_keys/pefile_key_parser.ko] 
undefined!

I have dropped the pekey tree for today.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgpcipKWwClUP.pgp
Description: PGP signature


Re: 3.8-rc2: pciehp waitqueue hang...

2013-01-03 Thread Daniel J Blueman
On 3 January 2013 23:41, Jiang Liu  wrote:
> On 01/03/2013 11:11 PM, Daniel J Blueman wrote:
>> When the Apple thunderbolt ethernet adapter comes loose on my Macbook
>> Pro Retina (Intel DSL3510), we see pci_slot_name return
>> non-deterministic data (ie varying each boot), and we see pciehp_wp
>> remain armed with events causing the kthread to get stuck:
>>
>> tg3 :0a:00.0 eth0: Link is up at 1000 Mbps, full duplex
>> tg3 :0a:00.0 eth0: Flow control is on for TX and on for RX
>> 
>> pciehp :06:03.0:pcie24: Card not present on Slot(3)
>> tg3 :0a:00.0: tg3_abort_hw timed out, TX_MODE_ENABLE will not
>> clear MAC_TX_MODE=
>> tg3 :0a:00.0 eth0: No firmware running
>> tg3 :0a:00.0 eth0: Link is down
>> pcieport :00:01.1: System wakeup enabled by ACPI
>> pciehp :09:00.0:pcie24: unloading service driver pciehp
>> pciehp :09:00.0:pcie24: Latch open on
>> Slot(\xfff89\xffbbe\x02\xff88\x\x\xffe09\xffbbe\x02\xff88\x\xfbcon)
>> pciehp :09:00.0:pcie24: Button pressed on
>> Slot(\xfff89\xffbbe\x02\xff88\x\x\xffe09\xffbbe\x02\xff88\x\xfbcon)
>> pciehp :09:00.0:pcie24: Card present on
>> Slot(\xfff89\xffbbe\x02\xff88\x\x\xffe09\xffbbe\x02\xff88\x\xfbcon)
>> pciehp :09:00.0:pcie24: Power fault on slot
>> \xfff89\xffbbe\x02\xff88\x\x\xffe09\xffbbe\x02\xff88\x\xfbcon
>> pciehp :09:00.0:pcie24: Power fault bit 0 set
>> pciehp :09:00.0:pcie24: PCI slot
>> #\xfff89\xffbbe\x02\xff88\x\x\xffe09\xffbbe\x02\xff88\x\xfbcon
>> - powering on due to button press.
>> pciehp :09:00.0:pcie24: Link Training Error occurs
>> pciehp :09:00.0:pcie24: Failed to check link status
>> INFO: task kworker/0:1:52 blocked for more than 120 seconds.
[...]

> Hi Daniel,
> It seems like an issue caused by recursive PCIe HPC.
> Could you please help to try the patch from:
> http://www.spinics.net/lists/linux-pci/msg18625.html
> Thanks!
> Gerry

(adding Yijing)

Splendid; this fixes this failure nicely [1], finally releasing the bus.

If nothing else, I feel this should be queud for 3.8-rc3.

Many thanks,
  Daniel

--- [1]


pciehp :06:03.0:pcie24: Card not present on Slot(3)
tg3 :0a:00.0: tg3_abort_hw timed out, TX_MODE_ENABLE will not
clear MAC_TX_MODE=
tg3 :0a:00.0 eth0: No firmware running
tg3 :0a:00.0 eth0: Link is down
[sched_delayed] sched: RT throttling activated
pcieport :00:01.1: System wakeup enabled by ACPI
pciehp :09:00.0:pcie24: unloading service driver pciehp
pciehp :09:00.0:pcie24: Latch open on
Slot(\xffb0\x04Pd\x02\xff88\x\x\xff98\x04Pd\x02\xff88\x\xfbcon)
pciehp :09:00.0:pcie24: Button pressed on
Slot(\xffb0\x04Pd\x02\xff88\x\x\xff98\x04Pd\x02\xff88\x\xfbcon)
pciehp :09:00.0:pcie24: Card present on
Slot(\xffb0\x04Pd\x02\xff88\x\x\xff98\x04Pd\x02\xff88\x\xfbcon)
pciehp :09:00.0:pcie24: Power fault on slot
\xffb0\x04Pd\x02\xff88\x\x\xff98\x04Pd\x02\xff88\x\xfbcon
pciehp :09:00.0:pcie24: Power fault bit 0 set
pciehp :09:00.0:pcie24: PCI slot
#\xffb0\x04Pd\x02\xff88\x\x\xff98\x04Pd\x02\xff88\x\xfbcon
- powering on due to button press.
pciehp :09:00.0:pcie24: Link Training Error occurs
pciehp :09:00.0:pcie24: Failed to check link status
pci_bus :0a: busn_res: [bus 0a] is released
pci_bus :09: busn_res: [bus 09-0a] is released
-- 
Daniel J Blueman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Alternative 2][PATCH] ACPI / PCI: Set root bridge ACPI handle in advance

2013-01-03 Thread Bjorn Helgaas
On Thu, Jan 03, 2013 at 11:56:55PM +0100, Rafael J. Wysocki wrote:

> OK, I now have sent no less than three working version of the patch that fixes
> the current code which _is_ insane.  You haven't even responded to the last
> one, but for the first two the reason why you didn't like them was something
> similar to "it may conflict with some future changes I'm planning".  Well,
> that might be used to reject prety much any change and I'm not considering it
> as a good enough reason for blocking a fix.  Sorry about that.

I think your memory is faulty.  My response to the first
(https://lkml.org/lkml/2012/12/20/407) was "Thanks for cleaning this up, I
have an interface concern, here's an outline of a possible alternative."

My response to the second (https://lkml.org/lkml/2012/12/26/72) was "I like
this much better, Acked-by: Bjorn Helgaas."  Then Yinghai noticed the issue
with non-ACPI host bridges, and you abandoned that approach.

I took a few days of vacation, then spent the better part of yesterday
exploring the reasons why x86 and ia64 don't use the "parent" argument when
several other arches do, and worked up a patch
(https://lkml.org/lkml/2013/1/2/285).  It turned out to have a fatal flaw,
but was done in good faith.

It's true I haven't responded to the third one, posted about 12 hours ago.

I still like the approach of the second patch.  What would you think
of the following incremental change to it?  I did reproduce Yinghai's
issue with non-ACPI host bridges, and this change resolves it for me.

diff -u b/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
--- b/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -522,6 +522,7 @@
sd = >sd;
sd->domain = domain;
sd->node = node;
+   sd->acpi_handle = device->handle;
/*
 * Maybe the desired pci bus has been already scanned. In such case
 * it is unnecessary to scan the pci bus with the given domain,busnum.
@@ -596,9 +597,8 @@
 int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
 {
struct pci_sysdata *sd = bridge->bus->sysdata;
-   struct pci_root_info *info = container_of(sd, struct pci_root_info, sd);
 
-   ACPI_HANDLE_SET(>dev, info->bridge->handle);
+   ACPI_HANDLE_SET(>dev, sd->acpi_handle);
return 0;
 }
 
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -14,6 +14,7 @@
 struct pci_sysdata {
int domain; /* PCI domain */
int node;   /* NUMA node */
+   void*acpi_handle;
 #ifdef CONFIG_X86_64
void*iommu; /* IOMMU private data */
 #endif

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] writeback: fix writeback cache thrashing

2013-01-03 Thread Simon Jeons
On Thu, 2013-01-03 at 13:35 +0900, Namjae Jeon wrote:
> 2013/1/2, Jan Kara :
> > On Tue 01-01-13 08:51:04, Wanpeng Li wrote:
> >> On Mon, Dec 31, 2012 at 12:30:54PM +0100, Jan Kara wrote:
> >> >On Sun 30-12-12 14:59:50, Namjae Jeon wrote:
> >> >> From: Namjae Jeon 
> >> >>
> >> >> Consider Process A: huge I/O on sda
> >> >> doing heavy write operation - dirty memory becomes more
> >> >> than dirty_background_ratio
> >> >> on HDD - flusher thread flush-8:0
> >> >>
> >> >> Consider Process B: small I/O on sdb
> >> >> doing while [1]; read 1024K + rewrite 1024K + sleep 2sec
> >> >> on Flash device - flusher thread flush-8:16
> >> >>
> >> >> As Process A is a heavy dirtier, dirty memory becomes more
> >> >> than dirty_background_thresh. Due to this, below check becomes
> >> >> true(checking global_page_state in over_bground_thresh)
> >> >> for all bdi devices(even for very small dirtied bdi - sdb):
> >> >>
> >> >> In this case, even small cached data on 'sdb' is forced to flush
> >> >> and writeback cache thrashing happens.
> >> >>
> >> >> When we added debug prints inside above 'if' condition and ran
> >> >> above Process A(heavy dirtier on bdi with flush-8:0) and
> >> >> Process B(1024K frequent read/rewrite on bdi with flush-8:16)
> >> >> we got below prints:
> >> >>
> >> >> [Test setup: ARM dual core CPU, 512 MB RAM]
> >> >>
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56064 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56704 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 84720 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 94720 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   384 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   960 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92160 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   256 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   768 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   256 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   320 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92032 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 91968 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =  1024 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   576 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 84352 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   512 KB
> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92608 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92544 KB
> >> >>
> >> >> As mentioned in above log, when global dirty memory > global
> >> >> background_thresh
> >> >> small cached data is also forced to flush by flush-8:16.
> >> >>
> >> >> If removing global background_thresh checking code, we can reduce
> >> >> cache
> >> >> thrashing of frequently used small data.
> >> >  It's not completely clear to me:
> >> >  Why is this a problem? Wearing of the flash? Power consumption? I'd
> >> > like
> >> >to understand this before changing the code...
> Hi Jan.
> Yes, it can reduce wearing and fragmentation of flash. And also from
> one scenario - we
> think it might reduce power consumption also.
> 
> >> >
> >> >> And It will be great if we can reserve a portion of writeback cache
> >> >> using
> >> >> min_ratio.
> >> >>
> >> >> After applying patch:
> >> >> $ echo 5 > /sys/block/sdb/bdi/min_ratio
> >> >> $ cat /sys/block/sdb/bdi/min_ratio
> >> >> 5
> >> >>
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56064 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56704 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  84160 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  96960 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  94080 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  93120 KB
> >> >> [over_bground_thresh]: wakeup flush-8:0 : 

[3.8-rc2] stuck at reading CIFS mounted directory

2013-01-03 Thread Jongman Heo
Hi, all,

In 3.8-rc2, access to CIFS-mounted directory (df, ls, or similar) got stuck 
with following message.

It's mounted with...
  mount -t cifs ///Share  /mnt/window -o 
user=jongman.heo,password=,sec=ntlm


[16655.288591] INFO: task bash:4042 blocked for more than 120 seconds.
[16655.318117] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this 
message.
[16655.318123] bashD dada9c5c 0  4042  1 0x0004
[16655.318132]  dada9cd0 0082 0282 dada9c5c c09022c6 dada9c7c c044d316 
c0c7c300
[16655.318139]  d6db3a7b 0f09 c0c7c300  0f09 f3b7b240 c04401ba 

[16655.318145]  c0b9e0d8 f598e960  0303 dada9c98 dada9c98 f598e960 
0006
[16655.318150] Call Trace:
[16655.342785]  [] ? _raw_spin_unlock_irqrestore+0xf/0x11
[16655.351554]  [] ? __wake_up+0x3b/0x42
[16655.358802]  [] ? call_usermodehelper_fns+0x148/0x152
[16655.358840]  [] ? __request_module+0x15e/0x1a1
[16655.358842]  [] ? call_usermodehelper_freeinfo+0x19/0x19
[16655.358845]  [] schedule+0x51/0x53
[16655.358847]  [] schedule_preempt_disabled+0x8/0xa
[16655.384345]  [] __mutex_lock_common+0xd6/0x123
[16655.384430]  [] __mutex_lock_slowpath+0x20/0x22
[16655.384436]  [] ? mutex_lock+0x18/0x25
[16655.384441]  [] mutex_lock+0x18/0x25
[16655.384892]  [] cifs_reconnect_tcon+0x170/0x252
[16655.384953]  [] ? should_resched+0x8/0x22
[16655.384963]  [] ? _cond_resched+0x8/0x1c
[16655.384969]  [] smb_init+0x1d/0x6d
[16655.385023]  [] CIFSSMBQPathInfo+0x4e/0x1e4
[16655.385071]  [] cifs_query_path_info+0x38/0x73
[16655.385080]  [] cifs_get_inode_info+0x122/0x3ac
[16655.385548]  [] ? walk_component+0x14a/0x17a
[16655.385570]  [] ? build_path_from_dentry+0xa3/0x19e
[16655.385585]  [] ? build_path_from_dentry+0xa3/0x19e
[16655.385596]  [] ? build_path_from_dentry+0xa3/0x19e
[16655.385601]  [] ? getname_flags+0x59/0xeb
[16655.385606]  [] ? _raw_spin_lock+0x8/0xa
[16655.385613]  [] cifs_revalidate_dentry_attr+0x120/0x168
[16655.385618]  [] cifs_getattr+0x5e/0xe3
[16655.385625]  [] vfs_getattr+0x37/0x4e
[16655.385631]  [] ? cifs_revalidate_dentry+0x20/0x20
[16655.385639]  [] vfs_fstatat+0x59/0x8a
[16655.385645]  [] vfs_stat+0x19/0x1b
[16655.385652]  [] sys_stat64+0x11/0x22
[16655.385659]  [] ? should_resched+0x8/0x22
[16655.385668]  [] ? _cond_resched+0x8/0x1c
[16655.385674]  [] ? task_work_run+0x6d/0x79
[16655.385825]  [] ? __do_page_fault+0x33b/0x33b
[16655.385834]  [] ? do_page_fault+0x8/0xa
[16655.385840]  [] sysenter_do_call+0x12/0x2c



[PATCH v7u1 21/31] x86, kexec: only set ident mapping for ram.

2013-01-03 Thread Yinghai Lu
We should not set mapping for all under max_pfn.
That causes same problem that is fixed by

x86, mm: Only direct map addresses that are marked as E820_RAM

This patch expose pfn_mapped array, and only set ident mapping for ranges
in that array.

This patch rely on new ident_mapping_init that could handle existing
pgd/pud between different calling.

Signed-off-by: Yinghai Lu 
Cc: Alexander Duyck 
---
 arch/x86/include/asm/page.h|4 
 arch/x86/kernel/machine_kexec_64.c |   13 +
 arch/x86/mm/init.c |4 ++--
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index 3698a6a..c878924 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -17,6 +17,10 @@
 
 struct page;
 
+#include 
+extern struct range pfn_mapped[];
+extern int nr_pfn_mapped;
+
 static inline void clear_user_page(void *page, unsigned long vaddr,
   struct page *pg)
 {
diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index d2d7e02..4eabc16 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -100,10 +100,15 @@ static int init_pgtable(struct kimage *image, unsigned 
long start_pgtable)
 
level4p = (pgd_t *)__va(start_pgtable);
clear_page(level4p);
-   result = kernel_ident_mapping_init(, level4p,
-   0, max_pfn << PAGE_SHIFT);
-   if (result)
-   return result;
+   for (i = 0; i < nr_pfn_mapped; i++) {
+   mstart = pfn_mapped[i].start << PAGE_SHIFT;
+   mend   = pfn_mapped[i].end << PAGE_SHIFT;
+
+   result = kernel_ident_mapping_init(,
+level4p, mstart, mend);
+   if (result)
+   return result;
+   }
 
/*
 * segments's mem ranges could be outside 0 ~ max_pfn,
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ab26a15..d704b36 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -300,8 +300,8 @@ static int __meminit split_mem_range(struct map_range *mr, 
int nr_range,
return nr_range;
 }
 
-static struct range pfn_mapped[E820_X_MAX];
-static int nr_pfn_mapped;
+struct range pfn_mapped[E820_X_MAX];
+int nr_pfn_mapped;
 
 static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long 
end_pfn)
 {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 02/31] x86, 64bit, mm: make pgd next calculation consistent with pud/pmd

2013-01-03 Thread Yinghai Lu
Just like the way we calculate next for pud and pmd, aka
round down and add size.

Also, do not do boundary-checking with 'next', and just pass 'end' down
to phys_pud_init() instead. Because the loop in phys_pud_init() stops at
PTRS_PER_PUD and thus can handle a possibly bigger 'end' properly.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init_64.c |6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 167439c..b1178eb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -530,9 +530,7 @@ kernel_physical_mapping_init(unsigned long start,
pgd_t *pgd = pgd_offset_k(start);
pud_t *pud;
 
-   next = (start + PGDIR_SIZE) & PGDIR_MASK;
-   if (next > end)
-   next = end;
+   next = (start & PGDIR_MASK) + PGDIR_SIZE;
 
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
@@ -542,7 +540,7 @@ kernel_physical_mapping_init(unsigned long start,
}
 
pud = alloc_low_page();
-   last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
+   last_map_addr = phys_pud_init(pud, __pa(start), __pa(end),
 page_size_mask);
 
spin_lock(_mm.page_table_lock);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 20/31] x86, kexec: replace ident_mapping_init and init_level4_page

2013-01-03 Thread Yinghai Lu
Now ident_mapping_init is checking if pgd/pud is present for every 2M,
so several 2Ms are in same PUD, it will keep checking if pud is there.

init_level4_page does not check existing pgd/pud.

We could use generic mapping_init to replace them all.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/machine_kexec_64.c |  161 ++--
 1 file changed, 26 insertions(+), 135 deletions(-)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index be14ee1..d2d7e02 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -16,144 +16,12 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
 #include 
 
-static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
-   unsigned long addr)
-{
-   pud_t *pud;
-   pmd_t *pmd;
-   struct page *page;
-   int result = -ENOMEM;
-
-   addr &= PMD_MASK;
-   pgd += pgd_index(addr);
-   if (!pgd_present(*pgd)) {
-   page = kimage_alloc_control_pages(image, 0);
-   if (!page)
-   goto out;
-   pud = (pud_t *)page_address(page);
-   clear_page(pud);
-   set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
-   }
-   pud = pud_offset(pgd, addr);
-   if (!pud_present(*pud)) {
-   page = kimage_alloc_control_pages(image, 0);
-   if (!page)
-   goto out;
-   pmd = (pmd_t *)page_address(page);
-   clear_page(pmd);
-   set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
-   }
-   pmd = pmd_offset(pud, addr);
-   if (!pmd_present(*pmd))
-   set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
-   result = 0;
-out:
-   return result;
-}
-
-static int ident_mapping_init(struct kimage *image, pgd_t *level4p,
-   unsigned long mstart, unsigned long mend)
-{
-   int result;
-
-   mstart = round_down(mstart, PMD_SIZE);
-   mend   = round_up(mend - 1, PMD_SIZE);
-
-   while (mstart < mend) {
-   result = init_one_level2_page(image, level4p, mstart);
-   if (result)
-   return result;
-
-   mstart += PMD_SIZE;
-   }
-
-   return 0;
-}
-
-static void init_level2_page(pmd_t *level2p, unsigned long addr)
-{
-   unsigned long end_addr;
-
-   addr &= PAGE_MASK;
-   end_addr = addr + PUD_SIZE;
-   while (addr < end_addr) {
-   set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
-   addr += PMD_SIZE;
-   }
-}
-
-static int init_level3_page(struct kimage *image, pud_t *level3p,
-   unsigned long addr, unsigned long last_addr)
-{
-   unsigned long end_addr;
-   int result;
-
-   result = 0;
-   addr &= PAGE_MASK;
-   end_addr = addr + PGDIR_SIZE;
-   while ((addr < last_addr) && (addr < end_addr)) {
-   struct page *page;
-   pmd_t *level2p;
-
-   page = kimage_alloc_control_pages(image, 0);
-   if (!page) {
-   result = -ENOMEM;
-   goto out;
-   }
-   level2p = (pmd_t *)page_address(page);
-   init_level2_page(level2p, addr);
-   set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
-   addr += PUD_SIZE;
-   }
-   /* clear the unused entries */
-   while (addr < end_addr) {
-   pud_clear(level3p++);
-   addr += PUD_SIZE;
-   }
-out:
-   return result;
-}
-
-
-static int init_level4_page(struct kimage *image, pgd_t *level4p,
-   unsigned long addr, unsigned long last_addr)
-{
-   unsigned long end_addr;
-   int result;
-
-   result = 0;
-   addr &= PAGE_MASK;
-   end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
-   while ((addr < last_addr) && (addr < end_addr)) {
-   struct page *page;
-   pud_t *level3p;
-
-   page = kimage_alloc_control_pages(image, 0);
-   if (!page) {
-   result = -ENOMEM;
-   goto out;
-   }
-   level3p = (pud_t *)page_address(page);
-   result = init_level3_page(image, level3p, addr, last_addr);
-   if (result)
-   goto out;
-   set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
-   addr += PGDIR_SIZE;
-   }
-   /* clear the unused entries */
-   while (addr < end_addr) {
-   pgd_clear(level4p++);
-   addr += PGDIR_SIZE;
-   }
-out:
-   return result;
-}
-
 static void free_transition_pgtable(struct kimage *image)
 {
free_page((unsigned long)image->arch.pud);
@@ -203,15 +71,37 @@ err:
return result;
 }
 
+static void 

[PATCH v7u1 00/31] x86, boot, 64bit: Add support for loading ramdisk and bzImage above 4G

2013-01-03 Thread Yinghai Lu
Now we have limit kdump reseved under 896M, because kexec has the limitation.
and also bzImage need to stay under 4g.

To make kexec/kdump could use range above 4g, we need to make bzImage and
ramdisk could be loaded above 4g.
During booting bzImage will be unpacked on same postion and stay high.

The patches add fields in setup_header and boot_params to
1. get info about ramdisk position info above 4g from bootloader/kexec
2. get info about cmd_line_ptr info above 4g from bootloader/kexec
3. set xloadflags bit0 in header for bzImage and bootloader/kexec load
   could check that to decide if it could to put bzImage high.
4. use sentinel to make sure ext_* fields in boot_params could be used.

This patches is tested with kexec tools with local changes and they are sent
to kexec list later.

could be found at:

git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git 
for-x86-boot

and it is on top of linus's tree 2013-01-03
plus tip:x86/mm, tip:x86/mm2

-v2: add ext_cmd_line_ptr support, and handle boot_param/cmd_line is above
 4G case.
-v3: according to hpa, use xloadflags instead code32_start_offset.
 0x200 will not be changed...
-v4: move ext_ramdisk_image/ext_ramdisk_size/ext_cmd_line_ptr to boot_params.
 add handling cross GB boundary case.
-v5: put spare pages in BRK,so could avoid wasting about 4 pages.
 add check for bit USE_EXT_BOOT_PARAMS in xloadflags
-v6: use sentinel according to HPA
 add kdump load high support.
-v7: move sentinel from 0x1f0 to 0x1ef... according to HPA.
 Use HPA's #PF handler version instead of ioremap.
-v7u1: update changelog and comments, so it could break KGDB...

H. Peter Anvin (1):
  x86, 64bit: early #PF handler set page table

Yinghai Lu (30):
  x86, mm: Fix page table early allocation offset checking
  x86, 64bit, mm: make pgd next calculation consistent with pud/pmd
  x86, realmode: set real_mode permissions early
  x86, 64bit, mm: add generic kernel/ident mapping helper
  x86, 64bit: copy zero-page early
  x86, 64bit, realmode: use init_level4_pgt to set trapmoline_pgt directly
  x86, realmode: Separate real_mode reserve and setup
  x86, 64bit: #PF handler set page to cover 2M only
  x86, 64bit: Don't set max_pfn_mapped wrong value early on native path
  x86: Merge early_reserve_initrd for 32bit and 64bit
  x86: add get_ramdisk_image/size()
  x86, boot: add get_cmd_line_ptr()
  x86, boot: move checking of cmd_line_ptr out of common path
  x86, boot: pass cmd_line_ptr with unsigned long instead
  x86, boot: move verify_cpu.S and no_longmode down
  x86, boot: Move lldt/ltr out of 64bit code section
  x86, kexec: remove 1024G limitation for kexec buffer on 64bit
  x86, kexec: set ident mapping for kernel that is above max_pfn
  x86, kexec: replace ident_mapping_init and init_level4_page
  x86, kexec: only set ident mapping for ram.
  x86, boot: add fields to support load bzImage and ramdisk above 4G
  x86, boot: update comments about entries for 64bit image
  x86, boot: Not need to check setup_header version for setup_data
  memblock: add memblock_mem_size()
  x86: Don't enable swiotlb if there is not enough ram for it
  x86, kdump: remove crashkernel range find limit for 64bit
  x86: add Crash kernel low reservation
  x86: Merge early kernel reserve for 32bit and 64bit
  x86, 64bit, mm: Mark data/bss/brk to nx
  x86, 64bit, mm: hibernate use generic mapping_init

 Documentation/kernel-parameters.txt |3 +
 Documentation/x86/boot.txt  |   53 +++-
 Documentation/x86/zero-page.txt |4 +
 arch/x86/boot/boot.h|   18 ++-
 arch/x86/boot/cmdline.c |   12 +-
 arch/x86/boot/compressed/cmdline.c  |   12 +-
 arch/x86/boot/compressed/head_64.S  |   48 ---
 arch/x86/boot/compressed/misc.c |   12 ++
 arch/x86/boot/header.S  |   12 +-
 arch/x86/boot/setup.ld  |7 ++
 arch/x86/include/asm/init.h |   12 ++
 arch/x86/include/asm/kexec.h|6 +-
 arch/x86/include/asm/page.h |4 +
 arch/x86/include/asm/pgtable_64_types.h |4 +
 arch/x86/include/asm/processor.h|1 +
 arch/x86/include/asm/realmode.h |3 +-
 arch/x86/include/uapi/asm/bootparam.h   |   13 +-
 arch/x86/kernel/head32.c|   20 ---
 arch/x86/kernel/head64.c|  131 ++-
 arch/x86/kernel/head_64.S   |  210 +++
 arch/x86/kernel/machine_kexec_64.c  |  171 -
 arch/x86/kernel/pci-swiotlb.c   |   14 ++-
 arch/x86/kernel/setup.c |  128 ++-
 arch/x86/kernel/traps.c |9 ++
 arch/x86/mm/init.c  |   11 +-
 arch/x86/mm/init_64.c   |  109 ++--
 arch/x86/power/hibernate_64.c   |   66 --
 arch/x86/realmode/init.c|   42 ---
 include/linux/kexec.h 

[PATCH v7u1 01/31] x86, mm: Fix page table early allocation offset checking

2013-01-03 Thread Yinghai Lu
During debugging loading kernel above 4G, found one page if is not used
in BRK with early page allocation.

pgt_buf_top is address that can not be used, so should check if that new
end is above that top, otherwise last page will not be used.

Fix that checking and also add print out for every allocation from BRK.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6f85de8..c4293cf 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -47,7 +47,7 @@ __ref void *alloc_low_pages(unsigned int num)
__GFP_ZERO, order);
}
 
-   if ((pgt_buf_end + num) >= pgt_buf_top) {
+   if ((pgt_buf_end + num) > pgt_buf_top) {
unsigned long ret;
if (min_pfn_mapped >= max_pfn_mapped)
panic("alloc_low_page: ran out of memory");
@@ -61,6 +61,8 @@ __ref void *alloc_low_pages(unsigned int num)
} else {
pfn = pgt_buf_end;
pgt_buf_end += num;
+   printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
+   pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
}
 
for (i = 0; i < num; i++) {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [PATCH] Bluetooth: fix the oops due to conn->hcon == NULL in shutdown case

2013-01-03 Thread Liu, Chuansheng


> -Original Message-
> From: Gustavo Padovan [mailto:gust...@padovan.org]
> Sent: Friday, January 04, 2013 6:03 AM
> To: Liu, Chuansheng
> Cc: mar...@holtmann.org; johan.hedb...@gmail.com;
> linux-blueto...@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] Bluetooth: fix the oops due to conn->hcon == NULL in
> shutdown case
> 
> Hi Chuansheng,
> 
> * Chuansheng Liu  [2012-12-25 18:04:17 +0800]:
> 
> >
> > Meet one panic issue as below stack:


> > Disassemble the code:
> > base address of __sco_sock_close is 0xc184f410
> > 0xc184f4f8 <+232>:   lock decl 0x8(%ebx) < == crash here, ebx is 0x0,
> >
> > the related source code is:
> > (gdb) l *0xc184f4f8
> > 0xc184f4f8 is in __sco_sock_close (arch/x86/include/asm/atomic.h:123)
> > 119 static inline int atomic_dec_and_test(atomic_t *v)
> > 123 asm volatile(LOCK_PREFIX "decl %0; sete %1"
> >
> > The whole call stack is:
> > sys_shutdown()
> >   sco_sock_shutdown()
> > __sco_sock_close()
> >   hci_conn_put()
> > atomic_dec_and_test()
> >
> > Due to the conn->hcon is NULL, and the member hcon->refcnt is at offset 0x8,
> > so "BUG: unable to handle kernel NULL pointer dereference at 0008"
> > appears.
Could you add the above crash info to indicate where crashed? Thanks.

> >
> > Here fix it that adding the condition if conn->hcon is NULL, just like
> > in sco_chan_del().
> >
> > Signed-off-by: liu chuansheng 
> > ---
> >  net/bluetooth/sco.c |6 --
> >  1 files changed, 4 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
> > index 531a93d..190f70c 100644
> > --- a/net/bluetooth/sco.c
> > +++ b/net/bluetooth/sco.c
> > @@ -355,8 +355,10 @@ static void __sco_sock_close(struct sock *sk)
> > if (sco_pi(sk)->conn) {
> > sk->sk_state = BT_DISCONN;
> > sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT);
> > -   hci_conn_put(sco_pi(sk)->conn->hcon);
> > -   sco_pi(sk)->conn->hcon = NULL;
> > +   if (sco_pi(sk)->conn->hcon) {
> > +   hci_conn_put(sco_pi(sk)->conn->hcon);
> > +   sco_pi(sk)->conn->hcon = NULL;
> > +   }
> > } else
> > sco_chan_del(sk, ECONNRESET);
> > break;
> 
> Please check if the following patch fixes the issue for you:
> 
> commit ae5668c1fc155d3034d0eedcdb52798390975a39 (HEAD, master)
> Author: Gustavo Padovan 
> Date:   Thu Jan 3 19:59:28 2013 -0200
> 
> Bluetooth: Check if the hci connection exists in SCO shutdown
> 
> Checking only for sco_conn seems to not be enough and lead to NULL
> dereferences in the code, check for hcon instead.
> 
> <1>[11340.226404] BUG: unable to handle kernel NULL pointer
> dereference at
> 000
> 8
> <4>[11340.226619] EIP is at __sco_sock_close+0xe8/0x1a0
> <4>[11340.226629] EAX: f063a740 EBX:  ECX: f58f4544 EDX:
> 
> <4>[11340.226640] ESI: dec83e00 EDI: 5f9a081f EBP: e0fdff38 ESP:
> e0fdff1c
> <0>[11340.226674] Stack:
> <4>[11340.226682]  c184db87 c1251028 dec83e00 e0fdff38 c1754aef
> dec83e00
> 
> e0fdff5c
> <4>[11340.226718]  c184f587 e0fdff64 e0fdff68 5f9a081f e0fdff5c
> c1751852
> d7813800
> 62262f10
> <4>[11340.226752]  e0fdff70 c1753c00  0001 000d
> e0fdffac
> c175425c
> 0041
> <0>[11340.226793] Call Trace:
> <4>[11340.226813]  [] ? sco_sock_clear_timer+0x27/0x60
> <4>[11340.226831]  [] ? local_bh_enable+0x68/0xd0
> <4>[11340.226846]  [] ? lock_sock_nested+0x4f/0x60
> <4>[11340.226862]  [] sco_sock_shutdown+0x67/0xb0
> <4>[11340.226879]  [] ? sockfd_lookup_light+0x22/0x80
> <4>[11340.226897]  [] sys_shutdown+0x30/0x60
> <4>[11340.226912]  [] sys_socketcall+0x1dc/0x2a0
> <4>[11340.226929]  [] ? trace_hardirqs_on_thunk+0xc/0x10
> <4>[11340.226944]  [] syscall_call+0x7/0xb
> <4>[11340.226960]  [] ? restore_cur+0x5e/0xd7
> <0>[11340.226969] Code:  ff 4b 08 0f 94 c0 84 c0 74 20 80 7b 19 01 74
> 2f b8 0a 00 00
> 
> Reported-by: Chuansheng Liu 
> Signed-off-by: Gustavo Padovan 
> 
> diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
> index 531a93d..57f250c 100644
> --- a/net/bluetooth/sco.c
> +++ b/net/bluetooth/sco.c
> @@ -352,7 +352,7 @@ static void __sco_sock_close(struct sock *sk)
> 
>   case BT_CONNECTED:
>   case BT_CONFIG:
> - if (sco_pi(sk)->conn) {
> + if (sco_pi(sk)->conn->hcon) {
Your fix is incomplete, at least it should be:
if ( (sco_pi(sk)->conn) && (sco_pi(sk)->conn->hcon)) {
Otherwise, it will bring another crash case. So could you add signed-off-by me 
also?
Although it is not easy to reproduce, thanks.
Signed-off-by: liu chuansheng 

>   sk->sk_state = BT_DISCONN;
>   sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT);
>

[PATCH v7u1 22/31] x86, boot: add fields to support load bzImage and ramdisk above 4G

2013-01-03 Thread Yinghai Lu
ext_ramdisk_image/size will record high 32bits for ramdisk info.

xloadflags bit0 will be set if relocatable with 64bit.

Let get_ramdisk_image/size to use ext_ramdisk_image/size to get
right positon for ramdisk.

bootloader will fill value to ext_ramdisk_image/size when it load
ramdisk above 4G.

Also bootloader will check if xloadflags bit0 is set to decicde if
it could load ramdisk high above 4G.

sentinel is used to make sure kernel have ext_* valid values set

Update header version to 2.12.

-v2: add ext_cmd_line_ptr for above 4G support.
-v3: update to xloadflags from HPA.
-v4: use fields from bootparam instead setup_header according to HPA.
-v5: add checking for USE_EXT_BOOT_PARAMS
-v6: use sentinel to check if ext_* are valid suggested by HPA.
 HPA said:
1. add a field in the uninitialized portion, call it "sentinel";
2. make sure the byte position corresponding to the "sentinel" field is
   nonzero in the bzImage file;
3. if the kernel boots up and sentinel is nonzero, erase those fields
   that you identified as uninitialized;
-v7: change to 0x1ef instead of 0x1f0, HPA said:
it is quite plausible that someone may (fairly sanely) start the
copy range at 0x1f0 instead of 0x1f1

Signed-off-by: Yinghai Lu 
Cc: Rob Landley 
Cc: Matt Fleming 
Cc: Gokul Caushik 
Cc: Josh Triplett 
Cc: Joe Millenbach 
---
 Documentation/x86/boot.txt|   15 ++-
 Documentation/x86/zero-page.txt   |4 
 arch/x86/boot/compressed/cmdline.c|2 ++
 arch/x86/boot/compressed/misc.c   |   12 
 arch/x86/boot/header.S|   12 ++--
 arch/x86/boot/setup.ld|7 +++
 arch/x86/include/uapi/asm/bootparam.h |   13 ++---
 arch/x86/kernel/head64.c  |2 ++
 arch/x86/kernel/setup.c   |4 
 9 files changed, 65 insertions(+), 6 deletions(-)

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 406d82d..18ca9fb 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -57,6 +57,9 @@ Protocol 2.10:(Kernel 2.6.31) Added a protocol for 
relaxed alignment
 Protocol 2.11: (Kernel 3.6) Added a field for offset of EFI handover
protocol entry point.
 
+Protocol 2.12: (Kernel 3.9) Added three fields for loading bzImage and
+ramdisk above 4G with 64bit in bootparam.
+
  MEMORY LAYOUT
 
 The traditional memory map for the kernel loader, used for Image or
@@ -182,7 +185,7 @@ Offset  Proto   NameMeaning
 0230/4 2.05+   kernel_alignment Physical addr alignment required for kernel
 0234/1 2.05+   relocatable_kernel Whether kernel is relocatable or not
 0235/1 2.10+   min_alignment   Minimum alignment, as a power of two
-0236/2 N/A pad3Unused
+0236/2 2.12+   xloadflags  Boot protocol option flags
 0238/4 2.06+   cmdline_sizeMaximum size of the kernel command line
 023C/4 2.07+   hardware_subarch Hardware subarchitecture
 0240/8 2.07+   hardware_subarch_data Subarchitecture-specific data
@@ -582,6 +585,16 @@ Protocol:  2.10+
   misaligned kernel.  Therefore, a loader should typically try each
   power-of-two alignment from kernel_alignment down to this alignment.
 
+Field name: xloadflags
+Type:   modify (obligatory)
+Offset/size:0x236/2
+Protocol:   2.12+
+
+  This field is a bitmask.
+
+  Bit 0 (read): CAN_BE_LOADED_ABOVE_4G
+- If 1, kernel/boot_params/cmdline/ramdisk can be above 4g,
+
 Field name:cmdline_size
 Type:  read
 Offset/size:   0x238/4
diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
index cf5437d..1140e59 100644
--- a/Documentation/x86/zero-page.txt
+++ b/Documentation/x86/zero-page.txt
@@ -19,6 +19,9 @@ OffsetProto   NameMeaning
 090/010ALL hd1_infohd1 disk parameter, OBSOLETE!!
 0A0/010ALL sys_desc_table  System description table (struct 
sys_desc_table)
 0B0/010ALL olpc_ofw_header OLPC's OpenFirmware CIF and friends
+0C0/004ALL ext_ramdisk_image ramdisk_image high 32bits
+0C4/004ALL ext_ramdisk_size  ramdisk_size high 32bits
+0C8/004ALL ext_cmd_line_ptr  cmd_line_ptr high 32bits
 140/080ALL edid_info   Video mode setup (struct edid_info)
 1C0/020ALL efi_infoEFI 32 information (struct efi_info)
 1E0/004ALL alk_mem_k   Alternative mem check, in KB
@@ -27,6 +30,7 @@ OffsetProto   NameMeaning
 1E9/001ALL eddbuf_entries  Number of entries in eddbuf (below)
 1EA/001ALL edd_mbr_sig_buf_entries Number of entries in 
edd_mbr_sig_buffer
(below)
+1EF/001ALL sentinel0: states _ext_* fields are valid
 290/040ALL edd_mbr_sig_buffer EDD MBR signatures
 2D0/A00ALL e820_mapE820 memory map table
   

[PATCH v7u1 07/31] x86, realmode: Separate real_mode reserve and setup

2013-01-03 Thread Yinghai Lu
After we switch to use #PF handler help to set page table, init_level4_pgt
will only have entries set after init_mem_mapping.
We need to move copying init_level4_pgt to trampoline_pgd after that.

So split reserve and setup, and move the setup after init_mem_mapping()

Signed-off-by: Yinghai Lu 
Cc: Jarkko Sakkinen 
---
 arch/x86/include/asm/realmode.h |3 ++-
 arch/x86/kernel/setup.c |4 +++-
 arch/x86/realmode/init.c|   30 +++---
 3 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index fe1ec5b..9c6b890 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -58,6 +58,7 @@ extern unsigned char boot_gdt[];
 extern unsigned char secondary_startup_64[];
 #endif
 
-extern void __init setup_real_mode(void);
+void reserve_real_mode(void);
+void setup_real_mode(void);
 
 #endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 81ea5a5..01b22d0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -913,10 +913,12 @@ void __init setup_arch(char **cmdline_p)
printk(KERN_DEBUG "initial memory mapped: [mem 0x-%#010lx]\n",
(max_pfn_mapped

[PATCH v7u1 09/31] x86, 64bit: #PF handler set page to cover 2M only

2013-01-03 Thread Yinghai Lu
Now #PF hanlder could map 1G per #PF, That causes same problem that
is fixed by
x86, mm: Only direct map addresses that are marked as E820_RAM

only add one 2M mapping instead of 1G accessing one time for dynamically
per #PF.

Signed-off-by: Yinghai Lu 
Cc: Alexander Duyck 
---
 arch/x86/kernel/head64.c |   42 +-
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 25591f9..a3fc233 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -52,15 +52,15 @@ int __init early_make_pgtable(unsigned long address)
unsigned long physaddr = address - __PAGE_OFFSET;
unsigned long i;
pgdval_t pgd, *pgd_p;
-   pudval_t *pud_p;
+   pudval_t pud, *pud_p;
pmdval_t pmd, *pmd_p;
 
/* Invalid address or early pgt is done ?  */
if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
return -1;
 
-   i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1);
-   pgd_p = _level4_pgt[i].pgd;
+again:
+   pgd_p = _level4_pgt[pgd_index(address)].pgd;
pgd = *pgd_p;
 
/*
@@ -68,29 +68,37 @@ int __init early_make_pgtable(unsigned long address)
 * critical -- __PAGE_OFFSET would point us back into the dynamic
 * range and we might end up looping forever...
 */
-   if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) {
+   if (pgd)
pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map 
- phys_base);
-   } else {
-   if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1)
+   else {
+   if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
reset_early_page_tables();
+   goto again;
+   }
 
pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
for (i = 0; i < PTRS_PER_PUD; i++)
pud_p[i] = 0;
-
*pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + 
_KERNPG_TABLE;
}
-   i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
-   pud_p += i;
-
-   pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
-   pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
-   for (i = 0; i < PTRS_PER_PMD; i++) {
-   pmd_p[i] = pmd;
-   pmd += PMD_SIZE;
-   }
+   pud_p += pud_index(address);
+   pud = *pud_p;
 
-   *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + 
_KERNPG_TABLE;
+   if (pud)
+   pmd_p = (pmdval_t *)((pud & PTE_PFN_MASK) + __START_KERNEL_map 
- phys_base);
+   else {
+   if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+   reset_early_page_tables();
+   goto again;
+   }
+
+   pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+   for (i = 0; i < PTRS_PER_PMD; i++)
+   pmd_p[i] = 0;
+   *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + 
_KERNPG_TABLE;
+   }
+   pmd = (physaddr & PMD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
+   pmd_p[pmd_index(address)] = pmd;
 
return 0;
 }
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 05/31] x86, 64bit: copy zero-page early

2013-01-03 Thread Yinghai Lu
real_mode_data aka zero-page could be above 4g.
We will have #PF handler to set page table for not accessible ram
early, but could limit it before x86_64_start_reservations to limit
the change to native path.

Also we will need to ramdisk info in zero-page to access microcode
blob in ramdisk in x86_64_start_kernel, so copy zero-page early make
it accessing ramdisk info simple.

Signed-off-by: Yinghai Lu 
Cc: Alexander Duyck 
Cc: Fenghua Yu 
---
 arch/x86/kernel/head64.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7b215a5..c0a25e0 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -87,6 +87,8 @@ void __init x86_64_start_kernel(char * real_mode_data)
}
load_idt((const struct desc_ptr *)_descr);
 
+   copy_bootdata(__va(real_mode_data));
+
if (console_loglevel == 10)
early_printk("Kernel alive\n");
 
@@ -95,7 +97,9 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
 void __init x86_64_start_reservations(char *real_mode_data)
 {
-   copy_bootdata(__va(real_mode_data));
+   /* version is always not zero if it is copied */
+   if (!boot_params.hdr.version)
+   copy_bootdata(__va(real_mode_data));
 
memblock_reserve(__pa_symbol(_text),
 (unsigned long)__bss_stop - (unsigned long)_text);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 11/31] x86: Merge early_reserve_initrd for 32bit and 64bit

2013-01-03 Thread Yinghai Lu
They are the same, could move them out from head32/64.c to setup.c.

We are using memblock, and it could handle overlapping properly, so
we don't need to reserve some at first to hold the location, and just
need to make sure we reserve them before we are using memblock to find
free mem to use.

Signed-off-by: Yinghai Lu 
Reviewed-by: Pekka Enberg 
---
 arch/x86/kernel/head32.c |   11 ---
 arch/x86/kernel/head64.c |   11 ---
 arch/x86/kernel/setup.c  |   22 ++
 3 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index e175548..b071d41 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -33,17 +33,6 @@ void __init i386_start_kernel(void)
memblock_reserve(__pa_symbol(_text),
 (unsigned long)__bss_stop - (unsigned long)_text);
 
-#ifdef CONFIG_BLK_DEV_INITRD
-   /* Reserve INITRD */
-   if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-   /* Assume only end is not page aligned */
-   u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-   u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
-   u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-   memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
-   }
-#endif
-
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_MRST:
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7061d8b..c463725 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -176,17 +176,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
memblock_reserve(__pa_symbol(_text),
 (unsigned long)__bss_stop - (unsigned long)_text);
 
-#ifdef CONFIG_BLK_DEV_INITRD
-   /* Reserve INITRD */
-   if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-   /* Assume only end is not page aligned */
-   unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-   unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-   unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + 
ramdisk_size);
-   memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
-   }
-#endif
-
reserve_ebda_region();
 
/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 04797e78..1b8a8cc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -360,6 +360,19 @@ static u64 __init get_mem_size(unsigned long limit_pfn)
 
return mapped_pages << PAGE_SHIFT;
 }
+static void __init early_reserve_initrd(void)
+{
+   /* Assume only end is not page aligned */
+   u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+   u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+   u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+
+   if (!boot_params.hdr.type_of_loader ||
+   !ramdisk_image || !ramdisk_size)
+   return; /* No initrd provided by bootloader */
+
+   memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+}
 static void __init reserve_initrd(void)
 {
/* Assume only end is not page aligned */
@@ -386,10 +399,6 @@ static void __init reserve_initrd(void)
if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
PFN_DOWN(ramdisk_end))) {
/* All are mapped, easy case */
-   /*
-* don't need to reserve again, already reserved early
-* in i386_start_kernel
-*/
initrd_start = ramdisk_image + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
return;
@@ -400,6 +409,9 @@ static void __init reserve_initrd(void)
memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
 }
 #else
+static void __init early_reserve_initrd(void)
+{
+}
 static void __init reserve_initrd(void)
 {
 }
@@ -661,6 +673,8 @@ early_param("reservelow", parse_reservelow);
 
 void __init setup_arch(char **cmdline_p)
 {
+   early_reserve_initrd();
+
 #ifdef CONFIG_X86_32
memcpy(_cpu_data, _cpu_data, sizeof(new_cpu_data));
visws_early_detect();
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 10/31] x86, 64bit: Don't set max_pfn_mapped wrong value early on native path

2013-01-03 Thread Yinghai Lu
We are not having max_pfn_mapped set correctly until init_memory_mapping.

so don't print it initial value for 64bit

Also need to use KERNEL_IMAGE_SIZE directly for highmap cleanup.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/head64.c |3 ---
 arch/x86/kernel/setup.c  |2 ++
 arch/x86/mm/init_64.c|6 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index a3fc233..7061d8b 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -146,9 +146,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
 
-   /* XXX - this is wrong... we need to build page tables from scratch */
-   max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
-
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
 #ifdef CONFIG_EARLY_PRINTK
set_intr_gate(i, _idt_handlers[i]);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63160c6..04797e78 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -910,8 +910,10 @@ void __init setup_arch(char **cmdline_p)
setup_bios_corruption_check();
 #endif
 
+#ifdef CONFIG_X86_32
printk(KERN_DEBUG "initial memory mapped: [mem 0x-%#010lx]\n",
(max_pfn_mapped

[PATCH v7u1 31/31] x86, 64bit, mm: hibernate use generic mapping_init

2013-01-03 Thread Yinghai Lu
Make it only map range in pfn_mapped array.

and it has kernel mapping with EXEC.

Signed-off-by: Yinghai Lu 
Cc: Pavel Machek 
Cc: Rafael J. Wysocki 
Cc: linux...@vger.kernel.org
---
 arch/x86/power/hibernate_64.c |   66 ++---
 1 file changed, 22 insertions(+), 44 deletions(-)

diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 460f314..a0fde91 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,8 @@
 #include 
 #include 
 #include 
+
+#include 
 #include 
 #include 
 #include 
@@ -39,41 +41,21 @@ pgd_t *temp_level4_pgt;
 
 void *relocated_restore_code;
 
-static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long 
end)
+static void *alloc_pgt_page(void *context)
 {
-   long i, j;
-
-   i = pud_index(address);
-   pud = pud + i;
-   for (; i < PTRS_PER_PUD; pud++, i++) {
-   unsigned long paddr;
-   pmd_t *pmd;
-
-   paddr = address + i*PUD_SIZE;
-   if (paddr >= end)
-   break;
-
-   pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
-   if (!pmd)
-   return -ENOMEM;
-   set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
-   for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
-   unsigned long pe;
-
-   if (paddr >= end)
-   break;
-   pe = __PAGE_KERNEL_LARGE_EXEC | paddr;
-   pe &= __supported_pte_mask;
-   set_pmd(pmd, __pmd(pe));
-   }
-   }
-   return 0;
+   return (void *)get_safe_page(GFP_ATOMIC);
 }
 
 static int set_up_temporary_mappings(void)
 {
-   unsigned long start, end, next;
-   int error;
+   struct x86_mapping_info info = {
+   .alloc_pgt_page = alloc_pgt_page,
+   .pmd_flag   = __PAGE_KERNEL_LARGE_EXEC,
+   .kernel_mapping = true,
+   };
+   unsigned long mstart, mend;
+   int result;
+   int i;
 
temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
if (!temp_level4_pgt)
@@ -84,21 +66,17 @@ static int set_up_temporary_mappings(void)
init_level4_pgt[pgd_index(__START_KERNEL_map)]);
 
/* Set up the direct mapping from scratch */
-   start = (unsigned long)pfn_to_kaddr(0);
-   end = (unsigned long)pfn_to_kaddr(max_pfn);
-
-   for (; start < end; start = next) {
-   pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
-   if (!pud)
-   return -ENOMEM;
-   next = start + PGDIR_SIZE;
-   if (next > end)
-   next = end;
-   if ((error = res_phys_pud_init(pud, __pa(start), __pa(next
-   return error;
-   set_pgd(temp_level4_pgt + pgd_index(start),
-   mk_kernel_pgd(__pa(pud)));
+   for (i = 0; i < nr_pfn_mapped; i++) {
+   mstart = pfn_mapped[i].start << PAGE_SHIFT;
+   mend   = pfn_mapped[i].end << PAGE_SHIFT;
+
+   result = kernel_ident_mapping_init(, temp_level4_pgt,
+  mstart, mend);
+
+   if (result)
+   return result;
}
+
return 0;
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 16/31] x86, boot: move verify_cpu.S and no_longmode down

2013-01-03 Thread Yinghai Lu
We need to move some code to 32bit section in following patch:

   x86, boot: Move lldt/ltr out of 64bit code section

but that will push startup_64 down from 0x200.

According to hpa, we can not change startup_64 position and that
is an ABI.

We could move function verify_cpu and no_longmode down, because
verify_cpu is used via function call and no_longmode will not
return, then we don't need to add extra code for jumping back.

Signed-off-by: Yinghai Lu 
Cc: Matt Fleming 
---
 arch/x86/boot/compressed/head_64.S |   17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S 
b/arch/x86/boot/compressed/head_64.S
index 2c4b171..fb984c0 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -176,14 +176,6 @@ ENTRY(startup_32)
lret
 ENDPROC(startup_32)
 
-no_longmode:
-   /* This isn't an x86-64 CPU so hang */
-1:
-   hlt
-   jmp 1b
-
-#include "../../kernel/verify_cpu.S"
-
/*
 * Be careful here startup_64 needs to be at a predictable
 * address so I can export it in an ELF header.  Bootloaders
@@ -349,6 +341,15 @@ relocated:
  */
jmp *%rbp
 
+   .code32
+no_longmode:
+   /* This isn't an x86-64 CPU so hang */
+1:
+   hlt
+   jmp 1b
+
+#include "../../kernel/verify_cpu.S"
+
.data
 gdt:
.word   gdt_end - gdt
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] sunrpc: Fix lockd sleeping until timeout

2013-01-03 Thread J. Bruce Fields
On Wed, Dec 26, 2012 at 05:09:07PM +0200, Andriy Skulysh wrote:
> There is a race in enqueueing thread to a pool and
> waking up a thread.
> lockd doesn't wake up on reception of lock granted callback
> if svc_wake_up() is called before lockd's thread is added
> to a pool.
> 
> Signed-off-by: Andriy Skulysh 
> ---
>  include/linux/sunrpc/svc.h |1 +
>  net/sunrpc/svc_xprt.c  |9 -
>  2 files changed, 9 insertions(+), 1 deletions(-)
> 
> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
> index 676ddf5..1f0216b 100644
> --- a/include/linux/sunrpc/svc.h
> +++ b/include/linux/sunrpc/svc.h
> @@ -50,6 +50,7 @@ struct svc_pool {
>   unsigned intsp_nrthreads;   /* # of threads in pool */
>   struct list_headsp_all_threads; /* all server threads */
>   struct svc_pool_stats   sp_stats;   /* statistics on pool operation 
> */
> + int sp_task_pending;/* has pending task */
>  } cacheline_aligned_in_smp;
> 
>  /*
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index b8e47fa..c7ab6f5 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -499,7 +499,8 @@ void svc_wake_up(struct svc_serv *serv)
>   rqstp->rq_xprt = NULL;
>*/
>   wake_up(>rq_wait);
> - }
> + } else
> + pool->sp_task_pending = 1;
>   spin_unlock_bh(>sp_lock);
>   }
>  }
> @@ -634,7 +635,13 @@ struct svc_xprt *svc_get_next_xprt(struct
> svc_rqst *rqstp, long timeout)
>* long for cache updates.
>*/
>   rqstp->rq_chandle.thread_wait = 1*HZ;
> + pool->sp_task_pending = 0;
>   } else {
> + if (pool->sp_task_pending) {
> + pool->sp_task_pending = 0;
> + spin_unlock_bh(>sp_lock);
> + return -EAGAIN;

That should be ERR_PTR(-EAGAIN).

Other than this this looks right to me

Out of curiosity: how did you run across this problem, and how did you
test the fix?

--b.

> + }
>   /* No data pending. Go to sleep */
>   svc_thread_enqueue(pool, rqstp);
> 
> -- 
> 1.7.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 27/31] x86, kdump: remove crashkernel range find limit for 64bit

2013-01-03 Thread Yinghai Lu
Now kexeced kernel/ramdisk could be above 4g, so remove 896 limit for
64bit.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/setup.c |4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c58497e..6adbc45 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -501,13 +501,11 @@ static void __init 
memblock_x86_reserve_range_setup_data(void)
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
  * would limit the kernel to the low 512 MiB due to mapping restrictions.
- * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
- * limit once kexec-tools are fixed.
  */
 #ifdef CONFIG_X86_32
 # define CRASH_KERNEL_ADDR_MAX (512 << 20)
 #else
-# define CRASH_KERNEL_ADDR_MAX (896 << 20)
+# define CRASH_KERNEL_ADDR_MAX MAXMEM
 #endif
 
 static void __init reserve_crashkernel(void)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 19/31] x86, kexec: set ident mapping for kernel that is above max_pfn

2013-01-03 Thread Yinghai Lu
When first kernel is booted with memmap= or mem=  to limit max_pfn.
kexec can load second kernel above that max_pfn.

We need to set ident mapping for whole image in this case not just
for first 2M.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/machine_kexec_64.c |   43 +++-
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index b3ea9db..be14ee1 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -56,6 +56,25 @@ out:
return result;
 }
 
+static int ident_mapping_init(struct kimage *image, pgd_t *level4p,
+   unsigned long mstart, unsigned long mend)
+{
+   int result;
+
+   mstart = round_down(mstart, PMD_SIZE);
+   mend   = round_up(mend - 1, PMD_SIZE);
+
+   while (mstart < mend) {
+   result = init_one_level2_page(image, level4p, mstart);
+   if (result)
+   return result;
+
+   mstart += PMD_SIZE;
+   }
+
+   return 0;
+}
+
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
unsigned long end_addr;
@@ -184,22 +203,34 @@ err:
return result;
 }
 
-
 static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 {
+   unsigned long mstart, mend;
pgd_t *level4p;
int result;
+   int i;
+
level4p = (pgd_t *)__va(start_pgtable);
result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
if (result)
return result;
+
/*
-* image->start may be outside 0 ~ max_pfn, for example when
-* jump back to original kernel from kexeced kernel
+* segments's mem ranges could be outside 0 ~ max_pfn,
+* for example when jump back to original kernel from kexeced kernel.
+* or first kernel is booted with user mem map, and second kernel
+* could be loaded out of that range.
 */
-   result = init_one_level2_page(image, level4p, image->start);
-   if (result)
-   return result;
+   for (i = 0; i < image->nr_segments; i++) {
+   mstart = image->segment[i].mem;
+   mend   = mstart + image->segment[i].memsz;
+
+   result = ident_mapping_init(image, level4p, mstart, mend);
+
+   if (result)
+   return result;
+   }
+
return init_transition_pgtable(image, level4p);
 }
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 18/31] x86, kexec: remove 1024G limitation for kexec buffer on 64bit

2013-01-03 Thread Yinghai Lu
Now 64bit kernel supports more than 1T ram and kexec tools
could find buffer above 1T, remove that obsolete limitation.
and use MAXMEM instead.

Tested on system with more than 1024G ram.

Signed-off-by: Yinghai Lu 
Cc: Eric W. Biederman 
---
 arch/x86/include/asm/kexec.h |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 6080d26..17483a4 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -48,11 +48,11 @@
 # define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
 #else
 /* Maximum physical address we can use pages from */
-# define KEXEC_SOURCE_MEMORY_LIMIT  (0xFFUL)
+# define KEXEC_SOURCE_MEMORY_LIMIT  (MAXMEM-1)
 /* Maximum address we can reach in physical address mode */
-# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFUL)
+# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1)
 /* Maximum address we can use for the control pages */
-# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFUL)
+# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
 
 /* Allocate one page for the pdp and the second for the code */
 # define KEXEC_CONTROL_PAGE_SIZE  (4096UL + 4096UL)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 14/31] x86, boot: move checking of cmd_line_ptr out of common path

2013-01-03 Thread Yinghai Lu
cmdline.c::__cmdline_find_option... are shared between 16-bit setup code
and 32/64 bit decompressor code.

for 32/64 only path via kexec, we should not check if ptr is less 1M.
as those cmdline could be put above 1M, or even 4G.

Move out accessible checking out of __cmdline_find_option()
So decompressor in misc.c can parse cmdline correctly.

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/boot.h|   14 --
 arch/x86/boot/cmdline.c |8 
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 18997e5..7fadf80 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -289,12 +289,22 @@ int __cmdline_find_option(u32 cmdline_ptr, const char 
*option, char *buffer, int
 int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
 static inline int cmdline_find_option(const char *option, char *buffer, int 
bufsize)
 {
-   return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, 
buffer, bufsize);
+   u32 cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+   if (cmd_line_ptr >= 0x10)
+   return -1;  /* inaccessible */
+
+   return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize);
 }
 
 static inline int cmdline_find_option_bool(const char *option)
 {
-   return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
+   u32 cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+   if (cmd_line_ptr >= 0x10)
+   return -1;  /* inaccessible */
+
+   return __cmdline_find_option_bool(cmd_line_ptr, option);
 }
 
 
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 6b3b6f7..768f00f 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -41,8 +41,8 @@ int __cmdline_find_option(u32 cmdline_ptr, const char 
*option, char *buffer, int
st_bufcpy   /* Copying this to buffer */
} state = st_wordstart;
 
-   if (!cmdline_ptr || cmdline_ptr >= 0x10)
-   return -1;  /* No command line, or inaccessible */
+   if (!cmdline_ptr)
+   return -1;  /* No command line */
 
cptr = cmdline_ptr & 0xf;
set_fs(cmdline_ptr >> 4);
@@ -111,8 +111,8 @@ int __cmdline_find_option_bool(u32 cmdline_ptr, const char 
*option)
st_wordskip,/* Miscompare, skip */
} state = st_wordstart;
 
-   if (!cmdline_ptr || cmdline_ptr >= 0x10)
-   return -1;  /* No command line, or inaccessible */
+   if (!cmdline_ptr)
+   return -1;  /* No command line */
 
cptr = cmdline_ptr & 0xf;
set_fs(cmdline_ptr >> 4);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 30/31] x86, 64bit, mm: Mark data/bss/brk to nx

2013-01-03 Thread Yinghai Lu
HPA said, we should not have RW and +x set at the time.

for kernel layout:
[0.00] Kernel Layout:
[0.00]   .text: [0x0100-0x021434f8]
[0.00] .rodata: [0x0220-0x02a13fff]
[0.00]   .data: [0x02c0-0x02dc763f]
[0.00]   .init: [0x02dc9000-0x0312cfff]
[0.00].bss: [0x0313b000-0x03dd6fff]
[0.00].brk: [0x03dd7000-0x03df]

before the patch, we have
---[ High Kernel Mapping ]---
0x8000-0x8100  16M   pmd
0x8100-0x8220  18M ro PSE GLB x  pmd
0x8220-0x82c0  10M ro PSE GLB NX pmd
0x82c0-0x82dc90001828K RW GLB x  pte
0x82dc9000-0x82e0 220K RW GLB NX pte
0x82e0-0x8300   2M RW PSE GLB NX pmd
0x8300-0x8313a0001256K RW GLB NX pte
0x8313a000-0x8320 792K RW GLB x  pte
0x8320-0x83e0  12M RW PSE GLB x  pmd
0x83e0-0xa000 450M   pmd

after patch,, we get
---[ High Kernel Mapping ]---
0x8000-0x8100  16M   pmd
0x8100-0x8220  18M ro PSE GLB x  pmd
0x8220-0x82c0  10M ro PSE GLB NX pmd
0x82c0-0x82e0   2M RW GLB NX pte
0x82e0-0x8300   2M RW PSE GLB NX pmd
0x8300-0x8320   2M RW GLB NX pte
0x8320-0x83e0  12M RW PSE GLB NX pmd
0x83e0-0xa000 450M   pmd

so data, bss, brk get NX ...

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init_64.c |7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 98385a2..9653411 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -820,6 +820,7 @@ void mark_rodata_ro(void)
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
unsigned long text_end = PFN_ALIGN(&__stop___ex_table);
unsigned long rodata_end = PFN_ALIGN(&__end_rodata);
+   unsigned long all_end = PFN_ALIGN(&_end);
 
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
   (end - start) >> 10);
@@ -828,10 +829,10 @@ void mark_rodata_ro(void)
kernel_set_to_readonly = 1;
 
/*
-* The rodata section (but not the kernel text!) should also be
-* not-executable.
+* The rodata/data/bss/brk section (but not the kernel text!)
+* should also be not-executable.
 */
-   set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
+   set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
 
rodata_test();
 
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 26/31] x86: Don't enable swiotlb if there is not enough ram for it

2013-01-03 Thread Yinghai Lu
Normal boot path on system with iommu support:
swiotlb buffer will be allocated early at first and then try to initialize
iommu, if iommu for intel or amd could setup properly, swiotlb buffer
will be freed.

The early allocating is with bootmem, and get panic when we try to use
kdump with buffer above 4G only if swiotlb is enabled.

because actually the kernel can go on without swiotlb, and use intel iommu.

Try disable swiotlb if there is not enough ram for it.

That is for kdump to use kernel above 4G.

Suggested-by: Eric W. Biederman 
Signed-off-by: Yinghai Lu 
Cc: Konrad Rzeszutek Wilk 
Cc: Joerg Roedel 
---
 arch/x86/kernel/pci-swiotlb.c |   14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6c483ba..949ebfe 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -50,6 +51,11 @@ static struct dma_map_ops swiotlb_dma_ops = {
.dma_supported = NULL,
 };
 
+static bool __init enough_mem_for_swiotlb(void)
+{
+   /* do we have less than 1M RAM under 4G ? */
+   return memblock_mem_size(1ULL<<(32-PAGE_SHIFT)) > (1ULL<<20);
+}
 /*
  * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
  *
@@ -58,12 +64,12 @@ static struct dma_map_ops swiotlb_dma_ops = {
  */
 int __init pci_swiotlb_detect_override(void)
 {
-   int use_swiotlb = swiotlb | swiotlb_force;
-
if (swiotlb_force)
swiotlb = 1;
+   else if (!enough_mem_for_swiotlb())
+   swiotlb = 0;
 
-   return use_swiotlb;
+   return swiotlb;
 }
 IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
  pci_xen_swiotlb_detect,
@@ -78,7 +84,7 @@ int __init pci_swiotlb_detect_4gb(void)
 {
/* don't initialize swiotlb if iommu=off (no_iommu=1) */
 #ifdef CONFIG_X86_64
-   if (!no_iommu && max_pfn > MAX_DMA32_PFN)
+   if (!no_iommu && max_pfn > MAX_DMA32_PFN && enough_mem_for_swiotlb())
swiotlb = 1;
 #endif
return swiotlb;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 29/31] x86: Merge early kernel reserve for 32bit and 64bit

2013-01-03 Thread Yinghai Lu
They are the same, and we could move them out from head32/64.c to setup.c.

We are using memblock, and it could handle overlapping properly, so
we don't need to reserve some at first to hold the location, and just
need to make sure we reserve them before we are using memblock to find
free mem to use.

Signed-off-by: Yinghai Lu 
Cc: Alexander Duyck 
---
 arch/x86/kernel/head32.c |9 -
 arch/x86/kernel/head64.c |9 -
 arch/x86/kernel/setup.c  |9 +
 3 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b071d41..17f7792 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -30,9 +30,6 @@ static void __init i386_default_early_setup(void)
 
 void __init i386_start_kernel(void)
 {
-   memblock_reserve(__pa_symbol(_text),
-(unsigned long)__bss_stop - (unsigned long)_text);
-
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_MRST:
@@ -46,11 +43,5 @@ void __init i386_start_kernel(void)
break;
}
 
-   /*
-* At this point everything still needed from the boot loader
-* or BIOS or kernel text should be early reserved or marked not
-* RAM in e820. All other memory is free game.
-*/
-
start_kernel();
 }
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index e63d29a..d9d7c75 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -184,16 +184,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
if (!boot_params.hdr.version)
copy_bootdata(__va(real_mode_data));
 
-   memblock_reserve(__pa_symbol(_text),
-(unsigned long)__bss_stop - (unsigned long)_text);
-
reserve_ebda_region();
 
-   /*
-* At this point everything still needed from the boot loader
-* or BIOS or kernel text should be early reserved or marked not
-* RAM in e820. All other memory is free game.
-*/
-
start_kernel();
 }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2203dd6..3117515 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -706,8 +706,17 @@ early_param("reservelow", parse_reservelow);
 
 void __init setup_arch(char **cmdline_p)
 {
+   memblock_reserve(__pa_symbol(_text),
+(unsigned long)__bss_stop - (unsigned long)_text);
+
early_reserve_initrd();
 
+   /*
+* At this point everything still needed from the boot loader
+* or BIOS or kernel text should be early reserved or marked not
+* RAM in e820. All other memory is free game.
+*/
+
 #ifdef CONFIG_X86_32
memcpy(_cpu_data, _cpu_data, sizeof(new_cpu_data));
visws_early_detect();
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v7u1 28/31] x86: add Crash kernel low reservation

2013-01-03 Thread Yinghai Lu
During kdump kernel's booting stage, it need to find low ram for
swiotlb buffer when system does not support intel iommu/dmar remapping.

kexed-tools is appending memmap=exactmap and range from /proc/iomem
with "Crash kernel", and that range is above 4G for 64bit after boot
protocol 2.12.

We need to add another range in /proc/iomem like "Crash kernel low",
so kexec-tools could find that info and append to kdump kernel
command line.

Try to reserve some under 4G if the normal "Crash kernel" is above 4G.

User could specify the size with crashkernel_low=XX[KMG].

-v2: fix warning that is found by Fengguang's test robot.
-v3: move out get_mem_size change to another patch, to solve compiling
 warning that is found by Borislav Petkov 
-v4: user must specify crashkernel_low if system does not support
 intel or amd iommu.

Signed-off-by: Yinghai Lu 
Cc: Eric Biederman 
Cc: Rob Landley 
---
 Documentation/kernel-parameters.txt |3 +++
 arch/x86/kernel/setup.c |   42 +--
 include/linux/kexec.h   |3 +++
 kernel/kexec.c  |   34 +++-
 4 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 363e348..da0e077 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -594,6 +594,9 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
is selected automatically. Check
Documentation/kdump/kdump.txt for further details.
 
+   crashkernel_low=size[KMG]
+   [KNL, x86] parts under 4G.
+
crashkernel=range1:size1[,range2:size2,...][@offset]
[KNL] Same as above, but depends on the memory
in the running system. The syntax of range is
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6adbc45..2203dd6 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -508,8 +508,44 @@ static void __init 
memblock_x86_reserve_range_setup_data(void)
 # define CRASH_KERNEL_ADDR_MAX MAXMEM
 #endif
 
+static void __init reserve_crashkernel_low(void)
+{
+#ifdef CONFIG_X86_64
+   const unsigned long long alignment = 16<<20;/* 16M */
+   unsigned long long low_base = 0, low_size = 0;
+   unsigned long total_low_mem;
+   unsigned long long base;
+   int ret;
+
+   total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
+   ret = parse_crashkernel_low(boot_command_line, total_low_mem,
+   _size, );
+   if (ret != 0 || low_size <= 0)
+   return;
+
+   low_base = memblock_find_in_range(low_size, (1ULL<<32),
+   low_size, alignment);
+
+   if (!low_base) {
+   pr_info("crashkernel low reservation failed - No suitable area 
found.\n");
+
+   return;
+   }
+
+   memblock_reserve(low_base, low_size);
+   pr_info("Reserving %ldMB of low memory at %ldMB for crashkernel (System 
low RAM: %ldMB)\n",
+   (unsigned long)(low_size >> 20),
+   (unsigned long)(low_base >> 20),
+   (unsigned long)(total_low_mem >> 20));
+   crashk_low_res.start = low_base;
+   crashk_low_res.end   = low_base + low_size - 1;
+   insert_resource(_resource, _low_res);
+#endif
+}
+
 static void __init reserve_crashkernel(void)
 {
+   const unsigned long long alignment = 16<<20;/* 16M */
unsigned long long total_mem;
unsigned long long crash_size, crash_base;
int ret;
@@ -523,8 +559,6 @@ static void __init reserve_crashkernel(void)
 
/* 0 means: find the address automatically */
if (crash_base <= 0) {
-   const unsigned long long alignment = 16<<20;/* 16M */
-
/*
 *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
 */
@@ -535,6 +569,7 @@ static void __init reserve_crashkernel(void)
pr_info("crashkernel reservation failed - No suitable 
area found.\n");
return;
}
+
} else {
unsigned long long start;
 
@@ -556,6 +591,9 @@ static void __init reserve_crashkernel(void)
crashk_res.start = crash_base;
crashk_res.end   = crash_base + crash_size - 1;
insert_resource(_resource, _res);
+
+   if (crash_base >= (1ULL<<32))
+   reserve_crashkernel_low();
 }
 #else
 static void __init reserve_crashkernel(void)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d0b8458..d2e6927 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -191,6 +191,7 @@ extern struct kimage *kexec_crash_image;
 /* Location of a reserved region to hold the crash kernel.
  */
 extern struct resource crashk_res;

[PATCH v7u1 25/31] memblock: add memblock_mem_size()

2013-01-03 Thread Yinghai Lu
Use it to get mem size under the limit_pfn.
to replace local version in x86 reserved_initrd.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/setup.c  |   16 +---
 include/linux/memblock.h |1 +
 mm/memblock.c|   17 +
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 15ce495..c58497e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -363,20 +363,6 @@ static void __init relocate_initrd(void)
ramdisk_here, ramdisk_here + ramdisk_size - 1);
 }
 
-static u64 __init get_mem_size(unsigned long limit_pfn)
-{
-   int i;
-   u64 mapped_pages = 0;
-   unsigned long start_pfn, end_pfn;
-
-   for_each_mem_pfn_range(i, MAX_NUMNODES, _pfn, _pfn, NULL) {
-   start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
-   end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
-   mapped_pages += end_pfn - start_pfn;
-   }
-
-   return mapped_pages << PAGE_SHIFT;
-}
 static void __init early_reserve_initrd(void)
 {
/* Assume only end is not page aligned */
@@ -404,7 +390,7 @@ static void __init reserve_initrd(void)
 
initrd_start = 0;
 
-   mapped_size = get_mem_size(max_pfn_mapped);
+   mapped_size = (u64)memblock_mem_size(max_pfn_mapped);
if (ramdisk_size >= (mapped_size>>1))
panic("initrd too large to handle, "
   "disabling initrd (%lld needed, %lld available)\n",
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index d452ee1..f388203 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -155,6 +155,7 @@ phys_addr_t memblock_alloc_base(phys_addr_t size, 
phys_addr_t align,
 phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align,
  phys_addr_t max_addr);
 phys_addr_t memblock_phys_mem_size(void);
+phys_addr_t memblock_mem_size(unsigned long limit_pfn);
 phys_addr_t memblock_start_of_DRAM(void);
 phys_addr_t memblock_end_of_DRAM(void);
 void memblock_enforce_memory_limit(phys_addr_t memory_limit);
diff --git a/mm/memblock.c b/mm/memblock.c
index 6259055..4b3b8d2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -827,6 +827,23 @@ phys_addr_t __init memblock_phys_mem_size(void)
return memblock.memory.total_size;
 }
 
+phys_addr_t __init memblock_mem_size(unsigned long limit_pfn)
+{
+   unsigned long pages = 0;
+   struct memblock_region *r;
+   unsigned long start_pfn, end_pfn;
+
+   for_each_memblock(memory, r) {
+   start_pfn = memblock_region_memory_base_pfn(r);
+   end_pfn = memblock_region_memory_end_pfn(r);
+   start_pfn = min_t(unsigned long, start_pfn, limit_pfn);
+   end_pfn = min_t(unsigned long, end_pfn, limit_pfn);
+   pages += end_pfn - start_pfn;
+   }
+
+   return (phys_addr_t)pages << PAGE_SHIFT;
+}
+
 /* lowest address */
 phys_addr_t __init_memblock memblock_start_of_DRAM(void)
 {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   3   4   5   6   7   8   9   10   >