date:20121127

[PATCH v5 09/13] x86: use io_remap to access real_mode_data

2012-11-27 Thread Yinghai Lu

When 64bit bootloader put real mode data above 4g, We can not
access real mode data directly yet.

because in arch/x86/kernel/head_64.S, only set ident mapping
for 0-1g, and kernel code/data/bss.

So need to move early_ioremap_init() calling early from setup_arch()
to x86_64_start_kernel().

Also use rsi/rdi instead of esi/edi for real_data pointer passing
between asm code and c code.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/head64.c  |   17 ++---
 arch/x86/kernel/head_64.S |4 ++--
 arch/x86/kernel/setup.c   |2 ++
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 3ac6cad..735cd47 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -52,12 +52,21 @@ static void __init copy_bootdata(char *real_mode_data)
 {
char * command_line;
unsigned long cmd_line_ptr;
+   char *p;
 
-   memcpy(_params, real_mode_data, sizeof boot_params);
+   /*
+* for 64bit bootload path, those data could be above 4G,
+* and we do set ident mapping for them in head_64.S.
+* So need to ioremap to access them.
+*/
+   p = early_memremap((unsigned long)real_mode_data, sizeof(boot_params));
+   memcpy(_params, p, sizeof(boot_params));
+   early_iounmap(p, sizeof(boot_params));
cmd_line_ptr = get_cmd_line_ptr();
if (cmd_line_ptr) {
-   command_line = __va(cmd_line_ptr);
+   command_line = early_memremap(cmd_line_ptr, COMMAND_LINE_SIZE);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
+   early_iounmap(command_line, COMMAND_LINE_SIZE);
}
 }
 
@@ -104,7 +113,9 @@ void __init x86_64_start_kernel(char * real_mode_data)
 
 void __init x86_64_start_reservations(char *real_mode_data)
 {
-   copy_bootdata(__va(real_mode_data));
+   early_ioremap_init();
+
+   copy_bootdata(real_mode_data);
 
memblock_reserve(__pa_symbol(&_text),
 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 338799a..9f6526a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -409,9 +409,9 @@ ENTRY(secondary_startup_64)
movlinitial_gs+4(%rip),%edx
wrmsr   
 
-   /* esi is pointer to real mode structure with interesting info.
+   /* rsi is pointer to real mode structure with interesting info.
   pass it to C */
-   movl%esi, %edi
+   movq%rsi, %rdi

/* Finally jump to run C code and to be on real kernel address
 * Since we are running on identity-mapped space we have to jump
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 194e151..573fa7d7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -718,7 +718,9 @@ void __init setup_arch(char **cmdline_p)
 
early_trap_init();
early_cpu_init();
+#ifdef CONFIG_X86_32
early_ioremap_init();
+#endif
 
setup_olpc_ofw_pgd();
 
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 04/13] x86: Merge early_reserve_initrd for 32bit and 64bit

2012-11-27 Thread Yinghai Lu

They are the same, could move them out from head32/64.c to setup.c.

We are using memblock, and it could handle overlapping properly, so
we don't need to reserve some at first to hold the location, and just
need to make sure we reserve them before we are using memblock to find
free mem to use.

Signed-off-by: Yinghai Lu 
Reviewed-by: Pekka Enberg 
---
 arch/x86/kernel/head32.c |   11 ---
 arch/x86/kernel/head64.c |   11 ---
 arch/x86/kernel/setup.c  |   22 ++
 3 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index c18f59d..4c52efc 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -33,17 +33,6 @@ void __init i386_start_kernel(void)
memblock_reserve(__pa_symbol(&_text),
 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
 
-#ifdef CONFIG_BLK_DEV_INITRD
-   /* Reserve INITRD */
-   if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-   /* Assume only end is not page aligned */
-   u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-   u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
-   u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
-   memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
-   }
-#endif
-
/* Call the subarch specific early setup function */
switch (boot_params.hdr.hardware_subarch) {
case X86_SUBARCH_MRST:
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 037df57..00e612a 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -100,17 +100,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
memblock_reserve(__pa_symbol(&_text),
 __pa_symbol(&__bss_stop) - __pa_symbol(&_text));
 
-#ifdef CONFIG_BLK_DEV_INITRD
-   /* Reserve INITRD */
-   if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
-   /* Assume only end is not page aligned */
-   unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
-   unsigned long ramdisk_size  = boot_params.hdr.ramdisk_size;
-   unsigned long ramdisk_end   = PAGE_ALIGN(ramdisk_image + 
ramdisk_size);
-   memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
-   }
-#endif
-
reserve_ebda_region();
 
/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 6d29d1f..ee6d267 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -364,6 +364,19 @@ static u64 __init get_mem_size(unsigned long limit_pfn)
 
return mapped_pages << PAGE_SHIFT;
 }
+static void __init early_reserve_initrd(void)
+{
+   /* Assume only end is not page aligned */
+   u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+   u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+   u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
+
+   if (!boot_params.hdr.type_of_loader ||
+   !ramdisk_image || !ramdisk_size)
+   return; /* No initrd provided by bootloader */
+
+   memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image);
+}
 static void __init reserve_initrd(void)
 {
/* Assume only end is not page aligned */
@@ -390,10 +403,6 @@ static void __init reserve_initrd(void)
if (pfn_range_is_mapped(PFN_DOWN(ramdisk_image),
PFN_DOWN(ramdisk_end))) {
/* All are mapped, easy case */
-   /*
-* don't need to reserve again, already reserved early
-* in i386_start_kernel
-*/
initrd_start = ramdisk_image + PAGE_OFFSET;
initrd_end = initrd_start + ramdisk_size;
return;
@@ -404,6 +413,9 @@ static void __init reserve_initrd(void)
memblock_free(ramdisk_image, ramdisk_end - ramdisk_image);
 }
 #else
+static void __init early_reserve_initrd(void)
+{
+}
 static void __init reserve_initrd(void)
 {
 }
@@ -665,6 +677,8 @@ early_param("reservelow", parse_reservelow);
 
 void __init setup_arch(char **cmdline_p)
 {
+   early_reserve_initrd();
+
 #ifdef CONFIG_X86_32
memcpy(_cpu_data, _cpu_data, sizeof(new_cpu_data));
visws_early_detect();
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 07/13] x86, boot: move checking of cmd_line_ptr out of common path

2012-11-27 Thread Yinghai Lu

cmdline.c::__cmdline_find_option... are shared between
16-bit setup code and 32/64 bit decompressor code.

for 32/64 only path via kexec, we should not check if ptr less 1M.
as those cmdline could be put above 1M, or even 4G.

Move out accessible checking out of __cmdline_find_option()
So decompressor in misc.c can parse cmdline correctly.

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/boot.h|   14 --
 arch/x86/boot/cmdline.c |8 
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 18997e5..7fadf80 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -289,12 +289,22 @@ int __cmdline_find_option(u32 cmdline_ptr, const char 
*option, char *buffer, int
 int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
 static inline int cmdline_find_option(const char *option, char *buffer, int 
bufsize)
 {
-   return __cmdline_find_option(boot_params.hdr.cmd_line_ptr, option, 
buffer, bufsize);
+   u32 cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+   if (cmd_line_ptr >= 0x10)
+   return -1;  /* inaccessible */
+
+   return __cmdline_find_option(cmd_line_ptr, option, buffer, bufsize);
 }
 
 static inline int cmdline_find_option_bool(const char *option)
 {
-   return __cmdline_find_option_bool(boot_params.hdr.cmd_line_ptr, option);
+   u32 cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+   if (cmd_line_ptr >= 0x10)
+   return -1;  /* inaccessible */
+
+   return __cmdline_find_option_bool(cmd_line_ptr, option);
 }
 
 
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 6b3b6f7..768f00f 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -41,8 +41,8 @@ int __cmdline_find_option(u32 cmdline_ptr, const char 
*option, char *buffer, int
st_bufcpy   /* Copying this to buffer */
} state = st_wordstart;
 
-   if (!cmdline_ptr || cmdline_ptr >= 0x10)
-   return -1;  /* No command line, or inaccessible */
+   if (!cmdline_ptr)
+   return -1;  /* No command line */
 
cptr = cmdline_ptr & 0xf;
set_fs(cmdline_ptr >> 4);
@@ -111,8 +111,8 @@ int __cmdline_find_option_bool(u32 cmdline_ptr, const char 
*option)
st_wordskip,/* Miscompare, skip */
} state = st_wordstart;
 
-   if (!cmdline_ptr || cmdline_ptr >= 0x10)
-   return -1;  /* No command line, or inaccessible */
+   if (!cmdline_ptr)
+   return -1;  /* No command line */
 
cptr = cmdline_ptr & 0xf;
set_fs(cmdline_ptr >> 4);
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 03/13] x86, 64bit: Set extra ident mapping for whole kernel range

2012-11-27 Thread Yinghai Lu

Current when kernel is loaded above 1G, only [_text, _text+2M] is set
up with extra ident page table.
That is not enough, some variables that could be used early are out of
that range, like BRK for early page table.
Need to set map for [_text, _end] include text/data/bss/brk...

Also current kernel is not allowed to be loaded above 512g, it thinks
that address is too big.
We need to add one extra spare page for level3 to point that 512g range.
Need to check _text range and set level4 pg with that spare level3 page,
and set level3 with level2 page to cover [_text, _end] with extra mapping.

At last, to handle crossing GB boundary, we need to add another
level2 spare page. To handle crossing 512GB boundary, we need to
add another level3 spare page to next 512G range.

Test on with kexec-tools with local test code to force loading kernel
cross 1G, 5G, 512g, 513g.

We need this to put relocatable 64bit bzImage high above 1g.

-v4: add crossing GB boundary handling.
-v5: use spare pages from BRK, so could save pages when kernel is not
loaded above 1GB.

Signed-off-by: Yinghai Lu 
Cc: "Eric W. Biederman" 
---
 arch/x86/kernel/head_64.S |  203 +
 1 files changed, 187 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 94bf9cc..338799a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_PARAVIRT
 #include 
@@ -42,6 +43,13 @@ L3_PAGE_OFFSET = pud_index(__PAGE_OFFSET)
 L4_START_KERNEL = pgd_index(__START_KERNEL_map)
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
 
+/* two for level3, and two for level2 */
+SPARE_MAP_SIZE = (4 * PAGE_SIZE)
+RESERVE_BRK(spare_map, SPARE_MAP_SIZE)
+
+#define spare_page(x)  (__brk_base + (x) * PAGE_SIZE)
+#define add_one_spare_page addq $PAGE_SIZE, _brk_end(%rip)
+
.text
__HEAD
.code64
@@ -78,12 +86,6 @@ startup_64:
testl   %eax, %eax
jnz bad_address
 
-   /* Is the address too large? */
-   leaq_text(%rip), %rdx
-   movq$PGDIR_SIZE, %rax
-   cmpq%rax, %rdx
-   jae bad_address
-
/* Fixup the physical addresses in the page table
 */
addq%rbp, init_level4_pgt + 0(%rip)
@@ -97,25 +99,196 @@ startup_64:
 
addq%rbp, level2_fixmap_pgt + (506*8)(%rip)
 
-   /* Add an Identity mapping if I am above 1G */
+   /* Add an Identity mapping if _end is above 1G */
+   leaq_end(%rip), %r9
+   decq%r9
+   cmp $PUD_SIZE, %r9
+   jl  ident_complete
+
+   /* Clear spare pages */
+   leaq__brk_base(%rip), %rdi
+   xorq%rax, %rax
+   movq$(SPARE_MAP_SIZE/8), %rcx
+1: decq%rcx
+   movq%rax, (%rdi)
+   leaq8(%rdi), %rdi
+   jnz 1b
+
+   /* get end */
+   andq$PMD_PAGE_MASK, %r9
+   /* round start to 1G if it is below 1G */
leaq_text(%rip), %rdi
andq$PMD_PAGE_MASK, %rdi
+   cmp $PUD_SIZE, %rdi
+   jg  1f
+   movq$PUD_SIZE, %rdi
+1:
+   /* get 512G index */
+   movq%r9, %r8
+   shrq$PGDIR_SHIFT, %r8
+   andq$(PTRS_PER_PGD - 1), %r8
+   movq%rdi, %rax
+   shrq$PGDIR_SHIFT, %rax
+   andq$(PTRS_PER_PGD - 1), %rax
+
+   /* cross two 512G ? */
+   cmp %r8, %rax
+   jne set_level3_other_512g
+
+   /* all in first 512G ? */
+   cmp $0, %rax
+   je  skip_level3_spare
+
+   /* same 512G other than first 512g */
+   /*
+* We need one level3, one or two level 2,
+* so use first one for level3.
+*/
+   leaq(spare_page(0) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+   leaqinit_level4_pgt(%rip), %rbx
+   movq%rdx, 0(%rbx, %rax, 8)
+   addq$L4_PAGE_OFFSET, %rax
+   movq%rdx, 0(%rbx, %rax, 8)
+   /* one level3 in BRK */
+   add_one_spare_page
+
+   /* get 1G index */
+   movq%r9, %r8
+   shrq$PUD_SHIFT, %r8
+   andq$(PTRS_PER_PUD - 1), %r8
+   movq%rdi, %rax
+   shrq$PUD_SHIFT, %rax
+   andq$(PTRS_PER_PUD - 1), %rax
+
+   /* same 1G ? */
+   cmp %r8, %rax
+   je  set_level2_start_only_not_first_512g
+
+   /* set level2 for end */
+   leaqspare_page(0)(%rip), %rbx
+   leaq(spare_page(2) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+   movq%rdx, 0(%rbx, %r8, 8)
+   /* second one level2 in BRK */
+   add_one_spare_page
+
+set_level2_start_only_not_first_512g:
+   leaqspare_page(0)(%rip), %rbx
+   leaq(spare_page(1) - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
+   movq%rdx, 0(%rbx, %rax, 8)
+   /* first one level2 in BRK */
+   add_one_spare_page
+
+   /* one spare level3 before level2*/
+   leaq

[PATCH v5 05/13] x86: add get_ramdisk_image/size()

2012-11-27 Thread Yinghai Lu

There several places to find ramdisk information early for reserving
and relocating.

Use functions to make code more readable and consistent.

Later will add ext_ramdisk_image/size in those functions to support
loading ramdisk above 4g.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/setup.c |   29 +
 1 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ee6d267..194e151 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -298,12 +298,25 @@ static void __init reserve_brk(void)
 
 #ifdef CONFIG_BLK_DEV_INITRD
 
+static u64 __init get_ramdisk_image(void)
+{
+   u64 ramdisk_image = boot_params.hdr.ramdisk_image;
+
+   return ramdisk_image;
+}
+static u64 __init get_ramdisk_size(void)
+{
+   u64 ramdisk_size = boot_params.hdr.ramdisk_size;
+
+   return ramdisk_size;
+}
+
 #define MAX_MAP_CHUNK  (NR_FIX_BTMAPS << PAGE_SHIFT)
 static void __init relocate_initrd(void)
 {
/* Assume only end is not page aligned */
-   u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-   u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+   u64 ramdisk_image = get_ramdisk_image();
+   u64 ramdisk_size  = get_ramdisk_size();
u64 area_size = PAGE_ALIGN(ramdisk_size);
u64 ramdisk_here;
unsigned long slop, clen, mapaddr;
@@ -342,8 +355,8 @@ static void __init relocate_initrd(void)
ramdisk_size  -= clen;
}
 
-   ramdisk_image = boot_params.hdr.ramdisk_image;
-   ramdisk_size  = boot_params.hdr.ramdisk_size;
+   ramdisk_image = get_ramdisk_image();
+   ramdisk_size  = get_ramdisk_size();
printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
" [mem %#010llx-%#010llx]\n",
ramdisk_image, ramdisk_image + ramdisk_size - 1,
@@ -367,8 +380,8 @@ static u64 __init get_mem_size(unsigned long limit_pfn)
 static void __init early_reserve_initrd(void)
 {
/* Assume only end is not page aligned */
-   u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-   u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+   u64 ramdisk_image = get_ramdisk_image();
+   u64 ramdisk_size  = get_ramdisk_size();
u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
 
if (!boot_params.hdr.type_of_loader ||
@@ -380,8 +393,8 @@ static void __init early_reserve_initrd(void)
 static void __init reserve_initrd(void)
 {
/* Assume only end is not page aligned */
-   u64 ramdisk_image = boot_params.hdr.ramdisk_image;
-   u64 ramdisk_size  = boot_params.hdr.ramdisk_size;
+   u64 ramdisk_image = get_ramdisk_image();
+   u64 ramdisk_size  = get_ramdisk_size();
u64 ramdisk_end   = PAGE_ALIGN(ramdisk_image + ramdisk_size);
u64 mapped_size;
 
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 08/13] x86, boot: update cmd_line_ptr to unsigned long

2012-11-27 Thread Yinghai Lu

boot/compressed/misc.c could be with 64 bit, and cmd_line_ptr could
above 4g.

So change to unsigned long instead, that will be 64bit in 64bit path
and 32bit in 32bit path.

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/boot.h|8 
 arch/x86/boot/cmdline.c |4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 7fadf80..5b75319 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -285,11 +285,11 @@ struct biosregs {
 void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
 
 /* cmdline.c */
-int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, 
int bufsize);
-int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option);
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char 
*buffer, int bufsize);
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option);
 static inline int cmdline_find_option(const char *option, char *buffer, int 
bufsize)
 {
-   u32 cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+   unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
 
if (cmd_line_ptr >= 0x10)
return -1;  /* inaccessible */
@@ -299,7 +299,7 @@ static inline int cmdline_find_option(const char *option, 
char *buffer, int bufs
 
 static inline int cmdline_find_option_bool(const char *option)
 {
-   u32 cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+   unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
 
if (cmd_line_ptr >= 0x10)
return -1;  /* inaccessible */
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 768f00f..625d21b 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -27,7 +27,7 @@ static inline int myisspace(u8 c)
  * Returns the length of the argument (regardless of if it was
  * truncated to fit in the buffer), or -1 on not found.
  */
-int __cmdline_find_option(u32 cmdline_ptr, const char *option, char *buffer, 
int bufsize)
+int __cmdline_find_option(unsigned long cmdline_ptr, const char *option, char 
*buffer, int bufsize)
 {
addr_t cptr;
char c;
@@ -99,7 +99,7 @@ int __cmdline_find_option(u32 cmdline_ptr, const char 
*option, char *buffer, int
  * Returns the position of that option (starts counting with 1)
  * or 0 on not found
  */
-int __cmdline_find_option_bool(u32 cmdline_ptr, const char *option)
+int __cmdline_find_option_bool(unsigned long cmdline_ptr, const char *option)
 {
addr_t cptr;
char c;
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 02/13] x86, boot: Move lldt/ltr out of 64bit code section

2012-11-27 Thread Yinghai Lu

commit 08da5a2ca

x86_64: Early segment setup for VT

add lldt/ltr to clean more segments.

Those code are put in code64, and it is using gdt that is only
loaded from code32 path.

That breaks booting with 64bit bootloader that does not go through
code32 path. It get at startup_64 directly,  and it has different
gdt.

Move those lines into code32 after their gdt is loaded.

Signed-off-by: Yinghai Lu 
Cc: Zachary Amsden 
Cc: Matt Fleming 
---
 arch/x86/boot/compressed/head_64.S |9 ++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S 
b/arch/x86/boot/compressed/head_64.S
index 2c3cee4..375af23 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -154,6 +154,12 @@ ENTRY(startup_32)
btsl$_EFER_LME, %eax
wrmsr
 
+   /* After gdt is loaded */
+   xorl%eax, %eax
+   lldt%ax
+   movl$0x20, %eax
+   ltr %ax
+
/*
 * Setup for the jump to 64bit mode
 *
@@ -245,9 +251,6 @@ preferred_addr:
movl%eax, %ss
movl%eax, %fs
movl%eax, %gs
-   lldt%ax
-   movl$0x20, %eax
-   ltr %ax
 
/*
 * Compute the decompressed kernel start address.  It is where
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 11/13] x86: remove 1024G limitation for kexec buffer on 64bit

2012-11-27 Thread Yinghai Lu

Now 64bit kernel supports more than 1T ram and kexec tools
could find buffer above 1T, remove that obsolete limitation.
and use MAXMEM instead.

Tested on system more than 1024G ram.

Signed-off-by: Yinghai Lu 
Cc: "Eric W. Biederman" 
---
 arch/x86/include/asm/kexec.h |6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff17..11bfdc5 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -48,11 +48,11 @@
 # define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
 #else
 /* Maximum physical address we can use pages from */
-# define KEXEC_SOURCE_MEMORY_LIMIT  (0xFFUL)
+# define KEXEC_SOURCE_MEMORY_LIMIT  (MAXMEM-1)
 /* Maximum address we can reach in physical address mode */
-# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFUL)
+# define KEXEC_DESTINATION_MEMORY_LIMIT (MAXMEM-1)
 /* Maximum address we can use for the control pages */
-# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFUL)
+# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
 
 /* Allocate one page for the pdp and the second for the code */
 # define KEXEC_CONTROL_PAGE_SIZE  (4096UL + 4096UL)
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 01/13] x86, boot: move verify_cpu.S after 0x200

2012-11-27 Thread Yinghai Lu

We are short of space before 0x200 that is entry for startup_64.

According to hpa, we can not change startup_64 to other offset and
that become ABI now.

We could move function verify_cpu down, and that could avoid extra
code of jmp back and forth if we would move other lines.

Signed-off-by: Yinghai Lu 
Cc: Matt Fleming 
---
 arch/x86/boot/compressed/head_64.S |5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S 
b/arch/x86/boot/compressed/head_64.S
index 2c4b171..2c3cee4 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -182,8 +182,6 @@ no_longmode:
hlt
jmp 1b
 
-#include "../../kernel/verify_cpu.S"
-
/*
 * Be careful here startup_64 needs to be at a predictable
 * address so I can export it in an ELF header.  Bootloaders
@@ -349,6 +347,9 @@ relocated:
  */
jmp *%rbp
 
+   .code32
+#include "../../kernel/verify_cpu.S"
+
.data
 gdt:
.word   gdt_end - gdt
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 12/13] x86, 64bit: Print init kernel lowmap correctly

2012-11-27 Thread Yinghai Lu

When we get x86_64_start_kernel from arch/x86/kernel/head_64.S,

We have
1. kernel highmap 512M (KERNEL_IMAGE_SIZE) from kernel loaded address.
2. kernel lowmap: [0, 1024M), and size (_end - _text) from kernel
   loaded address.

for example, if the kernel bzImage is loaded high from 8G, will get:
1. kernel highmap:  [8G, 8G+512M)
2. kernel lowmap: [0, 1024M), and  [8G, 8G +_end - _text)

So max_pfn_mapped that is for low map pfn recording is not that
simple to 512M for 64 bit.

Try to print out two ranges, when kernel is loaded high.

Also need to use KERNEL_IMAGE_SIZE directly for highmap cleanup.

Signed-off-by: Yinghai Lu 
---
 arch/x86/kernel/head64.c |2 --
 arch/x86/kernel/setup.c  |   23 +--
 arch/x86/mm/init_64.c|6 +-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 8ea1bc9..8d426b4 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -97,8 +97,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
/* Make NULL pointers segfault */
zap_identity_mappings();
 
-   max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
-
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
 #ifdef CONFIG_EARLY_PRINTK
set_intr_gate(i, _idt_handlers[i]);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2dbe2ce..87473fc 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -681,6 +681,26 @@ static int __init parse_reservelow(char *p)
 
 early_param("reservelow", parse_reservelow);
 
+static __init void print_init_mem_mapped(void)
+{
+#ifdef CONFIG_X86_32
+   printk(KERN_DEBUG "initial memory mapped: [mem 0x-%#010lx]\n",
+   (max_pfn_mapped

[PATCH v5 13/13] x86, mm: Fix page table early allocation offset checking

2012-11-27 Thread Yinghai Lu

During debug load kernel above 4G, found one page if is not used in BRK
and it should be with early page allocation.

Fix that checking and also add print out for every allocation from BRK
page table allocation.

Signed-off-by: Yinghai Lu 
---
 arch/x86/mm/init.c |4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6f85de8..c4293cf 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -47,7 +47,7 @@ __ref void *alloc_low_pages(unsigned int num)
__GFP_ZERO, order);
}
 
-   if ((pgt_buf_end + num) >= pgt_buf_top) {
+   if ((pgt_buf_end + num) > pgt_buf_top) {
unsigned long ret;
if (min_pfn_mapped >= max_pfn_mapped)
panic("alloc_low_page: ran out of memory");
@@ -61,6 +61,8 @@ __ref void *alloc_low_pages(unsigned int num)
} else {
pfn = pgt_buf_end;
pgt_buf_end += num;
+   printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
+   pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
}
 
for (i = 0; i < num; i++) {
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 06/13] x86, boot: add get_cmd_line_ptr()

2012-11-27 Thread Yinghai Lu

later will check ext_cmd_line_ptr at the same time.

Signed-off-by: Yinghai Lu 
---
 arch/x86/boot/compressed/cmdline.c |   10 --
 arch/x86/kernel/head64.c   |   13 +++--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/x86/boot/compressed/cmdline.c 
b/arch/x86/boot/compressed/cmdline.c
index 10f6b11..b4c913c 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -13,13 +13,19 @@ static inline char rdfs8(addr_t addr)
return *((char *)(fs + addr));
 }
 #include "../cmdline.c"
+static unsigned long get_cmd_line_ptr(void)
+{
+   unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr;
+
+   return cmd_line_ptr;
+}
 int cmdline_find_option(const char *option, char *buffer, int bufsize)
 {
-   return __cmdline_find_option(real_mode->hdr.cmd_line_ptr, option, 
buffer, bufsize);
+   return __cmdline_find_option(get_cmd_line_ptr(), option, buffer, 
bufsize);
 }
 int cmdline_find_option_bool(const char *option)
 {
-   return __cmdline_find_option_bool(real_mode->hdr.cmd_line_ptr, option);
+   return __cmdline_find_option_bool(get_cmd_line_ptr(), option);
 }
 
 #endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 00e612a..3ac6cad 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -41,13 +41,22 @@ static void __init clear_bss(void)
   (unsigned long) __bss_stop - (unsigned long) __bss_start);
 }
 
+static unsigned long get_cmd_line_ptr(void)
+{
+   unsigned long cmd_line_ptr = boot_params.hdr.cmd_line_ptr;
+
+   return cmd_line_ptr;
+}
+
 static void __init copy_bootdata(char *real_mode_data)
 {
char * command_line;
+   unsigned long cmd_line_ptr;
 
memcpy(_params, real_mode_data, sizeof boot_params);
-   if (boot_params.hdr.cmd_line_ptr) {
-   command_line = __va(boot_params.hdr.cmd_line_ptr);
+   cmd_line_ptr = get_cmd_line_ptr();
+   if (cmd_line_ptr) {
+   command_line = __va(cmd_line_ptr);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
 }
-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v5 10/13] x86, boot: add fields to support load bzImage and ramdisk above 4G

2012-11-27 Thread Yinghai Lu

ext_ramdisk_image/size will record high 32bits for ramdisk info.

xloadflags bit0 will be set if relocatable with 64bit.

Let get_ramdisk_image/size to use ext_ramdisk_image/size to get
right positon for ramdisk.

bootloader will fill value to ext_ramdisk_image/size when it load
ramdisk above 4G.

Also bootloader will check if xloadflags bit0 is set to decicde if
it could load ramdisk high above 4G.

xloadflags bit15 is used for bootloader to notify kernel if new added
ext_* in boot_params could be used or not.

Update header version to 2.12.

-v2: add ext_cmd_line_ptr for above 4G support.
-v3: update to xloadflags from HPA.
-v4: use fields from bootparam instead setup_header accoring to HPA.
-v5: add checking for USE_EXT_BOOT_PARAMS

Signed-off-by: Yinghai Lu 
Cc: Rob Landley 
Cc: Matt Fleming 
---
 Documentation/x86/boot.txt |   19 ++-
 Documentation/x86/zero-page.txt|3 +++
 arch/x86/boot/compressed/cmdline.c |3 +++
 arch/x86/boot/header.S |   12 ++--
 arch/x86/include/asm/bootparam.h   |   10 --
 arch/x86/kernel/head64.c   |3 +++
 arch/x86/kernel/setup.c|6 ++
 7 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index 9efceff..51954d7 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -57,6 +57,9 @@ Protocol 2.10:(Kernel 2.6.31) Added a protocol for 
relaxed alignment
 Protocol 2.11: (Kernel 3.6) Added a field for offset of EFI handover
protocol entry point.
 
+Protocol 2.12: (Kernel 3.9) Added three fields for loading bzImage and
+ramdisk above 4G with 64bit in bootparam.
+
  MEMORY LAYOUT
 
 The traditional memory map for the kernel loader, used for Image or
@@ -182,7 +185,7 @@ Offset  Proto   NameMeaning
 0230/4 2.05+   kernel_alignment Physical addr alignment required for kernel
 0234/1 2.05+   relocatable_kernel Whether kernel is relocatable or not
 0235/1 2.10+   min_alignment   Minimum alignment, as a power of two
-0236/2 N/A pad3Unused
+0236/2 2.12+   xloadflags  Boot protocol option flags
 0238/4 2.06+   cmdline_sizeMaximum size of the kernel command line
 023C/4 2.07+   hardware_subarch Hardware subarchitecture
 0240/8 2.07+   hardware_subarch_data Subarchitecture-specific data
@@ -581,6 +584,20 @@ Protocol:  2.10+
   misaligned kernel.  Therefore, a loader should typically try each
   power-of-two alignment from kernel_alignment down to this alignment.
 
+Field name: xloadflags
+Type:   modify (obligatory)
+Offset/size:0x236/2
+Protocol:   2.12+
+
+  This field is a bitmask.
+
+  Bit 0 (read): CAN_BE_LOADED_ABOVE_4G
+- If 1, kernel/boot_params/cmdline/ramdisk can be above 4g,
+
+  Bit 15 (write): USE_EXT_BOOT_PARAMS
+   - If 1, set by bootloader, and kernel could check new fields
+   in boot_params that are added from 2.12 safely.
+
 Field name:cmdline_size
 Type:  read
 Offset/size:   0x238/4
diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
index cf5437d..0e19657 100644
--- a/Documentation/x86/zero-page.txt
+++ b/Documentation/x86/zero-page.txt
@@ -19,6 +19,9 @@ OffsetProto   NameMeaning
 090/010ALL hd1_infohd1 disk parameter, OBSOLETE!!
 0A0/010ALL sys_desc_table  System description table (struct 
sys_desc_table)
 0B0/010ALL olpc_ofw_header OLPC's OpenFirmware CIF and friends
+0C0/004 ALLext_ramdisk_image ramdisk_image high 32bits
+0C4/004 ALLext_ramdisk_size  ramdisk_size high 32bits
+0C8/004 ALLext_cmd_line_ptr  cmd_line_ptr high 32bits
 140/080ALL edid_info   Video mode setup (struct edid_info)
 1C0/020ALL efi_infoEFI 32 information (struct efi_info)
 1E0/004ALL alk_mem_k   Alternative mem check, in KB
diff --git a/arch/x86/boot/compressed/cmdline.c 
b/arch/x86/boot/compressed/cmdline.c
index b4c913c..43e4ec7 100644
--- a/arch/x86/boot/compressed/cmdline.c
+++ b/arch/x86/boot/compressed/cmdline.c
@@ -17,6 +17,9 @@ static unsigned long get_cmd_line_ptr(void)
 {
unsigned long cmd_line_ptr = real_mode->hdr.cmd_line_ptr;
 
+   if (real_mode->hdr.xloadflags & USE_EXT_BOOT_PARAMS)
+   cmd_line_ptr |= (u64)real_mode->ext_cmd_line_ptr << 32;
+
return cmd_line_ptr;
 }
 int cmdline_find_option(const char *option, char *buffer, int bufsize)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 2a01744..156f664 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -279,7 +279,7 @@ _start:
# Part 2 of the header, from the old setup.S
 
.ascii  "HdrS"  # header signature
-   .word   0x020b  # header version number (>= 0x0105)
+   .word   0x020c  # header version number (>= 0x0105)

[PATCH v5 00/13] x86, boot, 64bit: Add support for loading ramdisk and bzImage above 4G

2012-11-27 Thread Yinghai Lu

Now we have limit kdump reseved under 896M, because kexec has the limitation.
and also bzImage need to stay under 4g.

To make kexec/kdump could use range above 4g, we need to make bzImage and
ramdisk could be loaded above 4g.
During booting bzImage will be unpacked on same postion and stay high.

The patches add fields in setup_header and boot_params to
1. get info about ramdisk position info above 4g from bootloader/kexec
2. get info about cmd_line_ptr info above 4g from bootloader/kexec
3. set xloadflags bit0 in header for bzImage and bootloader/kexec load
   could check that to decide if it could to put bzImage high.
4. set xloadflags bit15 in header for bootloader to notify if new added
   ext_* fields in boot_params could be used.

This patches is tested with kexec tools with local changes and they are sent
to kexec list later.

could be found at:

git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git 
for-x86-boot

and it is on top of for-x86-mm

-v2: add ext_cmd_line_ptr support, and handle boot_param/cmd_line is above
 4G case.
-v3: according to hpa, use xloadflags instead code32_start_offset.
 0x200 will not be changed...
-v4: move ext_ramdisk_image/ext_ramdisk_size/ext_cmd_line_ptr to boot_params.
 add handling cross GB boundary case.
-v5: put spare pages in BRK,so could avoid wasting about 4 pages.
 add check for bit USE_EXT_BOOT_PARAMS in xloadflags


Yinghai Lu (13):
  x86, boot: move verify_cpu.S after 0x200
  x86, boot: Move lldt/ltr out of 64bit code section
  x86, 64bit: Set extra ident mapping for whole kernel range
  x86: Merge early_reserve_initrd for 32bit and 64bit
  x86: add get_ramdisk_image/size()
  x86, boot: add get_cmd_line_ptr()
  x86, boot: move checking of cmd_line_ptr out of common path
  x86, boot: update cmd_line_ptr to unsigned long
  x86: use io_remap to access real_mode_data
  x86, boot: add fields to support load bzImage and ramdisk above 4G
  x86: remove 1024G limitation for kexec buffer on 64bit
  x86, 64bit: Print init kernel lowmap correctly
  x86, mm: Fix page table early allocation offset checking

 Documentation/x86/boot.txt |   19 +++-
 Documentation/x86/zero-page.txt|3 +
 arch/x86/boot/boot.h   |   18 +++-
 arch/x86/boot/cmdline.c|   12 +-
 arch/x86/boot/compressed/cmdline.c |   13 ++-
 arch/x86/boot/compressed/head_64.S |   14 ++-
 arch/x86/boot/header.S |   12 ++-
 arch/x86/include/asm/bootparam.h   |   10 ++-
 arch/x86/include/asm/kexec.h   |6 +-
 arch/x86/kernel/head32.c   |   11 --
 arch/x86/kernel/head64.c   |   44 +---
 arch/x86/kernel/head_64.S  |  207 ---
 arch/x86/kernel/setup.c|   78 --
 arch/x86/mm/init.c |4 +-
 arch/x86/mm/init_64.c  |6 +-
 15 files changed, 372 insertions(+), 85 deletions(-)

-- 
1.7.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] vfio powerpc: implemented IOMMU driver for VFIO

2012-11-27 Thread Alexey Kardashevskiy

VFIO implements platform independent stuff such as
a PCI driver, BAR access (via read/write on a file descriptor
or direct mapping when possible) and IRQ signaling.

The platform dependent part includes IOMMU initialization
and handling. This patch implements an IOMMU driver for VFIO
which does mapping/unmapping pages for the guest IO and
provides information about DMA window (required by a POWERPC
guest).

The counterpart in QEMU is required to support this functionality.

Cc: David Gibson 
Signed-off-by: Alexey Kardashevskiy 
---
 drivers/vfio/Kconfig|6 +
 drivers/vfio/Makefile   |1 +
 drivers/vfio/vfio_iommu_spapr_tce.c |  332 +++
 include/linux/vfio.h|   33 
 4 files changed, 372 insertions(+)
 create mode 100644 drivers/vfio/vfio_iommu_spapr_tce.c

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 7cd5dec..b464687 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -3,10 +3,16 @@ config VFIO_IOMMU_TYPE1
depends on VFIO
default n
 
+config VFIO_IOMMU_SPAPR_TCE
+   tristate
+   depends on VFIO && SPAPR_TCE_IOMMU
+   default n
+
 menuconfig VFIO
tristate "VFIO Non-Privileged userspace driver framework"
depends on IOMMU_API
select VFIO_IOMMU_TYPE1 if X86
+   select VFIO_IOMMU_SPAPR_TCE if PPC_POWERNV
help
  VFIO provides a framework for secure userspace device drivers.
  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 2398d4a..72bfabc 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
+obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
new file mode 100644
index 000..b98770e
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -0,0 +1,332 @@
+/*
+ * VFIO: IOMMU DMA mapping support for TCE on POWER
+ *
+ * Copyright (C) 2012 IBM Corp.  All rights reserved.
+ * Author: Alexey Kardashevskiy 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Derived from original vfio_iommu_type1.c:
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ * Author: Alex Williamson 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "a...@ozlabs.ru"
+#define DRIVER_DESC "VFIO IOMMU SPAPR TCE"
+
+static void tce_iommu_detach_group(void *iommu_data,
+   struct iommu_group *iommu_group);
+
+/*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ */
+
+/*
+ * This code handles mapping and unmapping of user data buffers
+ * into DMA'ble space using the IOMMU
+ */
+
+#define NPAGE_TO_SIZE(npage)   ((size_t)(npage) << PAGE_SHIFT)
+
+struct vwork {
+   struct mm_struct*mm;
+   longnpage;
+   struct work_struct  work;
+};
+
+/* delayed decrement/increment for locked_vm */
+static void lock_acct_bg(struct work_struct *work)
+{
+   struct vwork *vwork = container_of(work, struct vwork, work);
+   struct mm_struct *mm;
+
+   mm = vwork->mm;
+   down_write(>mmap_sem);
+   mm->locked_vm += vwork->npage;
+   up_write(>mmap_sem);
+   mmput(mm);
+   kfree(vwork);
+}
+
+static void lock_acct(long npage)
+{
+   struct vwork *vwork;
+   struct mm_struct *mm;
+
+   if (!current->mm)
+   return; /* process exited */
+
+   if (down_write_trylock(>mm->mmap_sem)) {
+   current->mm->locked_vm += npage;
+   up_write(>mm->mmap_sem);
+   return;
+   }
+
+   /*
+* Couldn't get mmap_sem lock, so must setup to update
+* mm->locked_vm later. If locked_vm were atomic, we
+* wouldn't need this silliness
+*/
+   vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
+   if (!vwork)
+   return;
+   mm = get_task_mm(current);
+   if (!mm) {
+   kfree(vwork);
+   return;
+   }
+   INIT_WORK(>work, lock_acct_bg);
+   vwork->mm = mm;
+   vwork->npage = npage;
+   schedule_work(>work);
+}
+
+/*
+ * The container descriptor supports only a single group per container.
+ * Required by the API as the container is not supplied with the IOMMU group
+ * at the moment of initialization.
+ */
+struct tce_container {
+   struct mutex lock;
+   struct iommu_table *tbl;
+};
+
+static void *tce_iommu_open(unsigned long arg)
+{
+   struct tce_container *container;
+
+   if (arg != VFIO_SPAPR_TCE_IOMMU) {
+   pr_err("tce_vfio: Wrong IOMMU

[PATCH] vfio powerpc: enabled on powernv platform

2012-11-27 Thread Alexey Kardashevskiy

This patch initializes IOMMU groups based on the IOMMU
configuration discovered during the PCI scan on POWERNV
(POWER non virtualized) platform. The IOMMU groups are
to be used later by VFIO driver (PCI pass through).

It also implements an API for mapping/unmapping pages for
guest PCI drivers and providing DMA window properties.
This API is going to be used later by QEMU-VFIO to handle
h_put_tce hypercalls from the KVM guest.

Although this driver has been tested only on the POWERNV
platform, it should work on any platform which supports
TCE tables.

To enable VFIO on POWER, enable SPAPR_TCE_IOMMU config
option and configure VFIO as required.

Cc: David Gibson 
Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/iommu.h |9 +++
 arch/powerpc/kernel/iommu.c  |  147 ++
 arch/powerpc/platforms/powernv/pci.c |  135 +++
 drivers/iommu/Kconfig|8 ++
 4 files changed, 299 insertions(+)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index cbfe678..5c7087a 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,9 @@ struct iommu_table {
struct iommu_pool large_pool;
struct iommu_pool pools[IOMMU_NR_POOLS];
unsigned long *it_map;   /* A simple allocation bitmap for now */
+#ifdef CONFIG_IOMMU_API
+   struct iommu_group *it_group;
+#endif
 };
 
 struct scatterlist;
@@ -147,5 +150,11 @@ static inline void iommu_restore(void)
 }
 #endif
 
+extern long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+   unsigned long pages);
+extern long iommu_put_tces(struct iommu_table *tbl, unsigned long entry,
+   uint64_t tce, enum dma_data_direction direction,
+   unsigned long pages);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index ff5a6ce..1456b6e 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DBG(...)
 
@@ -856,3 +857,149 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t 
size,
free_pages((unsigned long)vaddr, get_order(size));
}
 }
+
+#ifdef CONFIG_IOMMU_API
+/*
+ * SPAPR TCE API
+ */
+static void tce_flush(struct iommu_table *tbl)
+{
+   /* Flush/invalidate TLB caches if necessary */
+   if (ppc_md.tce_flush)
+   ppc_md.tce_flush(tbl);
+
+   /* Make sure updates are seen by hardware */
+   mb();
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of pages
+ * which it called put_page() on.
+ */
+static long clear_tces_nolock(struct iommu_table *tbl, unsigned long entry,
+   unsigned long pages)
+{
+   int i, pages_put = 0;
+   unsigned long oldtce;
+   struct page *page;
+
+   for (i = 0; i < pages; ++i) {
+   oldtce = ppc_md.tce_get(tbl, entry + i);
+   ppc_md.tce_free(tbl, entry + i, 1);
+
+   if (!(oldtce & (TCE_PCI_WRITE | TCE_PCI_READ)))
+   continue;
+
+   page = pfn_to_page(oldtce >> PAGE_SHIFT);
+
+   WARN_ON(!page);
+   if (!page)
+   continue;
+
+   if (oldtce & TCE_PCI_WRITE)
+   SetPageDirty(page);
+
+   ++pages_put;
+   put_page(page);
+   }
+
+   return pages_put;
+}
+
+/*
+ * iommu_clear_tces clears tces and returned the number of released pages
+ */
+long iommu_clear_tces(struct iommu_table *tbl, unsigned long entry,
+   unsigned long pages)
+{
+   int ret;
+   struct iommu_pool *pool = get_pool(tbl, entry);
+
+   spin_lock(&(pool->lock));
+   ret = clear_tces_nolock(tbl, entry, pages);
+   tce_flush(tbl);
+   spin_unlock(&(pool->lock));
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_clear_tces);
+
+static int put_tce(struct iommu_table *tbl, unsigned long entry,
+   uint64_t tce, enum dma_data_direction direction)
+{
+   int ret;
+   struct page *page = NULL;
+   unsigned long kva, offset;
+
+   /* Map new TCE */
+   offset = (tce & IOMMU_PAGE_MASK) - (tce & PAGE_MASK);
+
+   ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+   direction != DMA_TO_DEVICE, );
+   if (ret < 1) {
+   printk(KERN_ERR "tce_vfio: get_user_pages_fast failed tce=%llx 
ioba=%lx ret=%d\n",
+   tce, entry << IOMMU_PAGE_SHIFT, ret);
+   if (!ret)
+   ret = -EFAULT;
+   return ret;
+   }
+
+   kva = (unsigned long) page_address(page);
+   kva += offset;
+
+   /* tce_build receives a virtual address */
+   entry += tbl->it_offset; /* Offset into real TCE table */
+   ret = ppc_md.tce_build(tbl, entry, 1, kva,

Re: [PATCH 1/4] Input RMI4 - rename rmi_function_container to rmi_function

2012-11-27 Thread Dmitry Torokhov

On Tue, Nov 27, 2012 at 03:43:05PM -0800, Christopher Heiny wrote:
> On 11/27/2012 01:21 AM, Dmitry Torokhov wrote:
> >To save my old fingers...
> >
> >Signed-off-by: Dmitry Torokhov
> >---
> >
> >It looks like this driver(s) need some love and I might have some time so I
> >will refresh my "synaptics" branch with the patches you have sent and start
> >working off it. If you have updates I would appreciate if you also make them
> >available relative to that branch. When we are ready we'll squash them all
> >together and apply to the official branch.
> 
> No problem - let me know which branch/tag to work with, and we'll be
> happy to patch against that for the next round.

synaptics-rmi4. It is based off 3.7-rc6.


> 
> >
> >Thanks.
> >
> >  drivers/input/rmi4/rmi_driver.c | 158 +++--
> >  drivers/input/rmi4/rmi_driver.h |   4 +-
> >  drivers/input/rmi4/rmi_f01.c| 298 
> > 
> >  drivers/input/rmi4/rmi_f11.c| 258 +-
> >  include/linux/rmi.h |  22 ++-
> >  5 files changed, 368 insertions(+), 372 deletions(-)
> >
> >diff --git a/drivers/input/rmi4/rmi_driver.c 
> >b/drivers/input/rmi4/rmi_driver.c
> >index 05a73ae..e8a4b52 100644
> >--- a/drivers/input/rmi4/rmi_driver.c
> >+++ b/drivers/input/rmi4/rmi_driver.c
> >@@ -594,7 +594,7 @@ static struct device_attribute bsr_attribute = 
> >__ATTR(bsr, RMI_RW_ATTR,
> >
> >  static void rmi_free_function_list(struct rmi_device *rmi_dev)
> >  {
> >-struct rmi_function_container *entry, *n;
> >+struct rmi_function *entry, *n;
> > struct rmi_driver_data *data = dev_get_drvdata(_dev->dev);
> >
> > if (!data) {
> >@@ -613,44 +613,44 @@ static void rmi_free_function_list(struct rmi_device 
> >*rmi_dev)
> > }
> >  }
> >
> >-static void release_fc_device(struct device *dev)
> >+static void release_function_device(struct device *dev)
> >  {
> > dev_dbg(dev, "REMOVING KOBJ!");
> > kobject_put(>kobj);
> >  }
> 
> Hmmm.  Since rmi_function_container has evolved into a child device
> of the RMI4 module, maybe it would be better renamed
> rmi_function_device or rmi_function_dev?  I find this clearer, but
> can live with just rmi_function if you prefer that.

I just prefer something reasonably short so I still like rmi_function or
something of similar length.

> 
> 
> Similarly, rmi_function_handler has evolved into a driver for such
> devices, so perhaps it should be renamed rmi_function_driver?

No preference here.

Thanks.

-- 
Dmitry
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH] fs: configfs: programmatically create config groups

2012-11-27 Thread Nicholas A. Bellinger

Hi Sebastian & Co,

On Tue, 2012-11-27 at 16:20 +0100, Sebastian Andrzej Siewior wrote:
> On 11/26/2012 06:54 PM, Michal Nazarewicz wrote:
> > On Mon, Nov 26 2012, Sebastian Andrzej Siewior wrote:
> >> Wouldn't say that. It may adds complexity on another level. The target
> >> subsystem has the same problem with adding luns and there seems nothing
> >> wrong with having lun3 and 4 and leaving 0 and 1 unsued.
> >
> > That's not what Wikipedia claims though (from
> > ):
> >
> > LUN 0: There is one LUN which is required to exist in every
> > target: zero. The logical unit with LUN zero is special in that
> > it must implement a few specific commands, most notably Report
> > LUNs, which is how an initiator can find out all the other LUNs
> > in the target. But LUN zero need not provide any other services,
> > such as a storage volume.
> >
> > That's why I proposed solution where one needs to have continuous
> > numbering of LUNs.  I'm not an expert on SCSI though.
> 
> Let me quote "4.6.4 Minimum LUN addressing requirements" of SAM4:
> 
> | All SCSI target devices shall support LUN 0 (i.e., 
> | h) or the REPORT LUNS well-known logical unit. For SCSI
> | target devices that support the hierarchical addressing model the LUN
> | 0 or the REPORT LUNS well-known logical unit shall be the logical
> | unit that an application client addresses to determine
> | information about the SCSI target device and the logical units
> | contained within the SCSI target device.
> 
> Nab, I think not having LUN0 configured as long as REPORT LUNS says
> which luns are available is fine. Target seems to work on linux without
> it and SAM4 does no claim otherwise unless I miss interpret it. Any
> opinion on this from your side?
> 

So we use a special RAMDISK-MCP @ target_core_device.c:g_lun0_dev along
with a se_lun (located @ se_portal_group->tpg_virt_lun0) to always
service REPORT_LUNS to LUN=0, regardless of LUN=0 configfs fabric
endpoint layout.

Note this happens within target_core_device.c:transport_lookup_cmd_lun()
once no active se_node_acl->device_list[unpacked_lun] entry can be
located.

> >
> >> With the tcm gadget I get:
> >>
> >> |scsi 0:0:0:2: Direct-Access LIO-ORG  RAMDISK-MCP  4.0  PQ: 0
> >> ANSI: 5
> >> |scsi 0:0:0:3: Direct-Access LIO-ORG  FILEIO   4.0  PQ: 0
> >> ANSI: 5
> >>
> >> You notice :2 and :3 instead :0 and :1. While should be there something
> >> wrong with this?
> >
> > It may be that it works on Linux but fails on some other systems (or
> > even older Linux kernels).  Like I've said, I'm not SCSI expert, so my
> > knowledge of it is (embarrassingly) minimal.
> 
> Sure but still. You limit the user to create lunX folders where X can
> be 0..255 for instance. If the user chooses not create lun0, why force
> him?
> 

It's certainly easier for the user if REPORT_LUNS always 'just works' to
LUN=0.

--nab


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V3 RFC 1/2] sched: Bail out of yield_to when source and target runqueue has one task

2012-11-27 Thread Raghavendra K T


On 11/27/2012 07:34 PM, Andrew Theurer wrote:

On Tue, 2012-11-27 at 16:00 +0530, Raghavendra K T wrote:

On 11/26/2012 07:05 PM, Andrew Jones wrote:

On Mon, Nov 26, 2012 at 05:37:54PM +0530, Raghavendra K T wrote:

From: Peter Zijlstra 

In case of undercomitted scenarios, especially in large guests
yield_to overhead is significantly high. when run queue length of
source and target is one, take an opportunity to bail out and return
-ESRCH. This return condition can be further exploited to quickly come
out of PLE handler.

(History: Raghavendra initially worked on break out of kvm ple handler upon
   seeing source runqueue length = 1, but it had to export rq length).
   Peter came up with the elegant idea of return -ESRCH in scheduler core.

Signed-off-by: Peter Zijlstra 
Raghavendra, Checking the rq length of target vcpu condition added.(thanks Avi)
Reviewed-by: Srikar Dronamraju 
Signed-off-by: Raghavendra K T 
---

   kernel/sched/core.c |   25 +++--
   1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927f..fc219a5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4289,7 +4289,10 @@ EXPORT_SYMBOL(yield);
* It's the caller's job to ensure that the target task struct
* can't go away on us before we can do any checks.
*
- * Returns true if we indeed boosted the target task.
+ * Returns:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
*/
   bool __sched yield_to(struct task_struct *p, bool preempt)
   {
@@ -4303,6 +4306,15 @@ bool __sched yield_to(struct task_struct *p, bool 
preempt)

   again:
p_rq = task_rq(p);
+   /*
+* If we're the only runnable task on the rq and target rq also
+* has only one task, there's absolutely no point in yielding.
+*/
+   if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+   yielded = -ESRCH;
+   goto out_irq;
+   }
+
double_rq_lock(rq, p_rq);
while (task_rq(p) != p_rq) {
double_rq_unlock(rq, p_rq);
@@ -4310,13 +4322,13 @@ again:
}

if (!curr->sched_class->yield_to_task)
-   goto out;
+   goto out_unlock;

if (curr->sched_class != p->sched_class)
-   goto out;
+   goto out_unlock;

if (task_running(p_rq, p) || p->state)
-   goto out;
+   goto out_unlock;

yielded = curr->sched_class->yield_to_task(rq, p, preempt);
if (yielded) {
@@ -4329,11 +4341,12 @@ again:
resched_task(p_rq->curr);
}

-out:
+out_unlock:
double_rq_unlock(rq, p_rq);
+out_irq:
local_irq_restore(flags);

-   if (yielded)
+   if (yielded > 0)
schedule();

return yielded;



Acked-by: Andrew Jones 



Thank you Drew.

Marcelo Gleb.. Please let me know if you have comments / concerns on the
patches..

Andrew, Vinod, IMO, the patch set looks good for undercommit scenarios
especially for large guests where we do have overhead of vcpu iteration
of ple handler..


I agree, looks fine for undercommit scenarios.  I do wonder what happens
with 1.5x overcommit, where we might see 1/2 the host cpus with runqueue
of 2 and 1/2 of the host cpus with a runqueue of 1.  Even with this
change that scenario still might be fine, but it would be nice to see a
comparison.



Hi Andrew, yes thanks for pointing out 1.5x case which should have
theoretical  worst case..
I tried with 2 24 vcpu guests and the same 32 core machine.. Here is
the result..

Ebizzy (rec/sec higher is better)
x base
+ patched
N   AvgStddev
x  10 2688.6 347.55917
+  10 2707.6 260.93728

improvement 0.706%

dbench (Throughput MB/sec higher is better)
x base
+ patched
N AvgStddev
x  103164.712 140.24468
+  103244.021 185.92434

Improvement 2.5%

So there is no significant improvement / degradation seen in
1.5x.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 0/5] Add movablecore_map boot option

2012-11-27 Thread Jiang Liu

Hi Chen,

If a pageblock's migration type is movable, it may be converted to
reclaimable under memory pressure. CMA is introduced to guarantee
that pages of CMA won't be converted to other migratetypes.

And we are trying to avoid allocating kernel/DMA memory from specific
memory ranges, so we could easily reclaim pages when hot-removing
memory devices. 

I think the idea is not to directly reuse CMA for hotplug, but to 
reuse the mechanism to reserve specific memory ranges from bootmem
allocator. So CMA and hotplug could use the same code.
Basically we may try to reuse dma_declare_contiguous(), so that
we don't need to add special logic into bootmem allocator.

Regards!
Gerry

On 2012-11-28 14:16, Tang Chen wrote:
> Hi Bob, Liu Jiang,
> 
> About CMA, could you give me more info ?
> Thanks for your patent and nice advice. :)
> 
> 
> 1) I saw the following on http://lwn.net/Articles/447405/:
> 
> The "CMA" type is sticky; pages which are marked as being for CMA
> should never have their migration type changed by the kernel.
> 
> As Wen said, we now support a user interface to change movable memory
> into kernel memory. But seeing from above, the memory specified as
> CMA will not be able to be changed, right ?  If so, I don't think
> using CMA is a good idea.
> 
> 
> 2) Is CMA just implemented on ARM platform ?  I found the following in
> kernel-parameters.txt.
> 
> cma=nn[MG]  [ARM,KNL]
> Sets the size of kernel global memory area for contiguous
> memory allocations. For more information, see
> include/linux/dma-contiguous.h
> 
> We are developing on x86. Could we use it ?
> 
> 
> 3) Is CMA just used for DMA ? I am a little confused here. :)
> I found the main code of CMA is implemented in dma-contiguous.c.
> 
> 
> 4) The boot options cma=xxx and movablecore_map=xxx have different
> meanings for user. Reusing CMA could make user confused, I'm afraid.
> 
> And, even if we reuse "cma=" option, we still need to do the work
> in patch 3~5, right ?
> 
> 
> Thanks. :)
> 
> 
> 
> On 11/28/2012 12:08 PM, Jiang Liu wrote:
>> On 2012-11-28 11:24, Bob Liu wrote:
>>> On Tue, Nov 27, 2012 at 8:49 PM, Tang Chen  wrote:
 On 11/27/2012 08:09 PM, Bob Liu wrote:
>
> On Tue, Nov 27, 2012 at 4:29 PM, Tang Chen
> wrote:
>>
>> Hi Liu,
>>
>>
>> This feature is used in memory hotplug.
>>
>> In order to implement a whole node hotplug, we need to make sure the
>> node contains no kernel memory, because memory used by kernel could
>> not be migrated. (Since the kernel memory is directly mapped,
>> VA = PA + __PAGE_OFFSET. So the physical address could not be changed.)
>>
>> User could specify all the memory on a node to be movable, so that the
>> node could be hot-removed.
>>
>
> Thank you for your explanation. It's reasonable.
>
> But i think it's a bit duplicated with CMA, i'm not sure but maybe we
> can combine it with CMA which already in mainline?
>
 Hi Liu,

 Thanks for your advice. :)

 CMA is Contiguous Memory Allocator, right?  What I'm trying to do is
 controlling where is the start of ZONE_MOVABLE of each node. Could
 CMA do this job ?
>>>
>>> cma will not control the start of ZONE_MOVABLE of each node, but it
>>> can declare a memory that always movable
>>> and all non movable allocate request will not happen on that area.
>>>
>>> Currently cma use a boot parameter "cma=" to declare a memory size
>>> that always movable.
>>> I think it might fulfill your requirement if extending the boot
>>> parameter with a start address.
>>>
>>> more info at http://lwn.net/Articles/468044/

 And also, after a short investigation, CMA seems need to base on
 memblock. But we need to limit memblock not to allocate memory on
 ZONE_MOVABLE. As a result, we need to know the ranges before memblock
 could be used. I'm afraid we still need an approach to get the ranges,
 such as a boot option, or from static ACPI tables such as SRAT/MPST.

>>>
>>> Yes, it's based on memblock and with boot option.
>>> In setup_arch32()
>>>  dma_contiguous_reserve(0);   =>  will declare a cma area using
>>> memblock_reserve()
>>>
 I'm don't know much about CMA for now. So if you have any better idea,
 please share with us, thanks. :)
>>>
>>> My idea is reuse cma like below patch(even not compiled) and boot with
>>> "cma=size@start_address".
>>> I don't know whether it can work and whether suitable for your
>>> requirement, if not forgive me for this noises.
>>>
>>> diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
>>> index 612afcc..564962a 100644
>>> --- a/drivers/base/dma-contiguous.c
>>> +++ b/drivers/base/dma-contiguous.c
>>> @@ -59,11 +59,18 @@ struct cma *dma_contiguous_default_area;
>>>*/
>>>   static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M;
>>>   static long size_cmdline = -1;
>>> +static long cma_start_cmdline = -1;
>>>

Re: [PATCH 3/3] Input: bu21013_ts - Add support for Device Tree booting

2012-11-27 Thread Dmitry Torokhov

Hi Lee,

On Tue, Nov 27, 2012 at 01:13:10PM +, Lee Jones wrote:
> Now we can register the BU21013_ts touch screen when booting with
> Device Tree enabled. Here we parse all the necessary components
> previously expected to be passed from platform data.

I applied these 3 patches, but for DT we also need to specify compatible
ID and set up of_match_table pointer. Please send me a follow-up patches
doing that and also describing DT bindings for BU21013.

Thanks.

-- 
Dmitry
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: Tree for Nov 28

2012-11-27 Thread Stephen Rothwell

Hi all,

Changes since 20121127:

The ia64 tree lost its conflict.

The powerpc tree still had its build failure for which I applied a patch.

The modules tree still had its build failure so I used the version from
next-20121115.

The mfd tree gained a conflict against Linus' tree.

The pm tree gained a build failure so I used the version from
next-20121127.

The tty tree still had its build failure for which I disabled a driver.

The staging tree gained a conflict against the tty tree.

The arm-soc tree gained a conflict against the thermal tree.

The clk tree gained a conflict against the arm-soc tree.

The akpm tree lost a couple of patches that turned up elsewhere.



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" as mentioned in the FAQ on the wiki
(see below).

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log files
in the Next directory.  Between each merge, the tree was built with
a ppc64_defconfig for powerpc and an allmodconfig for x86_64. After the
final fixups (if any), it is also built with powerpc allnoconfig (32 and
64 bit), ppc44x_defconfig and allyesconfig (minus
CONFIG_PROFILE_ALL_BRANCHES - this fails its final link) and i386, sparc,
sparc64 and arm defconfig. These builds also have
CONFIG_ENABLE_WARN_DEPRECATED, CONFIG_ENABLE_MUST_CHECK and
CONFIG_DEBUG_INFO disabled when necessary.

Below is a summary of the state of the merge.

We are up to 211 trees (counting Linus' and 28 trees of patches pending
for Linus' tree), more are welcome (even if they are currently empty).
Thanks to those who have contributed, and to those who haven't, please do.

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

There is a wiki covering stuff to do with linux-next at
http://linux.f-seidel.de/linux-next/pmwiki/ .  Thanks to Frank Seidel.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

$ git checkout master
$ git reset --hard stable
Merging origin/master (e23739b Merge branch 'v4l_for_linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-media)
Merging fixes/master (12250d8 Merge branch 'i2c-embedded/for-next' of 
git://git.pengutronix.de/git/wsa/linux)
Merging kbuild-current/rc-fixes (bad9955 menuconfig: Replace CIRCLEQ by 
list_head-style lists.)
Merging arm-current/fixes (5010192 ARM: 7583/1: decompressor: Enable unaligned 
memory access for v6 and above)
Merging m68k-current/for-linus (34fa78b m68k: fix sigset_t accessor functions)
Merging powerpc-merge/merge (e716e01 powerpc/eeh: Do not invalidate PE properly)
Merging sparc/master (194d983 Merge tag 'sound-3.7' of 
git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound)
Merging net/master (b49d3c1 net: ipmr: limit MRT_TABLE identifiers)
Merging sound-current/for-linus (d846b17 ALSA: hda - Fix build without 
CONFIG_PM)
Merging pci-current/for-linus (ff8e59b PCI/portdrv: Don't create hotplug slots 
unless port supports hotplug)
Merging wireless/master (6bdd253 mac80211: fix remain-on-channel 
(non-)cancelling)
Merging driver-core.current/driver-core-linus (77b6706 Linux 3.7-rc5)
Merging tty.current/tty-linus (b1a925f tty vt: Fix a regression in command line 
edition)
Merging usb.current/usb-linus (f4a75d2e Linux 3.7-rc6)
Merging staging.current/staging-linus (f4a75d2e Linux 3.7-rc6)
Merging char-misc.current/char-misc-linus (f4a75d2e Linux 3.7-rc6)
Merging input-current/for-linus (0a0d628 ARM - OMAP: ads7846: fix pendown 
debounce setting)
Merging md-current/for-linus (874807a md/raid1{,0}: fix deadlock in 
bitmap_unplug.)
Merging audit-current/for-linus (c158a35 audit: no leading space in 
audit_log_d_path prefix)
Merging crypto-current/master (9efade1 crypto: cryptd - disable softirqs in 
cryptd_queue_worker to prevent data corruption)
Merging ide/master (9974e43 ide: fix generic_ide_suspend/resume Oops)
Merging dwmw2/master (244dc4e Merge 
git://git.infradead.org/users/dwmw2/random-2.6)
Merging sh-current/sh-fixes-for-linus (4403310 SH: Convert out[bwl] macros to 
inline functions)
Merging irqdomain-current/irqdomain/merge (a0d271c Linux 3.6)
Merging devicetree-current/devicetree/merge (0e622d3 of/address: sparc: Declare 
of_iomap as an extern function for sparc again)
Merging spi-current/spi/merge (a0d271c Linux 3.6)
Merging gpio-current/gpio/merge (96b7064 gpi

[RESEND PATCH] RTC: MAX77686: Add Maxim 77686 driver

2012-11-27 Thread Jonghwa Lee

Add driver for support max77686 rtc.
MAX77686 rtc support smpl and wtsr mode. It has two alarm register
which can be used for alarming to wake system up. This drvier uses regmap
to access its register.

Signed-off-by: Chiwoong Byun 
Signed-off-by: Jonghwa Lee 
Signed-off-by: Myugnjoo Ham 
Signed-off-by: Kyungmin Park 
---
 drivers/rtc/Kconfig|   10 +
 drivers/rtc/Makefile   |1 +
 drivers/rtc/rtc-max77686.c |  645 
 3 files changed, 656 insertions(+), 0 deletions(-)
 create mode 100644 drivers/rtc/rtc-max77686.c

diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 19c03ab..4848563 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -233,6 +233,16 @@ config RTC_DRV_MAX8998
  This driver can also be built as a module. If so, the module
  will be called rtc-max8998.
 
+config RTC_DRV_MAX77686
+   tristate "Maxim MAX77686"
+   depends on MFD_MAX77686
+   help
+ If you say yes here you will get support for the
+ RTC of Maxim MAX77686 PMIC.
+
+ This driver can also be built as a module. If so, the module
+ will be called rtc-max77686.
+
 config RTC_DRV_RS5C372
tristate "Ricoh R2025S/D, RS5C372A/B, RV5C386, RV5C387A"
help
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 56297f0..3cc94c9 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_RTC_DRV_MAX8907) += rtc-max8907.o
 obj-$(CONFIG_RTC_DRV_MAX8925)  += rtc-max8925.o
 obj-$(CONFIG_RTC_DRV_MAX8998)  += rtc-max8998.o
 obj-$(CONFIG_RTC_DRV_MAX6902)  += rtc-max6902.o
+obj-$(CONFIG_RTC_DRV_MAX77686) += rtc-max77686.o
 obj-$(CONFIG_RTC_DRV_MC13XXX)  += rtc-mc13xxx.o
 obj-$(CONFIG_RTC_DRV_MSM6242)  += rtc-msm6242.o
 obj-$(CONFIG_RTC_DRV_MPC5121)  += rtc-mpc5121.o
diff --git a/drivers/rtc/rtc-max77686.c b/drivers/rtc/rtc-max77686.c
new file mode 100644
index 000..63da55a
--- /dev/null
+++ b/drivers/rtc/rtc-max77686.c
@@ -0,0 +1,645 @@
+/*
+ * RTC driver for Maxim MAX77686
+ *
+ * Copyright (C) 2012 Samsung Electronics Co.Ltd
+ *
+ *  based on rtc-max8997.c
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* RTC Control Register */
+#define BCD_EN_SHIFT   0
+#define BCD_EN_MASK(1 << BCD_EN_SHIFT)
+#define MODEL24_SHIFT  1
+#define MODEL24_MASK   (1 << MODEL24_SHIFT)
+/* RTC Update Register1 */
+#define RTC_UDR_SHIFT  0
+#define RTC_UDR_MASK   (1 << RTC_UDR_SHIFT)
+#define RTC_RBUDR_SHIFT4
+#define RTC_RBUDR_MASK (1 << RTC_RBUDR_SHIFT)
+/* WTSR and SMPL Register */
+#define WTSRT_SHIFT0
+#define SMPLT_SHIFT2
+#define WTSR_EN_SHIFT  6
+#define SMPL_EN_SHIFT  7
+#define WTSRT_MASK (3 << WTSRT_SHIFT)
+#define SMPLT_MASK (3 << SMPLT_SHIFT)
+#define WTSR_EN_MASK   (1 << WTSR_EN_SHIFT)
+#define SMPL_EN_MASK   (1 << SMPL_EN_SHIFT)
+/* RTC Hour register */
+#define HOUR_PM_SHIFT  6
+#define HOUR_PM_MASK   (1 << HOUR_PM_SHIFT)
+/* RTC Alarm Enable */
+#define ALARM_ENABLE_SHIFT 7
+#define ALARM_ENABLE_MASK  (1 << ALARM_ENABLE_SHIFT)
+
+#define MAX77686_RTC_UPDATE_DELAY  16
+#undef MAX77686_RTC_WTSR_SMPL
+
+enum {
+   RTC_SEC = 0,
+   RTC_MIN,
+   RTC_HOUR,
+   RTC_WEEKDAY,
+   RTC_MONTH,
+   RTC_YEAR,
+   RTC_DATE,
+   RTC_NR_TIME
+};
+
+struct max77686_rtc_info {
+   struct device   *dev;
+   struct max77686_dev *max77686;
+   struct i2c_client   *rtc;
+   struct rtc_device   *rtc_dev;
+   struct mutexlock;
+
+   struct regmap   *regmap;
+
+   int virq;
+   int rtc_24hr_mode;
+};
+
+enum MAX77686_RTC_OP {
+   MAX77686_RTC_WRITE,
+   MAX77686_RTC_READ,
+};
+
+static inline int max77686_rtc_calculate_wday(u8 shifted)
+{
+   int counter = -1;
+   while (shifted) {
+   shifted >>= 1;
+   counter++;
+   }
+   return counter;
+}
+
+static void max77686_rtc_data_to_tm(u8 *data, struct rtc_time *tm,
+  int rtc_24hr_mode)
+{
+   tm->tm_sec = data[RTC_SEC] & 0x7f;
+   tm->tm_min = data[RTC_MIN] & 0x7f;
+   if (rtc_24hr_mode)
+   tm->tm_hour = data[RTC_HOUR] & 0x1f;
+   else {
+   tm->tm_hour = data[RTC_HOUR] & 0x0f;
+   if (data[RTC_HOUR] &

[PATCH 3.7] dma: sh: Don't use ENODEV for failing slave lookup

2012-11-27 Thread Guennadi Liakhovetski

If dmaengine driver's .device_alloc_chan_resources() method returns -ENODEV,
dma_request_channel() will decide, that the driver has been removed and will
remove the device from its list. To prevent this use ENXIO if a slave lookup
fails.

Reported-by: Kuninori Morimoto 
Signed-off-by: Guennadi Liakhovetski 
---

Hi Vinod

Could you please push this patch to Linus for 3.7 ASAP? I think, it should 
also go to "stable."

Thanks
Guennadi

 drivers/dma/sh/shdma.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/dma/sh/shdma.c b/drivers/dma/sh/shdma.c
index f41bcc5..910d878 100644
--- a/drivers/dma/sh/shdma.c
+++ b/drivers/dma/sh/shdma.c
@@ -326,7 +326,7 @@ static int sh_dmae_set_slave(struct shdma_chan *schan,
shdma_chan);
const struct sh_dmae_slave_config *cfg = dmae_find_slave(sh_chan, 
slave_id);
if (!cfg)
-   return -ENODEV;
+   return -ENXIO;
 
if (!try)
sh_chan->config = cfg;
-- 
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [net-next RFC] pktgen: don't wait for the device who doesn't free skb immediately after sent

2012-11-27 Thread Jason Wang


On 11/28/2012 12:49 AM, Stephen Hemminger wrote:

On Tue, 27 Nov 2012 14:45:13 +0800
Jason Wang  wrote:


On 11/27/2012 01:37 AM, Stephen Hemminger wrote:

On Mon, 26 Nov 2012 15:56:52 +0800
Jason Wang  wrote:


Some deivces do not free the old tx skbs immediately after it has been sent
(usually in tx interrupt). One such example is virtio-net which optimizes for
virt and only free the possible old tx skbs during the next packet sending. This
would lead the pktgen to wait forever in the refcount of the skb if no other
pakcet will be sent afterwards.

Solving this issue by introducing a new flag IFF_TX_SKB_FREE_DELAY which could
notify the pktgen that the device does not free skb immediately after it has
been sent and let it not to wait for the refcount to be one.

Signed-off-by: Jason Wang 

Another alternative would be using skb_orphan() and skb->destructor.
There are other cases where skb's are not freed right away.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Hi Stephen:

Do you mean registering a skb->destructor for pktgen then set and check
bits in skb->tx_flag?

Yes. Register a destructor that does something like update a counter (number of 
packets pending),
then just spin while number of packets pending is over threshold.
--


Not sure this is the best method, since pktgen was used to test the tx 
process of the device driver and NIC. If we use skb_orhpan(), we would 
miss the test of tx completion part.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] ARM: mmp: select PINCTRL for ARCH_MMP

2012-11-27 Thread Axel Lin

This makes PINCTRL related config options visible.
Otherwise there is no way to build pinctrl drivers for MMP2, PXA168 and PXA910.

Signed-off-by: Axel Lin 
---
This patch was sent on https://lkml.org/lkml/2012/11/12/10
Resend to Haojian's correct email address.

 arch/arm/Kconfig |1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1fd4208..2f8bfd4 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -591,6 +591,7 @@ config ARCH_MMP
select GPIO_PXA
select IRQ_DOMAIN
select NEED_MACH_GPIO_H
+   select PINCTRL
select PLAT_PXA
select SPARSE_IRQ
help
-- 
1.7.9.5



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch -mm 4/4] docs: Add documentation about /proc//fdinfo/ output

2012-11-27 Thread Cyrill Gorcunov

On Tue, Nov 27, 2012 at 02:47:54PM -0800, Andrew Morton wrote:
> On Fri, 23 Nov 2012 01:15:26 +0400
> Cyrill Gorcunov  wrote:
> 
> >  Documentation/filesystems/proc.txt |   81 
> > +
> 
> Looks good to me.  Here's a small tune-up:

Thanks a lot, Andrew!

Cyrill
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RESEND] pinctrl: Drop selecting PINCONF for MMP2, PXA168 and PXA910

2012-11-27 Thread Axel Lin

These drivers do not need to select PINCONF.

Signed-off-by: Axel Lin 
---
This patch was sent on https://lkml.org/lkml/2012/11/12/12.
Resend to Haojian's correct email address.

 drivers/pinctrl/Kconfig |3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/pinctrl/Kconfig b/drivers/pinctrl/Kconfig
index f06a073..c31aeb0 100644
--- a/drivers/pinctrl/Kconfig
+++ b/drivers/pinctrl/Kconfig
@@ -96,7 +96,6 @@ config PINCTRL_MMP2
bool "MMP2 pin controller driver"
depends on ARCH_MMP
select PINCTRL_PXA3xx
-   select PINCONF
 
 config PINCTRL_MXS
bool
@@ -133,13 +132,11 @@ config PINCTRL_PXA168
bool "PXA168 pin controller driver"
depends on ARCH_MMP
select PINCTRL_PXA3xx
-   select PINCONF
 
 config PINCTRL_PXA910
bool "PXA910 pin controller driver"
depends on ARCH_MMP
select PINCTRL_PXA3xx
-   select PINCONF
 
 config PINCTRL_SINGLE
tristate "One-register-per-pin type device tree based pinctrl driver"
-- 
1.7.9.5



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 0/5] Add movablecore_map boot option

2012-11-27 Thread Tang Chen


Hi Bob, Liu Jiang,

About CMA, could you give me more info ?
Thanks for your patent and nice advice. :)


1) I saw the following on http://lwn.net/Articles/447405/:

The "CMA" type is sticky; pages which are marked as being for CMA
should never have their migration type changed by the kernel.

As Wen said, we now support a user interface to change movable memory
into kernel memory. But seeing from above, the memory specified as
CMA will not be able to be changed, right ?  If so, I don't think
using CMA is a good idea.


2) Is CMA just implemented on ARM platform ?  I found the following in
kernel-parameters.txt.

cma=nn[MG]  [ARM,KNL]
Sets the size of kernel global memory area for contiguous
memory allocations. For more information, see
include/linux/dma-contiguous.h

We are developing on x86. Could we use it ?


3) Is CMA just used for DMA ? I am a little confused here. :)
I found the main code of CMA is implemented in dma-contiguous.c.


4) The boot options cma=xxx and movablecore_map=xxx have different
meanings for user. Reusing CMA could make user confused, I'm afraid.

And, even if we reuse "cma=" option, we still need to do the work
in patch 3~5, right ?


Thanks. :)



On 11/28/2012 12:08 PM, Jiang Liu wrote:

On 2012-11-28 11:24, Bob Liu wrote:

On Tue, Nov 27, 2012 at 8:49 PM, Tang Chen  wrote:

On 11/27/2012 08:09 PM, Bob Liu wrote:


On Tue, Nov 27, 2012 at 4:29 PM, Tang Chen
wrote:


Hi Liu,


This feature is used in memory hotplug.

In order to implement a whole node hotplug, we need to make sure the
node contains no kernel memory, because memory used by kernel could
not be migrated. (Since the kernel memory is directly mapped,
VA = PA + __PAGE_OFFSET. So the physical address could not be changed.)

User could specify all the memory on a node to be movable, so that the
node could be hot-removed.



Thank you for your explanation. It's reasonable.

But i think it's a bit duplicated with CMA, i'm not sure but maybe we
can combine it with CMA which already in mainline?


Hi Liu,

Thanks for your advice. :)

CMA is Contiguous Memory Allocator, right?  What I'm trying to do is
controlling where is the start of ZONE_MOVABLE of each node. Could
CMA do this job ?


cma will not control the start of ZONE_MOVABLE of each node, but it
can declare a memory that always movable
and all non movable allocate request will not happen on that area.

Currently cma use a boot parameter "cma=" to declare a memory size
that always movable.
I think it might fulfill your requirement if extending the boot
parameter with a start address.

more info at http://lwn.net/Articles/468044/


And also, after a short investigation, CMA seems need to base on
memblock. But we need to limit memblock not to allocate memory on
ZONE_MOVABLE. As a result, we need to know the ranges before memblock
could be used. I'm afraid we still need an approach to get the ranges,
such as a boot option, or from static ACPI tables such as SRAT/MPST.



Yes, it's based on memblock and with boot option.
In setup_arch32()
 dma_contiguous_reserve(0);   =>  will declare a cma area using
memblock_reserve()


I'm don't know much about CMA for now. So if you have any better idea,
please share with us, thanks. :)


My idea is reuse cma like below patch(even not compiled) and boot with
"cma=size@start_address".
I don't know whether it can work and whether suitable for your
requirement, if not forgive me for this noises.

diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index 612afcc..564962a 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -59,11 +59,18 @@ struct cma *dma_contiguous_default_area;
   */
  static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M;
  static long size_cmdline = -1;
+static long cma_start_cmdline = -1;

  static int __init early_cma(char *p)
  {
+   char *oldp;
 pr_debug("%s(%s)\n", __func__, p);
+   oldp = p;
 size_cmdline = memparse(p,);
+
+   if (*p == '@')
+   cma_start_cmdline = memparse(p+1,);
+   printk("cma start:0x%x, size: 0x%x\n", size_cmdline, cma_start_cmdline);
 return 0;
  }
  early_param("cma", early_cma);
@@ -127,8 +134,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
 if (selected_size) {
 pr_debug("%s: reserving %ld MiB for global area\n", __func__,
  selected_size / SZ_1M);
-
-   dma_declare_contiguous(NULL, selected_size, 0, limit);
+   if (cma_size_cmdline != -1)
+   dma_declare_contiguous(NULL, selected_size,
cma_start_cmdline, limit);
+   else
+   dma_declare_contiguous(NULL, selected_size, 0, limit);
 }
  };

Seems a good idea to reserve memory by reusing CMA logic, though need more
investigation here. One of CMA goal is to ensure pages in CMA are really
movable, and this patchset tries to achieve the same goal at a

[patch] workqueue: exit rescuer_thread() as TASK_RUNNING

2012-11-27 Thread Mike Galbraith


A rescue thread exiting TASK_INTERRUPTIBLE can lead to a task scheduling
off, never to be seen again.  In the case where this occurred, an exiting
thread hit reiserfs homebrew conditional resched while holding a mutex,
bringing the box to it's knees.

PID: 18105  TASK: 8807fd412180  CPU: 5   COMMAND: "kdmflush"
 #0 [8808157e7670] schedule at 8143f489
 #1 [8808157e77b8] reiserfs_get_block at a038ab2d [reiserfs]
 #2 [8808157e79a8] __block_write_begin at 8117fb14
 #3 [8808157e7a98] reiserfs_write_begin at a0388695 [reiserfs]
 #4 [8808157e7ad8] generic_perform_write at 810ee9e2
 #5 [8808157e7b58] generic_file_buffered_write at 810eeb41
 #6 [8808157e7ba8] __generic_file_aio_write at 810f1a3a
 #7 [8808157e7c58] generic_file_aio_write at 810f1c88
 #8 [8808157e7cc8] do_sync_write at 8114f850
 #9 [8808157e7dd8] do_acct_process at 810a268f
#10 [8808157e7e78] acct_process at 810a27ba
#11 [8808157e7e98] do_exit at 8105e29a
#12 [8808157e7ee8] kthread at 8107afee
#13 [8808157e7f48] kernel_thread_helper at 8144a5c4
[exception RIP: kernel_thread_helper]
RIP: 8144a5c0  RSP: 8808157e7f58  RFLAGS: 0202
RAX:   RBX:   RCX: 
RDX:   RSI: 8107af60  RDI: 8803ee491d18
RBP:    R8:    R9: 
R10:   R11:   R12: 
R13:   R14:   R15: 
ORIG_RAX:   CS: 0010  SS: 0018

Signed-off-by: Mike Galbraith 
Cc: sta...@vger.kernel.org

 kernel/workqueue.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 042d221..ac25db1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2407,8 +2407,10 @@ static int rescuer_thread(void *__wq)
 repeat:
set_current_state(TASK_INTERRUPTIBLE);
 
-   if (kthread_should_stop())
+   if (kthread_should_stop()) {
+   __set_current_state(TASK_RUNNING);
return 0;
+   }
 
/*
 * See whether any cpu is asking for help.  Unbounded


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: manual merge of the clk tree with the arm-soc tree

2012-11-27 Thread Stephen Rothwell

Hi Mike,

Today's linux-next merge of the clk tree got a conflict in
arch/arm/mach-ux500/cpu-db8500.c between commit e13316d60658 ("ARM:
ux500: Rename dbx500 cpufreq code to be more generic") from the arm-soc
tree and commit 50545e1d237b ("ARM: ux500: Remove cpufreq platform
device") from the clk tree.

I fixed it up (I used the clk tree version since that removed the code
that the arm-soc tree version modified) and can carry the fix as
necessary (no action is required).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgpXimhBsoREF.pgp
Description: PGP signature

Re: [PATCH] ARM: EXYNOS: Avoid early use of of_machine_is_compatible()

2012-11-27 Thread Olof Johansson

On Wed, Nov 28, 2012 at 02:23:09PM +0900, Kukjin Kim wrote:
> Olof Johansson wrote:
> > 
> > On Tue, Nov 27, 2012 at 2:27 PM, Kukjin Kim  wrote:
> > > On 11/28/12 07:11, Olof Johansson wrote:
> > >>
> > >> On Tue, Nov 27, 2012 at 11:53 AM, Doug Anderson
> > >> wrote:
> > >>>
> > >>> The recent commit "ARM: EXYNOS: add support for EXYNOS5440 SoC" broke
> > >>> support for exynos5250 because of_machine_is_compatible() was used too
> > >>> early in the boot process.  It also probably meant that the exynos5440
> > >>> failed to use the proper iotable.  Switch to use
> > >>> of_flat_dt_is_compatible() in both of these cases.
> > >>>
> > >>> The failure I was seeing in exynos5250 because of this was:
> > >>>Division by zero in kernel.
> > >>>[<80015ed4>] (unwind_backtrace+0x0/0xec) from [<8045c7a4>]
> > >>> (dump_stack+0x20/0x24)
> > >>>[<8045c7a4>] (dump_stack+0x20/0x24) from [<80012990>]
> > >>> (__div0+0x20/0x28)
> > >>>[<80012990>] (__div0+0x20/0x28) from [<8021ab04>]
> (Ldiv0_64+0x8/0x18)
> > >>>[<8021ab04>] (Ldiv0_64+0x8/0x18) from [<80068560>]
> > >>> (__clocksource_updatefreq_scale+0x54/0x134)
> > >>>[<80068560>] (__clocksource_updatefreq_scale+0x54/0x134) from
> > >>> [<8006865c>] (__clocksource_register_scale+0x1c/0x54)
> > >>>[<8006865c>] (__clocksource_register_scale+0x1c/0x54) from
> > >>> [<80612a18>] (exynos_timer_init+0x100/0x1e8)
> > >>>[<80612a18>] (exynos_timer_init+0x100/0x1e8) from [<8060d184>]
> > >>> (time_init+0x28/0x38)
> > >>>[<8060d184>] (time_init+0x28/0x38) from [<8060a754>]
> > >>> (start_kernel+0x1e0/0x3c8)
> > >>>[<8060a754>] (start_kernel+0x1e0/0x3c8) from [<40008078>]
> > (0x40008078)
> > >>>
> > >>> Signed-off-by: Doug Anderson
> > >>
> > >>
> > >> Thanks Doug.
> > >>
> > >> Kukjin, I'll apply this directly on top of the previous branch in
> > >> arm-soc, if that's OK with you.
> > >>
> > > Sure, go ahead with my ack if you want,
> > >
> > > Acked-by: Kukjin Kim 
> > >
> > > Note, actually there was a fix which uses soc_is_exynos5440() in my
> > local
> > > :-) I'm not sure which one is better at this moment, but I'm OK on this.
> > 
> > Ok, applied. Thanks all.
> > 
> Olof, just note, happens build error with exynos4_defconfig because of
> non-DT.

Ick, thanks for catching that.

> 
> Following can resolve it or we should create null function for
> of_get_flat_dt_root() and of_flat_dt_is_compatible()...
> 
> 8<---
> From: Kukjin Kim 
> Subject: ARM: EXYNOS: fix a build error with non-DT for exynos4
> 
> This fixes following in case of non-DT:
> arch/arm/mach-exynos/common.c: In function 'exynos_init_io':
> arch/arm/mach-exynos/common.c:339: error: implicit declaration of function
> 'of_get_flat_dt_root'
> arch/arm/mach-exynos/common.c:342: error: implicit declaration of function
> 'of_flat_dt_is_compatible'
> make[1]: *** [arch/arm/mach-exynos/common.o] Error 1
> 
> Signed-off-by: Kukjin Kim 
> ---
> diff --git a/arch/arm/mach-exynos/common.c b/arch/arm/mach-exynos/common.c
> index b919f5f..2110091 100644
> --- a/arch/arm/mach-exynos/common.c
> +++ b/arch/arm/mach-exynos/common.c
> @@ -336,12 +336,14 @@ void __init exynos_init_late(void)
> 
>  void __init exynos_init_io(struct map_desc *mach_desc, int size)
>  {
> +#ifdef CONFIG_OF
>   unsigned long root = of_get_flat_dt_root();
> 
>   /* initialize the io descriptors we need for initialization */
>   if (of_flat_dt_is_compatible(root, "samsung,exynos5440"))
>   iotable_init(exynos5440_iodesc,
> ARRAY_SIZE(exynos5440_iodesc));
>   else
> +#endif
>   iotable_init(exynos_iodesc, ARRAY_SIZE(exynos_iodesc));

I really don't like splitting an if/else with an ifdef like this, it's fragile
code and can be hard to follow.

There's also a second build error with exynos_defconfig in the
exynos5-dt.c board file due to a missing include. Teaches me to just apply
patches without trying to build. :(

I'll squash this into Doug's original patch, if that's OK?


diff --git a/arch/arm/mach-exynos/common.c b/arch/arm/mach-exynos/common.c
index 796e0c9..77e7c5b 100644
--- a/arch/arm/mach-exynos/common.c
+++ b/arch/arm/mach-exynos/common.c
@@ -122,6 +122,7 @@ static struct map_desc exynos_iodesc[] __initdata = {
},
 };
 
+#ifdef CONFIG_ARCH_EXYNOS5
 static struct map_desc exynos5440_iodesc[] __initdata = {
{
.virtual= (unsigned long)S5P_VA_CHIPID,
@@ -130,6 +131,7 @@ static struct map_desc exynos5440_iodesc[] __initdata = {
.type   = MT_DEVICE,
},
 };
+#endif
 
 static struct map_desc exynos4_iodesc[] __initdata = {
{
@@ -347,13 +349,19 @@ void __init exynos_init_late(void)
 
 void __init exynos_init_io(struct map_desc *mach_desc, int size)
 {
+   struct map_desc *iodesc = exynos_iodesc;
+   int iodesc_sz = ARRAY_SIZE(exynos_iodesc);
+#ifdef CONFIG_OF
unsigned long root = of_get_flat_dt_root();
 
/* initialize the io descriptors

Re: BUG: scheduling while atomic: ifup-bonding/3711/0x00000002 -- V3.6.7

2012-11-27 Thread Cong Wang

Cc netdev...

On Wed, Nov 28, 2012 at 4:37 AM, Linda Walsh  wrote:
>
>
> Is this a known problem / bug, or should I file a bug on it?  It doesn't
> cause a complete failure, and it happens multiple times (~28 times
> in 2.5 days?... so maybe 10x/day?)  about 8 start with ifup, and the rest
> start @ kworker -- both happen upon enabling the bonding driver
> on a 10Gb dual port adapter (trying to get 1 20Gb adapter).
>
> The 2 tracebacks tyeps (ifup-bonding + kworker) follow:


Does this quick fix help?

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5f5b69f..4a4d9eb 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1785,7 +1785,9 @@ int bond_enslave(struct net_device *bond_dev,
struct net_device *slave_dev)
new_slave->link == BOND_LINK_DOWN ? "DOWN" :
(new_slave->link == BOND_LINK_UP ? "UP" : "BACK"));

+   read_unlock(>lock);
bond_update_speed_duplex(new_slave);
+   read_lock(>lock);

if (USES_PRIMARY(bond->params.mode) && bond->params.primary[0]) {
/* if there is a primary slave, remember it */

Thanks!

>
>
> - ifup-bonding traceback:
>
> [  229.208603] bonding: bond0: Setting MII monitoring interval to 100.
> [  229.222336] bonding: bond0: Adding slave p2p1.
> [  229.685599] BUG: scheduling while atomic: ifup-bonding/3711/0x0002
> [  229.692166] 4 locks held by ifup-bonding/3711:
> [  229.696645]  #0:  (>mutex){..}, at: []
> sysfs_write_file+0x3f/0x150
> [  229.705721]  #1:  (s_active#75){..}, at: []
> sysfs_write_file+0xbb/0x150
> [  229.714538]  #2:  (rtnl_mutex){..}, at: []
> rtnl_trylock+0x10/0x20
> [  229.722772]  #3:  (>lock){..}, at: []
> bond_enslave+0x4df/0xb50 [bonding]
> [  229.732188] Modules linked in: bonding fan mousedev kvm_intel iTCO_wdt
> iTCO_vendor_support gpio_ich kvm acpi_cpufreq mperf tpm_tis tpm tpm_bios
> processor button
> [  229.747197] Pid: 3711, comm: ifup-bonding Not tainted 3.6.7-Isht-Van #1
> [  229.753843] Call Trace:
> [  229.756333]  [] __schedule_bug+0x5e/0x6c
> [  229.761863]  [] __schedule+0x77c/0x810
> [  229.767214]  [] schedule+0x24/0x70
> [  229.772214]  []
> schedule_hrtimeout_range_clock+0xfc/0x140
> [  229.779210]  [] ? update_rmtp+0x60/0x60
> [  229.784645]  [] ? hrtimer_start_range_ns+0xf/0x20
> [  229.790950]  [] schedule_hrtimeout_range+0xe/0x10
> [  229.797254]  [] usleep_range+0x3b/0x40
> [  229.802611]  [] ixgbe_acquire_swfw_sync_X540+0xbc/0x110
> [  229.809429]  [] ixgbe_read_phy_reg_generic+0x3d/0x120
> [  229.816078]  []
> ixgbe_get_copper_link_capabilities_generic+0x2c/0x60
> [  229.824022]  [] ? bond_enslave+0x4df/0xb50 [bonding]
> [  229.830581]  [] ixgbe_get_settings+0x34/0x2b0
> [  229.836534]  [] __ethtool_get_settings+0x85/0x140
> [  229.842837]  [] bond_update_speed_duplex+0x23/0x60
> [bonding]
> [  229.850092]  [] bond_enslave+0x548/0xb50 [bonding]
> [  229.856478]  [] bonding_store_slaves+0x13f/0x190
> [bonding]
> [  229.863556]  [] dev_attr_store+0x13/0x30
> [  229.869074]  [] sysfs_write_file+0xd4/0x150
> [  229.874856]  [] vfs_write+0xb1/0x180
> [  229.880034]  [] sys_write+0x48/0x90
> [  229.885125]  [] system_call_fastpath+0x16/0x1b
> [  229.891259] BUG: scheduling while atomic: ifup-bonding/3711/0x0002
> [  229.897839] 4 locks held by ifup-bonding/3711:
> [  229.902320]  #0:  (>mutex){..}, at: []
> sysfs_write_file+0x3f/0x150
> [  229.911395]  #1:  (s_active#75){..}, at: []
> sysfs_write_file+0xbb/0x150
> [  229.920212]  #2:  (rtnl_mutex){..}, at: []
> rtnl_trylock+0x10/0x20
> [  229.928449]  #3:  (>lock){..}, at: []
> bond_enslave+0x4df/0xb50 [bonding]
> [  229.937866] Modules linked in: bonding fan mousedev kvm_intel iTCO_wdt
> iTCO_vendor_support gpio_ich kvm acpi_cpufreq mperf tpm_tis tpm tpm_bios
> processor button
> [  229.952904] Pid: 3711, comm: ifup-bonding Tainted: GW
> 3.6.7-Isht-Van #1
> [  229.960507] Call Trace:
> [  229.962997]  [] __schedule_bug+0x5e/0x6c
> [  229.968526]  [] __schedule+0x77c/0x810
> [  229.973875]  [] schedule+0x24/0x70
> [  229.978876]  []
> schedule_hrtimeout_range_clock+0xfc/0x140
> [  229.985871]  [] ? update_rmtp+0x60/0x60
> [  229.991303]  [] ? update_rmtp+0x60/0x60
> [  229.996739]  [] ? hrtimer_start_range_ns+0xf/0x20
> [  230.003040]  [] schedule_hrtimeout_range+0xe/0x10
> [  230.009344]  [] usleep_range+0x3b/0x40
> [  230.014698]  [] ixgbe_release_swfw_sync_X540+0x4e/0x60
> [  230.021435]  [] ixgbe_read_phy_reg_generic+0x101/0x120
> [  230.028171]  []
> ixgbe_get_copper_link_capabilities_generic+0x2c/0x60
> [  230.036117]  [] ? bond_enslave+0x4df/0xb50 [bonding]
> [  230.042677]  [] ixgbe_get_settings+0x34/0x2b0
> [  230.048630]  [] __ethtool_get_settings+0x85/0x140
> [  230.054934]  [] bond_update_speed_duplex+0x23/0x60
> [bonding]
> [  230.062189]  [] bond_enslave+0x548/0xb50 [bonding]
> [  230.068580]  [] bonding_store_slaves+0x13f/0x190
> [bonding]
> [  230.075660]  []

[PATCH 1/2] Input: RMI4 - rework F11 sysfs and debugfs attributes

2012-11-27 Thread Dmitry Torokhov

Avoid rolling our own debugfs operations and use the standar ones instead.
Also switch to using attribute groups to create sysfs attributes.

The max_x and max_y can be retrieced via EVIOGABS and so attributes moved
over to debugfs.

Signed-off-by: Dmitry Torokhov 
---
 drivers/input/rmi4/rmi_f11.c | 918 +--
 include/linux/rmi.h  |  20 +-
 2 files changed, 186 insertions(+), 752 deletions(-)

diff --git a/drivers/input/rmi4/rmi_f11.c b/drivers/input/rmi4/rmi_f11.c
index 8457ab4..717e2d8 100644
--- a/drivers/input/rmi4/rmi_f11.c
+++ b/drivers/input/rmi4/rmi_f11.c
@@ -20,6 +20,7 @@
 #define FUNCTION_DATA f11_data
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -29,12 +30,6 @@
 #include 
 #include "rmi_driver.h"
 
-#ifdef CONFIG_RMI4_DEBUG
-#include 
-#include 
-#include 
-#endif
-
 #define F11_MAX_NUM_OF_SENSORS 8
 #define F11_MAX_NUM_OF_FINGERS 10
 #define F11_MAX_NUM_OF_TOUCH_SHAPES16
@@ -787,19 +782,7 @@ struct f11_2d_data {
  * @input - input device for absolute pointing stream
  * @mouse_input - input device for relative pointing stream.
  * @input_phys - buffer for the absolute phys name for this sensor.
- * @input_mouse_phys - buffer for the relative phys name for this sensor.
- * @debugfs_flip - inverts one or both axes.  Useful in prototyping new
- * systems.
- * @debugfs_flip - coordinate clipping range for one or both axes.  Useful in
- * prototyping new systems.
- * @debugfs_delta_threshold - adjusts motion sensitivity for relative reports
- * and (in reduced reporting mode) absolute reports.  Useful in prototyping new
- * systems.
- * @debugfs_offset - offsets one or both axes.  Useful in prototyping new
- * systems.
- * @debugfs_swap - swaps X and Y axes.  Useful in prototyping new systems.
- * @debugfs_type_a - forces type A behavior.  Useful in bringing up old systems
- * when you're not sure if you've got a Type A or Type B sensor.
+ * @input_phys_mouse - buffer for the relative phys name for this sensor.
  */
 struct f11_2d_sensor {
struct rmi_f11_2d_axis_alignment axis_align;
@@ -811,22 +794,13 @@ struct f11_2d_sensor {
u8 *data_pkt;
int pkt_size;
u8 sensor_index;
-   bool type_a;
+   u32 type_a; /* boolean but debugfs API requires u32 */
enum rmi_f11_sensor_type sensor_type;
struct input_dev *input;
struct input_dev *mouse_input;
struct rmi_function *fn;
char input_phys[NAME_BUFFER_SIZE];
char input_phys_mouse[NAME_BUFFER_SIZE];
-
-#ifdef CONFIG_RMI4_DEBUG
-   struct dentry *debugfs_flip;
-   struct dentry *debugfs_clip;
-   struct dentry *debugfs_delta_threshold;
-   struct dentry *debugfs_offset;
-   struct dentry *debugfs_swap;
-   struct dentry *debugfs_type_a;
-#endif
 };
 
 /** Data pertaining to F11 in general.  For per-sensor data, see struct
@@ -849,10 +823,6 @@ struct f11_data {
struct mutex dev_controls_mutex;
u16 rezero_wait_ms;
struct f11_2d_sensor sensors[F11_MAX_NUM_OF_SENSORS];
-
-#ifdef CONFIG_RMI4_DEBUG
-   struct dentry *debugfs_rezero_wait;
-#endif
 };
 
 enum finger_state_values {
@@ -862,71 +832,56 @@ enum finger_state_values {
F11_RESERVED= 0x03
 };
 
-static ssize_t f11_maxPos_show(struct device *dev,
-struct device_attribute *attr,
-char *buf)
-{
-   struct rmi_function *fn;
-   struct f11_data *data;
-
-   fn = to_rmi_function(dev);
-   data = fn->data;
-
-   return snprintf(buf, PAGE_SIZE, "%u %u\n",
-   data->sensors[0].max_x, data->sensors[0].max_y);
-}
-
-static ssize_t f11_relreport_show(struct device *dev,
-   struct device_attribute *attr,
-   char *buf)
+static ssize_t rmi_f11_relreport_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
 {
-   struct rmi_function *fn;
-   struct f11_data *instance_data;
-
-   fn = to_rmi_function(dev);
-   instance_data = fn->data;
+   struct rmi_function *fn = to_rmi_function(dev);
+   struct f11_data *data = fn->data;
 
return snprintf(buf, PAGE_SIZE, "%u\n",
-   instance_data->
-   sensors[0].axis_align.rel_report_enabled);
+   data->sensors[0].axis_align.rel_report_enabled);
 }
 
-static ssize_t f11_relreport_store(struct device *dev,
-struct device_attribute *attr,
-const char *buf,
-size_t count)
+static ssize_t rmi_f11_relreport_store(struct device *dev,
+  struct device_attribute *attr,
+  const char *buf,
+

[PATCH 2/2] Input: RMI4 - F11 - remove relreport attribute

2012-11-27 Thread Dmitry Torokhov

This data item does not seem to be used anywhere.

Signed-off-by: Dmitry Torokhov 
---
 drivers/input/rmi4/rmi_f11.c | 37 -
 include/linux/rmi.h  |  1 -
 2 files changed, 38 deletions(-)

diff --git a/drivers/input/rmi4/rmi_f11.c b/drivers/input/rmi4/rmi_f11.c
index 717e2d8..da3fd2a 100644
--- a/drivers/input/rmi4/rmi_f11.c
+++ b/drivers/input/rmi4/rmi_f11.c
@@ -832,42 +832,6 @@ enum finger_state_values {
F11_RESERVED= 0x03
 };
 
-static ssize_t rmi_f11_relreport_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
-   struct rmi_function *fn = to_rmi_function(dev);
-   struct f11_data *data = fn->data;
-
-   return snprintf(buf, PAGE_SIZE, "%u\n",
-   data->sensors[0].axis_align.rel_report_enabled);
-}
-
-static ssize_t rmi_f11_relreport_store(struct device *dev,
-  struct device_attribute *attr,
-  const char *buf,
-  size_t count)
-{
-   struct rmi_function *fn = to_rmi_function(dev);
-   struct f11_data *data = fn->data;
-   unsigned int new_value;
-   int error;
-
-   error = kstrtouint(buf, 0, _value);
-   if (error)
-   return error;
-
-   if (new_value > 1)
-   return -ERANGE;
-
-   data->sensors[0].axis_align.rel_report_enabled = new_value;
-
-   return count;
-}
-
-static DEVICE_ATTR(relreport, RMI_RW_ATTR,
-  rmi_f11_relreport_show, rmi_f11_relreport_store);
-
 static ssize_t rmi_f11_rezero_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
@@ -906,7 +870,6 @@ static ssize_t rmi_f11_rezero_store(struct device *dev,
 static DEVICE_ATTR(rezero, RMI_WO_ATTR, NULL, rmi_f11_rezero_store);
 
 static struct attribute *rmi_f11_attrs[] = {
-   _attr_relreport.attr,
_attr_rezero.attr,
NULL
 };
diff --git a/include/linux/rmi.h b/include/linux/rmi.h
index d4e1438..3969e61 100644
--- a/include/linux/rmi.h
+++ b/include/linux/rmi.h
@@ -73,7 +73,6 @@ enum rmi_attn_polarity {
  *   automatically enabled for this sensor.
  */
 struct rmi_f11_2d_axis_alignment {
-   bool rel_report_enabled;
u32 swap_axes;  /* boolean, but u32 is needed by debugfs API */
u32 flip_x; /* boolean */
u32 flip_y; /* boolean */
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] KVM: MMU: lazily drop large spte

2012-11-27 Thread Xiao Guangrong

On 11/18/2012 11:00 AM, Marcelo Tosatti wrote:
map gfn 4?  See corrected step 7 above.
>>
>> Ah, this is a real bug, and unfortunately, it exists in current
>> code. I will make a separate patchset to fix it. Thank you, Marcelo!
> 
> Is it? Hum..
> 
> Anyway, it would be great if you can write a testcase (should be similar
> in size to rmap_chain).

Marcelo, is this patch acceptable?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH] ARM: EXYNOS: Avoid early use of of_machine_is_compatible()

2012-11-27 Thread Kukjin Kim

Olof Johansson wrote:
> 
> On Tue, Nov 27, 2012 at 2:27 PM, Kukjin Kim  wrote:
> > On 11/28/12 07:11, Olof Johansson wrote:
> >>
> >> On Tue, Nov 27, 2012 at 11:53 AM, Doug Anderson
> >> wrote:
> >>>
> >>> The recent commit "ARM: EXYNOS: add support for EXYNOS5440 SoC" broke
> >>> support for exynos5250 because of_machine_is_compatible() was used too
> >>> early in the boot process.  It also probably meant that the exynos5440
> >>> failed to use the proper iotable.  Switch to use
> >>> of_flat_dt_is_compatible() in both of these cases.
> >>>
> >>> The failure I was seeing in exynos5250 because of this was:
> >>>Division by zero in kernel.
> >>>[<80015ed4>] (unwind_backtrace+0x0/0xec) from [<8045c7a4>]
> >>> (dump_stack+0x20/0x24)
> >>>[<8045c7a4>] (dump_stack+0x20/0x24) from [<80012990>]
> >>> (__div0+0x20/0x28)
> >>>[<80012990>] (__div0+0x20/0x28) from [<8021ab04>]
(Ldiv0_64+0x8/0x18)
> >>>[<8021ab04>] (Ldiv0_64+0x8/0x18) from [<80068560>]
> >>> (__clocksource_updatefreq_scale+0x54/0x134)
> >>>[<80068560>] (__clocksource_updatefreq_scale+0x54/0x134) from
> >>> [<8006865c>] (__clocksource_register_scale+0x1c/0x54)
> >>>[<8006865c>] (__clocksource_register_scale+0x1c/0x54) from
> >>> [<80612a18>] (exynos_timer_init+0x100/0x1e8)
> >>>[<80612a18>] (exynos_timer_init+0x100/0x1e8) from [<8060d184>]
> >>> (time_init+0x28/0x38)
> >>>[<8060d184>] (time_init+0x28/0x38) from [<8060a754>]
> >>> (start_kernel+0x1e0/0x3c8)
> >>>[<8060a754>] (start_kernel+0x1e0/0x3c8) from [<40008078>]
> (0x40008078)
> >>>
> >>> Signed-off-by: Doug Anderson
> >>
> >>
> >> Thanks Doug.
> >>
> >> Kukjin, I'll apply this directly on top of the previous branch in
> >> arm-soc, if that's OK with you.
> >>
> > Sure, go ahead with my ack if you want,
> >
> > Acked-by: Kukjin Kim 
> >
> > Note, actually there was a fix which uses soc_is_exynos5440() in my
> local
> > :-) I'm not sure which one is better at this moment, but I'm OK on this.
> 
> Ok, applied. Thanks all.
> 
Olof, just note, happens build error with exynos4_defconfig because of
non-DT.

Following can resolve it or we should create null function for
of_get_flat_dt_root() and of_flat_dt_is_compatible()...

8<---
From: Kukjin Kim 
Subject: ARM: EXYNOS: fix a build error with non-DT for exynos4

This fixes following in case of non-DT:
arch/arm/mach-exynos/common.c: In function 'exynos_init_io':
arch/arm/mach-exynos/common.c:339: error: implicit declaration of function
'of_get_flat_dt_root'
arch/arm/mach-exynos/common.c:342: error: implicit declaration of function
'of_flat_dt_is_compatible'
make[1]: *** [arch/arm/mach-exynos/common.o] Error 1

Signed-off-by: Kukjin Kim 
---
diff --git a/arch/arm/mach-exynos/common.c b/arch/arm/mach-exynos/common.c
index b919f5f..2110091 100644
--- a/arch/arm/mach-exynos/common.c
+++ b/arch/arm/mach-exynos/common.c
@@ -336,12 +336,14 @@ void __init exynos_init_late(void)

 void __init exynos_init_io(struct map_desc *mach_desc, int size)
 {
+#ifdef CONFIG_OF
unsigned long root = of_get_flat_dt_root();

/* initialize the io descriptors we need for initialization */
if (of_flat_dt_is_compatible(root, "samsung,exynos5440"))
iotable_init(exynos5440_iodesc,
ARRAY_SIZE(exynos5440_iodesc));
else
+#endif
iotable_init(exynos_iodesc, ARRAY_SIZE(exynos_iodesc));

if (mach_desc)
8<---

Thanks.

Best regards,
Kgene.
--
Kukjin Kim , Senior Engineer,
SW Solution Development Team, Samsung Electronics Co., Ltd.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Lockdep complain for zram

2012-11-27 Thread Minchan Kim

On Fri, Nov 23, 2012 at 03:46:31PM +0100, Jerome Marchand wrote:
> On 11/23/2012 12:34 AM, Minchan Kim wrote:
> > On Thu, Nov 22, 2012 at 12:13:24PM +0100, Jerome Marchand wrote:
> >> On 11/21/2012 09:37 AM, Minchan Kim wrote:
> >>> Hi alls,
> >>>
> >>> Today, I saw below complain of lockdep.
> >>> As a matter of fact, I knew it long time ago but forgot that.
> >>> The reason lockdep complains is that now zram uses GFP_KERNEL
> >>> in reclaim path(ex, __zram_make_request) :(
> >>> I can fix it via replacing GFP_KERNEL with GFP_NOIO.
> >>> But more big problem is vzalloc in zram_init_device which calls 
> >>> GFP_KERNEL.
> >>> Of course, I can change it with __vmalloc which can receive gfp_t.
> >>> But still we have a problem. Althoug __vmalloc can handle gfp_t, it calls
> >>> allocation of GFP_KERNEL. That's why I sent the patch.
> >>> https://lkml.org/lkml/2012/4/23/77
> >>> Since then, I forgot it, saw the bug today and poped the question again.
> >>>
> >>> Yes. Fundamental problem is utter crap API vmalloc.
> >>> If we can fix it, everyone would be happy. But life isn't simple like 
> >>> seeing
> >>> my thread of the patch.
> >>>
> >>> So next option is to move zram_init_device into setting disksize time.
> >>> But it makes unnecessary metadata waste until zram is used really(That's 
> >>> why
> >>> Nitin move zram_init_device from disksize setting time to make_request) 
> >>> and
> >>> it makes user should set the disksize before using, which are behavior 
> >>> change.
> >>>
> >>> I would like to clean up this issue before promoting because it might 
> >>> change
> >>> usage behavior.
> >>>
> >>> Do you have any idea?
> >>
> >> This is a false positive due to the memory allocation in
> >> zram_init_device() called from zram_make_request(). It appears to
> >> lockdep that the allocation might trigger a request on the device that
> >> would try to take init_lock again, but in fact it doesn't. The device
> >> is not initialized yet, even less swapped on.
> > 
> > That's not a only swap case.
> > Let's think following usecase.
> > 
> > 1) Booting
> > 2) echo $((DISKSIZE)) > /sys/block/zram0/disksize
> > 3) dd if=/dev/zero of=/dev/zram0 bs=4K count=1
> > 4) Written 4K page(page-A) is still page cache and isn't submitted
> >to zram block device.
> > 5) Memory pressure happen by some memory hogger.
> > 6) VM start to reclaim and write page-A to zram0.
> > 7) zram_init_device is called at last.
> > 8) allocate GFP_KERNEL in zram_init_device
> > 9) goto reclaim path again.
> > 10) deadlock.
> > 
> > So I think it's not false positive.
> 
> I guess you're right. That's a scenario I haven't imagined. At any rate, my
> patch fixes that.
> 
> > Even if it is, I think lock split isn't a good idea to just avoid
> > lockdep warn. It makes code unnecessary complicated and it would be more
> > error-prone. Let's not add another lock without performance trouble report
> > by the lock.
> > 
> > As I discussed with Nitin in this thread, lazy initialization don't have
> > much point and disksize setting option isn't consistent for user behavior.
> > And I expect Nitin will send patch "diet of table" soonish.
> > 
> > So just moving the initialzation part from reclaim context to process's one
> > is simple and clear solution, I believe.
> 
> Although that would avoid deadlocks (I guess, I'm not sure anymore...), it
> won't stop lockdep from complaining. It still makes an allocation while

Argh, I sent it by mistake anyway, It's false-positive by this patch now.
Anyway we need more patch to shut lockdep up. I just sent patchset.

> holding a lock that is also taken in a reclaim context.
> Anyway, I like the idea to removes the lazy initialization. It makes things
> more complicated without any actual advantage.

Thanks for the review, Jerome.

> 
> Jerome
> 
> > 
> >>
> >> The following (quickly tested) patch should prevent lockdep complain.  
> >>
> >> Jerome
> >>
> >> ---
> >> >From ebb3514c4ee18276da7c5ca08025991b493ac204 Mon Sep 17 00:00:00 2001
> >> From: Jerome Marchand 
> >> Date: Thu, 22 Nov 2012 09:07:40 +0100
> >> Subject: [PATCH] staging: zram: Avoid lockdep warning
> >>
> >> zram triggers a lockdep warning. The cause of it is the call to
> >> zram_init_device() from zram_make_request(). The memory allocation in
> >> zram_init_device() could start a memory reclaim which in turn could
> >> cause swapout and (as it appears to lockdep) a call to
> >> zram_make_request(). However this is a false positive: an
> >> unititialized device can't be used as swap.
> >> A solution is to split init_lock in two lock. One mutex that protects
> >> init, reset and size setting and a rw_semaphore that protects requests
> >> and reset. Thus init and request would be protected by different locks
> >> and lockdep will be happy.
> >>
> >> Signed-off-by: Jerome Marchand 
> >> ---
> >>  drivers/staging/zram/zram_drv.c   |   41 
> >> +++-
> >>  drivers/staging/zram/zram_drv.h   |   16 ++---
>

Re: [PATCH v2 0/5] Add movablecore_map boot option

2012-11-27 Thread Jiang Liu

On 2012-11-28 13:21, Wen Congyang wrote:
> At 11/28/2012 12:01 PM, Jiang Liu Wrote:
>> On 2012-11-28 11:47, Tang Chen wrote:
>>> On 11/27/2012 11:10 AM, wujianguo wrote:

 Hi Tang,
 DMA address can't be set as movable, if some one boot kernel with
 movablecore_map=4G@0xa0 or other memory region that contains DMA 
 address,
 system maybe boot failed. Should this case be handled or mentioned
 in the change log and kernel-parameters.txt?
>>>
>>> Hi Wu,
>>>
>>> I think we can use MAX_DMA_PFN and MAX_DMA32_PFN to prevent setting DMA
>>> address as movable. Just ignore the address lower than them, and set
>>> the rest as movable. How do you think ?
>>>
>>> And, since we cannot figure out the minimum of memory kernel needs, I
>>> think for now, we can just add some warning into kernel-parameters.txt.
>>>
>>> Thanks. :)
>> On one other OS, there is a mechanism to dynamically convert pages from
>> movable zones into normal zones.
> 
> The OS auto does it? Or the user coverts it?
> 
> We can convert pages from movable zones into normal zones by the following
> interface:
> echo online_kernel >/sys/devices/system/memory/memoryX/state
> 
> We have posted a patchset to implement it, and it is in mm tree now.
OS automatically converts it, no manual operations needed.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V3 RFC 2/2] kvm: Handle yield_to failure return code for potential undercommit case

2012-11-27 Thread Raghavendra K T


On 11/28/2012 06:42 AM, Marcelo Tosatti wrote:


Don't understand the reasoning behind why 3 is a good choice.


Here is where I came from. (explaining from scratch for completeness, 
forgive me :))

In moderate overcommits, we can falsely exit from ple handler even when
we have preempted task of same VM waiting on other cpus. To reduce this
problem, we try few times before exiting.
The problem boils down to:
what is the probability that we exit ple handler even when we have more
than 1 task in other cpus. Theoretical worst case should be around 1.5x
overcommit (As also pointed by Andrew Theurer). [But practical
worstcase may be around 2x,3x overcommits as indicated by the results
for the patch series]

So if p is the probability of finding rq length one on a particular cpu,
and if we do n tries, then probability of exiting ple handler is:

 p^(n+1) [ because we would have come across one source with rq length
1 and n target cpu rqs  with length 1 ]

so
num tries: probability of aborting ple handler (1.5x overcommit)
 1 1/4
 2 1/8
 3 1/16

We can increase this probability with more tries, but the problem is
the overhead.
Also, If we have tried three times that means we would have iterated
over 3 good eligible vcpus along with many non-eligible candidates. In
worst case if we iterate all the vcpus, we reduce 1x performance and
overcommit performance get hit. [ as in results ].

I have tried num_tries = 1,2,3 and n already ( not 4 yet). So I
concluded 3 is enough.

Infact I have also run kernbench and hackbench which are giving 5-20%
improvement.

[ As a side note , I also thought how about having num_tries = f(n) =
ceil ( log(num_online_cpus)/2 ) But I thought calculation is too much
overhead and also there is no point in probably making it dependent on
online cpus ]

Please let me know if you are happy with this rationale/ or correct me
if you foresee some problem. (Infact Avi, Rik's concern about false
exiting made me arrive at 'try' logic which I did not have earlier).

I am currently trying out the result for 1.5x overcommit will post the
result.



On Mon, Nov 26, 2012 at 05:38:04PM +0530, Raghavendra K T wrote:

From: Raghavendra K T 

yield_to returns -ESRCH, When source and target of yield_to
run queue length is one. When we see three successive failures of
yield_to we assume we are in potential undercommit case and abort
from PLE handler.
The assumption is backed by low probability of wrong decision
for even worst case scenarios such as average runqueue length
between 1 and 2.

note that we do not update last boosted vcpu in failure cases.
Thank Avi for raising question on aborting after first fail from yield_to.

Reviewed-by: Srikar Dronamraju 
Signed-off-by: Raghavendra K T 

[...]

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] zram: Prevent use of unmapped buffer

2012-11-27 Thread Minchan Kim

Hi Nitin,

On Mon, Nov 26, 2012 at 11:26:07PM -0800, Nitin Gupta wrote:
> The commit c8f2f0db1 ("zram: Fix handling of incompressible pages")
> introduced a bug which caused a kunmap()'ed buffer to be used in case

I got confused by the descripton. :(
The descripton is not right.
The problem is to access freed memory, not accessing to kunmaped buffer.

partial I/O write.

1. uncmem = kmalloc
2. zram_decompress_page(uncmem)
3. memcpy(uncmem, user_mem)
4. lzo1x_1_compress(uncmem)
5. kfree(uncmem)
6. src = uncmem
7. memcpy(cmem, src, clen) <- HIT

> of partial writes where the data was found to be incompressible.
> 
> This fixes bug 50081:
> https://bugzilla.kernel.org/show_bug.cgi?id=50081
> 
> Signed-off-by: Nitin Gupta 
> Reported-by: Mihail Kasadjikov 
> Reported-by: Tomas M 

Good catch! Shame on me. I should have reivewed more carefully. :(
Please resend it with revised descrption and title.
I will rebase mh patchset just sent on top of this bug fix patch.

P.S) Sigh, Now code isn't clean due to partial read/write path handling.
 IMHO, sooner or later, we need refactoring.

-- 
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 0/5] Add movablecore_map boot option

2012-11-27 Thread Wen Congyang

At 11/28/2012 12:01 PM, Jiang Liu Wrote:
> On 2012-11-28 11:47, Tang Chen wrote:
>> On 11/27/2012 11:10 AM, wujianguo wrote:
>>>
>>> Hi Tang,
>>> DMA address can't be set as movable, if some one boot kernel with
>>> movablecore_map=4G@0xa0 or other memory region that contains DMA 
>>> address,
>>> system maybe boot failed. Should this case be handled or mentioned
>>> in the change log and kernel-parameters.txt?
>>
>> Hi Wu,
>>
>> I think we can use MAX_DMA_PFN and MAX_DMA32_PFN to prevent setting DMA
>> address as movable. Just ignore the address lower than them, and set
>> the rest as movable. How do you think ?
>>
>> And, since we cannot figure out the minimum of memory kernel needs, I
>> think for now, we can just add some warning into kernel-parameters.txt.
>>
>> Thanks. :)
> On one other OS, there is a mechanism to dynamically convert pages from
> movable zones into normal zones.

The OS auto does it? Or the user coverts it?

We can convert pages from movable zones into normal zones by the following
interface:
echo online_kernel >/sys/devices/system/memory/memoryX/state

We have posted a patchset to implement it, and it is in mm tree now.

Thanks
Wen Congyang

> 
> Regards!
> Gerry
> 
>>
>>>
>>> Thanks,
>>> Jianguo Wu
>>>
>>
>> .
>>
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: manual merge of the staging tree with the tty tree

2012-11-27 Thread Stephen Rothwell

Hi Greg,

Today's linux-next merge of the staging tree got a conflict in
drivers/staging/dgrp/dgrp_tty.c between commit 3ba89e96610b ("staging:
dgrp: dgrp_tty.c: Remove the TIOCSSOFTCAR ioctl handler from dgrp
driver") from the tty tree and commit 8fdefcb0ab74 ("staging: dgrp:
remove TIOCGSOFTCAR and TIOCSSOFTCAR handling") from the staging tree.

I fixed it up (using the staging tree version - since it is a superset of
the tty tree version) and can carry the fix as necessary (no action is
required).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgpFKTzQVxIbV.pgp
Description: PGP signature

Re: [PATCH v2 0/5] Add movablecore_map boot option

2012-11-27 Thread Jianguo Wu

On 2012/11/28 11:47, Tang Chen wrote:

> On 11/27/2012 11:10 AM, wujianguo wrote:
>>
>> Hi Tang,
>> DMA address can't be set as movable, if some one boot kernel with
>> movablecore_map=4G@0xa0 or other memory region that contains DMA address,
>> system maybe boot failed. Should this case be handled or mentioned
>> in the change log and kernel-parameters.txt?
> 
> Hi Wu,
> 
> I think we can use MAX_DMA_PFN and MAX_DMA32_PFN to prevent setting DMA
> address as movable. Just ignore the address lower than them, and set
> the rest as movable. How do you think ?
> 

I think it's OK for now.

> And, since we cannot figure out the minimum of memory kernel needs, I
> think for now, we can just add some warning into kernel-parameters.txt.
> 
> Thanks. :)
> 
>>
>> Thanks,
>> Jianguo Wu
>>
> 
> .
> 



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] percpu-rwsem: use synchronize_sched_expedited

2012-11-27 Thread Mikulas Patocka



On Tue, 27 Nov 2012, Jeff Chua wrote:

> On Tue, Nov 27, 2012 at 3:38 PM, Jens Axboe  wrote:
> > On 2012-11-27 06:57, Jeff Chua wrote:
> >> On Sun, Nov 25, 2012 at 7:23 AM, Jeff Chua  
> >> wrote:
> >>> On Sun, Nov 25, 2012 at 5:09 AM, Mikulas Patocka  
> >>> wrote:
>  So it's better to slow down mount.
> >>>
> >>> I am quite proud of the linux boot time pitting against other OS. Even
> >>> with 10 partitions. Linux can boot up in just a few seconds, but now
> >>> you're saying that we need to do this semaphore check at boot up. By
> >>> doing so, it's inducing additional 4 seconds during boot up.
> >>
> >> By the way, I'm using a pretty fast SSD (Samsung PM830) and fast CPU
> >> (2.8GHz). I wonder if those on slower hard disk or slower CPU, what
> >> kind of degradation would this cause or just the same?
> >
> > It'd likely be the same slow down time wise, but as a percentage it
> > would appear smaller on a slower disk.
> >
> > Could you please test Mikulas' suggestion of changing
> > synchronize_sched() in include/linux/percpu-rwsem.h to
> > synchronize_sched_expedited()?
> 
> Tested. It seems as fast as before, but may be a "tick" slower. Just
> perception. I was getting pretty much 0.012s with everything reverted.
> With synchronize_sched_expedited(), it seems to be 0.012s ~ 0.013s.
> So, it's good.
> 
> 
> > linux-next also has a re-write of the per-cpu rw sems, out of Andrews
> > tree. It would be a good data point it you could test that, too.
> 
> Tested. It's slower. 0.350s. But still faster than 0.500s without the patch.
> 
> # time mount /dev/sda1 /mnt; sync; sync; umount /mnt
> 
> 
> So, here's the comparison ...
> 
> 0.500s 3.7.0-rc7
> 0.168s 3.7.0-rc2
> 0.012s 3.6.0
> 0.013s 3.7.0-rc7 + synchronize_sched_expedited()
> 0.350s 3.7.0-rc7 + Oleg's patch.
> 
> 
> Thanks,
> Jeff.

OK, I'm seinding two patches to reduce mount times. If it is possible to 
put them to 3.7.0, put them there.

Mikulas

---

percpu-rwsem: use synchronize_sched_expedited

Use synchronize_sched_expedited() instead of synchronize_sched()
to improve mount speed.

This patch improves mount time from 0.500s to 0.013s.

Note: if realtime people complain about the use
synchronize_sched_expedited() and synchronize_rcu_expedited(), I suggest
that they introduce an option CONFIG_REALTIME or
/proc/sys/kernel/realtime and turn off these *_expedited functions if
the option is enabled (i.e. turn synchronize_sched_expedited into
synchronize_sched and synchronize_rcu_expedited into synchronize_rcu).

Signed-off-by: Mikulas Patocka 

---
 include/linux/percpu-rwsem.h |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: linux-3.7-rc7/include/linux/percpu-rwsem.h
===
--- linux-3.7-rc7.orig/include/linux/percpu-rwsem.h 2012-11-28 
02:41:03.0 +0100
+++ linux-3.7-rc7/include/linux/percpu-rwsem.h  2012-11-28 02:41:15.0 
+0100
@@ -13,7 +13,7 @@ struct percpu_rw_semaphore {
 };
 
 #define light_mb() barrier()
-#define heavy_mb() synchronize_sched()
+#define heavy_mb() synchronize_sched_expedited()
 
 static inline void percpu_down_read(struct percpu_rw_semaphore *p)
 {
@@ -51,7 +51,7 @@ static inline void percpu_down_write(str
 {
mutex_lock(>mtx);
p->locked = true;
-   synchronize_sched(); /* make sure that all readers exit the 
rcu_read_lock_sched region */
+   synchronize_sched_expedited(); /* make sure that all readers exit the 
rcu_read_lock_sched region */
while (__percpu_count(p->counters))
msleep(1);
heavy_mb(); /* C, between read of p->counter and write to data, paired 
with B */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v6 2/6] PM / Runtime: introduce pm_runtime_set_memalloc_noio()

2012-11-27 Thread Ming Lei

On Wed, Nov 28, 2012 at 5:19 AM, Rafael J. Wysocki  wrote:
>
> Please use counters instead of walking the whole path every time.  Ie. in
> addition to the flag add a counter to store the number of the device's
> children having that flag set.

Even though counter is added, walking the whole path can't be avoided too,
and may be a explicit walking or recursion, because pm_runtime_set_memalloc_noio
is required to set or clear the flag(or increase/decrease the counter) of
devices in the whole path.

Thanks,
--
Ming Lei
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: build failure after merge of the powerpc tree

2012-11-27 Thread Benjamin Herrenschmidt

On Wed, 2012-11-28 at 14:03 +1100, Stephen Rothwell wrote:
> Hi all,
> 
> After merging the powerpc tree, next-20121115's build (powerpc
> allmodconfig) failed like this:
> 
> ERROR: ".of_reconfig_notifier_register" [drivers/crypto/nx/nx-compress.ko] 
> undefined!
> ERROR: ".of_reconfig_notifier_unregister" [drivers/crypto/nx/nx-compress.ko] 
> undefined!
> 
> Caused by commit 1cf3d8b3d24c ("powerpc+of: Add of node/property
> notification chain for adds and removes").

That nx stuff should just have gone through the powerpc tree... that's
not the first time it trips on generic changes because we miss it
being in the crypto tree...

Ashley, Nathan, who owns that ? Can you come up with a fixup patch ?

Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] zram: allocate metadata when disksize is set up

2012-11-27 Thread Minchan Kim

On Tue, Nov 27, 2012 at 04:45:54PM +0100, Jerome Marchand wrote:
> On 11/27/2012 06:13 AM, Nitin Gupta wrote:
> > On 11/22/2012 06:42 PM, Minchan Kim wrote:
> >> Lockdep complains about recursive deadlock of zram->init_lock.
> >> Because zram_init_device could be called in reclaim context and
> >> it requires a page with GFP_KERNEL.
> >>
> >> We can fix it via replacing GFP_KERNEL with GFP_NOIO.
> >> But more big problem is vzalloc in zram_init_device which calls GFP_KERNEL.
> >> We can change it with __vmalloc which can receive gfp_t.
> >> But still we have a problem. Although __vmalloc can handle gfp_t, it calls
> >> allocation of GFP_KERNEL. That's why I sent the patch.
> >> https://lkml.org/lkml/2012/4/23/77
> >>
> >> Yes. Fundamental problem is utter crap API vmalloc.
> >> If we can fix it, everyone would be happy. But life isn't simple
> >> like seeing my thread of the patch.
> >>
> >> So next option is to give up lazy initialization and initialize it at the
> >> very disksize setting time. But it makes unnecessary metadata waste until
> >> zram is really used. But let's think about it.
> >>
> >> 1) User of zram normally do mkfs.xxx or mkswap before using
> >> the zram block device(ex, normally, do it at booting time)
> >> It ends up allocating such metadata of zram before real usage so
> >> benefit of lazy initialzation would be mitigated.
> >>
> >> 2) Some user want to use zram when memory pressure is high.(ie, load zram
> >> dynamically, NOT booting time). It does make sense because people don't
> >> want to waste memory until memory pressure is high(ie, where zram is 
> >> really
> >> helpful time). In this case, lazy initialzation could be failed easily
> >> because we will use GFP_NOIO instead of GFP_KERNEL for avoiding 
> >> deadlock.
> >> So the benefit of lazy initialzation would be mitigated, too.
> >>
> >> 3) Metadata overhead is not critical and Nitin has a plan to diet it.
> >> 4K : 12 byte(64bit machine) -> 64G : 192M so 0.3% isn't big overhead
> >> If insane user use such big zram device up to 20, it could consume 6% 
> >> of ram
> >> but efficieny of zram will cover the waste.
> >>
> >> So this patch gives up lazy initialization and instead we initialize 
> >> metadata
> >> at disksize setting time.
> >>
> >> Signed-off-by: Minchan Kim 
> >> ---
> >>   drivers/staging/zram/zram_drv.c   |   21 -
> >>   drivers/staging/zram/zram_sysfs.c |1 +
> >>   2 files changed, 5 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/drivers/staging/zram/zram_drv.c 
> >> b/drivers/staging/zram/zram_drv.c
> >> index 9ef1eca..f364fb5 100644
> >> --- a/drivers/staging/zram/zram_drv.c
> >> +++ b/drivers/staging/zram/zram_drv.c
> >> @@ -441,16 +441,13 @@ static void zram_make_request(struct request_queue 
> >> *queue, struct bio *bio)
> >>   {
> >>struct zram *zram = queue->queuedata;
> >>
> >> -  if (unlikely(!zram->init_done) && zram_init_device(zram))
> >> -  goto error;
> >> -
> >>down_read(>init_lock);
> >>if (unlikely(!zram->init_done))
> >> -  goto error_unlock;
> >> +  goto error;
> >>
> >>if (!valid_io_request(zram, bio)) {
> >>zram_stat64_inc(zram, >stats.invalid_io);
> >> -  goto error_unlock;
> >> +  goto error;
> >>}
> >>
> >>__zram_make_request(zram, bio, bio_data_dir(bio));
> >> @@ -458,9 +455,8 @@ static void zram_make_request(struct request_queue 
> >> *queue, struct bio *bio)
> >>
> >>return;
> >>
> >> -error_unlock:
> >> -  up_read(>init_lock);
> >>   error:
> >> +  up_read(>init_lock);
> >>bio_io_error(bio);
> >>   }
> >>
> >> @@ -509,19 +505,12 @@ void zram_reset_device(struct zram *zram)
> >>up_write(>init_lock);
> >>   }
> >>
> >> +/* zram->init_lock should be hold */
> > 
> > s/hold/held
> > 
> > btw, shouldn't we also change GFP_KERNEL to GFP_NOIO in is_partial_io() 
> > case in both read/write handlers?
> 
> Good point. Actually, the one in zram_bvec_read() should actually be
> GFP_ATOMIC because of the kmap_atomic() above (or be moved out of

Right.

> kmap_atomic/kunmap_atomic nest).
> Another solution would be to allocate some working buffer at device
> init as it's done for compress_buffer/workmem. It would make
> zram_bvec_read/write look simpler (no need to free memory or manage 
> kmalloc failure).

Fair enough.
I sent a patch which replace GFP_KERNEL with GFP_ATOMIC but your suggestion
would be better. It could be a separate patch. I will send it.

Thanks.

> 
> Jerome
> 
> > 
> > Rest of the patch looks good.
> > 
> > 
> > Thanks,
> > Nitin
> > 
> >>   int zram_init_device(struct zram *zram)
> >>   {
> >>int ret;
> >>size_t num_pages;
> >>
> >> -  down_write(>init_lock);
> >> -  if (zram->init_done) {
> >> -  up_write(>init_lock);
> >> -  return 0;
> >> -  }
> >> -
> >> -  BUG_ON(!zram->disksize);
> >> -
> >>if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) {
> >>

Re: [PATCH 1/3] i2c: exynos5: add High Speed I2C controller driver

2012-11-27 Thread Naveen Krishna Ch

Hello Felipe, Thomas,

Thanks for your time and valuable comments,
I will post the next version with you comments addressed.

On 27 November 2012 19:04, Thomas Abraham  wrote:
>
> On 27 November 2012 18:30, Naveen Krishna Chatradhi
>  wrote:
> > Adds support for High Speed I2C driver found in Exynos5 and later
> > SoCs from Samsung. This driver currently supports Auto mode.
> >
> > Driver only supports Device Tree method.
> >
> > Signed-off-by: Taekgyun Ko 
> > Signed-off-by: Naveen Krishna Chatradhi 
> > ---
> >  drivers/i2c/busses/Kconfig   |6 +
> >  drivers/i2c/busses/Makefile  |1 +
> >  drivers/i2c/busses/i2c-exynos5.c |  758
> > ++
> >  drivers/i2c/busses/i2c-exynos5.h |   80 
> >  4 files changed, 845 insertions(+)
> >  create mode 100644 drivers/i2c/busses/i2c-exynos5.c
> >  create mode 100644 drivers/i2c/busses/i2c-exynos5.h
> >
> > diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
> > index 65dd599..88e8833 100644
> > --- a/drivers/i2c/busses/Kconfig
> > +++ b/drivers/i2c/busses/Kconfig
> > @@ -609,6 +609,12 @@ config I2C_S3C2410
> >   Say Y here to include support for I2C controller in the
> >   Samsung SoCs.
> >
> > +config I2C_EXYNOS5
> > +   tristate "Exynos5 HS-I2C Driver"
> > +   help
> > + Say Y here to include support for High Speed I2C controller in
> > the
> > + Exynos5 series SoCs from Samsung.
> > +
> >  config I2C_S6000
> > tristate "S6000 I2C support"
> > depends on XTENSA_VARIANT_S6000
> > diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
> > index 2d33d62..426b4fd 100644
> > --- a/drivers/i2c/busses/Makefile
> > +++ b/drivers/i2c/busses/Makefile
> > @@ -60,6 +60,7 @@ obj-$(CONFIG_I2C_PUV3)+= i2c-puv3.o
> >  obj-$(CONFIG_I2C_PXA)  += i2c-pxa.o
> >  obj-$(CONFIG_I2C_PXA_PCI)  += i2c-pxa-pci.o
> >  obj-$(CONFIG_I2C_S3C2410)  += i2c-s3c2410.o
> > +obj-$(CONFIG_I2C_EXYNOS5)  += i2c-exynos5.o
> >  obj-$(CONFIG_I2C_S6000)+= i2c-s6000.o
> >  obj-$(CONFIG_I2C_SH7760)   += i2c-sh7760.o
> >  obj-$(CONFIG_I2C_SH_MOBILE)+= i2c-sh_mobile.o
> > diff --git a/drivers/i2c/busses/i2c-exynos5.c
> > b/drivers/i2c/busses/i2c-exynos5.c
> > new file mode 100644
> > index 000..5983aa9
> > --- /dev/null
> > +++ b/drivers/i2c/busses/i2c-exynos5.c
> > @@ -0,0 +1,758 @@
> > +/* linux/drivers/i2c/busses/i2c-exynos5.c
> > + *
> > + * Copyright (C) 2012 Samsung Electronics Co., Ltd.
> > + *
> > + * Exynos5 series High Speed I2C controller driver
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > +*/
> > +
> > +#include 
> > +#include 
> > +
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +#include 
> > +#include "i2c-exynos5.h"
> > +
> > +#define HSI2C_POLLING 0
> > +#define HSI2C_FAST_SPD 0
> > +#define HSI2C_HIGH_SPD 1
> > +
> > +/* Max time to wait for bus to become idle after a xfer */
> > +#define EXYNOS5_I2C_TIMEOUT (msecs_to_jiffies(1000))
> > +
> > +struct exynos5_i2c {
> > +   unsigned intsuspended:1;
> > +
> > +   struct i2c_msg  *msg;
> > +   struct completion   msg_complete;
> > +   unsigned intmsg_byte_ptr;
> > +
> > +   unsigned intirq;
> > +
> > +   void __iomem*regs;
> > +   struct clk  *clk;
> > +   struct device   *dev;
> > +   struct resource *ioarea;
> > +   struct i2c_adapter  adap;
> > +   unsigned intbus_number;
> > +   unsigned intspeed_mode;
> > +   unsigned intfast_speed;
> > +   unsigned inthigh_speed;
> > +   int operation_mode;
> > +   int gpios[2];
> > +};
> > +
> > +static struct platform_device_id exynos5_driver_ids[] = {
> > +   {
> > +   .name   = "exynos5-hs-i2c",
> > +   .driver_data= 0,
> > +   }, { },
> > +};
> > +MODULE_DEVICE_TABLE(platform, exynos5_driver_ids);
> > +
> > +#ifdef CONFIG_OF
> > +static const struct of_device_id exynos5_i2c_match[] = {
> > +   { .compatible = "samsung,exynos5-hs-i2c", .data = (void *)0 },
> > +   {},
> > +};
> > +MODULE_DEVICE_TABLE(of, exynos5_i2c_match);
> > +#endif
> > +
> > +static inline void dump_i2c_register(struct exynos5_i2c *i2c)
> > +{
> > +   dev_dbg(i2c->dev, "Register dump(%d) :\n %x\n %x\n %x\n %x\n"
> > +   " %x\n %x\n %x\n %x\n %x\n"
> > +   " %x\n %x\n %x\n %x\n %x\n"
> > +   " %x\n %x\n %x\n %x\n %x\n"
> > +   " %x\n %x\n %x\n %x\n %x\n"
> > +

Re: [PATCH 2/2] zram: allocate metadata when disksize is set up

2012-11-27 Thread Minchan Kim

On Mon, Nov 26, 2012 at 09:13:24PM -0800, Nitin Gupta wrote:
> On 11/22/2012 06:42 PM, Minchan Kim wrote:
> >Lockdep complains about recursive deadlock of zram->init_lock.
> >Because zram_init_device could be called in reclaim context and
> >it requires a page with GFP_KERNEL.
> >
> >We can fix it via replacing GFP_KERNEL with GFP_NOIO.
> >But more big problem is vzalloc in zram_init_device which calls GFP_KERNEL.
> >We can change it with __vmalloc which can receive gfp_t.
> >But still we have a problem. Although __vmalloc can handle gfp_t, it calls
> >allocation of GFP_KERNEL. That's why I sent the patch.
> >https://lkml.org/lkml/2012/4/23/77
> >
> >Yes. Fundamental problem is utter crap API vmalloc.
> >If we can fix it, everyone would be happy. But life isn't simple
> >like seeing my thread of the patch.
> >
> >So next option is to give up lazy initialization and initialize it at the
> >very disksize setting time. But it makes unnecessary metadata waste until
> >zram is really used. But let's think about it.
> >
> >1) User of zram normally do mkfs.xxx or mkswap before using
> >the zram block device(ex, normally, do it at booting time)
> >It ends up allocating such metadata of zram before real usage so
> >benefit of lazy initialzation would be mitigated.
> >
> >2) Some user want to use zram when memory pressure is high.(ie, load zram
> >dynamically, NOT booting time). It does make sense because people don't
> >want to waste memory until memory pressure is high(ie, where zram is 
> > really
> >helpful time). In this case, lazy initialzation could be failed easily
> >because we will use GFP_NOIO instead of GFP_KERNEL for avoiding deadlock.
> >So the benefit of lazy initialzation would be mitigated, too.
> >
> >3) Metadata overhead is not critical and Nitin has a plan to diet it.
> >4K : 12 byte(64bit machine) -> 64G : 192M so 0.3% isn't big overhead
> >If insane user use such big zram device up to 20, it could consume 6% of 
> > ram
> >but efficieny of zram will cover the waste.
> >
> >So this patch gives up lazy initialization and instead we initialize metadata
> >at disksize setting time.
> >
> >Signed-off-by: Minchan Kim 
> >---
> >  drivers/staging/zram/zram_drv.c   |   21 -
> >  drivers/staging/zram/zram_sysfs.c |1 +
> >  2 files changed, 5 insertions(+), 17 deletions(-)
> >
> >diff --git a/drivers/staging/zram/zram_drv.c 
> >b/drivers/staging/zram/zram_drv.c
> >index 9ef1eca..f364fb5 100644
> >--- a/drivers/staging/zram/zram_drv.c
> >+++ b/drivers/staging/zram/zram_drv.c
> >@@ -441,16 +441,13 @@ static void zram_make_request(struct request_queue 
> >*queue, struct bio *bio)
> >  {
> > struct zram *zram = queue->queuedata;
> >
> >-if (unlikely(!zram->init_done) && zram_init_device(zram))
> >-goto error;
> >-
> > down_read(>init_lock);
> > if (unlikely(!zram->init_done))
> >-goto error_unlock;
> >+goto error;
> >
> > if (!valid_io_request(zram, bio)) {
> > zram_stat64_inc(zram, >stats.invalid_io);
> >-goto error_unlock;
> >+goto error;
> > }
> >
> > __zram_make_request(zram, bio, bio_data_dir(bio));
> >@@ -458,9 +455,8 @@ static void zram_make_request(struct request_queue 
> >*queue, struct bio *bio)
> >
> > return;
> >
> >-error_unlock:
> >-up_read(>init_lock);
> >  error:
> >+up_read(>init_lock);
> > bio_io_error(bio);
> >  }
> >
> >@@ -509,19 +505,12 @@ void zram_reset_device(struct zram *zram)
> > up_write(>init_lock);
> >  }
> >
> >+/* zram->init_lock should be hold */
> 
> s/hold/held

Done.

> 
> btw, shouldn't we also change GFP_KERNEL to GFP_NOIO in
> is_partial_io() case in both read/write handlers?

Absolutely. The previous patch isn't complete but sent by mistake.
Sorry for the noise.
I just sent new patch.

Thanks.

-- 
Kind regards,
Minchan Kim
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] zram: force disksize setting before using zram

2012-11-27 Thread Minchan Kim

On Mon, Nov 26, 2012 at 09:04:47PM -0800, Nitin Gupta wrote:
> On 11/22/2012 06:42 PM, Minchan Kim wrote:
> >Now zram document syas "set disksize is optional"
> >but partly it's wrong. When you try to use zram firstly after
> >booting, you must set disksize, otherwise zram can't work because
> >zram gendisk's size is 0. But once you do it, you can use zram freely
> >after reset because reset doesn't reset to zero paradoxically.
> >So in this time, disksize setting is optional.:(
> >It's inconsitent for user behavior and not straightforward.
> >
> >This patch forces always setting disksize firstly before using zram.
> >Yes. It changes current behavior so someone could complain when
> >he upgrades zram. Apparently it could be a problem if zram is mainline
> >but it still lives in staging so behavior could be changed for right
> >way to go. Let them excuse.
> >
> >Signed-off-by: Minchan Kim 
> >---
> >  drivers/staging/zram/zram.txt |7 +++--
> >  drivers/staging/zram/zram_drv.c   |   57 
> > ++---
> >  drivers/staging/zram/zram_drv.h   |5 +---
> >  drivers/staging/zram/zram_sysfs.c |6 +---
> >  4 files changed, 27 insertions(+), 48 deletions(-)
> >
> >diff --git a/drivers/staging/zram/zram.txt b/drivers/staging/zram/zram.txt
> >index 5f75d29..00ae66b 100644
> >--- a/drivers/staging/zram/zram.txt
> >+++ b/drivers/staging/zram/zram.txt
> >@@ -23,10 +23,9 @@ Following shows a typical sequence of steps for using 
> >zram.
> > This creates 4 devices: /dev/zram{0,1,2,3}
> > (num_devices parameter is optional. Default: 1)
> >
> >-2) Set Disksize (Optional):
> >+2) Set Disksize
> > Set disk size by writing the value to sysfs node 'disksize'
> >-(in bytes). If disksize is not given, default value of 25%
> >-of RAM is used.
> >+(in bytes).
> >
> 
> Disksize can now be set using K/M/G suffixes also (see Sergey's
> change: handle mem suffixes in disk size ...). So, this should be
> documented as:
> 
> 2) Set Disksize
>   Set disk size by writing the value to sysfs node 'disksize'.
>   The value can be either in bytes or you can use mem suffixes.
>   Examples:
>   # Initialize /dev/zram0 with 50MB disksize
>   echo $((50*1024*1024)) > /sys/block/zram0/disksize
> 
>   # Using mem suffixes
>   echo 256K > /sys/block/zram0/disksize
>   echo 512M > /sys/block/zram0/disksize
>   echo 1G > /sys/block/zram0/disksize
> 

Done.

> 
> > # Initialize /dev/zram0 with 50MB disksize
> > echo $((50*1024*1024)) > /sys/block/zram0/disksize
> >@@ -67,6 +66,8 @@ Following shows a typical sequence of steps for using zram.
> >
> > (This frees all the memory allocated for the given device).
> >
> >+If you want to use zram again, you should set disksize first
> >+due to reset zram.
> 
> 
> This frees all the memory allocated for the given device and resets
> the disksize to zero. You must set the disksize again before reusing
> the device.

Done.

> 
> >
> >  Please report any problems at:
> >   - Mailing list: linux-mm-cc at laptop dot org
> >diff --git a/drivers/staging/zram/zram_drv.c 
> >b/drivers/staging/zram/zram_drv.c
> >index fb4a7c9..9ef1eca 100644
> >--- a/drivers/staging/zram/zram_drv.c
> >+++ b/drivers/staging/zram/zram_drv.c
> >@@ -104,35 +104,6 @@ static int page_zero_filled(void *ptr)
> > return 1;
> >  }
> >
> >-static void zram_set_disksize(struct zram *zram, size_t totalram_bytes)
> >-{
> >-if (!zram->disksize) {
> >-pr_info(
> >-"disk size not provided. You can use disksize_kb module "
> >-"param to specify size.\nUsing default: (%u%% of RAM).\n",
> >-default_disksize_perc_ram
> >-);
> >-zram->disksize = default_disksize_perc_ram *
> >-(totalram_bytes / 100);
> >-}
> >-
> >-if (zram->disksize > 2 * (totalram_bytes)) {
> >-pr_info(
> >-"There is little point creating a zram of greater than "
> >-"twice the size of memory since we expect a 2:1 compression "
> >-"ratio. Note that zram uses about 0.1%% of the size of "
> >-"the disk when not in use so a huge zram is "
> >-"wasteful.\n"
> >-"\tMemory Size: %zu kB\n"
> >-"\tSize you selected: %llu kB\n"
> >-"Continuing anyway ...\n",
> >-totalram_bytes >> 10, zram->disksize
> >-);
> >-}
> >-
> >-zram->disksize &= PAGE_MASK;
> >-}
> >-
> >  static void zram_free_page(struct zram *zram, size_t index)
> >  {
> > unsigned long handle = zram->table[index].handle;
> >@@ -497,6 +468,9 @@ void __zram_reset_device(struct zram *zram)
> >  {
> > size_t index;
> >
> >+if (!zram->init_done)
> >+goto out;
> >+
> > zram->init_done = 0;
> >
> > /* Free various per-device buffers */
> >@@ -523,8 +497,9 @@ void __zram_reset_device(struct zram *zram)
> >
> >

Re: [PATCH v2 0/5] Add movablecore_map boot option

2012-11-27 Thread Jiang Liu

On 2012-11-28 11:24, Bob Liu wrote:
> On Tue, Nov 27, 2012 at 8:49 PM, Tang Chen  wrote:
>> On 11/27/2012 08:09 PM, Bob Liu wrote:
>>>
>>> On Tue, Nov 27, 2012 at 4:29 PM, Tang Chen
>>> wrote:

 Hi Liu,


 This feature is used in memory hotplug.

 In order to implement a whole node hotplug, we need to make sure the
 node contains no kernel memory, because memory used by kernel could
 not be migrated. (Since the kernel memory is directly mapped,
 VA = PA + __PAGE_OFFSET. So the physical address could not be changed.)

 User could specify all the memory on a node to be movable, so that the
 node could be hot-removed.

>>>
>>> Thank you for your explanation. It's reasonable.
>>>
>>> But i think it's a bit duplicated with CMA, i'm not sure but maybe we
>>> can combine it with CMA which already in mainline?
>>>
>> Hi Liu,
>>
>> Thanks for your advice. :)
>>
>> CMA is Contiguous Memory Allocator, right?  What I'm trying to do is
>> controlling where is the start of ZONE_MOVABLE of each node. Could
>> CMA do this job ?
> 
> cma will not control the start of ZONE_MOVABLE of each node, but it
> can declare a memory that always movable
> and all non movable allocate request will not happen on that area.
> 
> Currently cma use a boot parameter "cma=" to declare a memory size
> that always movable.
> I think it might fulfill your requirement if extending the boot
> parameter with a start address.
> 
> more info at http://lwn.net/Articles/468044/
>>
>> And also, after a short investigation, CMA seems need to base on
>> memblock. But we need to limit memblock not to allocate memory on
>> ZONE_MOVABLE. As a result, we need to know the ranges before memblock
>> could be used. I'm afraid we still need an approach to get the ranges,
>> such as a boot option, or from static ACPI tables such as SRAT/MPST.
>>
> 
> Yes, it's based on memblock and with boot option.
> In setup_arch32()
> dma_contiguous_reserve(0);   => will declare a cma area using
> memblock_reserve()
> 
>> I'm don't know much about CMA for now. So if you have any better idea,
>> please share with us, thanks. :)
> 
> My idea is reuse cma like below patch(even not compiled) and boot with
> "cma=size@start_address".
> I don't know whether it can work and whether suitable for your
> requirement, if not forgive me for this noises.
> 
> diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
> index 612afcc..564962a 100644
> --- a/drivers/base/dma-contiguous.c
> +++ b/drivers/base/dma-contiguous.c
> @@ -59,11 +59,18 @@ struct cma *dma_contiguous_default_area;
>   */
>  static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M;
>  static long size_cmdline = -1;
> +static long cma_start_cmdline = -1;
> 
>  static int __init early_cma(char *p)
>  {
> +   char *oldp;
> pr_debug("%s(%s)\n", __func__, p);
> +   oldp = p;
> size_cmdline = memparse(p, );
> +
> +   if (*p == '@')
> +   cma_start_cmdline = memparse(p+1, );
> +   printk("cma start:0x%x, size: 0x%x\n", size_cmdline, 
> cma_start_cmdline);
> return 0;
>  }
>  early_param("cma", early_cma);
> @@ -127,8 +134,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
> if (selected_size) {
> pr_debug("%s: reserving %ld MiB for global area\n", __func__,
>  selected_size / SZ_1M);
> -
> -   dma_declare_contiguous(NULL, selected_size, 0, limit);
> +   if (cma_size_cmdline != -1)
> +   dma_declare_contiguous(NULL, selected_size,
> cma_start_cmdline, limit);
> +   else
> +   dma_declare_contiguous(NULL, selected_size, 0, limit);
> }
>  };
Seems a good idea to reserve memory by reusing CMA logic, though need more
investigation here. One of CMA goal is to ensure pages in CMA are really
movable, and this patchset tries to achieve the same goal at a first glance.

 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] block_dev: don't take the write lock if block size doesn't change

2012-11-27 Thread Mikulas Patocka

block_dev: don't take the write lock if block size doesn't change

Taking the write lock has a big performance impact on the whole system
(because of synchronize_sched_expedited). This patch avoids taking the
write lock if the block size doesn't change (i.e. when mounting
filesystem with block size equal to the default block size).

The logic to test if the block device is mapped was moved to a separate
function is_bdev_mapped to avoid code duplication.

Signed-off-by: Mikulas Patocka 

---
 fs/block_dev.c |   25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

Index: linux-3.7-rc7/fs/block_dev.c
===
--- linux-3.7-rc7.orig/fs/block_dev.c   2012-11-28 04:09:01.0 +0100
+++ linux-3.7-rc7/fs/block_dev.c2012-11-28 04:13:53.0 +0100
@@ -114,10 +114,18 @@ void invalidate_bdev(struct block_device
 }
 EXPORT_SYMBOL(invalidate_bdev);
 
-int set_blocksize(struct block_device *bdev, int size)
+static int is_bdev_mapped(struct block_device *bdev)
 {
-   struct address_space *mapping;
+   int ret_val;
+   struct address_space *mapping = bdev->bd_inode->i_mapping;
+   mutex_lock(>i_mmap_mutex);
+   ret_val = mapping_mapped(mapping);
+   mutex_unlock(>i_mmap_mutex);
+   return ret_val;
+}
 
+int set_blocksize(struct block_device *bdev, int size)
+{
/* Size must be a power of two, and between 512 and PAGE_SIZE */
if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
return -EINVAL;
@@ -126,18 +134,21 @@ int set_blocksize(struct block_device *b
if (size < bdev_logical_block_size(bdev))
return -EINVAL;
 
+   /*
+* If the block size doesn't change, don't take the write lock.
+* We check for is_bdev_mapped anyway, for consistent behavior.
+*/
+   if (size == bdev->bd_block_size)
+   return is_bdev_mapped(bdev) ? -EBUSY : 0;
+
/* Prevent starting I/O or mapping the device */
percpu_down_write(>bd_block_size_semaphore);
 
/* Check that the block device is not memory mapped */
-   mapping = bdev->bd_inode->i_mapping;
-   mutex_lock(>i_mmap_mutex);
-   if (mapping_mapped(mapping)) {
-   mutex_unlock(>i_mmap_mutex);
+   if (is_bdev_mapped(bdev)) {
percpu_up_write(>bd_block_size_semaphore);
return -EBUSY;
}
-   mutex_unlock(>i_mmap_mutex);
 
/* Don't change the size if it is same as current */
if (bdev->bd_block_size != size) {

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] target: Make spc_get_write_same_sectors return sector_t

2012-11-27 Thread Nicholas A. Bellinger

From: Nicholas Bellinger 

We already expect TFO->get_blocks() to return sector_t for zero value case
when doing WRITE_SAME to the end of the backend device, so go ahead and return
sector_t from spc_get_write_same_sectors() to handle this case properly.

Also, update the single iblock_execute_write_same() caller of this code.

Cc: Christoph Hellwig 
Cc: Martin K. Petersen 
Signed-off-by: Nicholas Bellinger 
---
 drivers/target/target_core_iblock.c  |2 +-
 drivers/target/target_core_sbc.c |2 +-
 include/target/target_core_backend.h |2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/target/target_core_iblock.c 
b/drivers/target/target_core_iblock.c
index 42cc3fb..cfdb949 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -483,7 +483,7 @@ iblock_execute_write_same(struct se_cmd *cmd)
struct bio *bio;
struct bio_list list;
sector_t block_lba = cmd->t_task_lba;
-   unsigned int sectors = spc_get_write_same_sectors(cmd);
+   sector_t sectors = spc_get_write_same_sectors(cmd);
 
sg = >t_data_sg[0];
 
diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c
index 4fd22cd..8be5bd7 100644
--- a/drivers/target/target_core_sbc.c
+++ b/drivers/target/target_core_sbc.c
@@ -109,7 +109,7 @@ sbc_emulate_readcapacity_16(struct se_cmd *cmd)
return 0;
 }
 
-int spc_get_write_same_sectors(struct se_cmd *cmd)
+sector_t spc_get_write_same_sectors(struct se_cmd *cmd)
 {
u32 num_blocks;
 
diff --git a/include/target/target_core_backend.h 
b/include/target/target_core_backend.h
index 3393ab1..5079109 100644
--- a/include/target/target_core_backend.h
+++ b/include/target/target_core_backend.h
@@ -52,7 +52,7 @@ void  target_complete_cmd(struct se_cmd *, u8);
 
 sense_reason_t spc_parse_cdb(struct se_cmd *cmd, unsigned int *size);
 sense_reason_t spc_emulate_report_luns(struct se_cmd *cmd);
-intspc_get_write_same_sectors(struct se_cmd *cmd);
+sector_t   spc_get_write_same_sectors(struct se_cmd *cmd);
 
 sense_reason_t sbc_parse_cdb(struct se_cmd *cmd, struct sbc_ops *ops);
 u32sbc_get_device_rev(struct se_device *dev);
-- 
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 0/5] Add movablecore_map boot option

2012-11-27 Thread Jiang Liu

On 2012-11-28 11:47, Tang Chen wrote:
> On 11/27/2012 11:10 AM, wujianguo wrote:
>>
>> Hi Tang,
>> DMA address can't be set as movable, if some one boot kernel with
>> movablecore_map=4G@0xa0 or other memory region that contains DMA address,
>> system maybe boot failed. Should this case be handled or mentioned
>> in the change log and kernel-parameters.txt?
> 
> Hi Wu,
> 
> I think we can use MAX_DMA_PFN and MAX_DMA32_PFN to prevent setting DMA
> address as movable. Just ignore the address lower than them, and set
> the rest as movable. How do you think ?
> 
> And, since we cannot figure out the minimum of memory kernel needs, I
> think for now, we can just add some warning into kernel-parameters.txt.
> 
> Thanks. :)
On one other OS, there is a mechanism to dynamically convert pages from
movable zones into normal zones.

Regards!
Gerry

> 
>>
>> Thanks,
>> Jianguo Wu
>>
> 
> .
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: build failure after merge of the pm tree

2012-11-27 Thread Stephen Rothwell

Hi Rafael,

After merging the pm tree, today's linux-next build (x86_64 allmodconfig)
failed like this:

ERROR: "devfreq_monitor_resume" [drivers/devfreq/governor_simpleondemand.ko] 
undefined!
ERROR: "devfreq_monitor_suspend" [drivers/devfreq/governor_simpleondemand.ko] 
undefined!
ERROR: "devfreq_interval_update" [drivers/devfreq/governor_simpleondemand.ko] 
undefined!
ERROR: "devfreq_monitor_stop" [drivers/devfreq/governor_simpleondemand.ko] 
undefined!
ERROR: "devfreq_monitor_start" [drivers/devfreq/governor_simpleondemand.ko] 
undefined!

Caused by commit 206c30cfeb7c ("PM / devfreq: Add suspend and resume apis").

I have used the pm tree from next-20121127 for today.
-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgp5TjT00uQya.pgp
Description: PGP signature

Re: Recent kernel "mount" slow

2012-11-27 Thread Mikulas Patocka



On Tue, 27 Nov 2012, Jens Axboe wrote:

> On 2012-11-27 11:06, Jeff Chua wrote:
> > On Tue, Nov 27, 2012 at 3:38 PM, Jens Axboe  wrote:
> >> On 2012-11-27 06:57, Jeff Chua wrote:
> >>> On Sun, Nov 25, 2012 at 7:23 AM, Jeff Chua  
> >>> wrote:
>  On Sun, Nov 25, 2012 at 5:09 AM, Mikulas Patocka  
>  wrote:
> > So it's better to slow down mount.
> 
>  I am quite proud of the linux boot time pitting against other OS. Even
>  with 10 partitions. Linux can boot up in just a few seconds, but now
>  you're saying that we need to do this semaphore check at boot up. By
>  doing so, it's inducing additional 4 seconds during boot up.
> >>>
> >>> By the way, I'm using a pretty fast SSD (Samsung PM830) and fast CPU
> >>> (2.8GHz). I wonder if those on slower hard disk or slower CPU, what
> >>> kind of degradation would this cause or just the same?
> >>
> >> It'd likely be the same slow down time wise, but as a percentage it
> >> would appear smaller on a slower disk.
> >>
> >> Could you please test Mikulas' suggestion of changing
> >> synchronize_sched() in include/linux/percpu-rwsem.h to
> >> synchronize_sched_expedited()?
> > 
> > Tested. It seems as fast as before, but may be a "tick" slower. Just
> > perception. I was getting pretty much 0.012s with everything reverted.
> > With synchronize_sched_expedited(), it seems to be 0.012s ~ 0.013s.
> > So, it's good.
> 
> Excellent
> 
> >> linux-next also has a re-write of the per-cpu rw sems, out of Andrews
> >> tree. It would be a good data point it you could test that, too.
> > 
> > Tested. It's slower. 0.350s. But still faster than 0.500s without the patch.
> 
> Makes sense, it's 2 synchronize_sched() instead of 3. So it doesn't fix
> the real issue, which is having to do synchronize_sched() in the first
> place.
> 
> > # time mount /dev/sda1 /mnt; sync; sync; umount /mnt
> > 
> > 
> > So, here's the comparison ...
> > 
> > 0.500s 3.7.0-rc7
> > 0.168s 3.7.0-rc2
> > 0.012s 3.6.0
> > 0.013s 3.7.0-rc7 + synchronize_sched_expedited()
> > 0.350s 3.7.0-rc7 + Oleg's patch.
> 
> I wonder how many of them are due to changing to the same block size.
> Does the below patch make a difference?

This patch is wrong because you must check if the device is mapped while 
holding bdev->bd_block_size_semaphore (because 
bdev->bd_block_size_semaphore prevents new mappings from being created)

I'm sending another patch that has the same effect.


Note that ext[234] filesystems set blocksize to 1024 temporarily during 
mount, so it doesn't help much (it only helps for other filesystems, such 
as jfs). For ext[234], you have a device with default block size 4096, the 
filesystem sets block size to 1024 during mount, reads the super block and 
sets it back to 4096.

If you want, you can fix ext[234] to avoid this behavior.

Mikulas


> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 1a1e5e3..f041c56 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -126,29 +126,28 @@ int set_blocksize(struct block_device *bdev, int size)
>   if (size < bdev_logical_block_size(bdev))
>   return -EINVAL;
>  
> - /* Prevent starting I/O or mapping the device */
> - percpu_down_write(>bd_block_size_semaphore);
> -
>   /* Check that the block device is not memory mapped */
>   mapping = bdev->bd_inode->i_mapping;
>   mutex_lock(>i_mmap_mutex);
>   if (mapping_mapped(mapping)) {
>   mutex_unlock(>i_mmap_mutex);
> - percpu_up_write(>bd_block_size_semaphore);
>   return -EBUSY;
>   }
>   mutex_unlock(>i_mmap_mutex);
>  
>   /* Don't change the size if it is same as current */
>   if (bdev->bd_block_size != size) {
> - sync_blockdev(bdev);
> - bdev->bd_block_size = size;
> - bdev->bd_inode->i_blkbits = blksize_bits(size);
> - kill_bdev(bdev);
> + /* Prevent starting I/O */
> + percpu_down_write(>bd_block_size_semaphore);
> + if (bdev->bd_block_size != size) {
> + sync_blockdev(bdev);
> + bdev->bd_block_size = size;
> + bdev->bd_inode->i_blkbits = blksize_bits(size);
> + kill_bdev(bdev);
> + }
> + percpu_up_write(>bd_block_size_semaphore);
>   }
>  
> - percpu_up_write(>bd_block_size_semaphore);
> -
>   return 0;
>  }
>  
> @@ -1649,14 +1648,12 @@ EXPORT_SYMBOL_GPL(blkdev_aio_write);
>  
>  static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
>  {
> + struct address_space *mapping = file->f_mapping;
>   int ret;
> - struct block_device *bdev = I_BDEV(file->f_mapping->host);
> -
> - percpu_down_read(>bd_block_size_semaphore);
>  
> + mutex_lock(>i_mmap_mutex);
>   ret = generic_file_mmap(file, vma);
> -
> - percpu_up_read(>bd_block_size_semaphore);
> + mutex_unlock(>i_mmap_mutex);
>  
>   return ret;
>  }
> 
> --

Re: [PATCH v6 2/6] PM / Runtime: introduce pm_runtime_set_memalloc_noio()

2012-11-27 Thread Ming Lei

On Wed, Nov 28, 2012 at 5:19 AM, Rafael J. Wysocki  wrote:
> On Saturday, November 24, 2012 08:59:14 PM Ming Lei wrote:
>> The patch introduces the flag of memalloc_noio in 'struct dev_pm_info'
>> to help PM core to teach mm not allocating memory with GFP_KERNEL
>> flag for avoiding probable deadlock.
>>
>> As explained in the comment, any GFP_KERNEL allocation inside
>> runtime_resume() or runtime_suspend() on any one of device in
>> the path from one block or network device to the root device
>> in the device tree may cause deadlock, the introduced
>> pm_runtime_set_memalloc_noio() sets or clears the flag on
>> device in the path recursively.
>>
>> Cc: Alan Stern 
>> Cc: "Rafael J. Wysocki" 
>> Signed-off-by: Ming Lei 
>> ---
>> v5:
>>   - fix code style error
>>   - add comment on clear the device memalloc_noio flag
>> v4:
>>   - rename memalloc_noio_resume as memalloc_noio
>>   - remove pm_runtime_get_memalloc_noio()
>>   - add comments on pm_runtime_set_memalloc_noio
>> v3:
>>   - introduce pm_runtime_get_memalloc_noio()
>>   - hold one global lock on pm_runtime_set_memalloc_noio
>>   - hold device power lock when accessing memalloc_noio_resume
>> flag suggested by Alan Stern
>>   - implement pm_runtime_set_memalloc_noio without recursion
>> suggested by Alan Stern
>> v2:
>>   - introduce pm_runtime_set_memalloc_noio()
>> ---
>>  drivers/base/power/runtime.c |   60 
>> ++
>>  include/linux/pm.h   |1 +
>>  include/linux/pm_runtime.h   |3 +++
>>  3 files changed, 64 insertions(+)
>>
>> diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
>> index 3148b10..3e198a0 100644
>> --- a/drivers/base/power/runtime.c
>> +++ b/drivers/base/power/runtime.c
>> @@ -124,6 +124,66 @@ unsigned long pm_runtime_autosuspend_expiration(struct 
>> device *dev)
>>  }
>>  EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration);
>>
>> +static int dev_memalloc_noio(struct device *dev, void *data)
>> +{
>> + return dev->power.memalloc_noio;
>> +}
>> +
>> +/*
>> + * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag.
>> + * @dev: Device to handle.
>> + * @enable: True for setting the flag and False for clearing the flag.
>> + *
>> + * Set the flag for all devices in the path from the device to the
>> + * root device in the device tree if @enable is true, otherwise clear
>> + * the flag for devices in the path whose siblings don't set the flag.
>> + *
>
> Please use counters instead of walking the whole path every time.  Ie. in
> addition to the flag add a counter to store the number of the device's
> children having that flag set.

Thanks for your review.

IMO, pm_runtime_set_memalloc_noio() is only called in
probe() and release() of block device and network device, which is
in a very infrequent path, so I am wondering if it is worthy of introducing
another counter for all devices.

Also looks the current implementation of pm_runtime_set_memalloc_noio()
is simple and clean enough with the flag, IMO.

> I would use the flag only to store the information that
> pm_runtime_set_memalloc_noio(dev, true) has been run for this device directly
> and I'd use a counter for everything else.
>
> That is, have power.memalloc_count that would be incremented when (1)
> pm_runtime_set_memalloc_noio(dev, true) is called for that device and (2) when
> power.memalloc_count for one of its children changes from 0 to 1 (and
> analogously for decrementation).  Then, check the counter in rpm_callback().

Sorry, could you explain in a bit detail why we need the counter? Looks only
checking the flag in rpm_callback() is enough, doesn't it?

>
> Besides, don't you need to check children for the arg device itself?

It isn't needed since the children of network/block device can't be
involved of the deadlock in runtime PM path.

Also, the function is only called by network device or block device
subsystem, both the two kind of device are class device and should
have no children.

>
>> + * The function should only be called by block device, or network
>> + * device driver for solving the deadlock problem during runtime
>> + * resume/suspend:
>> + *
>> + * If memory allocation with GFP_KERNEL is called inside runtime
>> + * resume/suspend callback of any one of its ancestors(or the
>> + * block device itself), the deadlock may be triggered inside the
>> + * memory allocation since it might not complete until the block
>> + * device becomes active and the involed page I/O finishes. The
>> + * situation is pointed out first by Alan Stern. Network device
>> + * are involved in iSCSI kind of situation.
>> + *
>> + * The lock of dev_hotplug_mutex is held in the function for handling
>> + * hotplug race because pm_runtime_set_memalloc_noio() may be called
>> + * in async probe().
>> + *
>> + * The function should be called between device_add() and device_del()
>> + * on the affected

Re: [PATCH v2 0/5] Add movablecore_map boot option

2012-11-27 Thread Tang Chen


On 11/27/2012 11:10 AM, wujianguo wrote:


Hi Tang,
DMA address can't be set as movable, if some one boot kernel with
movablecore_map=4G@0xa0 or other memory region that contains DMA address,
system maybe boot failed. Should this case be handled or mentioned
in the change log and kernel-parameters.txt?


Hi Wu,

I think we can use MAX_DMA_PFN and MAX_DMA32_PFN to prevent setting DMA
address as movable. Just ignore the address lower than them, and set
the rest as movable. How do you think ?

And, since we cannot figure out the minimum of memory kernel needs, I
think for now, we can just add some warning into kernel-parameters.txt.

Thanks. :)



Thanks,
Jianguo Wu


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH]sched/rt.c: Add reschedule to switched_from_rt()

2012-11-27 Thread Steven Rostedt

On Fri, 2012-11-23 at 00:02 +0400, Kirill Tkhai wrote:
> Reschedule rq->curr if the first RT task has just been
> pulled to the rq.
> 
> Signed-off-by: Kirill V Tkhai 
> CC: Steven Rostedt 
> CC: Ingo Molnar 
> CC: Peter Zijlstra 
> ---
>  kernel/sched/rt.c |7 +--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 418feb0..29bda5b 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1889,8 +1889,11 @@ static void switched_from_rt(struct rq *rq, struct 
> task_struct *p)
>* we may need to handle the pulling of RT tasks
>* now.
>*/
> - if (p->on_rq && !rq->rt.rt_nr_running)
> - pull_rt_task(rq);
> + if (!p->on_rq || rq->rt.rt_nr_running)
> + return;
> +
> + if (pull_rt_task(rq))
> + resched_task(rq->curr);

Wow really? We never set NEED_RESCHED after pulling an RT task to a
queue that is about to run SCHED_OTHER?

Hmm, this is usually called before switched_to(), and looking at
switched_to_fair() there's a good chance that it does it.

But anyway, might as well add it here, I don't think it will hurt. The
prio_changed_rt() does it.

Acked-by: Steven Rostedt 

-- Steve

>  }
>  
>  void init_sched_rt_class(void)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

switcheroo registration vs switching race...

2012-11-27 Thread Daniel J Blueman

Hi Seth, Dave, Takashi,

If I power down the unused discrete GPU before lightdm starts by
fiddling with the sysfs file [1] in the upstart script, I see a race
manifesting as the discrete GPU's HDA controller timing out to
commands [2].

Adding some debug, I see that the registered audio devices are put
into D3 before the GPU is, but it turns out that the discrete (and
internal) GPU's HDA controller gets registered a bit later, so the
list is empty. The symptom is since the HDA driver it's talking to
hardware which is now in D3.

We could add a mutex to nouveau to allow us to wait for the DGPU HDA
controller, but perhaps this should be solved at a higher level in the
vgaswitcheroo code; what do you think?

Thanks,
  Daniel

--- [1]

echo OFF >/sys/kernel/debug/vgaswitcheroo/switch

--- [2]

snd_hda_intel :00:1b.0: enabling device ( -> 0002)
snd_hda_intel :00:1b.0: irq 51 for MSI/MSI-X
input: HDA Intel PCH Mic as /devices/pci:00/:00:1b.0/sound/card0/input10
input: HDA Intel PCH Headphone as
/devices/pci:00/:00:1b.0/sound/card0/input11
nouveau [  VBIOS][:01:00.0] ... appears to be valid
nouveau [  VBIOS][:01:00.0] using image from PRAMIN
nouveau [  VBIOS][:01:00.0] BIT signature found
nouveau [  VBIOS][:01:00.0] version 80.07.26.04
nouveau [   MXM][:01:00.0] no VBIOS data, nothing to do
nouveau [   PFB][:01:00.0] RAM type: GDDR5
nouveau [   PFB][:01:00.0] RAM size: 1024 MiB
nouveau W[ PGRAPH][:01:00.0] disabled, PGRAPH=1 to enable
vga_switcheroo: enabled
[TTM] Zone kernel: Available graphics memory: 4076308 kiB
[TTM] Zone  dma32: Available graphics memory: 2097152 kiB
[TTM] Initializing pool allocator
[TTM] Initializing DMA pool allocator
nouveau [   DRM] VRAM: 1024 MiB
nouveau [   DRM] GART: 512 MiB
nouveau [   DRM] BIT BIOS found
nouveau [   DRM] Bios version 80.07.26.04
nouveau [   DRM] TMDS table version 2.0
nouveau [   DRM] DCB version 4.0
nouveau [   DRM] DCB outp 00: 048101b6 0f230010
nouveau [   DRM] DCB outp 01: 018212d6 0f220020
nouveau [   DRM] DCB outp 02: 01021212 00020020
nouveau [   DRM] DCB outp 03: 088324c6 0f220010
nouveau [   DRM] DCB outp 04: 08032402 00020010
nouveau [   DRM] DCB outp 05: 02843862 00020010
nouveau [   DRM] DCB conn 00: 00020047
nouveau [   DRM] DCB conn 01: 02208146
nouveau [   DRM] DCB conn 02: 01104246
nouveau [   DRM] DCB conn 03: 00410361
[drm] Supports vblank timestamp caching Rev 1 (10.10.2010).
[drm] No driver support for vblank timestamp query.
nouveau W[   DRM] voltage table 0x50 unknown
nouveau [   DRM] 4 available performance level(s)
nouveau [   DRM] 1: core 209MHz shader 419MHz memory 405MHz voltage 520mV
nouveau [   DRM] 2: core 390MHz shader 780MHz memory 1080MHz voltage 610mV
nouveau [   DRM] 3: core 1000MHz shader 2000MHz memory 1080MHz voltage 630mV
nouveau [   DRM] 4: core 1254MHz shader 2508MHz memory 1080MHz voltage 630mV
nouveau [   DRM] c:
nouveau E[   DRM] failed to create kernel channel, -22
No connectors reported connected with modes
[drm] Cannot find any crtc or sizes - going 1024x768
nouveau [   DRM] allocated 1024x768 fb: 0x6, bo 880264974400
fb1: nouveaufb frame buffer device
[drm] Initialized nouveau 1.1.0 20120801 for :01:00.0 on minor 1
snd_hda_intel :01:00.1: enabling device ( -> 0002)
hda-intel: :01:00.1: Handle VGA-switcheroo audio client
snd_hda_intel :01:00.1: irq 52 for MSI/MSI-X
VGA switcheroo: switched nouveau off
nouveau [   DRM] suspending fbcon...
nouveau [   DRM] suspending display...
nouveau [   DRM] unpinning framebuffer(s)...
nouveau [   DRM] evicting buffers...
nouveau [   DRM] suspending client object trees...
input: HDA NVidia HDMI/DP,pcm=8 as
/devices/pci:00/:00:01.0/:01:00.1/sound/card1/input12
input: HDA NVidia HDMI/DP,pcm=7 as
/devices/pci:00/:00:01.0/:01:00.1/sound/card1/input13
input: HDA NVidia HDMI/DP,pcm=3 as
/devices/pci:00/:00:01.0/:01:00.1/sound/card1/input14
nouveau E[   I2C][:01:00.0] AUXCH(3): begin idle timeout 0x
nouveau E[   I2C][:01:00.0] AUXCH(2): begin idle timeout 0x
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503
hda-intel: spurious response 0x0:0x0, last cmd=0x170503

Re: How about a gpio_get(device , char ) function?

2012-11-27 Thread Alex Courbot

On Monday 26 November 2012 19:14:31 Grant Likely wrote:
> I don't have any problem with a gpio_get function, but I do agree that
> making it return an opaque handle is how it should be written with a new
> set of accessors. The handle should probably be simply the pointer to
> the _desc[number] which is a private table in gpiolib.c. The
> definition of it isn't available outside of gpiolib.c

That looks like a reasonable approach, but this would make the new API 
available only to systems that use GPIOlib. Shouldn't we be concerned about 
making this available to all GPIO implementations? Or is GPIOlib so widely 
used that we don't care?

Right now I have a very simple wrapper (for testing purposes) around the 
current integer-base GPIO namespace that accepts tables mapping consumers to 
GPIO numbers, much like Thierry did for the PWM subsystem. Integrating it into 
GPIOlib does not seem to be much more difficult ; it would require some 
refactoring though as most of the code should be shared by the two APIs.

This also seems to be the right opportunity (although not directly related) to 
switch the gpio_desc table into something more flexible. Two approaches come to 
mind: either a linked-list of gpio_chips ordered by base GPIO, or a radix-
tree. The small number of gpio chips in a system seem to make the first 
approach reasonable enough - GPIO lookup time would become linear instead of 
constant, but it should not be noticeable from the consumer perspective.

Alex.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/3] KVM: x86: improve reexecute_instruction

2012-11-27 Thread Xiao Guangrong

On 11/28/2012 07:42 AM, Marcelo Tosatti wrote:
> On Tue, Nov 27, 2012 at 11:30:24AM +0800, Xiao Guangrong wrote:
>> On 11/27/2012 06:41 AM, Marcelo Tosatti wrote:
>>

 -  return false;
 +again:
 +  page_fault_count = ACCESS_ONCE(vcpu->kvm->arch.page_fault_count);
 +
 +  /*
 +   * if emulation was due to access to shadowed page table
 +   * and it failed try to unshadow page and re-enter the
 +   * guest to let CPU execute the instruction.
 +   */
 +  kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
 +  emulate = vcpu->arch.mmu.page_fault(vcpu, cr3, PFERR_WRITE_MASK, false);
>>>
>>> Can you explain what is the objective here?
>>>
>>
>> Sure. :)
>>
>> The instruction emulation is caused by fault access on cr3. After unprotect
>> the target page, we call vcpu->arch.mmu.page_fault to fix the mapping of cr3.
>> if it return 1, mmu can not fix the mapping, we should report the error,
>> otherwise it is good to return to guest and let it re-execute the instruction
>> again.
>>
>> page_fault_count is used to avoid the race on other vcpus, since after we
>> unprotect the target page, other cpu can enter page fault path and let the
>> page be write-protected again.
>>
>> This way can help us to detect all the case that mmu can not be fixed.
> 
> How about recording the gfn number for shadow pages that have been
> shadowed in the current pagefault run? (which is cheap, compared to
> shadowing these pages).
> 

Marcelo,

Thanks for your idea!

If we use this way, we should cache gfns in vcpu struct.

Actually, i have considered the approach like yours, that is getting
all page tables of the guest, then to see whether the page table gfns
are contained in the target gfn. But we need changed mmu->gva_to_pfn
or introduce a new method to get page tables of the guest.

But reexecute_instruction is really the unlikely path, both of these
ways can make the mmu code more complex and/or introduce unnecessary
overload for the common cases.

it looks like the way used in this patch is the simplest and no harmful to
the core code.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: manual merge of the arm-soc tree with the slave-dma tree

2012-11-27 Thread Stephen Rothwell

Hi,

On Tue, 27 Nov 2012 11:08:36 +0530 Viresh Kumar  wrote:
>
> On 27 November 2012 10:14, Stephen Rothwell  wrote:
> > Today's linux-next merge of the arm-soc tree got a conflict in
> > arch/arm/mach-spear13xx/spear1310.c between commit b47394911c26 ("ARM:
> > SPEAr13xx: Pass DW DMAC platform data from DT") from the slave-dma tree
> > and commit 300a6856324a ("ARM: SPEAr1310: Fix AUXDATA for compact flash
> > controller") from the arm-soc tree.
> >
> > I have no idea how to fix this up, so I just effectively dropped the
> > arm-doc tree patch.
> 
> So sorry for that, Can you please take arm-soc version here? Patch 
> 300a6856324a
> is doing the correct thing. i.e. we need
> 
> +   OF_DEV_AUXDATA("arasan,cf-spear1340", MCIF_CF_BASE, NULL, _pdata),
> 
> instead of
> 
> +   OF_DEV_AUXDATA("arasan,cf-spear1340", MCIF_CF_BASE, NULL, "cf"),

Done, it should be correct on today's linux-next.
-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgpRLhgxr5F0h.pgp
Description: PGP signature

Re: [PATCH v2 0/5] Add movablecore_map boot option

2012-11-27 Thread Bob Liu

On Tue, Nov 27, 2012 at 8:49 PM, Tang Chen  wrote:
> On 11/27/2012 08:09 PM, Bob Liu wrote:
>>
>> On Tue, Nov 27, 2012 at 4:29 PM, Tang Chen
>> wrote:
>>>
>>> Hi Liu,
>>>
>>>
>>> This feature is used in memory hotplug.
>>>
>>> In order to implement a whole node hotplug, we need to make sure the
>>> node contains no kernel memory, because memory used by kernel could
>>> not be migrated. (Since the kernel memory is directly mapped,
>>> VA = PA + __PAGE_OFFSET. So the physical address could not be changed.)
>>>
>>> User could specify all the memory on a node to be movable, so that the
>>> node could be hot-removed.
>>>
>>
>> Thank you for your explanation. It's reasonable.
>>
>> But i think it's a bit duplicated with CMA, i'm not sure but maybe we
>> can combine it with CMA which already in mainline?
>>
> Hi Liu,
>
> Thanks for your advice. :)
>
> CMA is Contiguous Memory Allocator, right?  What I'm trying to do is
> controlling where is the start of ZONE_MOVABLE of each node. Could
> CMA do this job ?

cma will not control the start of ZONE_MOVABLE of each node, but it
can declare a memory that always movable
and all non movable allocate request will not happen on that area.

Currently cma use a boot parameter "cma=" to declare a memory size
that always movable.
I think it might fulfill your requirement if extending the boot
parameter with a start address.

more info at http://lwn.net/Articles/468044/
>
> And also, after a short investigation, CMA seems need to base on
> memblock. But we need to limit memblock not to allocate memory on
> ZONE_MOVABLE. As a result, we need to know the ranges before memblock
> could be used. I'm afraid we still need an approach to get the ranges,
> such as a boot option, or from static ACPI tables such as SRAT/MPST.
>

Yes, it's based on memblock and with boot option.
In setup_arch32()
dma_contiguous_reserve(0);   => will declare a cma area using
memblock_reserve()

> I'm don't know much about CMA for now. So if you have any better idea,
> please share with us, thanks. :)

My idea is reuse cma like below patch(even not compiled) and boot with
"cma=size@start_address".
I don't know whether it can work and whether suitable for your
requirement, if not forgive me for this noises.

diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
index 612afcc..564962a 100644
--- a/drivers/base/dma-contiguous.c
+++ b/drivers/base/dma-contiguous.c
@@ -59,11 +59,18 @@ struct cma *dma_contiguous_default_area;
  */
 static const unsigned long size_bytes = CMA_SIZE_MBYTES * SZ_1M;
 static long size_cmdline = -1;
+static long cma_start_cmdline = -1;

 static int __init early_cma(char *p)
 {
+   char *oldp;
pr_debug("%s(%s)\n", __func__, p);
+   oldp = p;
size_cmdline = memparse(p, );
+
+   if (*p == '@')
+   cma_start_cmdline = memparse(p+1, );
+   printk("cma start:0x%x, size: 0x%x\n", size_cmdline, cma_start_cmdline);
return 0;
 }
 early_param("cma", early_cma);
@@ -127,8 +134,10 @@ void __init dma_contiguous_reserve(phys_addr_t limit)
if (selected_size) {
pr_debug("%s: reserving %ld MiB for global area\n", __func__,
 selected_size / SZ_1M);
-
-   dma_declare_contiguous(NULL, selected_size, 0, limit);
+   if (cma_size_cmdline != -1)
+   dma_declare_contiguous(NULL, selected_size,
cma_start_cmdline, limit);
+   else
+   dma_declare_contiguous(NULL, selected_size, 0, limit);
}
 };

-- 
Regards,
--Bob
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 3.7-rc7: BUG: MAX_STACK_TRACE_ENTRIES too low!

2012-11-27 Thread Christian Kujau

On Tue, 27 Nov 2012 at 19:06, Christian Kujau wrote:
> the same thing[0] happened again in 3.7-rc7, after ~20h uptime:

I found the following on patchwork, but this seems to deal with powerpc64 
only, while this PowerBook G4 of mine is powerpc32:

  http://patchwork.ozlabs.org/patch/193414/

It looks related, but then again, I fail to parse assember...

Christian.

> [40007.339487] [sched_delayed] sched: RT throttling activated
> [69731.388717] BUG: MAX_STACK_TRACE_ENTRIES too low!
> [69731.390371] turning off the locking correctness validator.
> [69731.391942] Call Trace:
> [69731.393525] [c9a61c10] [c0009064] show_stack+0x70/0x1bc (unreliable)
> [69731.395152] [c9a61c50] [c0077460] save_trace+0xfc/0x114
> [69731.396735] [c9a61c60] [c007be20] __lock_acquire+0x1568/0x19b8
> [69731.398296] [c9a61d00] [c007c2c0] lock_acquire+0x50/0x70
> [69731.399857] [c9a61d20] [c0550e28] _raw_spin_lock_irq+0x5c/0x78
> [69731.401419] [c9a61d40] [c054fb58] __schedule+0xd8/0x534
> [69731.402972] [c9a61da0] [c0550094] _cond_resched+0x50/0x68
> [69731.404527] [c9a61db0] [c0479908] dst_gc_task+0xbc/0x258
> [69731.406070] [c9a61e40] [c004eeb8] process_one_work+0x1f4/0x49c
> [69731.407585] [c9a61e80] [c004f644] worker_thread+0x14c/0x400
> [69731.409075] [c9a61eb0] [c0057634] kthread+0xbc/0xc0
> [69731.410521] [c9a61f40] [c0011ad4] ret_from_kernel_thread+0x5c/0x64
> [...repeated 54 times...]
> 
> Anyone knows what this is about?
> 
> Thanks,
> Christian.
> 
> [0] http://lkml.indiana.edu/hypermail/linux/kernel/1211.0/03025.html
-- 
BOFH excuse #191:

Just type 'mv * /dev/null'.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: manual merge of the mfd tree with Linus' tree

2012-11-27 Thread Stephen Rothwell

Hi Samuel,

Today's linux-next merge of the mfd tree got a conflict in
drivers/mfd/twl-core.c between commit 78a3c5ab1749 ("mfd: twl-core: Fix
chip ID for the twl6030-pwm module") from Linus' tree and commit
afc45898f62c ("mfd: twl-core: Support for proper PWM drivers") from the
mfd tree.

I fixed it up (using the version from the mfd tree) and can carry the fix
as necessary (no action is required).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgpbe3PAO3IZK.pgp
Description: PGP signature

Re: [PATCH 17/19] drivers: convert shrinkers to new count/scan API

2012-11-27 Thread Dave Chinner

On Wed, Nov 28, 2012 at 01:13:11AM +, Chris Wilson wrote:
> On Wed, 28 Nov 2012 10:14:44 +1100, Dave Chinner  wrote:
> > +/*
> > + * XXX: (dchinner) This is one of the worst cases of shrinker abuse I've 
> > seen.
> > + *
> > + * i915_gem_purge() expects a byte count to be passed, and the minimum 
> > object
> > + * size is PAGE_SIZE.
> 
> No, purge() expects a count of pages to be freed. Each pass of the
> shrinker therefore tries to free a minimum of 128 pages.

Ah, I got the shifts mixed up. I'd been looking at way too much crap
already when I saw this. But the fact this can be misunderstood says
something about the level of documentation that the code has (i.e.
none).

> > The shrinker doesn't work on bytes - it works on
> > + * *objects*.
> 
> And I thought you were reviewing the shrinker API to be useful where a
> single object may range between 4K and 4G.

Which requires rewriting all the algorithms to not be dependent on
the subsystems using a fixed size object. The shrinker control
function is called shrink_slab() for a reason - it was expected to
be used to shrink caches of fixed sized objects allocated from slab
memory.

It has no concept of the amount of memory that each object consumes,
just an idea of how much *IO* it takes to replace the object in
memory once it's been reclaimed. The DEFAULT_SEEKS is design to
encode the fact it generally takes 2 IOs to replace either a LRU
page or a filesystem slab object, and so balances the scanning based
on that value. i.e. the shrinker algorithms are solidly based around
fixed sized objects that have some relationship to the cost of
physical IO operations to replace them in the cache.

The API change is the first step in the path to removing these built
in assumptions. The current API is just insane and any attempt to
build on it is going to be futile. The way I see this developing is
this:

- make the shrink_slab count -> scan algorithm per node

- add information about size of objects in the cache for
  fixed size object caches.
- the shrinker now has some idea of how many objects
  need to be freed to be able to free a page of
  memory, as well as the relative penalty for
  replacing them.
- tells the shrinker the size of the cache
  in bytes so overall memory footprint of the caches
  can be taken into account

- add new count and scan operations for caches that are
  based on memory used, not object counts
- allows us to use the same count/scan algorithm for
  calculating how much pressure to put on caches
  with variable size objects.

My care factor mostly ends here, as it will allow XFS to corectly
balance the metadata buffer cache (variable size objects) against the
inode, dentry and dquot caches which are object based. The next
steps that I'm about to give you are based on some discussions with
some MM people over bottles of red wine, so take it with a grain of
salt...

- calculate a "pressure" value for each cache controlled by a
  shrinker so that the relative memory pressure between
  caches can be compared. This allows the shrinkers to bias
  reclaim based on where the memory pressure is being
  generated

- start grouping shrinkers into a heirarchy, allowing
  related shrinkers (e.g. all the caches in a memcg) to be
  shrunk according resource limits that can be placed on the
  group. i.e. memory pressure is proportioned across
  groups rather than many individual shrinkers.

- comments have been made to the extent that with generic
  per-node lists and a node aware shrinker, all of the page
  scanning could be driven by the shrinker infrastructure,
  rather than the shrinkers being driven by how many pages
  in the page cache just got scanned for reclaim.

  IOWs, the main memory reclaim algorithm walks all the
  shrinkers groups to calculate overall memory pressure,
  calculate how much reclaim is necessary, and then
  proportion reclaim across all the shrinker groups. i.e.
  everything is a shrinker.

This patch set is really just the start of a long process. balance
between the page cache and VFS/filesystem shrinkers is critical to
the efficient operation of the OS under many, many workloads, so I'm
not about to change more than oe little thing at a time. This API
change is just one little step. You'll get what you want eventually,
but you're not going to get it as a first step.

> > + * But the craziest part comes when i915_gem_purge() has walked all the 
> > objects
> > + * and can't free any memory. That results in i915_gem_shrink_all() being
> > + * called, which idles the GPU and frees everything the driver has in it's
> > + * active and inactive lists. It's basically

Re: [RFC] [PATCH] fix infinite loop; increase robustness of debugfs_remove_recursive

2012-11-27 Thread Steven Rostedt

On Fri, 2012-11-23 at 18:15 +0100, Lars Ellenberg wrote:
> When toying around with debugfs, intentionally trying to break things,
> I managed to get it into a reproducible endless loop when cleaning up.
> 
> debugfs_remove_recursive() completely ignores that entries found
> on ->d_subdirs may already be d_delete()d, but not yet d_kill()ed.
> 
> In this case, the first two entries found have both been such dentries
> (d_iname = ".", btw), while later in the list there was still a real,
> not yet deleted entry.
> 
> That way, the "goto next_sibling;" did not catch this case,
> the "break the infinit loop if we fail to remove something"
> did not catch it either.
> 
> 
> Disclaimer: I'm not a dentries and inodes guy...

I'm not a dentries or inodes guy either, so I wont comment on the actual
merits of this patch.

> 
> Still, I think the "is this child a non-empty subdir" check
> was just wrong. This is my fix:
> - if (list_empty(>d_subdirs)) 
> + if (!simple_emty(child))

"simple_empty"

> 
> Also, always trying to __debugfs_remove() the first entry found from
> parent->d_subdirs.next is wrong, we need to skip over any already
> deleted children. I introduced the debugfs_find_next_positive_child()
> helper for this.
> 
> If I understand it correctly, if we do it this way, it can never fail.
> That is, as long as we can be sure that no new dentries will be created
> while we are in debugfs_remove_recursive().
> So the 
>   if (debugfs_positive(child)) {
>   /*
>* Avoid infinite loop if we fail to remove
>* one dentry.
> is probably dead code now?
> 
> 
> As an additional fix, to prevent an access after free and resulting Oops,
> I serialize dereferencing of attr->get/set and attr->data with d_delete(),
> using the file->f_dentry->d_parent->d_inode->i_mutex.
> 
> If this file->f_dentry meanwhile has been deleted, simple_attr_read()
> and simple_attr_write() now returns -ESTALE. (Any other error code 
> preferences?)
> 
> 
> With this patch, I was able to
> cd /sys/debugfs/test-module/some/dir
> exec 7< some-file
> rmmod test-module
> cat <&7

I saw this and thought "hmm, I wonder if the trace_events have issues,
as they create debugfs directories and files via modules too". I went
and tried to reproduce but couldn't get passed the rmmod, as the module
count gets incremented for any open files that the module creates. I
take it that you didn't add that feature to your test module.

> 
> without any infinite loops, hangs, oopses or other problems,
> and as expected get an ESTALE for the cat.
> 
> Without the patch, I'll get either an infinite loop and rmmod never
> terminates, or cat oopses.
> 
> 
> If you think this is correct, please comment on the FIXME
> below, and help me write a nice commit message.
> 
> If you think this is still wrong or even makes things worse,
> please help me with a proper fix ;-)
> 
> 
> Patch is against upstream as of yesterday,
> but looks like it still applies way back into 2009, 2.6.3x,
> so if it is correct, it may even qualify for the stable branches?
> 

Now, is there any current user of debugfs that is susceptible for this
bug? I'm not saying that these issues shouldn't be fixed. But I'm also
concerned about exploits and other things that just a root user may
accidentally cause harm. If there's current problem then maybe this
isn't needed for stable. But should probably be fixed for the future.

-- Steve


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/3] KVM: x86: let reexecute_instruction work for tdp

2012-11-27 Thread Xiao Guangrong

On 11/28/2012 07:32 AM, Marcelo Tosatti wrote:
> On Tue, Nov 27, 2012 at 11:13:11AM +0800, Xiao Guangrong wrote:
 +static bool reexecute_instruction(struct kvm_vcpu *vcpu, unsigned long 
 cr2)
  {
 -  gpa_t gpa;
 +  gpa_t gpa = cr2;
pfn_t pfn;

 -  if (tdp_enabled)
 +  if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
return false;
>>>
>>> How is indirect_shadow_pages protected? Why is ACCESS_ONCE() being used
>>> to read it?
>>
>> Hi Marcelo,
>>
>> It is protected by mmu-lock for it only be changed when mmu-lock is hold. And
>> ACCESS_ONCE is used on read path avoiding magic optimization from compiler.
> 
> Please switch to mmu_lock protection, there is no reason to have access
> to this variable locklessly - not performance critical.
> 
> For example, there is no use of barriers when modifying the variable.

This is not bad, the worst case is, the direct mmu failed to unprotect the 
shadow
pages, (meet indirect_shadow_pages = 0, but there has shadow pages being 
shadowed.),
after enter to guest, we will go into reexecute_instruction again, then it will
remove shadow pages.

But, i do not have strong opinion on it, i respect your idea! :)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v6 5/6] PM / Runtime: force memory allocation with no I/O during Runtime PM callbcack

2012-11-27 Thread Ming Lei

On Wed, Nov 28, 2012 at 5:24 AM, Rafael J. Wysocki  wrote:
>
> Please don't duplicate code this way.
>
> You can move that whole thing to rpm_callback().  Yes, you'll probably need to
> check dev->power.memalloc_noio twice in there, but that's OK.

Good idea, I will update it in v7.

Thanks,
--
Ming Lei
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

3.7-rc7: BUG: MAX_STACK_TRACE_ENTRIES too low!

2012-11-27 Thread Christian Kujau

Hi,

the same thing[0] happened again in 3.7-rc7, after ~20h uptime:

[40007.339487] [sched_delayed] sched: RT throttling activated
[69731.388717] BUG: MAX_STACK_TRACE_ENTRIES too low!
[69731.390371] turning off the locking correctness validator.
[69731.391942] Call Trace:
[69731.393525] [c9a61c10] [c0009064] show_stack+0x70/0x1bc (unreliable)
[69731.395152] [c9a61c50] [c0077460] save_trace+0xfc/0x114
[69731.396735] [c9a61c60] [c007be20] __lock_acquire+0x1568/0x19b8
[69731.398296] [c9a61d00] [c007c2c0] lock_acquire+0x50/0x70
[69731.399857] [c9a61d20] [c0550e28] _raw_spin_lock_irq+0x5c/0x78
[69731.401419] [c9a61d40] [c054fb58] __schedule+0xd8/0x534
[69731.402972] [c9a61da0] [c0550094] _cond_resched+0x50/0x68
[69731.404527] [c9a61db0] [c0479908] dst_gc_task+0xbc/0x258
[69731.406070] [c9a61e40] [c004eeb8] process_one_work+0x1f4/0x49c
[69731.407585] [c9a61e80] [c004f644] worker_thread+0x14c/0x400
[69731.409075] [c9a61eb0] [c0057634] kthread+0xbc/0xc0
[69731.410521] [c9a61f40] [c0011ad4] ret_from_kernel_thread+0x5c/0x64
[...repeated 54 times...]

Anyone knows what this is about?

Thanks,
Christian.

[0] http://lkml.indiana.edu/hypermail/linux/kernel/1211.0/03025.html
-- 
BOFH excuse #235:

The new frame relay network hasn't bedded down the software loop transmitter 
yet.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: build failure after merge of the powerpc tree

2012-11-27 Thread Stephen Rothwell

Hi all,

After merging the powerpc tree, next-20121115's build (powerpc
allmodconfig) failed like this:

ERROR: ".of_reconfig_notifier_register" [drivers/crypto/nx/nx-compress.ko] 
undefined!
ERROR: ".of_reconfig_notifier_unregister" [drivers/crypto/nx/nx-compress.ko] 
undefined!

Caused by commit 1cf3d8b3d24c ("powerpc+of: Add of node/property
notification chain for adds and removes").

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgpxQllmM276w.pgp
Description: PGP signature

A question about proc file system

2012-11-27 Thread Yangbin (Robin)

Hi all:

I write a kernel module which make a dir and create a file in proc file 
system. 
I use proc_mkdir() and create_proc_entry() in init and use remove_proc_entry() 
in exit.
Now I got a problem during my test:
1. open terminal A, insmod and cd into the dir the module created in proc
2. open terminal B. rmmod and insmod angain.
Then I cannot see the file which should be created by the module, unless 
terminal A quit the dir.

This will be ok with regale file system like ext3, one teminal's operation will 
not affert others.
Is this a bug or not?

Regards
Robin

Re: [PATCH v4] drivers: staging: remove last usage of NIPQUAD and NIP6 in gdm72xx

2012-11-27 Thread Shan Wei

Greg KH said, at 2012/11/28 1:19:
> On Tue, Nov 27, 2012 at 10:37:46AM +0800, Shan Wei wrote:
>> From: Shan Wei 
>>
>> commit cf4ca4874fc45 removed the definition of NIPQUAD and NIPQUAD_FMT,
>> and NIP6 also is out of date.
>>
>> Because DEBUG_SDU is not defined in gdm_wimax.h, no error message when 
>> compiling
>> this code. And remove constant condition judge.
>>
>>
>> Signed-off-by: Shan Wei 
>> ---
>> V3:
>>As suggestion of greg k-h's patch email bot, recreate this patch base on 
>> greg's staging tree,
>>v2 is on David Miller's net-next tree.
> 
> Are you sure you got the right branch?  This does not apply to my
> staging-next branch of the staging.git tree, what did you use?

Just create the patch base on you master of the staging.git, not the 
staging-next branch.
(It's different from David Miller, who has individual git tree for next 
development. :-( )

-

>From 0b48de95a28f68f6d32f25b77ecc006a9d8a94b9 Mon Sep 17 00:00:00 2001
From: Shan Wei 
Date: Wed, 28 Nov 2012 10:45:00 +0800
Subject: [PATCH staging-next v4] drivers: staging: use %pI4 format to print 
IPv4 address and remove last usage of NIP6

commit cf4ca4874fc45 removed the definition of NIPQUAD and NIPQUAD_FMT,
and NIP6 also is out of date.

commit 2874762b31d8d replace deprecated NIPQUAD marco to C code, but we can use 
%pI4 to
print IPv4 address more simply. And remove constant condition judge.

Because DEBUG_SDU is not defined in gdm_wimax.h, no error message when 
compiling. 


Signed-off-by: Shan Wei 
---
 drivers/staging/gdm72xx/gdm_wimax.c |   13 +
 1 files changed, 1 insertions(+), 12 deletions(-)

diff --git a/drivers/staging/gdm72xx/gdm_wimax.c 
b/drivers/staging/gdm72xx/gdm_wimax.c
index c302769..41efbee 100644
--- a/drivers/staging/gdm72xx/gdm_wimax.c
+++ b/drivers/staging/gdm72xx/gdm_wimax.c
@@ -168,24 +168,13 @@ static void dump_eth_packet(const char *title, u8 *data, 
int len)
get_ip_protocol_name(ip_protocol),
get_port_name(port));
 
-   #if 1
if (!(data[0] == 0xff && data[1] == 0xff)) {
if (protocol == ETH_P_IP) {
-   printk(KERN_DEBUG " src=%u.%u.%u.%u\n",
-   ((unsigned char *)&(ih->saddr))[0],
-   ((unsigned char *)&(ih->saddr))[1],
-   ((unsigned char *)&(ih->saddr))[2],
-   ((unsigned char *)&(ih->saddr))[3]);
+   printk(KERN_DEBUG " src=%pI4\n", >saddr);
} else if (protocol == ETH_P_IPV6) {
-   #ifdef NIP6
-   printk(KERN_DEBUG " src=%x:%x:%x:%x:%x:%x:%x:%x\n",
-   NIP6(ih->saddr));
-   #else
printk(KERN_DEBUG " src=%pI6\n", >saddr);
-   #endif
}
}
-   #endif
 
#if (DUMP_PACKET & DUMP_SDU_ALL)
printk_hex(data, len);
-- 
1.7.1


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC PATCH 05/06] input/rmi4: F01 - device control

2012-11-27 Thread Christopher Heiny


On 11/27/2012 01:29 AM, Dmitry Torokhov wrote:

On Mon, Nov 26, 2012 at 02:31:27PM -0800, Christopher Heiny wrote:

>On 11/26/2012 01:40 AM, Dmitry Torokhov wrote:

> >Hi Christopher,
> >
> >On Fri, Nov 16, 2012 at 07:58:53PM -0800, Christopher Heiny wrote:

> >>RMI Function 01 implements basic device control and power management
> >>behaviors for the RMI4 sensor.
> >>
> >>rmi_f01.h exports definitions that we expect to be used by other 
functionality
> >>in the future (such as firmware reflash).

> >
> >Please see my comments below.

>
>Hi Dmitry,
>
>Thanks for the feedback and the patch.  I've got just one question,
>included below, with a bunch of snipping).
>
>Chris
>

> >

> >>
> >>
> >>Signed-off-by: Christopher Heiny
> >>
> >>Cc: Dmitry Torokhov
> >>Cc: Linus Walleij
> >>Cc: Naveen Kumar Gaddipati
> >>Cc: Joeri de Gram
> >>
> >>
> >>---
> >>
> >>  drivers/input/rmi4/rmi_f01.c | 1348 
++
> >>  drivers/input/rmi4/rmi_f01.h |  160 +
> >>  2 files changed, 1508 insertions(+), 0 deletions(-)
> >>
> >>diff --git a/drivers/input/rmi4/rmi_f01.c b/drivers/input/rmi4/rmi_f01.c
> >>new file mode 100644
> >>index 000..038266c
> >>--- /dev/null
> >>+++ b/drivers/input/rmi4/rmi_f01.c
> >>@@ -0,0 +1,1348 @@
> >>+/*
> >>+ * Copyright (c) 2011-2012 Synaptics Incorporated
> >>+ * Copyright (c) 2011 Unixphere
> >>+ *
> >>+ * This program is free software; you can redistribute it and/or modify
> >>+ * it under the terms of the GNU General Public License as published by
> >>+ * the Free Software Foundation; either version 2 of the License, or
> >>+ * (at your option) any later version.

>
>[snip]
>

> >>+/**
> >>+ * @reset - set this bit to force a firmware reset of the sensor.
> >>+ */
> >>+struct f01_device_commands {
> >>+  bool reset:1;
> >>+  u8 reserved:7;

> >
> >When specifying bitwise fields please use u8, u16, etc only.

>
>Um, OK.  Previously patch feedback suggested to use bool instead of
>u8 for single bit fields (see here:
>http://www.spinics.net/lists/linux-input/msg22198.html).  So I'm a
>little confused.  It's no big deal to change it back, but I'd like
>confirmation that it is really what we should do.

>

I believe that it is better to specify exact bitness of the base type of
the bitfield so you do not surprised by the alignment.


OK, thanks!
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[ANNOUNCE] 3.0.53-rt77

2012-11-27 Thread Steven Rostedt


Dear RT Folks,

I'm pleased to announce the 3.0.53-rt77 stable release.


This release is just an update to the new stable 3.0.53 version
and no RT specific changes have been made.


You can get this release via the git tree at:

  git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git

  Head SHA1: 7399b1a4e535957c4ab7c07818e25433ac414d23


Or to build 3.0.53-rt77 directly, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v3.0/linux-3.0.tar.xz

  http://www.kernel.org/pub/linux/kernel/v3.0/patch-3.0.53.xz

  
http://www.kernel.org/pub/linux/kernel/projects/rt/3.0/patch-3.0.53-rt77.patch.xz



Enjoy,

-- Steve



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[ANNOUNCE] 3.4.20-rt31

2012-11-27 Thread Steven Rostedt


Dear RT Folks,

I'm pleased to announce the 3.4.20-rt31 stable release.


This release is just an update to the new stable 3.4.20 version
and no RT specific changes have been made.


You can get this release via the git tree at:

  git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git

  Head SHA1: f547c9596af656dff474b0fa2f5c0c1a4e6876fa


Or to build 3.4.20-rt31 directly, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v3.x/linux-3.4.tar.xz

  http://www.kernel.org/pub/linux/kernel/v3.x/patch-3.4.20.xz

  
http://www.kernel.org/pub/linux/kernel/projects/rt/3.4/patch-3.4.20-rt31.patch.xz



Enjoy,

-- Steve



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFT RESEND] regulator: pcf50633: Use linear_min_sel and regulator_[map|list]_voltage_linear

2012-11-27 Thread Axel Lin

This driver can be converted to use linear_min_sel and
regulator_[map|list]_voltage_linear.

Below shows the equation (from Datasheet) for each LDOs.

For AUTOOUT:
VO(prog) = 0.625 + auto_out x 0.025 V; e.g.
( to 00101110: reserved)
0010: 1.8 V (min)
01010011: 2.7 V
01101010: 3.275 V
01101011: 3.300 V
01101100: 3.325 V
0111 : 3.800 V (max)

The linear mapping start from 0x2f selector.

Thus we convert this equation to:
VO(prog) = 1.8 + (selector - linear_min_sel) x 0.025 V
(min_uV = 180, uV_step = 25000, linear_min_sel = 0x2f)

For DOWNxOUT:
VO(prog) = 0.625 + downx_out x 0.025 V; e.g.
 : 0.625 V (min)
00010111 : 1.200 V
0010 : 1.800 V
0101 : 3.000 V (max)

For xLDOOUT:
VO(prog) = 0.9 + xldo_out x 0.1 V; e.g.
0: 0.9 V
1: 1.0 V
11000 : 3.3 V
11011 : 3.6 V

Signed-off-by: Axel Lin 
---
Seems directly copy paste from the datasheet(pdf) adds some UTF-8 characters in 
the patch.
Fix it and resend.

Axel
 drivers/regulator/pcf50633-regulator.c |  176 +---
 1 file changed, 28 insertions(+), 148 deletions(-)

diff --git a/drivers/regulator/pcf50633-regulator.c 
b/drivers/regulator/pcf50633-regulator.c
index d776f51..534075e 100644
--- a/drivers/regulator/pcf50633-regulator.c
+++ b/drivers/regulator/pcf50633-regulator.c
@@ -24,12 +24,15 @@
 #include 
 #include 
 
-#define PCF50633_REGULATOR(_name, _id, _n) \
+#define PCF50633_REGULATOR(_name, _id, _min_uV, _uV_step, _min_sel, _n) \
{   \
.name = _name,  \
.id = PCF50633_REGULATOR_##_id, \
.ops = _regulator_ops, \
.n_voltages = _n,   \
+   .min_uV = _min_uV,  \
+   .uV_step = _uV_step,\
+   .linear_min_sel = _min_sel, \
.type = REGULATOR_VOLTAGE,  \
.owner = THIS_MODULE,   \
.vsel_reg = PCF50633_REG_##_id##OUT,\
@@ -38,162 +41,39 @@
.enable_mask = PCF50633_REGULATOR_ON,   \
}
 
-/* Bits from voltage value */
-static u8 auto_voltage_bits(unsigned int millivolts)
-{
-   if (millivolts < 1800)
-   return 0x2f;
-   if (millivolts > 3800)
-   return 0xff;
-
-   millivolts -= 625;
-
-   return millivolts / 25;
-}
-
-static u8 down_voltage_bits(unsigned int millivolts)
-{
-   if (millivolts < 625)
-   return 0;
-   else if (millivolts > 3000)
-   return 0xff;
-
-   millivolts -= 625;
-
-   return millivolts / 25;
-}
-
-static u8 ldo_voltage_bits(unsigned int millivolts)
-{
-   if (millivolts < 900)
-   return 0;
-   else if (millivolts > 3600)
-   return 0x1f;
-
-   millivolts -= 900;
-   return millivolts / 100;
-}
-
-/* Obtain voltage value from bits */
-static unsigned int auto_voltage_value(u8 bits)
-{
-   /* AUTOOUT:  to 00101110 are reserved.
-* Return 0 for bits in reserved range, which means this selector code
-* can't be used on this system */
-   if (bits < 0x2f)
-   return 0;
-
-   return 625 + (bits * 25);
-}
-
-
-static unsigned int down_voltage_value(u8 bits)
-{
-   return 625 + (bits * 25);
-}
-
-
-static unsigned int ldo_voltage_value(u8 bits)
-{
-   bits &= 0x1f;
-
-   return 900 + (bits * 100);
-}
-
-static int pcf50633_regulator_map_voltage(struct regulator_dev *rdev,
- int min_uV, int max_uV)
-{
-   struct pcf50633 *pcf;
-   int regulator_id, millivolts;
-   u8 volt_bits;
-
-   pcf = rdev_get_drvdata(rdev);
-
-   regulator_id = rdev_get_id(rdev);
-   if (regulator_id >= PCF50633_NUM_REGULATORS)
-   return -EINVAL;
-
-   millivolts = min_uV / 1000;
-
-   switch (regulator_id) {
-   case PCF50633_REGULATOR_AUTO:
-   volt_bits = auto_voltage_bits(millivolts);
-   break;
-   case PCF50633_REGULATOR_DOWN1:
-   case PCF50633_REGULATOR_DOWN2:
-   volt_bits = down_voltage_bits(millivolts);
-   break;
-   case PCF50633_REGULATOR_LDO1:
-   case PCF50633_REGULATOR_LDO2:
-   case PCF50633_REGULATOR_LDO3:
-   case PCF50633_REGULATOR_LDO4:
-   case PCF50633_REGULATOR_LDO5:
-   case PCF50633_REGULATOR_LDO6:
-   case PCF50633_REGULATOR_HCLDO:
-   case PCF50633_REGULATOR_MEMLDO:
-   volt_bits = ldo_voltage_bits(millivolts);
-   break;
-   default:
-   return

[PATCH -next 3/3] staging/fwserial: Remove superfluous free

2012-11-27 Thread Peter Hurley

Now that the dma fifo is allocated on activate and freed on
shutdown, this extra free is harmless but unnecessary.

Signed-off-by: Peter Hurley 
---
 drivers/staging/fwserial/fwserial.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/staging/fwserial/fwserial.c 
b/drivers/staging/fwserial/fwserial.c
index 0681967..61ee290 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -949,7 +949,6 @@ static void fwserial_destroy(struct kref *kref)
 
for (j = 0; j < num_ports; ++j) {
fw_core_remove_address_handler([j]->rx_handler);
-   dma_fifo_free([j]->tx_fifo);
tty_port_destroy([j]->port);
kfree(ports[j]);
}
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -next 2/3] staging/fwserial: Use WARN_ONCE when port table is corrupted

2012-11-27 Thread Peter Hurley


Signed-off-by: Peter Hurley 
---
 drivers/staging/fwserial/fwserial.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/staging/fwserial/fwserial.c 
b/drivers/staging/fwserial/fwserial.c
index 99a2d2d..0681967 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -939,14 +939,9 @@ static void fwserial_destroy(struct kref *kref)
 
mutex_lock(_table_lock);
for (j = 0; j < num_ports; ++i, ++j) {
-   static bool once;
-   int corrupt = port_table[i] != ports[j];
-   if (corrupt && !once) {
-   WARN(corrupt, "port_table[%d]: %p != ports[%d]: %p",
-i, port_table[i], j, ports[j]);
-   once = true;
-   port_table_corrupt = true;
-   }
+   port_table_corrupt |= port_table[i] != ports[j];
+   WARN_ONCE(port_table_corrupt, "port_table[%d]: %p != ports[%d]: 
%p",
+i, port_table[i], j, ports[j]);
 
port_table[i] = NULL;
}
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -next 1/3] staging/fwserial: Destruct embedded tty_port on teardown

2012-11-27 Thread Peter Hurley

For TTY drivers that manage the port lifetime, the tty_port should
to be specifically destructed when the port lifetime ends. Now that
a method has been added to do this, use it.

Signed-off-by: Peter Hurley 
Cc: Jiri Slaby 
Cc: Alan Cox 
---
 drivers/staging/fwserial/fwserial.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/staging/fwserial/fwserial.c 
b/drivers/staging/fwserial/fwserial.c
index 5d4d64a..99a2d2d 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -955,6 +955,7 @@ static void fwserial_destroy(struct kref *kref)
for (j = 0; j < num_ports; ++j) {
fw_core_remove_address_handler([j]->rx_handler);
dma_fifo_free([j]->tx_fifo);
+   tty_port_destroy([j]->port);
kfree(ports[j]);
}
kfree(serial);
@@ -2369,8 +2370,10 @@ unregister_ttys:
return err;
 
 free_ports:
-   for (--i; i >= 0; --i)
+   for (--i; i >= 0; --i) {
+   tty_port_destroy(>ports[i]->port);
kfree(serial->ports[i]);
+   }
kfree(serial);
return err;
 }
-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH -next 0/3] staging/fwserial: teardown cleanup

2012-11-27 Thread Peter Hurley

Hi Greg,

This patch series fixes up staging/fwserial to meet the new requirements
in Jiri Slaby's series "TTY: memory leaks patchset" (in tty-next).

Strictly speaking, only PATCH 1/3 implements the necessary cleanup.
PATCHES 2/3 & 3/3 are additional cleanups in the same area.

Peter Hurley (3):
  staging/fwserial: Destruct embedded tty_port on teardown
  staging/fwserial: Use WARN_ONCE when port table is corrupted
  staging/fwserial: Remove superfluous free

 drivers/staging/fwserial/fwserial.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

-- 
1.8.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/3] zram: give up lazy initialization of zram metadata

2012-11-27 Thread Minchan Kim

1) User of zram normally do mkfs.xxx or mkswap before using
   the zram block device(ex, normally, do it at booting time)
   It ends up allocating such metadata of zram before real usage so
   benefit of lazy initialzation would be mitigated.

2) Some user want to use zram when memory pressure is high.(ie, load zram
   dynamically, NOT booting time). It does make sense because people don't
   want to waste memory until memory pressure is high(ie, where zram is really
   helpful time). In this case, lazy initialzation could be failed easily
   because we will use GFP_NOIO instead of GFP_KERNEL for avoiding deadlock.
   So the benefit of lazy initialzation would be mitigated, too.

3) Metadata overhead is not critical and Nitin has a plan to diet it.
   4K : 12 byte(64bit machine) -> 64G : 192M so 0.3% isn't big overhead
   If insane user use such big zram device up to 20, it could consume 6% of ram
   but efficieny of zram will cover the waste.

So this patch gives up lazy initialization and instead we initialize metadata
at disksize setting time.

Signed-off-by: Minchan Kim 
---
 drivers/staging/zram/zram_drv.c   |   19 ---
 drivers/staging/zram/zram_sysfs.c |1 +
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c
index b036dcc..e04aefc 100644
--- a/drivers/staging/zram/zram_drv.c
+++ b/drivers/staging/zram/zram_drv.c
@@ -441,16 +441,13 @@ static void zram_make_request(struct request_queue 
*queue, struct bio *bio)
 {
struct zram *zram = queue->queuedata;
 
-   if (unlikely(!zram->init_done) && zram_init_device(zram))
-   goto error;
-
down_read(>init_lock);
if (unlikely(!zram->init_done))
-   goto error_unlock;
+   goto error;
 
if (!valid_io_request(zram, bio)) {
zram_stat64_inc(zram, >stats.invalid_io);
-   goto error_unlock;
+   goto error;
}
 
__zram_make_request(zram, bio, bio_data_dir(bio));
@@ -458,9 +455,8 @@ static void zram_make_request(struct request_queue *queue, 
struct bio *bio)
 
return;
 
-error_unlock:
-   up_read(>init_lock);
 error:
+   up_read(>init_lock);
bio_io_error(bio);
 }
 
@@ -509,17 +505,12 @@ void zram_reset_device(struct zram *zram)
up_write(>init_lock);
 }
 
+/* zram->init_lock should be held */
 int zram_init_device(struct zram *zram)
 {
int ret;
size_t num_pages;
 
-   down_write(>init_lock);
-   if (zram->init_done) {
-   up_write(>init_lock);
-   return 0;
-   }
-
if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) {
pr_info(
"There is little point creating a zram of greater than "
@@ -568,7 +559,6 @@ int zram_init_device(struct zram *zram)
}
 
zram->init_done = 1;
-   up_write(>init_lock);
 
pr_debug("Initialization done!\n");
return 0;
@@ -578,7 +568,6 @@ fail_no_table:
zram->disksize = 0;
 fail:
__zram_reset_device(zram);
-   up_write(>init_lock);
pr_err("Initialization failed: err=%d\n", ret);
return ret;
 }
diff --git a/drivers/staging/zram/zram_sysfs.c 
b/drivers/staging/zram/zram_sysfs.c
index 4143af9..369db12 100644
--- a/drivers/staging/zram/zram_sysfs.c
+++ b/drivers/staging/zram/zram_sysfs.c
@@ -71,6 +71,7 @@ static ssize_t disksize_store(struct device *dev,
 
zram->disksize = PAGE_ALIGN(disksize);
set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+   zram_init_device(zram);
up_write(>init_lock);
 
return len;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/3] zram: get rid of lockdep warning

2012-11-27 Thread Minchan Kim

Lockdep complains about recursive deadlock of zram->init_lock.
[1] made it false positive because we can't request IO to zram
before setting disksize. Anyway, we should shut lockdep up to
avoid many reporting from user.

This patch allocates zram's metadata out of lock so we can fix it.
In addition, this patch replace GFP_KERNEL with GFP_NOIO/GFP_ATOMIC
in request handle path for partion I/O.

[1] zram: give up lazy initialization of zram metadata

Signed-off-by: Minchan Kim 
---
 drivers/staging/zram/zram_drv.c   |  139 -
 drivers/staging/zram/zram_drv.h   |   12 +++-
 drivers/staging/zram/zram_sysfs.c |   13 ++--
 3 files changed, 63 insertions(+), 101 deletions(-)

diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c
index e04aefc..a19059e 100644
--- a/drivers/staging/zram/zram_drv.c
+++ b/drivers/staging/zram/zram_drv.c
@@ -71,22 +71,22 @@ static void zram_stat64_inc(struct zram *zram, u64 *v)
zram_stat64_add(zram, v, 1);
 }
 
-static int zram_test_flag(struct zram *zram, u32 index,
+static int zram_test_flag(struct zram_meta *meta, u32 index,
enum zram_pageflags flag)
 {
-   return zram->table[index].flags & BIT(flag);
+   return meta->table[index].flags & BIT(flag);
 }
 
-static void zram_set_flag(struct zram *zram, u32 index,
+static void zram_set_flag(struct zram_meta *meta, u32 index,
enum zram_pageflags flag)
 {
-   zram->table[index].flags |= BIT(flag);
+   meta->table[index].flags |= BIT(flag);
 }
 
-static void zram_clear_flag(struct zram *zram, u32 index,
+static void zram_clear_flag(struct zram_meta *meta, u32 index,
enum zram_pageflags flag)
 {
-   zram->table[index].flags &= ~BIT(flag);
+   meta->table[index].flags &= ~BIT(flag);
 }
 
 static int page_zero_filled(void *ptr)
@@ -106,16 +106,17 @@ static int page_zero_filled(void *ptr)
 
 static void zram_free_page(struct zram *zram, size_t index)
 {
-   unsigned long handle = zram->table[index].handle;
-   u16 size = zram->table[index].size;
+   struct zram_meta *meta = zram->meta;
+   unsigned long handle = meta->table[index].handle;
+   u16 size = meta->table[index].size;
 
if (unlikely(!handle)) {
/*
 * No memory is allocated for zero filled pages.
 * Simply clear zero page flag.
 */
-   if (zram_test_flag(zram, index, ZRAM_ZERO)) {
-   zram_clear_flag(zram, index, ZRAM_ZERO);
+   if (zram_test_flag(meta, index, ZRAM_ZERO)) {
+   zram_clear_flag(meta, index, ZRAM_ZERO);
zram_stat_dec(>stats.pages_zero);
}
return;
@@ -124,17 +125,17 @@ static void zram_free_page(struct zram *zram, size_t 
index)
if (unlikely(size > max_zpage_size))
zram_stat_dec(>stats.bad_compress);
 
-   zs_free(zram->mem_pool, handle);
+   zs_free(meta->mem_pool, handle);
 
if (size <= PAGE_SIZE / 2)
zram_stat_dec(>stats.good_compress);
 
zram_stat64_sub(zram, >stats.compr_size,
-   zram->table[index].size);
+   meta->table[index].size);
zram_stat_dec(>stats.pages_stored);
 
-   zram->table[index].handle = 0;
-   zram->table[index].size = 0;
+   meta->table[index].handle = 0;
+   meta->table[index].size = 0;
 }
 
 static void handle_zero_page(struct bio_vec *bvec)
@@ -159,20 +160,21 @@ static int zram_decompress_page(struct zram *zram, char 
*mem, u32 index)
int ret = LZO_E_OK;
size_t clen = PAGE_SIZE;
unsigned char *cmem;
-   unsigned long handle = zram->table[index].handle;
+   struct zram_meta *meta = zram->meta;
+   unsigned long handle = meta->table[index].handle;
 
-   if (!handle || zram_test_flag(zram, index, ZRAM_ZERO)) {
+   if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
memset(mem, 0, PAGE_SIZE);
return 0;
}
 
-   cmem = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
-   if (zram->table[index].size == PAGE_SIZE)
+   cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+   if (meta->table[index].size == PAGE_SIZE)
memcpy(mem, cmem, PAGE_SIZE);
else
-   ret = lzo1x_decompress_safe(cmem, zram->table[index].size,
+   ret = lzo1x_decompress_safe(cmem, meta->table[index].size,
mem, );
-   zs_unmap_object(zram->mem_pool, handle);
+   zs_unmap_object(meta->mem_pool, handle);
 
/* Should NEVER happen. Return bio error if it does. */
if (unlikely(ret != LZO_E_OK)) {
@@ -190,11 +192,11 @@ static int zram_bvec_read(struct zram *zram, struct 
bio_vec *bvec,
int ret;
struct page *page;
unsigned char *user_mem, *uncmem =

[PATCH 1/3] zram: force disksize setting before using zram

2012-11-27 Thread Minchan Kim

Now zram document syas "set disksize is optional"
but partly it's wrong. When you try to use zram firstly after
booting, you must set disksize, otherwise zram can't work because
zram gendisk's size is 0. But once you do it, you can use zram freely
after reset because reset doesn't reset to zero paradoxically.
So in this time, disksize setting is optional.:(
It's inconsitent for user behavior and not straightforward.

This patch forces always setting disksize firstly before using zram.
Yes. It changes current behavior so someone could complain when
he upgrades zram. Apparently it could be a problem if zram is mainline
but it still lives in staging so behavior could be changed for right
way to go. Let them excuse.

Signed-off-by: Minchan Kim 
---
 drivers/staging/zram/zram.txt |   27 +-
 drivers/staging/zram/zram_drv.c   |   55 +
 drivers/staging/zram/zram_drv.h   |5 +---
 drivers/staging/zram/zram_sysfs.c |6 +---
 4 files changed, 35 insertions(+), 58 deletions(-)

diff --git a/drivers/staging/zram/zram.txt b/drivers/staging/zram/zram.txt
index 5f75d29..765d790 100644
--- a/drivers/staging/zram/zram.txt
+++ b/drivers/staging/zram/zram.txt
@@ -23,17 +23,17 @@ Following shows a typical sequence of steps for using zram.
This creates 4 devices: /dev/zram{0,1,2,3}
(num_devices parameter is optional. Default: 1)
 
-2) Set Disksize (Optional):
-   Set disk size by writing the value to sysfs node 'disksize'
-   (in bytes). If disksize is not given, default value of 25%
-   of RAM is used.
-
-   # Initialize /dev/zram0 with 50MB disksize
-   echo $((50*1024*1024)) > /sys/block/zram0/disksize
-
-   NOTE: disksize cannot be changed if the disk contains any
-   data. So, for such a disk, you need to issue 'reset' (see below)
-   before you can change its disksize.
+2) Set Disksize
+Set disk size by writing the value to sysfs node 'disksize'.
+The value can be either in bytes or you can use mem suffixes.
+Examples:
+# Initialize /dev/zram0 with 50MB disksize
+echo $((50*1024*1024)) > /sys/block/zram0/disksize
+
+# Using mem suffixes
+echo 256K > /sys/block/zram0/disksize
+echo 512M > /sys/block/zram0/disksize
+echo 1G > /sys/block/zram0/disksize
 
 3) Activate:
mkswap /dev/zram0
@@ -65,8 +65,9 @@ Following shows a typical sequence of steps for using zram.
echo 1 > /sys/block/zram0/reset
echo 1 > /sys/block/zram1/reset
 
-   (This frees all the memory allocated for the given device).
-
+   This frees all the memory allocated for the given device and
+   resets the disksize to zero. You must set the disksize again
+   before reusing the device.
 
 Please report any problems at:
  - Mailing list: linux-mm-cc at laptop dot org
diff --git a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c
index fb4a7c9..b036dcc 100644
--- a/drivers/staging/zram/zram_drv.c
+++ b/drivers/staging/zram/zram_drv.c
@@ -104,35 +104,6 @@ static int page_zero_filled(void *ptr)
return 1;
 }
 
-static void zram_set_disksize(struct zram *zram, size_t totalram_bytes)
-{
-   if (!zram->disksize) {
-   pr_info(
-   "disk size not provided. You can use disksize_kb module "
-   "param to specify size.\nUsing default: (%u%% of RAM).\n",
-   default_disksize_perc_ram
-   );
-   zram->disksize = default_disksize_perc_ram *
-   (totalram_bytes / 100);
-   }
-
-   if (zram->disksize > 2 * (totalram_bytes)) {
-   pr_info(
-   "There is little point creating a zram of greater than "
-   "twice the size of memory since we expect a 2:1 compression "
-   "ratio. Note that zram uses about 0.1%% of the size of "
-   "the disk when not in use so a huge zram is "
-   "wasteful.\n"
-   "\tMemory Size: %zu kB\n"
-   "\tSize you selected: %llu kB\n"
-   "Continuing anyway ...\n",
-   totalram_bytes >> 10, zram->disksize
-   );
-   }
-
-   zram->disksize &= PAGE_MASK;
-}
-
 static void zram_free_page(struct zram *zram, size_t index)
 {
unsigned long handle = zram->table[index].handle;
@@ -497,6 +468,9 @@ void __zram_reset_device(struct zram *zram)
 {
size_t index;
 
+   if (!zram->init_done)
+   goto out;
+
zram->init_done = 0;
 
/* Free various per-device buffers */
@@ -523,8 +497,9 @@ void __zram_reset_device(struct zram *zram)
 
/* Reset stats */
memset(>stats, 0, sizeof(zram->stats));
-
+out:
zram->disksize = 0;
+   set_capacity(zram->disk, 0);
 }
 
 void zram_reset_device(struct zram *zram)
@@ -540,13 +515,24 @@ int zram_init_device(struct zram *zram)
size_t

[PATCH V2] watchdog: optimizing the hrtimer interval for power saving

2012-11-27 Thread Chuansheng Liu


By default, the watchdog threshold is 10, it means every 4s
every CPU will receive one hrtimer interrupt, for low power
device, it will cause 4-5mV power impact when device is deep
sleep.

So here want to optimize it as below:
4s + 4s + 4s + 4s + 4s
== >
1s + 9s + 9s ...
Or
1s + 1s..+ 9s + 9s 

For soft lockup detection, it will have more than 5 chances to
hit, once one chance is successful, we will start 9s hrtimer
instead of 1s;

For hard lockup dection, it will have more than 2 chances to hit,
As Don said, the min window is 10s just when CPU is always running
as MAX frequency. In most case, the window is 60s, so we should
have much more than 2 chances.

With this patch, in most cases the hrtimer will be 9s instead
of 4s averagely. It can save the device power indeed.

Change log:
Since V1: In V1, Don pointed out, "12 seconds will miss the window
  repeatedly." So here set the long period < min window 10s.

Signed-off-by: liu chuansheng 
---
 kernel/watchdog.c |   30 --
 1 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index dd4b80a..b37d682 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -125,7 +125,24 @@ static u64 get_sample_period(void)
 * and hard thresholds) to increment before the
 * hardlockup detector generates a warning
 */
-   return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+   return get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 20);
+}
+
+static u64 get_long_sample_period(void)
+{
+   /*
+* convert watchdog_thresh from seconds to ns
+* We want to give 5 chances to detect softlockup,
+* for power saving, once one chance is succeeding,
+* we can set long period to avoid power consumption.
+* Currently, set the long sample period is:
+* 9s = 10s - 1s, the reason is for covering the window
+* of nmi interrupt 10s interval.
+* So at least, for hard lockup, it has >=2 chances,
+* for soft lockup, it has >= 5 chances.
+*
+*/
+   return (watchdog_thresh * (u64)NSEC_PER_SEC) - get_sample_period();
 }
 
 /* Commands for resetting the watchdog */
@@ -267,6 +284,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct 
hrtimer *hrtimer)
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
+   bool is_touched;
+
+   is_touched = (__this_cpu_read(hrtimer_interrupts) ==
+   __this_cpu_read(soft_lockup_hrtimer_cnt));
 
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -275,7 +296,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct 
hrtimer *hrtimer)
wake_up_process(__this_cpu_read(softlockup_watchdog));
 
/* .. and repeat */
-   hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+   if (is_touched) {
+   hrtimer_forward_now(hrtimer,
+   ns_to_ktime(get_long_sample_period()));
+   } else {
+   hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
+   }
 
if (touch_ts == 0) {
if (unlikely(__this_cpu_read(softlockup_touch_sync))) {
-- 
1.7.0.4



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 6/6 v6] cpufreq, highbank: add support for highbank cpufreq

2012-11-27 Thread Shawn Guo

On Tue, Nov 27, 2012 at 02:04:32PM -0600, Mark Langsdorf wrote:
> Highbank processors depend on the external ECME to perform voltage
> management based on a requested frequency. Communication between the
> A9 cores and the ECME happens over the pl320 IPC channel.

...

> +static int hb_voltage_change(unsigned int freq)
> +{
> + int i;
> + u32 msg[7];
> +
> + msg[0] = HB_CPUFREQ_CHANGE_NOTE;
> + msg[1] = freq / 1000;
> + for (i = 2; i < 7; i++)
> + msg[i] = 0;
> +
> + return pl320_ipc_transmit(msg);

Is it possible to have this handled inside clk_set_rate() call of cpu
clock?

Shawn

> +}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFT] regulator: pcf50633: Use linear_min_sel and regulator_[map|list]_voltage_linear

2012-11-27 Thread Axel Lin

This driver can be converted to use linear_min_sel and
regulator_[map|list]_voltage_linear.

Below shows the equations from datasheet:

For AUTOOUT:
VO(prog) = 0.625 + auto_out × 0.025 V; e.g.
( to 00101110: reserved)
0010: 1.8 V (min)
01010011: 2.7 V
01101010: 3.275 V
01101011: 3.300 V
01101100: 3.325 V
0111 : 3.800 V (max)

The linear mapping start from 0x2f selector.

Thus convert this equation to:
VO(prog) = 1.8 + (selector - linear_min_sel) × 0.025 V,
(min_uV = 180, uV_step = 25000, linear_min_sel = 0x2f)

For DOWNxOUT:
VO(prog) = 0.625 + downx_out × 0.025 V; e.g.
 : 0.625 V (min)
00010111 : 1.200 V
0010 : 1.800 V
0101 : 3.000 V (max)

For xLDOOUT:
VO(prog) = 0.9 + xldo_out × 0.1 V; e.g.
0: 0.9 V
1: 1.0 V
11000 : 3.3 V
11011 : 3.6 V

Signed-off-by: Axel Lin 
---
 drivers/regulator/pcf50633-regulator.c |  176 +---
 1 file changed, 28 insertions(+), 148 deletions(-)

diff --git a/drivers/regulator/pcf50633-regulator.c 
b/drivers/regulator/pcf50633-regulator.c
index d776f51..534075e 100644
--- a/drivers/regulator/pcf50633-regulator.c
+++ b/drivers/regulator/pcf50633-regulator.c
@@ -24,12 +24,15 @@
 #include 
 #include 
 
-#define PCF50633_REGULATOR(_name, _id, _n) \
+#define PCF50633_REGULATOR(_name, _id, _min_uV, _uV_step, _min_sel, _n) \
{   \
.name = _name,  \
.id = PCF50633_REGULATOR_##_id, \
.ops = _regulator_ops, \
.n_voltages = _n,   \
+   .min_uV = _min_uV,  \
+   .uV_step = _uV_step,\
+   .linear_min_sel = _min_sel, \
.type = REGULATOR_VOLTAGE,  \
.owner = THIS_MODULE,   \
.vsel_reg = PCF50633_REG_##_id##OUT,\
@@ -38,162 +41,39 @@
.enable_mask = PCF50633_REGULATOR_ON,   \
}
 
-/* Bits from voltage value */
-static u8 auto_voltage_bits(unsigned int millivolts)
-{
-   if (millivolts < 1800)
-   return 0x2f;
-   if (millivolts > 3800)
-   return 0xff;
-
-   millivolts -= 625;
-
-   return millivolts / 25;
-}
-
-static u8 down_voltage_bits(unsigned int millivolts)
-{
-   if (millivolts < 625)
-   return 0;
-   else if (millivolts > 3000)
-   return 0xff;
-
-   millivolts -= 625;
-
-   return millivolts / 25;
-}
-
-static u8 ldo_voltage_bits(unsigned int millivolts)
-{
-   if (millivolts < 900)
-   return 0;
-   else if (millivolts > 3600)
-   return 0x1f;
-
-   millivolts -= 900;
-   return millivolts / 100;
-}
-
-/* Obtain voltage value from bits */
-static unsigned int auto_voltage_value(u8 bits)
-{
-   /* AUTOOUT:  to 00101110 are reserved.
-* Return 0 for bits in reserved range, which means this selector code
-* can't be used on this system */
-   if (bits < 0x2f)
-   return 0;
-
-   return 625 + (bits * 25);
-}
-
-
-static unsigned int down_voltage_value(u8 bits)
-{
-   return 625 + (bits * 25);
-}
-
-
-static unsigned int ldo_voltage_value(u8 bits)
-{
-   bits &= 0x1f;
-
-   return 900 + (bits * 100);
-}
-
-static int pcf50633_regulator_map_voltage(struct regulator_dev *rdev,
- int min_uV, int max_uV)
-{
-   struct pcf50633 *pcf;
-   int regulator_id, millivolts;
-   u8 volt_bits;
-
-   pcf = rdev_get_drvdata(rdev);
-
-   regulator_id = rdev_get_id(rdev);
-   if (regulator_id >= PCF50633_NUM_REGULATORS)
-   return -EINVAL;
-
-   millivolts = min_uV / 1000;
-
-   switch (regulator_id) {
-   case PCF50633_REGULATOR_AUTO:
-   volt_bits = auto_voltage_bits(millivolts);
-   break;
-   case PCF50633_REGULATOR_DOWN1:
-   case PCF50633_REGULATOR_DOWN2:
-   volt_bits = down_voltage_bits(millivolts);
-   break;
-   case PCF50633_REGULATOR_LDO1:
-   case PCF50633_REGULATOR_LDO2:
-   case PCF50633_REGULATOR_LDO3:
-   case PCF50633_REGULATOR_LDO4:
-   case PCF50633_REGULATOR_LDO5:
-   case PCF50633_REGULATOR_LDO6:
-   case PCF50633_REGULATOR_HCLDO:
-   case PCF50633_REGULATOR_MEMLDO:
-   volt_bits = ldo_voltage_bits(millivolts);
-   break;
-   default:
-   return -EINVAL;
-   }
-
-   return volt_bits;
-}
-
-static int pcf50633_regulator_list_voltage(struct regulator_dev *rdev,
-

Re: [PATCH V3 3/3] mfd: stmpe: Update DT support in stmpe driver

2012-11-27 Thread Viresh Kumar

On 28 November 2012 01:25, Rabin Vincent  wrote:
> 2012/11/27 Viresh Kumar :
>> On 27 November 2012 14:10, Lee Jones  wrote:
>> I haven't seen this in any of SPEAr boards i have worked on. Maybe Rabin
>> would have, that's why he added that part of code :)
>>
>> @Rabin/Linus: Do you remember why have you added this in stmpe driver:
>>
>> +   if (stmpe->pdata->irq_invert_polarity)
>> +   icr ^= STMPE_ICR_LSB_HIGH;
>> +
>>
>> Does somebody actually need it?
>
> It was (as irq_rev_pol) part of Luotao Fu's proposed STMPE811 patchset
> (https://patchwork.kernel.org/patch/106173/) which I integrated into my
> version of the STMPE driver, which didn't have it in its initial version
> (https://patchwork.kernel.org/patch/103273/).
>
> It's not something _I_ ever used.

I grep'd kernel and nobody is using it there, so lets get rid of it
completely :)
I will patch it.

--
viresh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 6/7] mfd: max8925: support dt for backlight

2012-11-27 Thread Qing Xu


On 11/23/2012 05:08 PM, Haojian Zhuang wrote:

On Tue, Nov 6, 2012 at 3:47 PM, Qing Xu  wrote:

From: Qing Xu 

Signed-off-by: Qing Xu 
---
  drivers/video/backlight/max8925_bl.c |   31 ++-
  1 files changed, 30 insertions(+), 1 deletions(-)

diff --git a/drivers/video/backlight/max8925_bl.c 
b/drivers/video/backlight/max8925_bl.c
index f72ba54..7de02ed 100644
--- a/drivers/video/backlight/max8925_bl.c
+++ b/drivers/video/backlight/max8925_bl.c
@@ -101,6 +101,29 @@ static const struct backlight_ops max8925_backlight_ops = {
 .get_brightness = max8925_backlight_get_brightness,
  };

+#ifdef CONFIG_OF
+static int max8925_backlight_dt_init(struct platform_device *pdev,
+ struct max8925_backlight_pdata *pdata)
+{
+   struct device_node *nproot = pdev->dev.parent->of_node, *np;
+   int dual_string;
+
+   if (!nproot)
+   return -ENODEV;
+   np = of_find_node_by_name(nproot, "backlight");
+   if (!np) {
+   dev_err(>dev, "failed to find backlight node\n");
+   return -ENODEV;
+   }
+
+   of_property_read_u32(np, "dual-string", _string);

Please use the property like "maxium,max8925-dual-string" at here. It's used
to avoid naming conflict.


patch updated in v2, and dts patch update in v3


+   pdata->dual_string = dual_string;
+   return 0;
+}
+#else
+#define max8925_backlight_dt_init(x, y)(-1)
+#endif
+
  static int __devinit max8925_backlight_probe(struct platform_device *pdev)
  {
 struct max8925_chip *chip = dev_get_drvdata(pdev->dev.parent);
@@ -150,6 +173,13 @@ static int __devinit max8925_backlight_probe(struct 
platform_device *pdev)
 platform_set_drvdata(pdev, bl);

 value = 0;
+   if (pdev->dev.parent->of_node && !pdata) {
+   pdata = devm_kzalloc(>dev,
+sizeof(struct max8925_backlight_pdata),
+GFP_KERNEL);
+   max8925_backlight_dt_init(pdev, pdata);
+   }
+
 if (pdata) {
 if (pdata->lxw_scl)
 value |= (1 << 7);
@@ -161,7 +191,6 @@ static int __devinit max8925_backlight_probe(struct 
platform_device *pdev)
 ret = max8925_set_bits(chip->i2c, data->reg_mode_cntl, 0xfe, value);
 if (ret < 0)
 goto out_brt;
-
 backlight_update_status(bl);
 return 0;
  out_brt:
--
1.7.0.4



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v3 7/7] mfd: max8925: add dts

2012-11-27 Thread Qing Xu

From: Qing Xu 

add max8925 dts support into mmp2 brownstone platform

Signed-off-by: Qing Xu 
---
 arch/arm/boot/dts/mmp2-brownstone.dts |  158 +
 arch/arm/boot/dts/mmp2.dtsi   |4 +-
 2 files changed, 161 insertions(+), 1 deletions(-)

diff --git a/arch/arm/boot/dts/mmp2-brownstone.dts 
b/arch/arm/boot/dts/mmp2-brownstone.dts
index c9b4f27..d94210b 100644
--- a/arch/arm/boot/dts/mmp2-brownstone.dts
+++ b/arch/arm/boot/dts/mmp2-brownstone.dts
@@ -29,6 +29,164 @@
};
twsi1: i2c@d4011000 {
status = "okay";
+   pmic: max8925@3c {
+   compatible = "maxium,max8925";
+   reg = <0x3c>;
+   interrupts = <1>;
+   interrupt-parent = <>;
+   interrupt-controller;
+   #interrupt-cells = <1>;
+   tsc-irq = <0>;
+
+   regulators {
+   SDV1 {
+   regulator-min-microvolt 
= <637500>;
+   regulator-max-microvolt 
= <1425000>;
+   regulator-boot-on;
+   regulator-always-on;
+   };
+   SDV2 {
+   regulator-min-microvolt 
= <65>;
+   regulator-max-microvolt 
= <2225000>;
+   regulator-boot-on;
+   regulator-always-on;
+   };
+   SDV3 {
+   regulator-min-microvolt 
= <75>;
+   regulator-max-microvolt 
= <390>;
+   regulator-boot-on;
+   regulator-always-on;
+   };
+   LDO1 {
+   regulator-min-microvolt 
= <75>;
+   regulator-max-microvolt 
= <390>;
+   regulator-boot-on;
+   regulator-always-on;
+   };
+   LDO2 {
+   regulator-min-microvolt 
= <65>;
+   regulator-max-microvolt 
= <225>;
+   regulator-boot-on;
+   regulator-always-on;
+   };
+   LDO3 {
+   regulator-min-microvolt 
= <65>;
+   regulator-max-microvolt 
= <225>;
+   regulator-boot-on;
+   regulator-always-on;
+   };
+   LDO4 {
+   regulator-min-microvolt 
= <75>;
+   regulator-max-microvolt 
= <390>;
+   regulator-boot-on;
+   regulator-always-on;
+   };
+   LDO5 {
+   regulator-min-microvolt 
= <75>;
+   regulator-max-microvolt 
= <390>;
+   regulator-boot-on;
+   regulator-always-on;
+   };
+   LDO6 {
+   regulator-min-microvolt 
= <75>;
+   regulator-max-microvolt 
= <390>;
+

[PATCH v2 6/7] mfd: max8925: support dt for backlight

2012-11-27 Thread Qing Xu

From: Qing Xu 

Signed-off-by: Qing Xu 
---
 drivers/video/backlight/max8925_bl.c |   31 ++-
 1 files changed, 30 insertions(+), 1 deletions(-)

diff --git a/drivers/video/backlight/max8925_bl.c 
b/drivers/video/backlight/max8925_bl.c
index f72ba54..1682d00 100644
--- a/drivers/video/backlight/max8925_bl.c
+++ b/drivers/video/backlight/max8925_bl.c
@@ -101,6 +101,29 @@ static const struct backlight_ops max8925_backlight_ops = {
.get_brightness = max8925_backlight_get_brightness,
 };
 
+#ifdef CONFIG_OF
+static int max8925_backlight_dt_init(struct platform_device *pdev,
+ struct max8925_backlight_pdata *pdata)
+{
+   struct device_node *nproot = pdev->dev.parent->of_node, *np;
+   int dual_string;
+
+   if (!nproot)
+   return -ENODEV;
+   np = of_find_node_by_name(nproot, "backlight");
+   if (!np) {
+   dev_err(>dev, "failed to find backlight node\n");
+   return -ENODEV;
+   }
+
+   of_property_read_u32(np, "max8925-dual-string", _string);
+   pdata->dual_string = dual_string;
+   return 0;
+}
+#else
+#define max8925_backlight_dt_init(x, y)(-1)
+#endif
+
 static int __devinit max8925_backlight_probe(struct platform_device *pdev)
 {
struct max8925_chip *chip = dev_get_drvdata(pdev->dev.parent);
@@ -150,6 +173,13 @@ static int __devinit max8925_backlight_probe(struct 
platform_device *pdev)
platform_set_drvdata(pdev, bl);
 
value = 0;
+   if (pdev->dev.parent->of_node && !pdata) {
+   pdata = devm_kzalloc(>dev,
+sizeof(struct max8925_backlight_pdata),
+GFP_KERNEL);
+   max8925_backlight_dt_init(pdev, pdata);
+   }
+
if (pdata) {
if (pdata->lxw_scl)
value |= (1 << 7);
@@ -161,7 +191,6 @@ static int __devinit max8925_backlight_probe(struct 
platform_device *pdev)
ret = max8925_set_bits(chip->i2c, data->reg_mode_cntl, 0xfe, value);
if (ret < 0)
goto out_brt;
-
backlight_update_status(bl);
return 0;
 out_brt:
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] context_tracking: New context tracking susbsystem

2012-11-27 Thread Paul E. McKenney

On Tue, Nov 27, 2012 at 08:50:38PM -0500, Steven Rostedt wrote:
> On Tue, 2012-11-27 at 16:59 -0800, Paul E. McKenney wrote:
> > On Tue, Nov 27, 2012 at 07:33:25PM +0100, Frederic Weisbecker wrote:
> > > Create a new subsystem that probes on kernel boundaries
> > > to keep track of the transitions between level contexts
> > > with two basic initial contexts: user or kernel.
> > > 
> > > This is an abstraction of some RCU code that use such tracking
> > > to implement its userspace extended quiescent state.
> > > 
> > > We need to pull this up from RCU into this new level of indirection
> > > because this tracking is also going to be used to implement an "on
> > > demand" generic virtual cputime accounting. A necessary step to
> > > shutdown the tick while still accounting the cputime.
> > 
> > I have queued this, and if it passes tests and inspection will try
> > pushing it for 3.8.
> > 
> 
> You can add my Reviewed-by: Steven Rostedt  if you
> like.

Very happy to do so!  ;-)

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] context_tracking: New context tracking susbsystem

2012-11-27 Thread Steven Rostedt

On Tue, 2012-11-27 at 16:59 -0800, Paul E. McKenney wrote:
> On Tue, Nov 27, 2012 at 07:33:25PM +0100, Frederic Weisbecker wrote:
> > Create a new subsystem that probes on kernel boundaries
> > to keep track of the transitions between level contexts
> > with two basic initial contexts: user or kernel.
> > 
> > This is an abstraction of some RCU code that use such tracking
> > to implement its userspace extended quiescent state.
> > 
> > We need to pull this up from RCU into this new level of indirection
> > because this tracking is also going to be used to implement an "on
> > demand" generic virtual cputime accounting. A necessary step to
> > shutdown the tick while still accounting the cputime.
> 
> I have queued this, and if it passes tests and inspection will try
> pushing it for 3.8.
> 

You can add my Reviewed-by: Steven Rostedt  if you
like.

-- Steve


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: fix balloon_page_movable() page->flags check

2012-11-27 Thread Rafael Aquini

On Tue, Nov 27, 2012 at 05:15:44PM -0800, Andrew Morton wrote:
> On Tue, 27 Nov 2012 22:34:10 -0200 Rafael Aquini  wrote:
> 
> > Do you want me to resubmit this patch with the changes you suggested?
> 
> oh, I think I can reach that far.  How's this look?
>

It looks great to me.

Just a small nitpick, 
here __balloon_page_flags should be changed to page_flags_cleared too:
> @@ -109,18 +110,16 @@ static inline void balloon_mapping_free(
>  /*
>   * __balloon_page_flags - helper to perform balloon @page ->flags tests.
>   *

Thanks!
--Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] module: Remove a extra null character at the top of module->strtab.

2012-11-27 Thread sat

From: Satoru Takeuchi 

There is a extra null character('\0') at the top of module->strtab for
each module. Commit 59ef28b introduced this bug and this patch fixes it.

Live dump log of the current linus git kernel(HEAD is 2844a4870):

crash> mod | grep loop
a01db0a0  loop 16689  (not loaded)  [CONFIG_KALLSYMS]
crash> module.core_symtab a01db0a0
  core_symtab = 0xa01db320crash> rd 0xa01db320 12
a01db320:  00550001    U...
a01db330:   000200740002   t...
a01db340:  a01d8000 0038   8...
a01db350:  001a0064000e a01daeb0   d...
a01db360:  00a0 000200740019   t...
a01db370:  a01d8068 001b   h...
crash> module.core_strtab a01db0a0
  core_strtab = 0xa01dbb30 ""
crash> rd 0xa01dbb30 4
a01dbb30:  615f70616d6b 66780063696d6f74   ..kmap_atomic.xf
a01dbb40:  73636e75665f7265 72665f646e696600   er_funcs.find_fr


We expect Just first one byte of '\0', but actually first two bytes
are '\0'. Here is The relationship between symtab and strtab.

symtab_idx  strtab_idx  symbol
---
0   0x1 "\0" # startab_idx should be 0
1   0x2 "kmap_atomic"
2   0xe "xfer_funcs"
3   0x19"find_fr..."

By applying this patch, it becomes as follows.

symtab_idx  strtab_idx  symbol
---
0   0x0 "\0"# extra byte is removed
1   0x1 "kmap_atomic"
2   0xd "xfer_funcs"
3   0x18"find_fr..."

Signed-off-by: Satoru Takeuchi 
Cc: Masaki Kimura 
Cc: Rusty Russell 
Cc: Greg Kroah-Hartman 
---
 kernel/module.c |6 +-
 1 files changed, 1 insertions(+), 5 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 6e48c3a..7af5d5d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2282,7 +2282,7 @@ static void layout_symtab(struct module *mod, struct 
load_info *info)
Elf_Shdr *symsect = info->sechdrs + info->index.sym;
Elf_Shdr *strsect = info->sechdrs + info->index.str;
const Elf_Sym *src;
-   unsigned int i, nsrc, ndst, strtab_size;
+   unsigned int i, nsrc, ndst, strtab_size = 0;
 
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
@@ -2293,9 +2293,6 @@ static void layout_symtab(struct module *mod, struct 
load_info *info)
src = (void *)info->hdr + symsect->sh_offset;
nsrc = symsect->sh_size / sizeof(*src);
 
-   /* strtab always starts with a nul, so offset 0 is the empty string. */
-   strtab_size = 1;
-
/* Compute total space required for the core symbols' strtab. */
for (ndst = i = 0; i < nsrc; i++) {
if (i == 0 ||
@@ -2337,7 +2334,6 @@ static void add_kallsyms(struct module *mod, const struct 
load_info *info)
mod->core_symtab = dst = mod->module_core + info->symoffs;
mod->core_strtab = s = mod->module_core + info->stroffs;
src = mod->symtab;
-   *s++ = 0;
for (ndst = i = 0; i < mod->num_symtab; i++) {
if (i == 0 ||
is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
-- 1.7.2.5 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 3.6.8: dmar: DRHD: handling fault status reg 602

2012-11-27 Thread Robert Hancock


On 11/27/2012 09:16 AM, Justin Piszcz wrote:

Hello,

Any idea why this is happening (e.g. why is PTE Read Access not set?)

[   13.204560] dmar: DRHD: handling fault status reg 602
[   13.208078] dmar: DMAR:[DMA Read] Request device [04:00.0] fault addr 0
[   13.208078] DMAR:[fault reason 06] PTE Read access is not set
[   15.777874] dmar: DRHD: handling fault status reg 702
[   15.777879] dmar: DMAR:[DMA Read] Request device [04:00.0] fault addr 0
[   15.777879] DMAR:[fault reason 06] PTE Read access is not set
[   16.100453] dmar: DRHD: handling fault status reg 2
[   16.100458] dmar: DMAR:[DMA Read] Request device [04:00.0] fault addr 0
[   16.100458] DMAR:[fault reason 06] PTE Read access is not set
[   16.141058] dmar: DRHD: handling fault status reg 102
[   16.141062] dmar: DMAR:[DMA Read] Request device [04:00.0] fault addr 0
[   16.141062] DMAR:[fault reason 06] PTE Read access is not set
[   16.210102] dmar: DRHD: handling fault status reg 202
[   16.210111] dmar: DMAR:[DMA Read] Request device [04:00.0] fault addr 0
[   16.210111] DMAR:[fault reason 06] PTE Read access is not set
[   16.918149] ixgbe :86:00.0: eth2: NIC Link is Up 10 Gbps, Flow
Control: RX/TX

This is from:
http://lkml.org/lkml/2012/11/27/263

Justin.



From the dmesg you posted (and some comments on that thread) it might 
have something to do with CONFIG_PCI_MMCONFIG being disabled. If so, try 
enabling that.


Of course the DMAR stuff should be recovering from that more gracefully 
if that's the problem.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V3 RFC 2/2] kvm: Handle yield_to failure return code for potential undercommit case

2012-11-27 Thread Marcelo Tosatti


Don't understand the reasoning behind why 3 is a good choice.

On Mon, Nov 26, 2012 at 05:38:04PM +0530, Raghavendra K T wrote:
> From: Raghavendra K T 
> 
> yield_to returns -ESRCH, When source and target of yield_to
> run queue length is one. When we see three successive failures of
> yield_to we assume we are in potential undercommit case and abort
> from PLE handler.
> The assumption is backed by low probability of wrong decision
> for even worst case scenarios such as average runqueue length
> between 1 and 2.
> 
> note that we do not update last boosted vcpu in failure cases.
> Thank Avi for raising question on aborting after first fail from yield_to.
> 
> Reviewed-by: Srikar Dronamraju 
> Signed-off-by: Raghavendra K T 
> ---
>  virt/kvm/kvm_main.c |   26 --
>  1 file changed, 16 insertions(+), 10 deletions(-)
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index be70035..053f494 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1639,6 +1639,7 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
>  {
>   struct pid *pid;
>   struct task_struct *task = NULL;
> + bool ret = false;
>  
>   rcu_read_lock();
>   pid = rcu_dereference(target->pid);
> @@ -1646,17 +1647,15 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target)
>   task = get_pid_task(target->pid, PIDTYPE_PID);
>   rcu_read_unlock();
>   if (!task)
> - return false;
> + return ret;
>   if (task->flags & PF_VCPU) {
>   put_task_struct(task);
> - return false;
> - }
> - if (yield_to(task, 1)) {
> - put_task_struct(task);
> - return true;
> + return ret;
>   }
> + ret = yield_to(task, 1);
>   put_task_struct(task);
> - return false;
> +
> + return ret;
>  }
>  EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
>  
> @@ -1697,12 +1696,14 @@ bool kvm_vcpu_eligible_for_directed_yield(struct 
> kvm_vcpu *vcpu)
>   return eligible;
>  }
>  #endif
> +
>  void kvm_vcpu_on_spin(struct kvm_vcpu *me)
>  {
>   struct kvm *kvm = me->kvm;
>   struct kvm_vcpu *vcpu;
>   int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
>   int yielded = 0;
> + int try = 3;
>   int pass;
>   int i;
>  
> @@ -1714,7 +1715,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
>* VCPU is holding the lock that we need and will release it.
>* We approximate round-robin by starting at the last boosted VCPU.
>*/
> - for (pass = 0; pass < 2 && !yielded; pass++) {
> + for (pass = 0; pass < 2 && !yielded && try; pass++) {
>   kvm_for_each_vcpu(i, vcpu, kvm) {
>   if (!pass && i <= last_boosted_vcpu) {
>   i = last_boosted_vcpu;
> @@ -1727,10 +1728,15 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me)
>   continue;
>   if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
>   continue;
> - if (kvm_vcpu_yield_to(vcpu)) {
> +
> + yielded = kvm_vcpu_yield_to(vcpu);
> + if (yielded > 0) {
>   kvm->last_boosted_vcpu = i;
> - yielded = 1;
>   break;
> + } else if (yielded < 0) {
> + try--;
> + if (!try)
> + break;
>   }
>   }
>   }
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: fix balloon_page_movable() page->flags check

2012-11-27 Thread Andrew Morton

On Tue, 27 Nov 2012 22:34:10 -0200 Rafael Aquini  wrote:

> Do you want me to resubmit this patch with the changes you suggested?

oh, I think I can reach that far.  How's this look?

From: Andrew Morton 
Subject: 
mm-introduce-a-common-interface-for-balloon-pages-mobility-mm-fix-balloon_page_movable-page-flags-check-fix

use PAGE_FLAGS_CHECK_AT_PREP, s/__balloon_page_flags/page_flags_cleared/, small 
cleanups

Cc: "Michael S. Tsirkin" 
Cc: Andi Kleen 
Cc: Konrad Rzeszutek Wilk 
Cc: Mel Gorman 
Cc: Minchan Kim 
Cc: Rafael Aquini 
Cc: Rik van Riel 
Cc: Rusty Russell 
Cc: Sasha Levin 
Signed-off-by: Andrew Morton 
---

 include/linux/balloon_compaction.h |   21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff -puN 
include/linux/balloon_compaction.h~mm-introduce-a-common-interface-for-balloon-pages-mobility-mm-fix-balloon_page_movable-page-flags-check-fix
 include/linux/balloon_compaction.h
--- 
a/include/linux/balloon_compaction.h~mm-introduce-a-common-interface-for-balloon-pages-mobility-mm-fix-balloon_page_movable-page-flags-check-fix
+++ a/include/linux/balloon_compaction.h
@@ -41,6 +41,7 @@
 #ifndef _LINUX_BALLOON_COMPACTION_H
 #define _LINUX_BALLOON_COMPACTION_H
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -109,18 +110,16 @@ static inline void balloon_mapping_free(
 /*
  * __balloon_page_flags - helper to perform balloon @page ->flags tests.
  *
- * As balloon pages are got from Buddy, and we do not play with page->flags
+ * As balloon pages are obtained from buddy and we do not play with page->flags
  * at driver level (exception made when we get the page lock for compaction),
- * therefore we can safely identify a ballooned page by checking if the
- * NR_PAGEFLAGS rightmost bits from the page->flags are all cleared.
- * This approach also helps on skipping ballooned pages that are locked for
- * compaction or release, thus mitigating their racy check at
- * balloon_page_movable()
+ * we can safely identify a ballooned page by checking if the
+ * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared.  This approach also
+ * helps us skip ballooned pages that are locked for compaction or release, 
thus
+ * mitigating their racy check at balloon_page_movable()
  */
-#define BALLOON_PAGE_FLAGS_MASK   ((1UL << NR_PAGEFLAGS) - 1)
-static inline bool __balloon_page_flags(struct page *page)
+static inline bool page_flags_cleared(struct page *page)
 {
-   return page->flags & BALLOON_PAGE_FLAGS_MASK ? false : true;
+   return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP);
 }
 
 /*
@@ -149,10 +148,10 @@ static inline bool __is_movable_balloon_
 static inline bool balloon_page_movable(struct page *page)
 {
/*
-* Before dereferencing and testing mapping->flags, lets make sure
+* Before dereferencing and testing mapping->flags, let's make sure
 * this is not a page that uses ->mapping in a different way
 */
-   if (__balloon_page_flags(page) && !page_mapped(page) &&
+   if (page_flags_cleared(page) && !page_mapped(page) &&
page_count(page) == 1)
return __is_movable_balloon_page(page);
 
_

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1260 matches

Mail list logo