Re: pipe/page fault oddness.

2014-10-08 Thread Aneesh Kumar K.V
Linus Torvalds  writes:

> On Mon, Oct 6, 2014 at 3:18 PM, Aneesh Kumar K.V
>  wrote:
>>
>> Are we still looking at these options ? I could look at implementing the
>> first option which will also enable us to free up one pte bit.
>
> We definitely are. If you can test my patch (with the small follow-up
> fix), and do the necessary changes for ppc64, that would be good.
>
> I looked quickly at the ppc64 side, and it didn't look too painful.
> Using pte_protnone() instead of pte_numa() should remove move lines
> than it adds there too..
>

This is a quick hack and gets it running. With perf bench numa mem

-bash-4.2# grep numa /proc/vmstat   

   
numa_hit 3310633
numa_miss 0
numa_foreign 0
numa_interleave 6451
numa_local 3162369
numa_other 148264
numa_pte_updates 27708982
numa_huge_pte_updates 76987
numa_hint_faults 268439275
numa_hint_faults_local 5359216
numa_pages_migrated 3349573
-bash-4.2# 

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index d98c1ecc3266..2a9bbe4d2364 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -41,7 +41,7 @@ static inline pgprot_t pte_pgprot(pte_t pte)  { return 
__pgprot(pte_val(pte) & PA
 
 static inline int pte_present(pte_t pte)
 {
-   return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
+   return pte_val(pte) & _PAGE_PRESENT;
 }
 
 #define pte_present_nonuma pte_present_nonuma
@@ -50,78 +50,20 @@ static inline int pte_present_nonuma(pte_t pte)
return pte_val(pte) & (_PAGE_PRESENT);
 }
 
-#define pte_numa pte_numa
-static inline int pte_numa(pte_t pte)
+#define pte_protnone pte_protnone
+static inline int pte_protnone(pte_t pte)
 {
return (pte_val(pte) &
-   (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+   (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
 }
 
-#define pte_mknonnuma pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
+#define pmd_protnone pmd_protnone
+static inline int pmd_protnone(pmd_t pmd)
 {
-   pte_val(pte) &= ~_PAGE_NUMA;
-   pte_val(pte) |=  _PAGE_PRESENT | _PAGE_ACCESSED;
-   return pte;
-}
-
-#define pte_mknuma pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
-   /*
-* We should not set _PAGE_NUMA on non present ptes. Also clear the
-* present bit so that hash_page will return 1 and we collect this
-* as numa fault.
-*/
-   if (pte_present(pte)) {
-   pte_val(pte) |= _PAGE_NUMA;
-   pte_val(pte) &= ~_PAGE_PRESENT;
-   } else
-   VM_BUG_ON(1);
-   return pte;
+   return pte_protnone(pmd_pte(pmd));
 }
 
-#define ptep_set_numa ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-pte_t *ptep)
-{
-   if ((pte_val(*ptep) & _PAGE_PRESENT) == 0)
-   VM_BUG_ON(1);
-
-   pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
-   return;
-}
-
-#define pmd_numa pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
-   return pte_numa(pmd_pte(pmd));
-}
-
-#define pmdp_set_numa pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-pmd_t *pmdp)
-{
-   if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0)
-   VM_BUG_ON(1);
-
-   pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
-   return;
-}
-
-#define pmd_mknonnuma pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
-   return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
-}
-
-#define pmd_mknuma pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
-   return pte_pmd(pte_mknuma(pmd_pte(pmd)));
-}
-
-# else
+#else
 
 static inline int pte_present(pte_t pte)
 {
diff --git a/arch/powerpc/include/asm/pte-hash64.h 
b/arch/powerpc/include/asm/pte-hash64.h
index 2505d8eab15c..68d0a5d01dc3 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -30,7 +30,7 @@
 /*
  * Used for tracking numa faults
  */
-#define _PAGE_NUMA 0x0010 /* Gather numa placement stats */
+//#define _PAGE_NUMA   0x0010 /* Gather numa placement stats */
 
 
 /* No separate kernel read-only */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 084ad54c73cd..27004431b576 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -235,7 +235,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
pte_size = psize;
pte = lookup_linux_pte_and_update(pgdir, hva, writing,
  _size);
-   if (pte_present(pte) && !pte_numa(pte)) {
+   /*
+* Skip the ptes 

Re: pipe/page fault oddness.

2014-10-08 Thread Aneesh Kumar K.V
Linus Torvalds torva...@linux-foundation.org writes:

 On Mon, Oct 6, 2014 at 3:18 PM, Aneesh Kumar K.V
 aneesh.ku...@linux.vnet.ibm.com wrote:

 Are we still looking at these options ? I could look at implementing the
 first option which will also enable us to free up one pte bit.

 We definitely are. If you can test my patch (with the small follow-up
 fix), and do the necessary changes for ppc64, that would be good.

 I looked quickly at the ppc64 side, and it didn't look too painful.
 Using pte_protnone() instead of pte_numa() should remove move lines
 than it adds there too..


This is a quick hack and gets it running. With perf bench numa mem

-bash-4.2# grep numa /proc/vmstat   

   
numa_hit 3310633
numa_miss 0
numa_foreign 0
numa_interleave 6451
numa_local 3162369
numa_other 148264
numa_pte_updates 27708982
numa_huge_pte_updates 76987
numa_hint_faults 268439275
numa_hint_faults_local 5359216
numa_pages_migrated 3349573
-bash-4.2# 

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index d98c1ecc3266..2a9bbe4d2364 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -41,7 +41,7 @@ static inline pgprot_t pte_pgprot(pte_t pte)  { return 
__pgprot(pte_val(pte)  PA
 
 static inline int pte_present(pte_t pte)
 {
-   return pte_val(pte)  (_PAGE_PRESENT | _PAGE_NUMA);
+   return pte_val(pte)  _PAGE_PRESENT;
 }
 
 #define pte_present_nonuma pte_present_nonuma
@@ -50,78 +50,20 @@ static inline int pte_present_nonuma(pte_t pte)
return pte_val(pte)  (_PAGE_PRESENT);
 }
 
-#define pte_numa pte_numa
-static inline int pte_numa(pte_t pte)
+#define pte_protnone pte_protnone
+static inline int pte_protnone(pte_t pte)
 {
return (pte_val(pte) 
-   (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+   (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT;
 }
 
-#define pte_mknonnuma pte_mknonnuma
-static inline pte_t pte_mknonnuma(pte_t pte)
+#define pmd_protnone pmd_protnone
+static inline int pmd_protnone(pmd_t pmd)
 {
-   pte_val(pte) = ~_PAGE_NUMA;
-   pte_val(pte) |=  _PAGE_PRESENT | _PAGE_ACCESSED;
-   return pte;
-}
-
-#define pte_mknuma pte_mknuma
-static inline pte_t pte_mknuma(pte_t pte)
-{
-   /*
-* We should not set _PAGE_NUMA on non present ptes. Also clear the
-* present bit so that hash_page will return 1 and we collect this
-* as numa fault.
-*/
-   if (pte_present(pte)) {
-   pte_val(pte) |= _PAGE_NUMA;
-   pte_val(pte) = ~_PAGE_PRESENT;
-   } else
-   VM_BUG_ON(1);
-   return pte;
+   return pte_protnone(pmd_pte(pmd));
 }
 
-#define ptep_set_numa ptep_set_numa
-static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
-pte_t *ptep)
-{
-   if ((pte_val(*ptep)  _PAGE_PRESENT) == 0)
-   VM_BUG_ON(1);
-
-   pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
-   return;
-}
-
-#define pmd_numa pmd_numa
-static inline int pmd_numa(pmd_t pmd)
-{
-   return pte_numa(pmd_pte(pmd));
-}
-
-#define pmdp_set_numa pmdp_set_numa
-static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
-pmd_t *pmdp)
-{
-   if ((pmd_val(*pmdp)  _PAGE_PRESENT) == 0)
-   VM_BUG_ON(1);
-
-   pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
-   return;
-}
-
-#define pmd_mknonnuma pmd_mknonnuma
-static inline pmd_t pmd_mknonnuma(pmd_t pmd)
-{
-   return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
-}
-
-#define pmd_mknuma pmd_mknuma
-static inline pmd_t pmd_mknuma(pmd_t pmd)
-{
-   return pte_pmd(pte_mknuma(pmd_pte(pmd)));
-}
-
-# else
+#else
 
 static inline int pte_present(pte_t pte)
 {
diff --git a/arch/powerpc/include/asm/pte-hash64.h 
b/arch/powerpc/include/asm/pte-hash64.h
index 2505d8eab15c..68d0a5d01dc3 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -30,7 +30,7 @@
 /*
  * Used for tracking numa faults
  */
-#define _PAGE_NUMA 0x0010 /* Gather numa placement stats */
+//#define _PAGE_NUMA   0x0010 /* Gather numa placement stats */
 
 
 /* No separate kernel read-only */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 084ad54c73cd..27004431b576 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -235,7 +235,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
pte_size = psize;
pte = lookup_linux_pte_and_update(pgdir, hva, writing,
  pte_size);
-   if (pte_present(pte)  !pte_numa(pte)) {
+   /*
+ 

Re: pipe/page fault oddness.

2014-10-07 Thread Linus Torvalds
On Mon, Oct 6, 2014 at 3:18 PM, Aneesh Kumar K.V
 wrote:
>
> Are we still looking at these options ? I could look at implementing the
> first option which will also enable us to free up one pte bit.

We definitely are. If you can test my patch (with the small follow-up
fix), and do the necessary changes for ppc64, that would be good.

I looked quickly at the ppc64 side, and it didn't look too painful.
Using pte_protnone() instead of pte_numa() should remove move lines
than it adds there too..

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-07 Thread Linus Torvalds
On Mon, Oct 6, 2014 at 3:18 PM, Aneesh Kumar K.V
aneesh.ku...@linux.vnet.ibm.com wrote:

 Are we still looking at these options ? I could look at implementing the
 first option which will also enable us to free up one pte bit.

We definitely are. If you can test my patch (with the small follow-up
fix), and do the necessary changes for ppc64, that would be good.

I looked quickly at the ppc64 side, and it didn't look too painful.
Using pte_protnone() instead of pte_numa() should remove move lines
than it adds there too..

Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-06 Thread Aneesh Kumar K.V
Mel Gorman  writes:

> On Wed, Oct 01, 2014 at 09:18:25AM -0700, Linus Torvalds wrote:
>> On Wed, Oct 1, 2014 at 9:01 AM, Linus Torvalds
>>  wrote:
>> >
>> > We need to get rid of it, and just make it the same as pte_protnone().
>> > And then the real protnone is in the vma flags, and if you actually
>> > ever get to a pte that is marked protnone, you know it's a numa page.
>> 
>> So I'd really suggest we do exactly that. Get rid of "pte_numa()"
>> entirely, get rid of "_PAGE_[BIT_]NUMA" entirely, and instead add a
>> "pte_protnone()" helper to check for the "protnone" case (which on x86
>> is testing the _PAGE_PROTNONE bit, and on most other architectures is
>> just testing that the page has no access rights).
>> 
>
> Do not interpret the following as being against the idea of taking the
> pte_protnone approach. This is intended to give background.
>
> At the time the changes were made to the _PAGE_NUMA bits it was acknowledged
> that a full move to prot_none was an option but it was not the preferred
> solution at the time. It replaced one set of corner cases with another and
> the last time like this time, there was considerable time pressure. The
> VMA would be required to distinguish between a NUMA hinting fault and a
> real prot_none bit. In most cases, we have the VMA now with the exception
> of GUP. GUP would have to unconditionally go into the slow path to do the
> VMA lookup. That is not likely to be a big of a problem but it was a concern.
>
> In early implementations based on prot_none there were some VMA-based
> protection checks that had higher overhead. At the time, there were severe
> problems with overhead due to NUMA balancing and adding more was not
> desirable. This has been addressed since with changes in multiple other
> areas so it's much less of a concern now than it was. In the current shape,
> these probably is not as much a problem as long as any check on pte_numa
> was first guarded by a VMA check. One way of handling the corner cases
> where would be to pass in the VMA where available and have a VM_BUG_ON that
> fires if its a PROT_NONE VMA. That would catch problems during debugging
> without adding overhead in the !debug case.
>
> Going back to the start, the PTE bit was used as the approach due to
> concerns that a pte_protnone helper would not work on all architectures,
> ppc64 in particular.  There was no PROT_NONE bit there and instead prot_none
> protections rely on PAGE_USER not being set so it's inaccessible from
> userspace. There was discussion at the time that this could conceivably be
> broken from some sub-architectures but I don't recall the details. Looking
> at the current shape and your patch, it's conceivable that the pte_protnone
> could be implemented as a _PAGE_PRESENT && !_PAGE_USER check as long
> as it was guarded by a VMA check which x86 requires anyway. Not sure
> if that would work for PMDs as I'm not familiar with with ppc64 to tell
> offhand. Alternatively, ppc64 would potentially use the bit currently used
> for _PAGE_NUMA as a _PROT_NONE bit.

Are we still looking at these options ? I could look at implementing the
first option which will also enable us to free up one pte bit.

Note: Freeing up one bit will enable us to implement soft dirty tracking
needed for CRIU.

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-06 Thread Aneesh Kumar K.V
Mel Gorman mgor...@suse.de writes:

 On Wed, Oct 01, 2014 at 09:18:25AM -0700, Linus Torvalds wrote:
 On Wed, Oct 1, 2014 at 9:01 AM, Linus Torvalds
 torva...@linux-foundation.org wrote:
 
  We need to get rid of it, and just make it the same as pte_protnone().
  And then the real protnone is in the vma flags, and if you actually
  ever get to a pte that is marked protnone, you know it's a numa page.
 
 So I'd really suggest we do exactly that. Get rid of pte_numa()
 entirely, get rid of _PAGE_[BIT_]NUMA entirely, and instead add a
 pte_protnone() helper to check for the protnone case (which on x86
 is testing the _PAGE_PROTNONE bit, and on most other architectures is
 just testing that the page has no access rights).
 

 Do not interpret the following as being against the idea of taking the
 pte_protnone approach. This is intended to give background.

 At the time the changes were made to the _PAGE_NUMA bits it was acknowledged
 that a full move to prot_none was an option but it was not the preferred
 solution at the time. It replaced one set of corner cases with another and
 the last time like this time, there was considerable time pressure. The
 VMA would be required to distinguish between a NUMA hinting fault and a
 real prot_none bit. In most cases, we have the VMA now with the exception
 of GUP. GUP would have to unconditionally go into the slow path to do the
 VMA lookup. That is not likely to be a big of a problem but it was a concern.

 In early implementations based on prot_none there were some VMA-based
 protection checks that had higher overhead. At the time, there were severe
 problems with overhead due to NUMA balancing and adding more was not
 desirable. This has been addressed since with changes in multiple other
 areas so it's much less of a concern now than it was. In the current shape,
 these probably is not as much a problem as long as any check on pte_numa
 was first guarded by a VMA check. One way of handling the corner cases
 where would be to pass in the VMA where available and have a VM_BUG_ON that
 fires if its a PROT_NONE VMA. That would catch problems during debugging
 without adding overhead in the !debug case.

 Going back to the start, the PTE bit was used as the approach due to
 concerns that a pte_protnone helper would not work on all architectures,
 ppc64 in particular.  There was no PROT_NONE bit there and instead prot_none
 protections rely on PAGE_USER not being set so it's inaccessible from
 userspace. There was discussion at the time that this could conceivably be
 broken from some sub-architectures but I don't recall the details. Looking
 at the current shape and your patch, it's conceivable that the pte_protnone
 could be implemented as a _PAGE_PRESENT  !_PAGE_USER check as long
 as it was guarded by a VMA check which x86 requires anyway. Not sure
 if that would work for PMDs as I'm not familiar with with ppc64 to tell
 offhand. Alternatively, ppc64 would potentially use the bit currently used
 for _PAGE_NUMA as a _PROT_NONE bit.

Are we still looking at these options ? I could look at implementing the
first option which will also enable us to free up one pte bit.

Note: Freeing up one bit will enable us to implement soft dirty tracking
needed for CRIU.

-aneesh

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-03 Thread Sasha Levin
On 10/03/2014 11:58 AM, Dave Jones wrote:
> On Fri, Oct 03, 2014 at 08:43:57AM -0700, Linus Torvalds wrote:
>  > On Thu, Oct 2, 2014 at 10:00 PM, Sasha Levin  
> wrote:
>  > >
>  > > For the record, I tweaked the environment to put some more pressure on 
> the
>  > > scheduler and found out what broke (which is not related to this thread 
> at
>  > > all).
>  > 
>  > Ok. It's probably still worth testing Mel's patches, since that's what
>  > goes into 3.17. But it would be interesting to eventually go back to
>  > stability-testing the protnone set, since it *looked* like it was
>  > working well aside from this issue. I might decide to just do it
>  > during the 3.18 merge window..

Linus, I'm running with Mel's patches since yesterday, nothing interesting
to report.

> FYI, I've been trying to reproduce that initial bug all week, and
> haven't seen a single reoccurance of it, just other bugs.
> Kind of a pain in the ass for confirming this numa stuff was indeed
> the cause, but that's been true of so many of the more obscure bugs
> trinity has shaken out.

+1, it seems to have "disappeared", quite annoying :/


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-03 Thread Dave Jones
On Fri, Oct 03, 2014 at 08:43:57AM -0700, Linus Torvalds wrote:
 > On Thu, Oct 2, 2014 at 10:00 PM, Sasha Levin  wrote:
 > >
 > > For the record, I tweaked the environment to put some more pressure on the
 > > scheduler and found out what broke (which is not related to this thread at
 > > all).
 > 
 > Ok. It's probably still worth testing Mel's patches, since that's what
 > goes into 3.17. But it would be interesting to eventually go back to
 > stability-testing the protnone set, since it *looked* like it was
 > working well aside from this issue. I might decide to just do it
 > during the 3.18 merge window..

FYI, I've been trying to reproduce that initial bug all week, and
haven't seen a single reoccurance of it, just other bugs.
Kind of a pain in the ass for confirming this numa stuff was indeed
the cause, but that's been true of so many of the more obscure bugs
trinity has shaken out.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-03 Thread Linus Torvalds
On Thu, Oct 2, 2014 at 10:00 PM, Sasha Levin  wrote:
>
> For the record, I tweaked the environment to put some more pressure on the
> scheduler and found out what broke (which is not related to this thread at
> all).

Ok. It's probably still worth testing Mel's patches, since that's what
goes into 3.17. But it would be interesting to eventually go back to
stability-testing the protnone set, since it *looked* like it was
working well aside from this issue. I might decide to just do it
during the 3.18 merge window..

  Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-03 Thread Linus Torvalds
On Thu, Oct 2, 2014 at 10:00 PM, Sasha Levin sasha.le...@oracle.com wrote:

 For the record, I tweaked the environment to put some more pressure on the
 scheduler and found out what broke (which is not related to this thread at
 all).

Ok. It's probably still worth testing Mel's patches, since that's what
goes into 3.17. But it would be interesting to eventually go back to
stability-testing the protnone set, since it *looked* like it was
working well aside from this issue. I might decide to just do it
during the 3.18 merge window..

  Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-03 Thread Dave Jones
On Fri, Oct 03, 2014 at 08:43:57AM -0700, Linus Torvalds wrote:
  On Thu, Oct 2, 2014 at 10:00 PM, Sasha Levin sasha.le...@oracle.com wrote:
  
   For the record, I tweaked the environment to put some more pressure on the
   scheduler and found out what broke (which is not related to this thread at
   all).
  
  Ok. It's probably still worth testing Mel's patches, since that's what
  goes into 3.17. But it would be interesting to eventually go back to
  stability-testing the protnone set, since it *looked* like it was
  working well aside from this issue. I might decide to just do it
  during the 3.18 merge window..

FYI, I've been trying to reproduce that initial bug all week, and
haven't seen a single reoccurance of it, just other bugs.
Kind of a pain in the ass for confirming this numa stuff was indeed
the cause, but that's been true of so many of the more obscure bugs
trinity has shaken out.

Dave

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-03 Thread Sasha Levin
On 10/03/2014 11:58 AM, Dave Jones wrote:
 On Fri, Oct 03, 2014 at 08:43:57AM -0700, Linus Torvalds wrote:
   On Thu, Oct 2, 2014 at 10:00 PM, Sasha Levin sasha.le...@oracle.com 
 wrote:
   
For the record, I tweaked the environment to put some more pressure on 
 the
scheduler and found out what broke (which is not related to this thread 
 at
all).
   
   Ok. It's probably still worth testing Mel's patches, since that's what
   goes into 3.17. But it would be interesting to eventually go back to
   stability-testing the protnone set, since it *looked* like it was
   working well aside from this issue. I might decide to just do it
   during the 3.18 merge window..

Linus, I'm running with Mel's patches since yesterday, nothing interesting
to report.

 FYI, I've been trying to reproduce that initial bug all week, and
 haven't seen a single reoccurance of it, just other bugs.
 Kind of a pain in the ass for confirming this numa stuff was indeed
 the cause, but that's been true of so many of the more obscure bugs
 trinity has shaken out.

+1, it seems to have disappeared, quite annoying :/


Thanks,
Sasha
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Sasha Levin
On 10/02/2014 12:10 PM, Linus Torvalds wrote:
> On Thu, Oct 2, 2014 at 8:04 AM, Sasha Levin  wrote:
>> >
>> > I have a new one for you. I know it doesn't say "numa" anywhere, but I
>> > haven't ever seen that trace before so I'll just go ahead and blame it
>> > on your patch...
> Fair enough, but the oops doesn't really give even a hint of what
> could be wrong.
> 
> The stack is clearly too deep:
> 
>   Thread overran stack, or stack corrupted
>   task.ti: 880dba2ec000
> RSP: 880dba2ebf48
> 
> but my patch shouldn't have added any deeper call-chains anywhere.

For the record, I tweaked the environment to put some more pressure on the
scheduler and found out what broke (which is not related to this thread at
all).

For the curious ones: https://lkml.org/lkml/2014/10/3/5


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Kirill A. Shutemov
On Thu, Oct 02, 2014 at 09:01:38AM -0700, Linus Torvalds wrote:
> On Thu, Oct 2, 2014 at 7:25 AM, Kirill A. Shutemov  
> wrote:
> >
> > I don't see what prevents the code to make zero page writable here.
> > We need at least pmd = pmd_wrprotect(pmd) before set_pmd_at();
> 
> Do we? If it's the zero page, it had better be an anonymous mapping,
> and vm_page_prot had better not be writable.
> 
> Anonymous pages don't _start_ out writable, we explicitly make them so
> with code like
> 
> if (vma->vm_flags & VM_WRITE)
> entry = pte_mkwrite(pte_mkdirty(entry));
> 
> so it should be fine to just use "pmd_modify(pmd, vma->vm_page_prot);" 
> directly.
> 
> But hey, this is the kind of thing that maybe I'm missing something on..

You're right.
It means we have redundant pmd_wrprotect() in set_huge_zero_page().

==

Subject: [PATCH] thp: do not mark zero-page pmd write-protected explicitly

Zero pages can be used only in anonymous mappings, which never have
writable vma->vm_page_prot: see protection_map in mm/mmap.c and __PX1X
definitions.

Let's drop redundant pmd_wrprotect() in set_huge_zero_page().

Signed-off-by: Kirill A. Shutemov 
---
 mm/huge_memory.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d9a21d06b862..2c17d184b56d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -784,7 +784,6 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct 
mm_struct *mm,
if (!pmd_none(*pmd))
return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
-   entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Linus Torvalds
On Thu, Oct 2, 2014 at 8:04 AM, Sasha Levin  wrote:
>
> I have a new one for you. I know it doesn't say "numa" anywhere, but I
> haven't ever seen that trace before so I'll just go ahead and blame it
> on your patch...

Fair enough, but the oops doesn't really give even a hint of what
could be wrong.

The stack is clearly too deep:

  Thread overran stack, or stack corrupted
  task.ti: 880dba2ec000
RSP: 880dba2ebf48

but my patch shouldn't have added any deeper call-chains anywhere.

Anyway, unless you can get some other interesting oops with more hints
about where we go wrong, you might want to try Mel's four cleanup
patches instead. I love how you're testing my quick-and-dirty hack,
and I think we'll need to do this eventually, but in the short term
Mel's patches are probably worth applying.

In particular, his 4/4 patch removes the code case that I was
particularly nervous about, so it looks worth a try, because that may
just remove the bug with much smaller effort.

   Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Linus Torvalds
On Thu, Oct 2, 2014 at 7:25 AM, Kirill A. Shutemov  wrote:
>
> I don't see what prevents the code to make zero page writable here.
> We need at least pmd = pmd_wrprotect(pmd) before set_pmd_at();

Do we? If it's the zero page, it had better be an anonymous mapping,
and vm_page_prot had better not be writable.

Anonymous pages don't _start_ out writable, we explicitly make them so
with code like

if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));

so it should be fine to just use "pmd_modify(pmd, vma->vm_page_prot);" directly.

But hey, this is the kind of thing that maybe I'm missing something on..

  Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Linus Torvalds
On Thu, Oct 2, 2014 at 1:47 AM, Hugh Dickins  wrote:
>
> I hesitate to admit, I still don't see it: please illuminate further.

No, your'e looking at what I was looking.

> We're talking about the loop in __split_huge_page_map(), where it does

Yes.

> entry = mk_pte(page + i, vma->vm_page_prot);
> entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> if (!pmd_write(*pmd))
> entry = pte_wrprotect(entry);
> if (!pmd_young(*pmd))
> entry = pte_mkold(entry);
> if (pmd_numa(*pmd))
> entry = pte_mknuma(entry);
>
> , right?  I only see that adding _PAGE_NUMA to _PAGE_PROTNONE if
> pmd_numa(*pmd): but that would mean we had already gone wrong, setting
> pmd_numa in a PROT_NONE vma, which task_numa_work takes care not to do;
> or have mprotected an area to PROT_NONE without doing the pmd_mknonnuma.

Fair enough. Except this code has no locking that I see, so if we
*ever* see that numa entry in the pmd while walking the page tables in
vmscan, we're basically screwed.

> Or are you noticing a deficiency in the pmd locking?  I have not
> worked my way through that, so cannot guarantee it, but please
> point me to the weakness where you see it.

So I don't see any locking at all wrt mprotect (or new mmap). That's
kind of the whole point for page-out - it bypasses all the normal VM
locks, and only uses the last pte locking.

So the whole use of vma->vm_page_prot here is a bit scary. That gets
modified outside of the page table locks. So how do you know it's not
already PROT_NONE, but mprotect just hasn't gotten to actually take
the page table locks yet?

I dunno. It all makes me just very nervous. The whole "numa bit is
separate from the protections, has different locking, and is just
oddly and subtly different" is really what I fundamentally object to.
And it seems so _unnecessary_. All this odd complexity for no actual
gain - just extra code, and extra room for subtle bugs. Which is
exactly why I hate that magic NUMA bit so much.

  Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Sasha Levin
On 10/01/2014 06:42 PM, Linus Torvalds wrote:
> On Wed, Oct 1, 2014 at 3:08 PM, Sasha Levin  wrote:
>> >
>> > I've tried this patch on the same configuration that was triggering
>> > the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
>> > ran fine for ~20 minutes before exploding with:
> Well, that's somewhat encouraging. I didn't expect it to be perfect.
> 
> That said, "ran fine" isn't necessarily the same thing as "worked".
> Who knows how buggy it was without showing overt symptoms until the
> BUG_ON() triggered. But hey, I'll be optimistic.
> 
>> > [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
> So that's
> 
> BUG_ON(is_huge_zero_page(page));
> 
> and the reason is trivial: the old code used to have a magical special
> case for the zero-page hugepage (see change_huge_pmd()) and I got rid
> of that (because now it's just about setting protections, and the
> zero-page hugepage is in no way special.
> 
> So I think the solution is equally trivial: just accept that the
> zero-page can happen, and ignore it (just un-numa it).
> 
> Appended is a incremental diff on top of the previous one. Even less
> tested than the last case, but I think you get the idea if it doesn't
> work as-is.

I have a new one for you. I know it doesn't say "numa" anywhere, but I
haven't ever seen that trace before so I'll just go ahead and blame it
on your patch...

[ 2838.403382] BUG: unable to handle kernel paging request at 00055d996e80
[ 2838.405740] IP: task_curr (kernel/sched/core.c:1010)
[ 2838.407076] PGD dba2c6067 PUD 0
[ 2838.407926] Thread overran stack, or stack corrupted
[ 2838.409093] Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 2838.411454] Dumping ftrace buffer:
[ 2838.412602](ftrace buffer empty)
[ 2838.413187] Modules linked in:
[ 2838.413187] CPU: 38 PID: 9342 Comm: trinity-c38 Not tainted 
3.17.0-rc7-sasha-00041-g6c9c81b #1260
[ 2838.413187] task: 880dba2f ti: 880dba2ec000 task.ti: 
880dba2ec000
[ 2838.413187] RIP: task_curr (kernel/sched/core.c:1010)
[ 2838.413187] RSP: 0018:880dba2ebf48  EFLAGS: 00010046
[ 2838.413187] RAX: f080 RBX: 880dba2f RCX: 000a
[ 2838.413187] RDX: ba1a9560 RSI: 880dba2f RDI: 880dba2f
[ 2838.413187] RBP: 880dba2ebf98 R08: 0004862a R09: 
[ 2838.413187] R10: 0038 R11: 001f R12: 880dba2f
[ 2838.413187] R13: 880dd5420740 R14: 000b R15: 8cc92000
[ 2838.413187] FS:  7f05f3dbc700() GS:880701e0() 
knlGS:
[ 2838.413187] CS:  0010 DS:  ES:  CR0: 8005003b
[ 2838.413187] CR2: 00055d996e80 CR3: 000dba2c5000 CR4: 06a0
[ 2838.413187] DR0: 006ee000 DR1:  DR2: 
[ 2838.413187] DR3:  DR6: 0ff0 DR7: 00090602
[ 2838.413187] Stack:
[ 2838.413187]  8816218b  880d000a 
000b
[ 2838.413187]  0082 880dba2f 000b 
880dba2ec070
[ 2838.413187]   8cc92000 880dba2ebff8 
88162a84
[ 2838.413187] Call Trace:
[ 2838.413187]  
[ 2838.413187] Code: 87 60 09 00 00 01 e8 8d ee ff ff 5d c3 66 66 2e 0f 1f 84 
00 00 00 00 00 48 8b 57 08 55 48 c7 c0 80 f0 00 00 48 89 e5 5d 8b 52 18 <48> 8b 
14 d5 80 c3 c4 8c 48 39 bc 10 68 09 00 00 0f 94 c0 0f b6
All code

   0:   87 60 09xchg   %esp,0x9(%rax)
   3:   00 00   add%al,(%rax)
   5:   01 e8   add%ebp,%eax
   7:   8d  (bad)
   8:   ee  out%al,(%dx)
   9:   ff  (bad)
   a:   ff 5d c3lcallq *-0x3d(%rbp)
   d:   66 66 2e 0f 1f 84 00data32 nopw %cs:0x0(%rax,%rax,1)
  14:   00 00 00 00
  18:   48 8b 57 08 mov0x8(%rdi),%rdx
  1c:   55  push   %rbp
  1d:   48 c7 c0 80 f0 00 00mov$0xf080,%rax
  24:   48 89 e5mov%rsp,%rbp
  27:   5d  pop%rbp
  28:   8b 52 18mov0x18(%rdx),%edx
  2b:*  48 8b 14 d5 80 c3 c4mov-0x733b3c80(,%rdx,8),%rdx
<-- trapping instruction
  32:   8c
  33:   48 39 bc 10 68 09 00cmp%rdi,0x968(%rax,%rdx,1)
  3a:   00
  3b:   0f 94 c0sete   %al
  3e:   0f b6 00movzbl (%rax),%eax

Code starting with the faulting instruction
===
   0:   48 8b 14 d5 80 c3 c4mov-0x733b3c80(,%rdx,8),%rdx
   7:   8c
   8:   48 39 bc 10 68 09 00cmp%rdi,0x968(%rax,%rdx,1)
   f:   00
  10:   0f 94 c0sete   %al
  13:   0f b6 00movzbl (%rax),%eax
[ 2838.413187] RIP task_curr (kernel/sched/core.c:1010)
[ 2838.413187]  RSP 
[ 2838.413187] CR2: 00055d996e80


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

Re: pipe/page fault oddness.

2014-10-02 Thread Sasha Levin
On 10/02/2014 04:03 AM, Chuck Ebbert wrote:
> On Wed, 01 Oct 2014 23:32:15 -0400
> Sasha Levin  wrote:
> 
>> On 10/01/2014 06:28 PM, Chuck Ebbert wrote:
>>> On Wed, 01 Oct 2014 18:08:30 -0400
>>> Sasha Levin  wrote:
>>>
> On 10/01/2014 04:20 PM, Linus Torvalds wrote:
>>> So I'm really sending this patch out in the hope that it will get
>>> comments, fixup and possibly even testing by people who actually know
>>> the NUMA balancing code. Rik?  Anybody?
>
> Hi Linus,
>
> I've tried this patch on the same configuration that was triggering
> the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
> ran fine for ~20 minutes before exploding with:
>
> [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
>>> That's:
>>> BUG_ON(is_huge_zero_page(page));
>>>
>>> Can you change your scripts to show the source code line when
>>> the error is a BUG_ON()? The machine code disassembly after the
>>> oops message doesn't really help.
>>>
>>
>> Hum? The source code line is the first line in the trace:
>>
>>  [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
>>
> 
> I meant, display the contents of that line so we can see what the
> BUG_ON() was triggered by. In some cases you might have a custom patch
> applied or be running a version that some people don't have handy.

Ah, the actual content? That's a good point - let me look into that.


Thanks,
Sasha

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Kirill A. Shutemov
On Wed, Oct 01, 2014 at 03:42:53PM -0700, Linus Torvalds wrote:
> On Wed, Oct 1, 2014 at 3:08 PM, Sasha Levin  wrote:
> >
> > I've tried this patch on the same configuration that was triggering
> > the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
> > ran fine for ~20 minutes before exploding with:
> 
> Well, that's somewhat encouraging. I didn't expect it to be perfect.
> 
> That said, "ran fine" isn't necessarily the same thing as "worked".
> Who knows how buggy it was without showing overt symptoms until the
> BUG_ON() triggered. But hey, I'll be optimistic.
> 
> > [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
> 
> So that's
> 
> BUG_ON(is_huge_zero_page(page));
> 
> and the reason is trivial: the old code used to have a magical special
> case for the zero-page hugepage (see change_huge_pmd()) and I got rid
> of that (because now it's just about setting protections, and the
> zero-page hugepage is in no way special.
> 
> So I think the solution is equally trivial: just accept that the
> zero-page can happen, and ignore it (just un-numa it).
> 
> Appended is a incremental diff on top of the previous one. Even less
> tested than the last case, but I think you get the idea if it doesn't
> work as-is.
> 
>  Linus

> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 14de54af6c38..fc33952d59c4 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1290,7 +1290,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct 
> vm_area_struct *vma,
>   }
>  
>   page = pmd_page(pmd);
> - BUG_ON(is_huge_zero_page(page));
> + if (is_huge_zero_page(page))
> + goto huge_zero_page;
> +
>   page_nid = page_to_nid(page);
>   last_cpupid = page_cpupid_last(page);
>   count_vm_numa_event(NUMA_HINT_FAULTS);
> @@ -1381,6 +1383,11 @@ out:
>   task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
>  
>   return 0;
> +huge_zero_page:
> + pmd = pmd_modify(pmd, vma->vm_page_prot);
> + set_pmd_at(mm, haddr, pmdp, pmd);
> + update_mmu_cache_pmd(vma, addr, pmdp);
> + goto out_unlock;

I don't see what prevents the code to make zero page writable here.
We need at least pmd = pmd_wrprotect(pmd) before set_pmd_at();

-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Mel Gorman
On Wed, Oct 01, 2014 at 09:18:25AM -0700, Linus Torvalds wrote:
> On Wed, Oct 1, 2014 at 9:01 AM, Linus Torvalds
>  wrote:
> >
> > We need to get rid of it, and just make it the same as pte_protnone().
> > And then the real protnone is in the vma flags, and if you actually
> > ever get to a pte that is marked protnone, you know it's a numa page.
> 
> So I'd really suggest we do exactly that. Get rid of "pte_numa()"
> entirely, get rid of "_PAGE_[BIT_]NUMA" entirely, and instead add a
> "pte_protnone()" helper to check for the "protnone" case (which on x86
> is testing the _PAGE_PROTNONE bit, and on most other architectures is
> just testing that the page has no access rights).
> 

Do not interpret the following as being against the idea of taking the
pte_protnone approach. This is intended to give background.

At the time the changes were made to the _PAGE_NUMA bits it was acknowledged
that a full move to prot_none was an option but it was not the preferred
solution at the time. It replaced one set of corner cases with another and
the last time like this time, there was considerable time pressure. The
VMA would be required to distinguish between a NUMA hinting fault and a
real prot_none bit. In most cases, we have the VMA now with the exception
of GUP. GUP would have to unconditionally go into the slow path to do the
VMA lookup. That is not likely to be a big of a problem but it was a concern.

In early implementations based on prot_none there were some VMA-based
protection checks that had higher overhead. At the time, there were severe
problems with overhead due to NUMA balancing and adding more was not
desirable. This has been addressed since with changes in multiple other
areas so it's much less of a concern now than it was. In the current shape,
these probably is not as much a problem as long as any check on pte_numa
was first guarded by a VMA check. One way of handling the corner cases
where would be to pass in the VMA where available and have a VM_BUG_ON that
fires if its a PROT_NONE VMA. That would catch problems during debugging
without adding overhead in the !debug case.

Going back to the start, the PTE bit was used as the approach due to
concerns that a pte_protnone helper would not work on all architectures,
ppc64 in particular.  There was no PROT_NONE bit there and instead prot_none
protections rely on PAGE_USER not being set so it's inaccessible from
userspace. There was discussion at the time that this could conceivably be
broken from some sub-architectures but I don't recall the details. Looking
at the current shape and your patch, it's conceivable that the pte_protnone
could be implemented as a _PAGE_PRESENT && !_PAGE_USER check as long
as it was guarded by a VMA check which x86 requires anyway. Not sure
if that would work for PMDs as I'm not familiar with with ppc64 to tell
offhand. Alternatively, ppc64 would potentially use the bit currently used
for _PAGE_NUMA as a _PROT_NONE bit.

I finished a small series that cleans up a number of issues discovered
recently due to Sasha's testing. It passed light testing and NUMA balancing
works but I cannot comment if they help Dave or Sasha's bugs as I never
managed to reproduce them. I'll post it shortly after I sent this mail.

Again, I'm not opposed to the pte_protnone issue as many of the concerns
I had before have been addressed since or rendered redundant. However,
I won't be able to digest and/or finish your patch in a reasonable time
frame. I'm only partially in work at the moment due to sick (going back
to bed after this mail) and out for a good chunk of next week and the week
after. Minor comments from the patch though

1. ppc64 needs work. Added Aneesh to the cc so he's at least aware
2. That check in pte_offset_kernel can probably go away
3. Ideally, VM_BUG_ON checks should be added to the pte_protnone check to
   ensure the VMA checks have already completed to avoid confusion between
   NUMA hints and real prot_none protections
4. GUP on prot_none now always hits the slow path but can't think of a
   case where that really matters.
5. SWP_OFFSET_SHIFT should be readjusted back if _PAGE_BIT_NUMA is
   removed.
6. It probably should at least be rebased on top of "mm: remove
   misleading ARCH_USES_NUMA_PROT_NONE" simply on the grounds that
   it cleans up some cruft
7. At the risk of pissing you off, pte_protnone_numa might be cleared
   at indicating why protnone is being checked at all

> Then we throw away "pte_mknuma()" and "pte_mknonnuma()" entirely,
> because they are brainless sh*t, and we just use
> 
> ptent = ptep_modify_prot_start(mm, addr, pte);
> ptent = pte_modify(ptent, newprot);
> ptep_modify_prot_commit(mm, addr, pte, ptent);
> 
> reliably instead (where for the mknuma case "newprot" is PROT_NONE,
> and for mknonnuma() it is vma->vm_page_prot. Yes, that means that you
> have to pass in the vma to those functions, but that just makes sense
> anyway.
> 
> And if that means that we lose the numa flag on mprotect 

Re: pipe/page fault oddness.

2014-10-02 Thread Hugh Dickins
On Wed, 1 Oct 2014, Linus Torvalds wrote:
> On Wed, Oct 1, 2014 at 1:19 AM, Hugh Dickins  wrote:
> 
> Can we please just get rid of _PAGE_NUMA. There is no excuse for it.

I'm no lover of _PAGE_NUMA, and hope that it can be simplified away
as you outline.  What we have in 3.16+3.17 is already an attempt to
improve on what you hated before, but not obviously an improvement.

Mel is the one who knows these issues inside out: maybe he's been
blinkered, but I wouldn't dare to pull it apart without his input.
Myself, I'm not looking beyond fixing whatever is the bug for 3.17.

> > However, that would still not explain Dave's endless refaulting;
> 
> Why not? You start out with a PROTNONE, trigger shrink_page_list() on
> a hugepage,.which calls add_to_swap(), which does
> split_huge_page_to_list(), which in turn calls __split_huge_page(),
> and that turns (_PAGE_PROTNONE) into (_PAGE_PROTNONE|_PAGE_NUMA),
> which you will then fault on forever, because the kernel thinks the
> page is present, but not a NUMA page.

I hesitate to admit, I still don't see it: please illuminate further.

We're talking about the loop in __split_huge_page_map(), where it does

entry = mk_pte(page + i, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (!pmd_write(*pmd))
entry = pte_wrprotect(entry);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
if (pmd_numa(*pmd))
entry = pte_mknuma(entry);

, right?  I only see that adding _PAGE_NUMA to _PAGE_PROTNONE if
pmd_numa(*pmd): but that would mean we had already gone wrong, setting
pmd_numa in a PROT_NONE vma, which task_numa_work takes care not to do;
or have mprotected an area to PROT_NONE without doing the pmd_mknonnuma.

Ah, we won't have mmap_sem in the add_to_swap case; so we could be
racing with an mprotect which already updated vm_flags and vm_page_prot,
but has not yet reached this pmd: is that a part of how you see it?

Or are you noticing a deficiency in the pmd locking?  I have not
worked my way through that, so cannot guarantee it, but please
point me to the weakness where you see it.

But when you convince me on that, then I still don't see how we get to
doing the repeated write fault, instead of hitting access_error(), as
you pointed out originally.  That still seems to require a PROTNONE
pte to be left behind in a VM_WRITE vma, which I do not see happening
here.  pte_modify leaves _PAGE_NUMA alone, but updates _PAGE_PROTNONE.
The pte_numa<->pte_special confusion is messy, but I don't yet get
how it would manifest in the manner observed.

But certainly, a bug in the THP splitting feels just right to
match the frequency of the sightings: I hope you've got it.

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Peter Zijlstra
On Wed, Oct 01, 2014 at 01:29:04PM -0400, Rik van Riel wrote:
> On 10/01/2014 12:18 PM, Linus Torvalds wrote:
> 
> > Seriously, why can't we just do this, and throw away all the crap that
> > is "numa special case". This would make all the random games in
> > change_pte_range() just go away entirely, because the whole NUMA thing
> > really wouldn't be a special case for the pte AT ALL any more. All it
> > would be is that a pte could be marked PROT_NONE even if the
> > vma->vm_flags aren't.
> 
> That's what I suggested quite a while back, but IIRC either
> Peter or Mel brought up a reason why this was not possible.

Hey, I'm the one that did the numa faulting with PROTNONE to begin with.
Andrea and Mel (and possibly you) all didn't like that and did the per
arch pagetable nonensen :-) I've never understood why.

https://lkml.org/lkml/2012/9/27/76
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Chuck Ebbert
On Wed, 01 Oct 2014 23:32:15 -0400
Sasha Levin  wrote:

> On 10/01/2014 06:28 PM, Chuck Ebbert wrote:
> > On Wed, 01 Oct 2014 18:08:30 -0400
> > Sasha Levin  wrote:
> > 
> >> > On 10/01/2014 04:20 PM, Linus Torvalds wrote:
> >>> > > So I'm really sending this patch out in the hope that it will get
> >>> > > comments, fixup and possibly even testing by people who actually know
> >>> > > the NUMA balancing code. Rik?  Anybody?
> >> > 
> >> > Hi Linus,
> >> > 
> >> > I've tried this patch on the same configuration that was triggering
> >> > the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
> >> > ran fine for ~20 minutes before exploding with:
> >> > 
> >> > [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
> > That's:
> > BUG_ON(is_huge_zero_page(page));
> > 
> > Can you change your scripts to show the source code line when
> > the error is a BUG_ON()? The machine code disassembly after the
> > oops message doesn't really help.
> > 
> 
> Hum? The source code line is the first line in the trace:
> 
>   [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
> 

I meant, display the contents of that line so we can see what the
BUG_ON() was triggered by. In some cases you might have a custom patch
applied or be running a version that some people don't have handy.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Sasha Levin
On 10/02/2014 12:10 PM, Linus Torvalds wrote:
 On Thu, Oct 2, 2014 at 8:04 AM, Sasha Levin sasha.le...@oracle.com wrote:
 
  I have a new one for you. I know it doesn't say numa anywhere, but I
  haven't ever seen that trace before so I'll just go ahead and blame it
  on your patch...
 Fair enough, but the oops doesn't really give even a hint of what
 could be wrong.
 
 The stack is clearly too deep:
 
   Thread overran stack, or stack corrupted
   task.ti: 880dba2ec000
 RSP: 880dba2ebf48
 
 but my patch shouldn't have added any deeper call-chains anywhere.

For the record, I tweaked the environment to put some more pressure on the
scheduler and found out what broke (which is not related to this thread at
all).

For the curious ones: https://lkml.org/lkml/2014/10/3/5


Thanks,
Sasha
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Chuck Ebbert
On Wed, 01 Oct 2014 23:32:15 -0400
Sasha Levin sasha.le...@oracle.com wrote:

 On 10/01/2014 06:28 PM, Chuck Ebbert wrote:
  On Wed, 01 Oct 2014 18:08:30 -0400
  Sasha Levin sasha.le...@oracle.com wrote:
  
   On 10/01/2014 04:20 PM, Linus Torvalds wrote:
So I'm really sending this patch out in the hope that it will get
comments, fixup and possibly even testing by people who actually know
the NUMA balancing code. Rik?  Anybody?
   
   Hi Linus,
   
   I've tried this patch on the same configuration that was triggering
   the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
   ran fine for ~20 minutes before exploding with:
   
   [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
  That's:
  BUG_ON(is_huge_zero_page(page));
  
  Can you change your scripts to show the source code line when
  the error is a BUG_ON()? The machine code disassembly after the
  oops message doesn't really help.
  
 
 Hum? The source code line is the first line in the trace:
 
   [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
 

I meant, display the contents of that line so we can see what the
BUG_ON() was triggered by. In some cases you might have a custom patch
applied or be running a version that some people don't have handy.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Peter Zijlstra
On Wed, Oct 01, 2014 at 01:29:04PM -0400, Rik van Riel wrote:
 On 10/01/2014 12:18 PM, Linus Torvalds wrote:
 
  Seriously, why can't we just do this, and throw away all the crap that
  is numa special case. This would make all the random games in
  change_pte_range() just go away entirely, because the whole NUMA thing
  really wouldn't be a special case for the pte AT ALL any more. All it
  would be is that a pte could be marked PROT_NONE even if the
  vma-vm_flags aren't.
 
 That's what I suggested quite a while back, but IIRC either
 Peter or Mel brought up a reason why this was not possible.

Hey, I'm the one that did the numa faulting with PROTNONE to begin with.
Andrea and Mel (and possibly you) all didn't like that and did the per
arch pagetable nonensen :-) I've never understood why.

https://lkml.org/lkml/2012/9/27/76
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Hugh Dickins
On Wed, 1 Oct 2014, Linus Torvalds wrote:
 On Wed, Oct 1, 2014 at 1:19 AM, Hugh Dickins hu...@google.com wrote:
 
 Can we please just get rid of _PAGE_NUMA. There is no excuse for it.

I'm no lover of _PAGE_NUMA, and hope that it can be simplified away
as you outline.  What we have in 3.16+3.17 is already an attempt to
improve on what you hated before, but not obviously an improvement.

Mel is the one who knows these issues inside out: maybe he's been
blinkered, but I wouldn't dare to pull it apart without his input.
Myself, I'm not looking beyond fixing whatever is the bug for 3.17.

  However, that would still not explain Dave's endless refaulting;
 
 Why not? You start out with a PROTNONE, trigger shrink_page_list() on
 a hugepage,.which calls add_to_swap(), which does
 split_huge_page_to_list(), which in turn calls __split_huge_page(),
 and that turns (_PAGE_PROTNONE) into (_PAGE_PROTNONE|_PAGE_NUMA),
 which you will then fault on forever, because the kernel thinks the
 page is present, but not a NUMA page.

I hesitate to admit, I still don't see it: please illuminate further.

We're talking about the loop in __split_huge_page_map(), where it does

entry = mk_pte(page + i, vma-vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (!pmd_write(*pmd))
entry = pte_wrprotect(entry);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
if (pmd_numa(*pmd))
entry = pte_mknuma(entry);

, right?  I only see that adding _PAGE_NUMA to _PAGE_PROTNONE if
pmd_numa(*pmd): but that would mean we had already gone wrong, setting
pmd_numa in a PROT_NONE vma, which task_numa_work takes care not to do;
or have mprotected an area to PROT_NONE without doing the pmd_mknonnuma.

Ah, we won't have mmap_sem in the add_to_swap case; so we could be
racing with an mprotect which already updated vm_flags and vm_page_prot,
but has not yet reached this pmd: is that a part of how you see it?

Or are you noticing a deficiency in the pmd locking?  I have not
worked my way through that, so cannot guarantee it, but please
point me to the weakness where you see it.

But when you convince me on that, then I still don't see how we get to
doing the repeated write fault, instead of hitting access_error(), as
you pointed out originally.  That still seems to require a PROTNONE
pte to be left behind in a VM_WRITE vma, which I do not see happening
here.  pte_modify leaves _PAGE_NUMA alone, but updates _PAGE_PROTNONE.
The pte_numa-pte_special confusion is messy, but I don't yet get
how it would manifest in the manner observed.

But certainly, a bug in the THP splitting feels just right to
match the frequency of the sightings: I hope you've got it.

Hugh
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Mel Gorman
On Wed, Oct 01, 2014 at 09:18:25AM -0700, Linus Torvalds wrote:
 On Wed, Oct 1, 2014 at 9:01 AM, Linus Torvalds
 torva...@linux-foundation.org wrote:
 
  We need to get rid of it, and just make it the same as pte_protnone().
  And then the real protnone is in the vma flags, and if you actually
  ever get to a pte that is marked protnone, you know it's a numa page.
 
 So I'd really suggest we do exactly that. Get rid of pte_numa()
 entirely, get rid of _PAGE_[BIT_]NUMA entirely, and instead add a
 pte_protnone() helper to check for the protnone case (which on x86
 is testing the _PAGE_PROTNONE bit, and on most other architectures is
 just testing that the page has no access rights).
 

Do not interpret the following as being against the idea of taking the
pte_protnone approach. This is intended to give background.

At the time the changes were made to the _PAGE_NUMA bits it was acknowledged
that a full move to prot_none was an option but it was not the preferred
solution at the time. It replaced one set of corner cases with another and
the last time like this time, there was considerable time pressure. The
VMA would be required to distinguish between a NUMA hinting fault and a
real prot_none bit. In most cases, we have the VMA now with the exception
of GUP. GUP would have to unconditionally go into the slow path to do the
VMA lookup. That is not likely to be a big of a problem but it was a concern.

In early implementations based on prot_none there were some VMA-based
protection checks that had higher overhead. At the time, there were severe
problems with overhead due to NUMA balancing and adding more was not
desirable. This has been addressed since with changes in multiple other
areas so it's much less of a concern now than it was. In the current shape,
these probably is not as much a problem as long as any check on pte_numa
was first guarded by a VMA check. One way of handling the corner cases
where would be to pass in the VMA where available and have a VM_BUG_ON that
fires if its a PROT_NONE VMA. That would catch problems during debugging
without adding overhead in the !debug case.

Going back to the start, the PTE bit was used as the approach due to
concerns that a pte_protnone helper would not work on all architectures,
ppc64 in particular.  There was no PROT_NONE bit there and instead prot_none
protections rely on PAGE_USER not being set so it's inaccessible from
userspace. There was discussion at the time that this could conceivably be
broken from some sub-architectures but I don't recall the details. Looking
at the current shape and your patch, it's conceivable that the pte_protnone
could be implemented as a _PAGE_PRESENT  !_PAGE_USER check as long
as it was guarded by a VMA check which x86 requires anyway. Not sure
if that would work for PMDs as I'm not familiar with with ppc64 to tell
offhand. Alternatively, ppc64 would potentially use the bit currently used
for _PAGE_NUMA as a _PROT_NONE bit.

I finished a small series that cleans up a number of issues discovered
recently due to Sasha's testing. It passed light testing and NUMA balancing
works but I cannot comment if they help Dave or Sasha's bugs as I never
managed to reproduce them. I'll post it shortly after I sent this mail.

Again, I'm not opposed to the pte_protnone issue as many of the concerns
I had before have been addressed since or rendered redundant. However,
I won't be able to digest and/or finish your patch in a reasonable time
frame. I'm only partially in work at the moment due to sick (going back
to bed after this mail) and out for a good chunk of next week and the week
after. Minor comments from the patch though

1. ppc64 needs work. Added Aneesh to the cc so he's at least aware
2. That check in pte_offset_kernel can probably go away
3. Ideally, VM_BUG_ON checks should be added to the pte_protnone check to
   ensure the VMA checks have already completed to avoid confusion between
   NUMA hints and real prot_none protections
4. GUP on prot_none now always hits the slow path but can't think of a
   case where that really matters.
5. SWP_OFFSET_SHIFT should be readjusted back if _PAGE_BIT_NUMA is
   removed.
6. It probably should at least be rebased on top of mm: remove
   misleading ARCH_USES_NUMA_PROT_NONE simply on the grounds that
   it cleans up some cruft
7. At the risk of pissing you off, pte_protnone_numa might be cleared
   at indicating why protnone is being checked at all

 Then we throw away pte_mknuma() and pte_mknonnuma() entirely,
 because they are brainless sh*t, and we just use
 
 ptent = ptep_modify_prot_start(mm, addr, pte);
 ptent = pte_modify(ptent, newprot);
 ptep_modify_prot_commit(mm, addr, pte, ptent);
 
 reliably instead (where for the mknuma case newprot is PROT_NONE,
 and for mknonnuma() it is vma-vm_page_prot. Yes, that means that you
 have to pass in the vma to those functions, but that just makes sense
 anyway.
 
 And if that means that we lose the numa flag on mprotect etc, nobody sane 
 

Re: pipe/page fault oddness.

2014-10-02 Thread Kirill A. Shutemov
On Wed, Oct 01, 2014 at 03:42:53PM -0700, Linus Torvalds wrote:
 On Wed, Oct 1, 2014 at 3:08 PM, Sasha Levin sasha.le...@oracle.com wrote:
 
  I've tried this patch on the same configuration that was triggering
  the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
  ran fine for ~20 minutes before exploding with:
 
 Well, that's somewhat encouraging. I didn't expect it to be perfect.
 
 That said, ran fine isn't necessarily the same thing as worked.
 Who knows how buggy it was without showing overt symptoms until the
 BUG_ON() triggered. But hey, I'll be optimistic.
 
  [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
 
 So that's
 
 BUG_ON(is_huge_zero_page(page));
 
 and the reason is trivial: the old code used to have a magical special
 case for the zero-page hugepage (see change_huge_pmd()) and I got rid
 of that (because now it's just about setting protections, and the
 zero-page hugepage is in no way special.
 
 So I think the solution is equally trivial: just accept that the
 zero-page can happen, and ignore it (just un-numa it).
 
 Appended is a incremental diff on top of the previous one. Even less
 tested than the last case, but I think you get the idea if it doesn't
 work as-is.
 
  Linus

 diff --git a/mm/huge_memory.c b/mm/huge_memory.c
 index 14de54af6c38..fc33952d59c4 100644
 --- a/mm/huge_memory.c
 +++ b/mm/huge_memory.c
 @@ -1290,7 +1290,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct 
 vm_area_struct *vma,
   }
  
   page = pmd_page(pmd);
 - BUG_ON(is_huge_zero_page(page));
 + if (is_huge_zero_page(page))
 + goto huge_zero_page;
 +
   page_nid = page_to_nid(page);
   last_cpupid = page_cpupid_last(page);
   count_vm_numa_event(NUMA_HINT_FAULTS);
 @@ -1381,6 +1383,11 @@ out:
   task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
  
   return 0;
 +huge_zero_page:
 + pmd = pmd_modify(pmd, vma-vm_page_prot);
 + set_pmd_at(mm, haddr, pmdp, pmd);
 + update_mmu_cache_pmd(vma, addr, pmdp);
 + goto out_unlock;

I don't see what prevents the code to make zero page writable here.
We need at least pmd = pmd_wrprotect(pmd) before set_pmd_at();

-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Sasha Levin
On 10/02/2014 04:03 AM, Chuck Ebbert wrote:
 On Wed, 01 Oct 2014 23:32:15 -0400
 Sasha Levin sasha.le...@oracle.com wrote:
 
 On 10/01/2014 06:28 PM, Chuck Ebbert wrote:
 On Wed, 01 Oct 2014 18:08:30 -0400
 Sasha Levin sasha.le...@oracle.com wrote:

 On 10/01/2014 04:20 PM, Linus Torvalds wrote:
 So I'm really sending this patch out in the hope that it will get
 comments, fixup and possibly even testing by people who actually know
 the NUMA balancing code. Rik?  Anybody?

 Hi Linus,

 I've tried this patch on the same configuration that was triggering
 the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
 ran fine for ~20 minutes before exploding with:

 [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
 That's:
 BUG_ON(is_huge_zero_page(page));

 Can you change your scripts to show the source code line when
 the error is a BUG_ON()? The machine code disassembly after the
 oops message doesn't really help.


 Hum? The source code line is the first line in the trace:

  [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!

 
 I meant, display the contents of that line so we can see what the
 BUG_ON() was triggered by. In some cases you might have a custom patch
 applied or be running a version that some people don't have handy.

Ah, the actual content? That's a good point - let me look into that.


Thanks,
Sasha

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Sasha Levin
On 10/01/2014 06:42 PM, Linus Torvalds wrote:
 On Wed, Oct 1, 2014 at 3:08 PM, Sasha Levin sasha.le...@oracle.com wrote:
 
  I've tried this patch on the same configuration that was triggering
  the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
  ran fine for ~20 minutes before exploding with:
 Well, that's somewhat encouraging. I didn't expect it to be perfect.
 
 That said, ran fine isn't necessarily the same thing as worked.
 Who knows how buggy it was without showing overt symptoms until the
 BUG_ON() triggered. But hey, I'll be optimistic.
 
  [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
 So that's
 
 BUG_ON(is_huge_zero_page(page));
 
 and the reason is trivial: the old code used to have a magical special
 case for the zero-page hugepage (see change_huge_pmd()) and I got rid
 of that (because now it's just about setting protections, and the
 zero-page hugepage is in no way special.
 
 So I think the solution is equally trivial: just accept that the
 zero-page can happen, and ignore it (just un-numa it).
 
 Appended is a incremental diff on top of the previous one. Even less
 tested than the last case, but I think you get the idea if it doesn't
 work as-is.

I have a new one for you. I know it doesn't say numa anywhere, but I
haven't ever seen that trace before so I'll just go ahead and blame it
on your patch...

[ 2838.403382] BUG: unable to handle kernel paging request at 00055d996e80
[ 2838.405740] IP: task_curr (kernel/sched/core.c:1010)
[ 2838.407076] PGD dba2c6067 PUD 0
[ 2838.407926] Thread overran stack, or stack corrupted
[ 2838.409093] Oops:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 2838.411454] Dumping ftrace buffer:
[ 2838.412602](ftrace buffer empty)
[ 2838.413187] Modules linked in:
[ 2838.413187] CPU: 38 PID: 9342 Comm: trinity-c38 Not tainted 
3.17.0-rc7-sasha-00041-g6c9c81b #1260
[ 2838.413187] task: 880dba2f ti: 880dba2ec000 task.ti: 
880dba2ec000
[ 2838.413187] RIP: task_curr (kernel/sched/core.c:1010)
[ 2838.413187] RSP: 0018:880dba2ebf48  EFLAGS: 00010046
[ 2838.413187] RAX: f080 RBX: 880dba2f RCX: 000a
[ 2838.413187] RDX: ba1a9560 RSI: 880dba2f RDI: 880dba2f
[ 2838.413187] RBP: 880dba2ebf98 R08: 0004862a R09: 
[ 2838.413187] R10: 0038 R11: 001f R12: 880dba2f
[ 2838.413187] R13: 880dd5420740 R14: 000b R15: 8cc92000
[ 2838.413187] FS:  7f05f3dbc700() GS:880701e0() 
knlGS:
[ 2838.413187] CS:  0010 DS:  ES:  CR0: 8005003b
[ 2838.413187] CR2: 00055d996e80 CR3: 000dba2c5000 CR4: 06a0
[ 2838.413187] DR0: 006ee000 DR1:  DR2: 
[ 2838.413187] DR3:  DR6: 0ff0 DR7: 00090602
[ 2838.413187] Stack:
[ 2838.413187]  8816218b  880d000a 
000b
[ 2838.413187]  0082 880dba2f 000b 
880dba2ec070
[ 2838.413187]   8cc92000 880dba2ebff8 
88162a84
[ 2838.413187] Call Trace:
[ 2838.413187]  UNK
[ 2838.413187] Code: 87 60 09 00 00 01 e8 8d ee ff ff 5d c3 66 66 2e 0f 1f 84 
00 00 00 00 00 48 8b 57 08 55 48 c7 c0 80 f0 00 00 48 89 e5 5d 8b 52 18 48 8b 
14 d5 80 c3 c4 8c 48 39 bc 10 68 09 00 00 0f 94 c0 0f b6
All code

   0:   87 60 09xchg   %esp,0x9(%rax)
   3:   00 00   add%al,(%rax)
   5:   01 e8   add%ebp,%eax
   7:   8d  (bad)
   8:   ee  out%al,(%dx)
   9:   ff  (bad)
   a:   ff 5d c3lcallq *-0x3d(%rbp)
   d:   66 66 2e 0f 1f 84 00data32 nopw %cs:0x0(%rax,%rax,1)
  14:   00 00 00 00
  18:   48 8b 57 08 mov0x8(%rdi),%rdx
  1c:   55  push   %rbp
  1d:   48 c7 c0 80 f0 00 00mov$0xf080,%rax
  24:   48 89 e5mov%rsp,%rbp
  27:   5d  pop%rbp
  28:   8b 52 18mov0x18(%rdx),%edx
  2b:*  48 8b 14 d5 80 c3 c4mov-0x733b3c80(,%rdx,8),%rdx
-- trapping instruction
  32:   8c
  33:   48 39 bc 10 68 09 00cmp%rdi,0x968(%rax,%rdx,1)
  3a:   00
  3b:   0f 94 c0sete   %al
  3e:   0f b6 00movzbl (%rax),%eax

Code starting with the faulting instruction
===
   0:   48 8b 14 d5 80 c3 c4mov-0x733b3c80(,%rdx,8),%rdx
   7:   8c
   8:   48 39 bc 10 68 09 00cmp%rdi,0x968(%rax,%rdx,1)
   f:   00
  10:   0f 94 c0sete   %al
  13:   0f b6 00movzbl (%rax),%eax
[ 2838.413187] RIP task_curr (kernel/sched/core.c:1010)
[ 2838.413187]  RSP 880dba2ebf48
[ 2838.413187] CR2: 00055d996e80


Thanks,
Sasha
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the 

Re: pipe/page fault oddness.

2014-10-02 Thread Linus Torvalds
On Thu, Oct 2, 2014 at 1:47 AM, Hugh Dickins hu...@google.com wrote:

 I hesitate to admit, I still don't see it: please illuminate further.

No, your'e looking at what I was looking.

 We're talking about the loop in __split_huge_page_map(), where it does

Yes.

 entry = mk_pte(page + i, vma-vm_page_prot);
 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 if (!pmd_write(*pmd))
 entry = pte_wrprotect(entry);
 if (!pmd_young(*pmd))
 entry = pte_mkold(entry);
 if (pmd_numa(*pmd))
 entry = pte_mknuma(entry);

 , right?  I only see that adding _PAGE_NUMA to _PAGE_PROTNONE if
 pmd_numa(*pmd): but that would mean we had already gone wrong, setting
 pmd_numa in a PROT_NONE vma, which task_numa_work takes care not to do;
 or have mprotected an area to PROT_NONE without doing the pmd_mknonnuma.

Fair enough. Except this code has no locking that I see, so if we
*ever* see that numa entry in the pmd while walking the page tables in
vmscan, we're basically screwed.

 Or are you noticing a deficiency in the pmd locking?  I have not
 worked my way through that, so cannot guarantee it, but please
 point me to the weakness where you see it.

So I don't see any locking at all wrt mprotect (or new mmap). That's
kind of the whole point for page-out - it bypasses all the normal VM
locks, and only uses the last pte locking.

So the whole use of vma-vm_page_prot here is a bit scary. That gets
modified outside of the page table locks. So how do you know it's not
already PROT_NONE, but mprotect just hasn't gotten to actually take
the page table locks yet?

I dunno. It all makes me just very nervous. The whole numa bit is
separate from the protections, has different locking, and is just
oddly and subtly different is really what I fundamentally object to.
And it seems so _unnecessary_. All this odd complexity for no actual
gain - just extra code, and extra room for subtle bugs. Which is
exactly why I hate that magic NUMA bit so much.

  Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Linus Torvalds
On Thu, Oct 2, 2014 at 7:25 AM, Kirill A. Shutemov kir...@shutemov.name wrote:

 I don't see what prevents the code to make zero page writable here.
 We need at least pmd = pmd_wrprotect(pmd) before set_pmd_at();

Do we? If it's the zero page, it had better be an anonymous mapping,
and vm_page_prot had better not be writable.

Anonymous pages don't _start_ out writable, we explicitly make them so
with code like

if (vma-vm_flags  VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));

so it should be fine to just use pmd_modify(pmd, vma-vm_page_prot); directly.

But hey, this is the kind of thing that maybe I'm missing something on..

  Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Linus Torvalds
On Thu, Oct 2, 2014 at 8:04 AM, Sasha Levin sasha.le...@oracle.com wrote:

 I have a new one for you. I know it doesn't say numa anywhere, but I
 haven't ever seen that trace before so I'll just go ahead and blame it
 on your patch...

Fair enough, but the oops doesn't really give even a hint of what
could be wrong.

The stack is clearly too deep:

  Thread overran stack, or stack corrupted
  task.ti: 880dba2ec000
RSP: 880dba2ebf48

but my patch shouldn't have added any deeper call-chains anywhere.

Anyway, unless you can get some other interesting oops with more hints
about where we go wrong, you might want to try Mel's four cleanup
patches instead. I love how you're testing my quick-and-dirty hack,
and I think we'll need to do this eventually, but in the short term
Mel's patches are probably worth applying.

In particular, his 4/4 patch removes the code case that I was
particularly nervous about, so it looks worth a try, because that may
just remove the bug with much smaller effort.

   Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-02 Thread Kirill A. Shutemov
On Thu, Oct 02, 2014 at 09:01:38AM -0700, Linus Torvalds wrote:
 On Thu, Oct 2, 2014 at 7:25 AM, Kirill A. Shutemov kir...@shutemov.name 
 wrote:
 
  I don't see what prevents the code to make zero page writable here.
  We need at least pmd = pmd_wrprotect(pmd) before set_pmd_at();
 
 Do we? If it's the zero page, it had better be an anonymous mapping,
 and vm_page_prot had better not be writable.
 
 Anonymous pages don't _start_ out writable, we explicitly make them so
 with code like
 
 if (vma-vm_flags  VM_WRITE)
 entry = pte_mkwrite(pte_mkdirty(entry));
 
 so it should be fine to just use pmd_modify(pmd, vma-vm_page_prot); 
 directly.
 
 But hey, this is the kind of thing that maybe I'm missing something on..

You're right.
It means we have redundant pmd_wrprotect() in set_huge_zero_page().

==

Subject: [PATCH] thp: do not mark zero-page pmd write-protected explicitly

Zero pages can be used only in anonymous mappings, which never have
writable vma-vm_page_prot: see protection_map in mm/mmap.c and __PX1X
definitions.

Let's drop redundant pmd_wrprotect() in set_huge_zero_page().

Signed-off-by: Kirill A. Shutemov kirill.shute...@linux.intel.com
---
 mm/huge_memory.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d9a21d06b862..2c17d184b56d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -784,7 +784,6 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct 
mm_struct *mm,
if (!pmd_none(*pmd))
return false;
entry = mk_pmd(zero_page, vma-vm_page_prot);
-   entry = pmd_wrprotect(entry);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Sasha Levin
On 10/01/2014 06:28 PM, Chuck Ebbert wrote:
> On Wed, 01 Oct 2014 18:08:30 -0400
> Sasha Levin  wrote:
> 
>> > On 10/01/2014 04:20 PM, Linus Torvalds wrote:
>>> > > So I'm really sending this patch out in the hope that it will get
>>> > > comments, fixup and possibly even testing by people who actually know
>>> > > the NUMA balancing code. Rik?  Anybody?
>> > 
>> > Hi Linus,
>> > 
>> > I've tried this patch on the same configuration that was triggering
>> > the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
>> > ran fine for ~20 minutes before exploding with:
>> > 
>> > [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
> That's:
>   BUG_ON(is_huge_zero_page(page));
> 
> Can you change your scripts to show the source code line when
> the error is a BUG_ON()? The machine code disassembly after the
> oops message doesn't really help.
> 

Hum? The source code line is the first line in the trace:

[ 2781.566206] kernel BUG at mm/huge_memory.c:1293!


Thanks,
Sasha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Linus Torvalds
On Wed, Oct 1, 2014 at 3:08 PM, Sasha Levin  wrote:
>
> I've tried this patch on the same configuration that was triggering
> the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
> ran fine for ~20 minutes before exploding with:

Well, that's somewhat encouraging. I didn't expect it to be perfect.

That said, "ran fine" isn't necessarily the same thing as "worked".
Who knows how buggy it was without showing overt symptoms until the
BUG_ON() triggered. But hey, I'll be optimistic.

> [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!

So that's

BUG_ON(is_huge_zero_page(page));

and the reason is trivial: the old code used to have a magical special
case for the zero-page hugepage (see change_huge_pmd()) and I got rid
of that (because now it's just about setting protections, and the
zero-page hugepage is in no way special.

So I think the solution is equally trivial: just accept that the
zero-page can happen, and ignore it (just un-numa it).

Appended is a incremental diff on top of the previous one. Even less
tested than the last case, but I think you get the idea if it doesn't
work as-is.

 Linus
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 14de54af6c38..fc33952d59c4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1290,7 +1290,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
}
 
page = pmd_page(pmd);
-   BUG_ON(is_huge_zero_page(page));
+   if (is_huge_zero_page(page))
+   goto huge_zero_page;
+
page_nid = page_to_nid(page);
last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
@@ -1381,6 +1383,11 @@ out:
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
 
return 0;
+huge_zero_page:
+   pmd = pmd_modify(pmd, vma->vm_page_prot);
+   set_pmd_at(mm, haddr, pmdp, pmd);
+   update_mmu_cache_pmd(vma, addr, pmdp);
+   goto out_unlock;
 }
 
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,


Re: pipe/page fault oddness.

2014-10-01 Thread Chuck Ebbert
On Wed, 01 Oct 2014 18:08:30 -0400
Sasha Levin  wrote:

> On 10/01/2014 04:20 PM, Linus Torvalds wrote:
> > So I'm really sending this patch out in the hope that it will get
> > comments, fixup and possibly even testing by people who actually know
> > the NUMA balancing code. Rik?  Anybody?
> 
> Hi Linus,
> 
> I've tried this patch on the same configuration that was triggering
> the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
> ran fine for ~20 minutes before exploding with:
> 
> [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!

That's:
BUG_ON(is_huge_zero_page(page));

Can you change your scripts to show the source code line when
the error is a BUG_ON()? The machine code disassembly after the
oops message doesn't really help.

> [ 2781.566953] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
> [ 2781.568054] Dumping ftrace buffer:
> [ 2781.568826](ftrace buffer empty)
> [ 2781.569392] Modules linked in:
> [ 2781.569909] CPU: 61 PID: 13111 Comm: trinity-c61 Not tainted 
> 3.17.0-rc7-sasha-00040-g65e1cb2 #1259
> [ 2781.571077] task: 88050ba8 ti: 880418ecc000 task.ti: 
> 880418ecc000
> [ 2781.571077] RIP: do_huge_pmd_numa_page (mm/huge_memory.c:1293 
> (discriminator 1))
> [ 2781.571077] RSP: :880418ecfc60  EFLAGS: 00010246
> [ 2781.571077] RAX: ea0074c6 RBX: ea0074c6 RCX: 
> 001d318009e0
> [ 2781.571077] RDX: ea00 RSI: b5706ef3 RDI: 
> 001d318009e0
> [ 2781.571077] RBP: 880418ecfcc8 R08: 0038 R09: 
> 0001
> [ 2781.571077] R10: 0038 R11: 0001 R12: 
> 8804f9b52000
> [ 2781.571077] R13: 001d318009e0 R14: 880508a1f840 R15: 
> 0028
> [ 2781.571077] FS:  7f5502fc9700() GS:881d77e0() 
> knlGS:
> [ 2781.571077] CS:  0010 DS:  ES:  CR0: 80050033
> [ 2781.571077] CR2:  CR3: 0004bfac4000 CR4: 
> 06a0
> [ 2781.571077] Stack:
> [ 2781.571077]  880418ecfc98 0282 88050ba8 
> 000b
> [ 2781.571077]  88060d2ab000 8806001d  
> 881d30b3ec00
> [ 2781.571077]   881d30b3ec00 88060d2ab000 
> 0100
> [ 2781.571077] Call Trace:
> [ 2781.571077] handle_mm_fault (mm/memory.c:3304 mm/memory.c:3368)
> [ 2781.571077] __do_page_fault (arch/x86/mm/fault.c:1231)
> [ 2781.571077] ? kvm_clock_read (./arch/x86/include/asm/preempt.h:90 
> arch/x86/kernel/kvmclock.c:86)
> [ 2781.571077] ? sched_clock (./arch/x86/include/asm/paravirt.h:192 
> arch/x86/kernel/tsc.c:304)
> [ 2781.571077] ? sched_clock_local (kernel/sched/clock.c:214)
> [ 2781.571077] ? context_tracking_user_exit (kernel/context_tracking.c:184)
> [ 2781.571077] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
> [ 2781.571077] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2641 
> (discriminator 8))
> [ 2781.571077] trace_do_page_fault (arch/x86/mm/fault.c:1314 
> include/linux/jump_label.h:115 include/linux/context_tracking_state.h:27 
> include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1315)
> [ 2781.571077] do_async_page_fault (arch/x86/kernel/kvm.c:279)
> [ 2781.571077] async_page_fault (arch/x86/kernel/entry_64.S:1314)
> [ 2781.571077] ? copy_user_generic_unrolled (arch/x86/lib/copy_user_64.S:166)
> [ 2781.571077] ? sys32_mmap (arch/x86/ia32/sys_ia32.c:159)
> [ 2781.571077] ia32_do_call (arch/x86/ia32/ia32entry.S:430)
> [ 2781.571077] Code: b4 eb e0 0f 1f 84 00 00 00 00 00 4c 89 f7 e8 88 2f 0c 03 
> 48 8b 45 d0 4c 89 e6 48 8b b8 88 00 00 00 e8 85 c7 ff ff e9 90 fe ff ff <0f> 
> 0b 66 0f 1f 44 00 00 48 89 df e8 90 e9 f9 ff 84 c0 0f 85 be
> All code
> 
>0: b4 eb   mov$0xeb,%ah
>2: e0 0f   loopne 0x13
>4: 1f  (bad)
>5: 84 00   test   %al,(%rax)
>7: 00 00   add%al,(%rax)
>9: 00 00   add%al,(%rax)
>b: 4c 89 f7mov%r14,%rdi
>e: e8 88 2f 0c 03  callq  0x30c2f9b
>   13: 48 8b 45 d0 mov-0x30(%rbp),%rax
>   17: 4c 89 e6mov%r12,%rsi
>   1a: 48 8b b8 88 00 00 00mov0x88(%rax),%rdi
>   21: e8 85 c7 ff ff  callq  0xc7ab
>   26: e9 90 fe ff ff  jmpq   0xfebb
>   2b:*0f 0b   ud2 <-- trapping instruction
>   2d: 66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
>   33: 48 89 dfmov%rbx,%rdi
>   36: e8 90 e9 f9 ff  callq  0xfff9e9cb
>   3b: 84 c0   test   %al,%al
>   3d: 0f  .byte 0xf
>   3e: 85  .byte 0x85
>   3f: be  .byte 0xbe
>   ...
> 
> Code starting with the faulting instruction
> ===
>0: 0f 0b   ud2
>2: 66 0f 1f 44 00 00   nopw   

Re: pipe/page fault oddness.

2014-10-01 Thread Sasha Levin
On 10/01/2014 04:20 PM, Linus Torvalds wrote:
> So I'm really sending this patch out in the hope that it will get
> comments, fixup and possibly even testing by people who actually know
> the NUMA balancing code. Rik?  Anybody?

Hi Linus,

I've tried this patch on the same configuration that was triggering
the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
ran fine for ~20 minutes before exploding with:

[ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
[ 2781.566953] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 2781.568054] Dumping ftrace buffer:
[ 2781.568826](ftrace buffer empty)
[ 2781.569392] Modules linked in:
[ 2781.569909] CPU: 61 PID: 13111 Comm: trinity-c61 Not tainted 
3.17.0-rc7-sasha-00040-g65e1cb2 #1259
[ 2781.571077] task: 88050ba8 ti: 880418ecc000 task.ti: 
880418ecc000
[ 2781.571077] RIP: do_huge_pmd_numa_page (mm/huge_memory.c:1293 (discriminator 
1))
[ 2781.571077] RSP: :880418ecfc60  EFLAGS: 00010246
[ 2781.571077] RAX: ea0074c6 RBX: ea0074c6 RCX: 001d318009e0
[ 2781.571077] RDX: ea00 RSI: b5706ef3 RDI: 001d318009e0
[ 2781.571077] RBP: 880418ecfcc8 R08: 0038 R09: 0001
[ 2781.571077] R10: 0038 R11: 0001 R12: 8804f9b52000
[ 2781.571077] R13: 001d318009e0 R14: 880508a1f840 R15: 0028
[ 2781.571077] FS:  7f5502fc9700() GS:881d77e0() 
knlGS:
[ 2781.571077] CS:  0010 DS:  ES:  CR0: 80050033
[ 2781.571077] CR2:  CR3: 0004bfac4000 CR4: 06a0
[ 2781.571077] Stack:
[ 2781.571077]  880418ecfc98 0282 88050ba8 
000b
[ 2781.571077]  88060d2ab000 8806001d  
881d30b3ec00
[ 2781.571077]   881d30b3ec00 88060d2ab000 
0100
[ 2781.571077] Call Trace:
[ 2781.571077] handle_mm_fault (mm/memory.c:3304 mm/memory.c:3368)
[ 2781.571077] __do_page_fault (arch/x86/mm/fault.c:1231)
[ 2781.571077] ? kvm_clock_read (./arch/x86/include/asm/preempt.h:90 
arch/x86/kernel/kvmclock.c:86)
[ 2781.571077] ? sched_clock (./arch/x86/include/asm/paravirt.h:192 
arch/x86/kernel/tsc.c:304)
[ 2781.571077] ? sched_clock_local (kernel/sched/clock.c:214)
[ 2781.571077] ? context_tracking_user_exit (kernel/context_tracking.c:184)
[ 2781.571077] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
[ 2781.571077] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2641 
(discriminator 8))
[ 2781.571077] trace_do_page_fault (arch/x86/mm/fault.c:1314 
include/linux/jump_label.h:115 include/linux/context_tracking_state.h:27 
include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1315)
[ 2781.571077] do_async_page_fault (arch/x86/kernel/kvm.c:279)
[ 2781.571077] async_page_fault (arch/x86/kernel/entry_64.S:1314)
[ 2781.571077] ? copy_user_generic_unrolled (arch/x86/lib/copy_user_64.S:166)
[ 2781.571077] ? sys32_mmap (arch/x86/ia32/sys_ia32.c:159)
[ 2781.571077] ia32_do_call (arch/x86/ia32/ia32entry.S:430)
[ 2781.571077] Code: b4 eb e0 0f 1f 84 00 00 00 00 00 4c 89 f7 e8 88 2f 0c 03 
48 8b 45 d0 4c 89 e6 48 8b b8 88 00 00 00 e8 85 c7 ff ff e9 90 fe ff ff <0f> 0b 
66 0f 1f 44 00 00 48 89 df e8 90 e9 f9 ff 84 c0 0f 85 be
All code

   0:   b4 eb   mov$0xeb,%ah
   2:   e0 0f   loopne 0x13
   4:   1f  (bad)
   5:   84 00   test   %al,(%rax)
   7:   00 00   add%al,(%rax)
   9:   00 00   add%al,(%rax)
   b:   4c 89 f7mov%r14,%rdi
   e:   e8 88 2f 0c 03  callq  0x30c2f9b
  13:   48 8b 45 d0 mov-0x30(%rbp),%rax
  17:   4c 89 e6mov%r12,%rsi
  1a:   48 8b b8 88 00 00 00mov0x88(%rax),%rdi
  21:   e8 85 c7 ff ff  callq  0xc7ab
  26:   e9 90 fe ff ff  jmpq   0xfebb
  2b:*  0f 0b   ud2 <-- trapping instruction
  2d:   66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
  33:   48 89 dfmov%rbx,%rdi
  36:   e8 90 e9 f9 ff  callq  0xfff9e9cb
  3b:   84 c0   test   %al,%al
  3d:   0f  .byte 0xf
  3e:   85  .byte 0x85
  3f:   be  .byte 0xbe
...

Code starting with the faulting instruction
===
   0:   0f 0b   ud2
   2:   66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
   8:   48 89 dfmov%rbx,%rdi
   b:   e8 90 e9 f9 ff  callq  0xfff9e9a0
  10:   84 c0   test   %al,%al
  12:   0f  .byte 0xf
  13:   85  .byte 0x85
  14:   be  .byte 0xbe
...
[ 2781.571077] RIP do_huge_pmd_numa_page (mm/huge_memory.c:1293 (discriminator 
1))
[ 2781.571077]  RSP 


Thanks,
Sasha

Re: pipe/page fault oddness.

2014-10-01 Thread Rik van Riel
On 10/01/2014 04:20 PM, Linus Torvalds wrote:

> Now, I'll be honest: this patch *migth* just work, but I expect it to
> have some stupid problem. It compiles. I haven't even dared boot it,
> much less try any numa benchmarks that woudln't show anything sane on
> my machine anyway.
> 
> So I'm really sending this patch out in the hope that it will get
> comments, fixup and possibly even testing by people who actually know
> the NUMA balancing code. Rik?  Anybody?

I'll add it to the list. No guarantees I'll be able to get
to it before Linux Con / KVM Forum though. I have a bit of
a backlog to get through this coming week...

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Linus Torvalds
On Wed, Oct 1, 2014 at 9:18 AM, Linus Torvalds
 wrote:
>
> So I'd really suggest we do exactly that. Get rid of "pte_numa()"
> entirely, get rid of "_PAGE_[BIT_]NUMA" entirely, and instead add a
> "pte_protnone()" helper to check for the "protnone" case (which on x86
> is testing the _PAGE_PROTNONE bit, and on most other architectures is
> just testing that the page has no access rights).
>
> Then we throw away "pte_mknuma()" and "pte_mknonnuma()" entirely,
> because they are brainless sh*t, and we just use
>
> ptent = ptep_modify_prot_start(mm, addr, pte);
> ptent = pte_modify(ptent, newprot);
> ptep_modify_prot_commit(mm, addr, pte, ptent);
>
> reliably instead (where for the mknuma case "newprot" is PROT_NONE,
> and for mknonnuma() it is vma->vm_page_prot. Yes, that means that you
> have to pass in the vma to those functions, but that just makes sense
> anyway.
>
> And if that means that we lose the numa flag on mprotect etc, nobody sane 
> cares.

So here is a *COMPLETELY UNTESTED* and probably seriously buggy first
version of such a patch. It doesn't do the powerpc conversion, so
somebody would need to check that eventually, but aside from that
obvious issue, can people fix this up? Or comment on why it doesn't
work.

Now, I like this because it gets rid of the horrible PAGE_NUMA special
cases, but it really seems to simplify things in general. Lookie here:

 13 files changed, 74 insertions(+), 268 deletions(-)

that's really mainly just the removal of odd and broken numa pte/pmd
helper functions from  that aren't needed any
more because the normal "change protections" functions just DTRT
automatically. Although there are actually a few other cases that got
simpler too, so it's not *just* removal of those _PAGE_NUMA-specific
helpers.

One thing this does *not* remove is the special pte locking rule in
the "change_*_range()" functions: they still take that broken
"prot_numa" argument. HOWEVER, it isn't actually used for any page
table modifications, the only reason for it existing is the hacky
locking issue (see lock_pte_protection(), and the comment about races
with the transhuge accesses).

Now, I'll be honest: this patch *migth* just work, but I expect it to
have some stupid problem. It compiles. I haven't even dared boot it,
much less try any numa benchmarks that woudln't show anything sane on
my machine anyway.

So I'm really sending this patch out in the hope that it will get
comments, fixup and possibly even testing by people who actually know
the NUMA balancing code. Rik?  Anybody?

Linus
 arch/x86/include/asm/pgtable.h   |  31 +--
 arch/x86/include/asm/pgtable_types.h |  27 +-
 arch/x86/mm/gup.c|   4 +-
 include/asm-generic/pgtable.h| 159 ++-
 include/linux/huge_mm.h  |   3 +-
 init/Kconfig |  11 ---
 mm/gup.c |   4 +-
 mm/huge_memory.c |  42 +++--
 mm/memory.c  |  16 ++--
 mm/mempolicy.c   |   2 +-
 mm/migrate.c |   1 -
 mm/mprotect.c|  40 +++--
 mm/pgtable-generic.c |   2 -
 13 files changed, 74 insertions(+), 268 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index aa97a070f09f..0d4ec3a62fed 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -299,7 +299,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
-   return pmd_clear_flags(pmd, _PAGE_PRESENT);
+   return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -457,8 +457,7 @@ static inline int pte_same(pte_t a, pte_t b)
 
 static inline int pte_present(pte_t a)
 {
-   return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
-  _PAGE_NUMA);
+   return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
 #define pte_present_nonuma pte_present_nonuma
@@ -473,7 +472,7 @@ static inline bool pte_accessible(struct mm_struct *mm, 
pte_t a)
if (pte_flags(a) & _PAGE_PRESENT)
return true;
 
-   if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
+   if ((pte_flags(a) & _PAGE_PROTNONE) &&
mm_tlb_flush_pending(mm))
return true;
 
@@ -493,10 +492,26 @@ static inline int pmd_present(pmd_t pmd)
 * the _PAGE_PSE flag will remain set at all times while the
 * _PAGE_PRESENT bit is clear).
 */
-   return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
-_PAGE_NUMA);
+   return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Technically these work even when not doing NUMA
+ * balancing, but we don't care and have simpler
+ * "always 

Re: pipe/page fault oddness.

2014-10-01 Thread Rik van Riel
On 10/01/2014 12:18 PM, Linus Torvalds wrote:

> Seriously, why can't we just do this, and throw away all the crap that
> is "numa special case". This would make all the random games in
> change_pte_range() just go away entirely, because the whole NUMA thing
> really wouldn't be a special case for the pte AT ALL any more. All it
> would be is that a pte could be marked PROT_NONE even if the
> vma->vm_flags aren't.

That's what I suggested quite a while back, but IIRC either
Peter or Mel brought up a reason why this was not possible.

Unfortunately I do not remember what it was, just that I
was convinced by it at the time...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Linus Torvalds
On Wed, Oct 1, 2014 at 9:01 AM, Linus Torvalds
 wrote:
>
> We need to get rid of it, and just make it the same as pte_protnone().
> And then the real protnone is in the vma flags, and if you actually
> ever get to a pte that is marked protnone, you know it's a numa page.

So I'd really suggest we do exactly that. Get rid of "pte_numa()"
entirely, get rid of "_PAGE_[BIT_]NUMA" entirely, and instead add a
"pte_protnone()" helper to check for the "protnone" case (which on x86
is testing the _PAGE_PROTNONE bit, and on most other architectures is
just testing that the page has no access rights).

Then we throw away "pte_mknuma()" and "pte_mknonnuma()" entirely,
because they are brainless sh*t, and we just use

ptent = ptep_modify_prot_start(mm, addr, pte);
ptent = pte_modify(ptent, newprot);
ptep_modify_prot_commit(mm, addr, pte, ptent);

reliably instead (where for the mknuma case "newprot" is PROT_NONE,
and for mknonnuma() it is vma->vm_page_prot. Yes, that means that you
have to pass in the vma to those functions, but that just makes sense
anyway.

And if that means that we lose the numa flag on mprotect etc, nobody sane cares.

Seriously, why can't we just do this, and throw away all the crap that
is "numa special case". This would make all the random games in
change_pte_range() just go away entirely, because the whole NUMA thing
really wouldn't be a special case for the pte AT ALL any more. All it
would be is that a pte could be marked PROT_NONE even if the
vma->vm_flags aren't.

Please, please, please? The current _PAGE_NUMA really is a horrible
horrible thing, and may well be the source of this bug.

The fact that it took DaveJ a long time to trigger his lockup would be
entirely consistent with "you have to split a PROTNONE large page due
to memory pressure", so the problem with our current pte_mknuma() that
Hugh points out looks entirely possible to me.

Now, there may be some reason why it can't happen, but even in the
absense of this bug, I really think that _PAGE_NUMA has been a huge
mistake from day one.

  Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Linus Torvalds
On Wed, Oct 1, 2014 at 1:19 AM, Hugh Dickins  wrote:
>
> Irrelevance follows...

Maybe not irrelevant.

> There *appears* to be a risk of hitting the VM_BUG_ON, or with no
> VM_BUG_ON (as in 3.17-rc) pte_mknuma proceeding to add _PAGE_NUMA
> to _PAGE_PROTNONE - making the pte then fail the pte_numa test,
> but pass the pte_special test, hence fail the vm_normal_page test:
> when coming from change_prot_numa serving MPOL_MF_LAZY for mbind.

Ugh, yes. The whole _PAGE_NUMA is still a f*cking mess. I hate it.
Hate it, hate it, hate it.

We need to get rid of it, and just make it the same as pte_protnone().
And then the real protnone is in the vma flags, and if you actually
ever get to a pte that is marked protnone, you know it's a numa page.

Seriously.

I never understood what the objection to that was, but every time I
tell people to do it, they go crazy and think _PAGE_NUMA makes sense.
It doesn't. There's no excuse. Rik, what was your broken excuse again?
Something to do with Powerpc, but it is obviously not true, since
powerpc supports protnone just fine.

Even our own comments are confused, with include/asm-generic/pgtable.h saying:

 * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
 * same bit too).

but no, it's not the same bit.

Can we please just get rid of _PAGE_NUMA. There is no excuse for it.

> However, that would still not explain Dave's endless refaulting;

Why not? You start out with a PROTNONE, trigger shrink_page_list() on
a hugepage,.which calls add_to_swap(), which does
split_huge_page_to_list(), which in turn calls __split_huge_page(),
and that turns (_PAGE_PROTNONE) into (_PAGE_PROTNONE|_PAGE_NUMA),
which you will then fault on forever, because the kernel thinks the
page is present, but not a NUMA page.

IOW, it's *exactly* the same f*cking confusion between _PAGE_NUMA and
_PAGE_PROTNONE I've complained about before.

> Some time wasted on that, but I learnt a valuable debugging technique:
> #undef EINVAL
> #define EINVAL __LINE__

Wow. There's a certain beauty in the pure crazyness.

However, be careful: our IS_ERR() handling knows that error numbers
are < 4096. So on a big file, and error pointers, that doesn't work
reliably.

  Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Hugh Dickins
On Tue, 30 Sep 2014, Linus Torvalds wrote:
> On Tue, Sep 30, 2014 at 11:20 AM, Dave Jones  wrote:
> >
> > page_fault_kernel:address=__per_cpu_end ip=copy_page_to_iter 
> > error_code=0x2
> 
> Interesting. "error_code" in particular. The value "2" means that the
> CPU thinks that the page is not present (bit zero is clear).
> 
> (That "address" is useless - it's tried to turn a user address into a
> kernel symbol, and the percpu symbols are zero-based, so it picks the
> last of them. The "ip" is useless too, since it doesn't give the
> offset)
> 
> So the CPU thinks it's a write to a not-present page, which means that
> _PAGE_PRESENT bit is clear.
> 
> Now the *kernel* thinks a page is present not just if _PAGE_PRESENT is
> set, but also if _PAGE_PROTNONE or _PAGE_NUMA are set. Sadly, your
> trace is not very useful, because inlining has caused pretty much all
> the cases to be in "handle_mm_fault()", so the trace doesn't really
> tell which path this all takes.
> 
> But we can still do *some* analysis on the trace: do_wp_page()
> shouldn't have been inlined, so it would have shown up in the trace if
> it had been called. So I think we can be pretty confident that the
> ptep_set_access_flags() we see is the one from handle_pte_fault().
> 
> And if that is the case, then we know that "pte_present()" is indeed
> true as far a the kernel is concerned. So with _PAGE_PRESENT not being
> set (based on the error code), we know that _PAGE_PROTNONE must be
> set, otherwise we'd have triggered the pte_numa() check and exited
> through do_numa_page().
> 
> So it smells like we have a PROT_NONE VM area (at least the paeg table
> entries imply that). But "access_error()" should have flagged that (it
> checks "vma->vm_flags & VM_WRITE"). How do we have a page table entry
> marked _PAGE_PROTNONE, but VM_WRITE set in the vma?
> 
> Or, possibly, we have some confusion about the page tables themselves
> (corruption, wrong %cr3 value, whatever), explaining why the CPU
> thinks one thing, but our software page table walker thinks another.
> 
> I'm not seeing how this all happens. But I'm adding Kirill to the cc,
> since he might see something I missed, and he touched some of this
> code last ("tag, you're it").
> 
> Kirill: the thread is on lkml, but basically it boils down to the
> second byte write in fault_in_pages_writeable() faulting forever,
> despite handle_mm_fault() apparently thinking that everything is fine.
> 
> Also adding Hugh Dickins, just because the more people who know this
> code that are involved, the better.

I've tried, but failed to explain it.

I think it's likely related to the VM_BUG_ON(!(val & _PAGE_PRESENT))
which linux-next has in pte_mknuma(), which Sasha Levin first reported
hitting in https://lkml.org/lkml/2014/8/26/869 (a resumption of the
"mm: BUG in unmap_page_range" thread, though its subject bug is fixed).

Mel and I gave it a lot of thought, but that too remains unexplained.
Sasha could reproduce it fairly easily on linux-next, but could not
reproduce it on 3.17-rc4 (plus the VM_BUG_ON); maybe Dave is doing
something different enough to get it on 3.17-rc7.

I say they're likely related because both could be explained if
there's some way in which a PROTNONE pte can get left behind after
the vma has been mprotected back from PROT_NONE to read-writable.
But we cannot see how (even when racing with page migration).

Irrelevance follows...

There *appears* to be a risk of hitting the VM_BUG_ON, or with no
VM_BUG_ON (as in 3.17-rc) pte_mknuma proceeding to add _PAGE_NUMA
to _PAGE_PROTNONE - making the pte then fail the pte_numa test,
but pass the pte_special test, hence fail the vm_normal_page test:
when coming from change_prot_numa serving MPOL_MF_LAZY for mbind.

However, that would still not explain Dave's endless refaulting;
though I was reminded to send you a patch to fix it, except that
when I came to test the fix, I could not produce the problem, and
eventually discovered a720094ded8c ("mm: mempolicy: Hide MPOL_NOOP
and MPOL_MF_LAZY from userspace for now") - that call to
change_prot_numa is still just dead code, so we're still safe from
its use on PROT_NONE areas (which task_numa_work carefully avoids).

Some time wasted on that, but I learnt a valuable debugging technique:
#undef EINVAL
#define EINVAL __LINE__

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Hugh Dickins
On Tue, 30 Sep 2014, Linus Torvalds wrote:
 On Tue, Sep 30, 2014 at 11:20 AM, Dave Jones da...@redhat.com wrote:
 
  page_fault_kernel:address=__per_cpu_end ip=copy_page_to_iter 
  error_code=0x2
 
 Interesting. error_code in particular. The value 2 means that the
 CPU thinks that the page is not present (bit zero is clear).
 
 (That address is useless - it's tried to turn a user address into a
 kernel symbol, and the percpu symbols are zero-based, so it picks the
 last of them. The ip is useless too, since it doesn't give the
 offset)
 
 So the CPU thinks it's a write to a not-present page, which means that
 _PAGE_PRESENT bit is clear.
 
 Now the *kernel* thinks a page is present not just if _PAGE_PRESENT is
 set, but also if _PAGE_PROTNONE or _PAGE_NUMA are set. Sadly, your
 trace is not very useful, because inlining has caused pretty much all
 the cases to be in handle_mm_fault(), so the trace doesn't really
 tell which path this all takes.
 
 But we can still do *some* analysis on the trace: do_wp_page()
 shouldn't have been inlined, so it would have shown up in the trace if
 it had been called. So I think we can be pretty confident that the
 ptep_set_access_flags() we see is the one from handle_pte_fault().
 
 And if that is the case, then we know that pte_present() is indeed
 true as far a the kernel is concerned. So with _PAGE_PRESENT not being
 set (based on the error code), we know that _PAGE_PROTNONE must be
 set, otherwise we'd have triggered the pte_numa() check and exited
 through do_numa_page().
 
 So it smells like we have a PROT_NONE VM area (at least the paeg table
 entries imply that). But access_error() should have flagged that (it
 checks vma-vm_flags  VM_WRITE). How do we have a page table entry
 marked _PAGE_PROTNONE, but VM_WRITE set in the vma?
 
 Or, possibly, we have some confusion about the page tables themselves
 (corruption, wrong %cr3 value, whatever), explaining why the CPU
 thinks one thing, but our software page table walker thinks another.
 
 I'm not seeing how this all happens. But I'm adding Kirill to the cc,
 since he might see something I missed, and he touched some of this
 code last (tag, you're it).
 
 Kirill: the thread is on lkml, but basically it boils down to the
 second byte write in fault_in_pages_writeable() faulting forever,
 despite handle_mm_fault() apparently thinking that everything is fine.
 
 Also adding Hugh Dickins, just because the more people who know this
 code that are involved, the better.

I've tried, but failed to explain it.

I think it's likely related to the VM_BUG_ON(!(val  _PAGE_PRESENT))
which linux-next has in pte_mknuma(), which Sasha Levin first reported
hitting in https://lkml.org/lkml/2014/8/26/869 (a resumption of the
mm: BUG in unmap_page_range thread, though its subject bug is fixed).

Mel and I gave it a lot of thought, but that too remains unexplained.
Sasha could reproduce it fairly easily on linux-next, but could not
reproduce it on 3.17-rc4 (plus the VM_BUG_ON); maybe Dave is doing
something different enough to get it on 3.17-rc7.

I say they're likely related because both could be explained if
there's some way in which a PROTNONE pte can get left behind after
the vma has been mprotected back from PROT_NONE to read-writable.
But we cannot see how (even when racing with page migration).

Irrelevance follows...

There *appears* to be a risk of hitting the VM_BUG_ON, or with no
VM_BUG_ON (as in 3.17-rc) pte_mknuma proceeding to add _PAGE_NUMA
to _PAGE_PROTNONE - making the pte then fail the pte_numa test,
but pass the pte_special test, hence fail the vm_normal_page test:
when coming from change_prot_numa serving MPOL_MF_LAZY for mbind.

However, that would still not explain Dave's endless refaulting;
though I was reminded to send you a patch to fix it, except that
when I came to test the fix, I could not produce the problem, and
eventually discovered a720094ded8c (mm: mempolicy: Hide MPOL_NOOP
and MPOL_MF_LAZY from userspace for now) - that call to
change_prot_numa is still just dead code, so we're still safe from
its use on PROT_NONE areas (which task_numa_work carefully avoids).

Some time wasted on that, but I learnt a valuable debugging technique:
#undef EINVAL
#define EINVAL __LINE__

Hugh
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Linus Torvalds
On Wed, Oct 1, 2014 at 1:19 AM, Hugh Dickins hu...@google.com wrote:

 Irrelevance follows...

Maybe not irrelevant.

 There *appears* to be a risk of hitting the VM_BUG_ON, or with no
 VM_BUG_ON (as in 3.17-rc) pte_mknuma proceeding to add _PAGE_NUMA
 to _PAGE_PROTNONE - making the pte then fail the pte_numa test,
 but pass the pte_special test, hence fail the vm_normal_page test:
 when coming from change_prot_numa serving MPOL_MF_LAZY for mbind.

Ugh, yes. The whole _PAGE_NUMA is still a f*cking mess. I hate it.
Hate it, hate it, hate it.

We need to get rid of it, and just make it the same as pte_protnone().
And then the real protnone is in the vma flags, and if you actually
ever get to a pte that is marked protnone, you know it's a numa page.

Seriously.

I never understood what the objection to that was, but every time I
tell people to do it, they go crazy and think _PAGE_NUMA makes sense.
It doesn't. There's no excuse. Rik, what was your broken excuse again?
Something to do with Powerpc, but it is obviously not true, since
powerpc supports protnone just fine.

Even our own comments are confused, with include/asm-generic/pgtable.h saying:

 * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
 * same bit too).

but no, it's not the same bit.

Can we please just get rid of _PAGE_NUMA. There is no excuse for it.

 However, that would still not explain Dave's endless refaulting;

Why not? You start out with a PROTNONE, trigger shrink_page_list() on
a hugepage,.which calls add_to_swap(), which does
split_huge_page_to_list(), which in turn calls __split_huge_page(),
and that turns (_PAGE_PROTNONE) into (_PAGE_PROTNONE|_PAGE_NUMA),
which you will then fault on forever, because the kernel thinks the
page is present, but not a NUMA page.

IOW, it's *exactly* the same f*cking confusion between _PAGE_NUMA and
_PAGE_PROTNONE I've complained about before.

 Some time wasted on that, but I learnt a valuable debugging technique:
 #undef EINVAL
 #define EINVAL __LINE__

Wow. There's a certain beauty in the pure crazyness.

However, be careful: our IS_ERR() handling knows that error numbers
are  4096. So on a big file, and error pointers, that doesn't work
reliably.

  Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Linus Torvalds
On Wed, Oct 1, 2014 at 9:01 AM, Linus Torvalds
torva...@linux-foundation.org wrote:

 We need to get rid of it, and just make it the same as pte_protnone().
 And then the real protnone is in the vma flags, and if you actually
 ever get to a pte that is marked protnone, you know it's a numa page.

So I'd really suggest we do exactly that. Get rid of pte_numa()
entirely, get rid of _PAGE_[BIT_]NUMA entirely, and instead add a
pte_protnone() helper to check for the protnone case (which on x86
is testing the _PAGE_PROTNONE bit, and on most other architectures is
just testing that the page has no access rights).

Then we throw away pte_mknuma() and pte_mknonnuma() entirely,
because they are brainless sh*t, and we just use

ptent = ptep_modify_prot_start(mm, addr, pte);
ptent = pte_modify(ptent, newprot);
ptep_modify_prot_commit(mm, addr, pte, ptent);

reliably instead (where for the mknuma case newprot is PROT_NONE,
and for mknonnuma() it is vma-vm_page_prot. Yes, that means that you
have to pass in the vma to those functions, but that just makes sense
anyway.

And if that means that we lose the numa flag on mprotect etc, nobody sane cares.

Seriously, why can't we just do this, and throw away all the crap that
is numa special case. This would make all the random games in
change_pte_range() just go away entirely, because the whole NUMA thing
really wouldn't be a special case for the pte AT ALL any more. All it
would be is that a pte could be marked PROT_NONE even if the
vma-vm_flags aren't.

Please, please, please? The current _PAGE_NUMA really is a horrible
horrible thing, and may well be the source of this bug.

The fact that it took DaveJ a long time to trigger his lockup would be
entirely consistent with you have to split a PROTNONE large page due
to memory pressure, so the problem with our current pte_mknuma() that
Hugh points out looks entirely possible to me.

Now, there may be some reason why it can't happen, but even in the
absense of this bug, I really think that _PAGE_NUMA has been a huge
mistake from day one.

  Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Rik van Riel
On 10/01/2014 12:18 PM, Linus Torvalds wrote:

 Seriously, why can't we just do this, and throw away all the crap that
 is numa special case. This would make all the random games in
 change_pte_range() just go away entirely, because the whole NUMA thing
 really wouldn't be a special case for the pte AT ALL any more. All it
 would be is that a pte could be marked PROT_NONE even if the
 vma-vm_flags aren't.

That's what I suggested quite a while back, but IIRC either
Peter or Mel brought up a reason why this was not possible.

Unfortunately I do not remember what it was, just that I
was convinced by it at the time...
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Linus Torvalds
On Wed, Oct 1, 2014 at 9:18 AM, Linus Torvalds
torva...@linux-foundation.org wrote:

 So I'd really suggest we do exactly that. Get rid of pte_numa()
 entirely, get rid of _PAGE_[BIT_]NUMA entirely, and instead add a
 pte_protnone() helper to check for the protnone case (which on x86
 is testing the _PAGE_PROTNONE bit, and on most other architectures is
 just testing that the page has no access rights).

 Then we throw away pte_mknuma() and pte_mknonnuma() entirely,
 because they are brainless sh*t, and we just use

 ptent = ptep_modify_prot_start(mm, addr, pte);
 ptent = pte_modify(ptent, newprot);
 ptep_modify_prot_commit(mm, addr, pte, ptent);

 reliably instead (where for the mknuma case newprot is PROT_NONE,
 and for mknonnuma() it is vma-vm_page_prot. Yes, that means that you
 have to pass in the vma to those functions, but that just makes sense
 anyway.

 And if that means that we lose the numa flag on mprotect etc, nobody sane 
 cares.

So here is a *COMPLETELY UNTESTED* and probably seriously buggy first
version of such a patch. It doesn't do the powerpc conversion, so
somebody would need to check that eventually, but aside from that
obvious issue, can people fix this up? Or comment on why it doesn't
work.

Now, I like this because it gets rid of the horrible PAGE_NUMA special
cases, but it really seems to simplify things in general. Lookie here:

 13 files changed, 74 insertions(+), 268 deletions(-)

that's really mainly just the removal of odd and broken numa pte/pmd
helper functions from asm-generic/pgtable.h that aren't needed any
more because the normal change protections functions just DTRT
automatically. Although there are actually a few other cases that got
simpler too, so it's not *just* removal of those _PAGE_NUMA-specific
helpers.

One thing this does *not* remove is the special pte locking rule in
the change_*_range() functions: they still take that broken
prot_numa argument. HOWEVER, it isn't actually used for any page
table modifications, the only reason for it existing is the hacky
locking issue (see lock_pte_protection(), and the comment about races
with the transhuge accesses).

Now, I'll be honest: this patch *migth* just work, but I expect it to
have some stupid problem. It compiles. I haven't even dared boot it,
much less try any numa benchmarks that woudln't show anything sane on
my machine anyway.

So I'm really sending this patch out in the hope that it will get
comments, fixup and possibly even testing by people who actually know
the NUMA balancing code. Rik?  Anybody?

Linus
 arch/x86/include/asm/pgtable.h   |  31 +--
 arch/x86/include/asm/pgtable_types.h |  27 +-
 arch/x86/mm/gup.c|   4 +-
 include/asm-generic/pgtable.h| 159 ++-
 include/linux/huge_mm.h  |   3 +-
 init/Kconfig |  11 ---
 mm/gup.c |   4 +-
 mm/huge_memory.c |  42 +++--
 mm/memory.c  |  16 ++--
 mm/mempolicy.c   |   2 +-
 mm/migrate.c |   1 -
 mm/mprotect.c|  40 +++--
 mm/pgtable-generic.c |   2 -
 13 files changed, 74 insertions(+), 268 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index aa97a070f09f..0d4ec3a62fed 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -299,7 +299,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
 
 static inline pmd_t pmd_mknotpresent(pmd_t pmd)
 {
-   return pmd_clear_flags(pmd, _PAGE_PRESENT);
+   return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -457,8 +457,7 @@ static inline int pte_same(pte_t a, pte_t b)
 
 static inline int pte_present(pte_t a)
 {
-   return pte_flags(a)  (_PAGE_PRESENT | _PAGE_PROTNONE |
-  _PAGE_NUMA);
+   return pte_flags(a)  (_PAGE_PRESENT | _PAGE_PROTNONE);
 }
 
 #define pte_present_nonuma pte_present_nonuma
@@ -473,7 +472,7 @@ static inline bool pte_accessible(struct mm_struct *mm, 
pte_t a)
if (pte_flags(a)  _PAGE_PRESENT)
return true;
 
-   if ((pte_flags(a)  (_PAGE_PROTNONE | _PAGE_NUMA)) 
+   if ((pte_flags(a)  _PAGE_PROTNONE) 
mm_tlb_flush_pending(mm))
return true;
 
@@ -493,10 +492,26 @@ static inline int pmd_present(pmd_t pmd)
 * the _PAGE_PSE flag will remain set at all times while the
 * _PAGE_PRESENT bit is clear).
 */
-   return pmd_flags(pmd)  (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
-_PAGE_NUMA);
+   return pmd_flags(pmd)  (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Technically these work even when not doing NUMA
+ * balancing, but we don't care and have simpler
+ * always false 

Re: pipe/page fault oddness.

2014-10-01 Thread Rik van Riel
On 10/01/2014 04:20 PM, Linus Torvalds wrote:

 Now, I'll be honest: this patch *migth* just work, but I expect it to
 have some stupid problem. It compiles. I haven't even dared boot it,
 much less try any numa benchmarks that woudln't show anything sane on
 my machine anyway.
 
 So I'm really sending this patch out in the hope that it will get
 comments, fixup and possibly even testing by people who actually know
 the NUMA balancing code. Rik?  Anybody?

I'll add it to the list. No guarantees I'll be able to get
to it before Linux Con / KVM Forum though. I have a bit of
a backlog to get through this coming week...

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-10-01 Thread Sasha Levin
On 10/01/2014 04:20 PM, Linus Torvalds wrote:
 So I'm really sending this patch out in the hope that it will get
 comments, fixup and possibly even testing by people who actually know
 the NUMA balancing code. Rik?  Anybody?

Hi Linus,

I've tried this patch on the same configuration that was triggering
the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
ran fine for ~20 minutes before exploding with:

[ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
[ 2781.566953] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
[ 2781.568054] Dumping ftrace buffer:
[ 2781.568826](ftrace buffer empty)
[ 2781.569392] Modules linked in:
[ 2781.569909] CPU: 61 PID: 13111 Comm: trinity-c61 Not tainted 
3.17.0-rc7-sasha-00040-g65e1cb2 #1259
[ 2781.571077] task: 88050ba8 ti: 880418ecc000 task.ti: 
880418ecc000
[ 2781.571077] RIP: do_huge_pmd_numa_page (mm/huge_memory.c:1293 (discriminator 
1))
[ 2781.571077] RSP: :880418ecfc60  EFLAGS: 00010246
[ 2781.571077] RAX: ea0074c6 RBX: ea0074c6 RCX: 001d318009e0
[ 2781.571077] RDX: ea00 RSI: b5706ef3 RDI: 001d318009e0
[ 2781.571077] RBP: 880418ecfcc8 R08: 0038 R09: 0001
[ 2781.571077] R10: 0038 R11: 0001 R12: 8804f9b52000
[ 2781.571077] R13: 001d318009e0 R14: 880508a1f840 R15: 0028
[ 2781.571077] FS:  7f5502fc9700() GS:881d77e0() 
knlGS:
[ 2781.571077] CS:  0010 DS:  ES:  CR0: 80050033
[ 2781.571077] CR2:  CR3: 0004bfac4000 CR4: 06a0
[ 2781.571077] Stack:
[ 2781.571077]  880418ecfc98 0282 88050ba8 
000b
[ 2781.571077]  88060d2ab000 8806001d  
881d30b3ec00
[ 2781.571077]   881d30b3ec00 88060d2ab000 
0100
[ 2781.571077] Call Trace:
[ 2781.571077] handle_mm_fault (mm/memory.c:3304 mm/memory.c:3368)
[ 2781.571077] __do_page_fault (arch/x86/mm/fault.c:1231)
[ 2781.571077] ? kvm_clock_read (./arch/x86/include/asm/preempt.h:90 
arch/x86/kernel/kvmclock.c:86)
[ 2781.571077] ? sched_clock (./arch/x86/include/asm/paravirt.h:192 
arch/x86/kernel/tsc.c:304)
[ 2781.571077] ? sched_clock_local (kernel/sched/clock.c:214)
[ 2781.571077] ? context_tracking_user_exit (kernel/context_tracking.c:184)
[ 2781.571077] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
[ 2781.571077] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2641 
(discriminator 8))
[ 2781.571077] trace_do_page_fault (arch/x86/mm/fault.c:1314 
include/linux/jump_label.h:115 include/linux/context_tracking_state.h:27 
include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1315)
[ 2781.571077] do_async_page_fault (arch/x86/kernel/kvm.c:279)
[ 2781.571077] async_page_fault (arch/x86/kernel/entry_64.S:1314)
[ 2781.571077] ? copy_user_generic_unrolled (arch/x86/lib/copy_user_64.S:166)
[ 2781.571077] ? sys32_mmap (arch/x86/ia32/sys_ia32.c:159)
[ 2781.571077] ia32_do_call (arch/x86/ia32/ia32entry.S:430)
[ 2781.571077] Code: b4 eb e0 0f 1f 84 00 00 00 00 00 4c 89 f7 e8 88 2f 0c 03 
48 8b 45 d0 4c 89 e6 48 8b b8 88 00 00 00 e8 85 c7 ff ff e9 90 fe ff ff 0f 0b 
66 0f 1f 44 00 00 48 89 df e8 90 e9 f9 ff 84 c0 0f 85 be
All code

   0:   b4 eb   mov$0xeb,%ah
   2:   e0 0f   loopne 0x13
   4:   1f  (bad)
   5:   84 00   test   %al,(%rax)
   7:   00 00   add%al,(%rax)
   9:   00 00   add%al,(%rax)
   b:   4c 89 f7mov%r14,%rdi
   e:   e8 88 2f 0c 03  callq  0x30c2f9b
  13:   48 8b 45 d0 mov-0x30(%rbp),%rax
  17:   4c 89 e6mov%r12,%rsi
  1a:   48 8b b8 88 00 00 00mov0x88(%rax),%rdi
  21:   e8 85 c7 ff ff  callq  0xc7ab
  26:   e9 90 fe ff ff  jmpq   0xfebb
  2b:*  0f 0b   ud2 -- trapping instruction
  2d:   66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
  33:   48 89 dfmov%rbx,%rdi
  36:   e8 90 e9 f9 ff  callq  0xfff9e9cb
  3b:   84 c0   test   %al,%al
  3d:   0f  .byte 0xf
  3e:   85  .byte 0x85
  3f:   be  .byte 0xbe
...

Code starting with the faulting instruction
===
   0:   0f 0b   ud2
   2:   66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
   8:   48 89 dfmov%rbx,%rdi
   b:   e8 90 e9 f9 ff  callq  0xfff9e9a0
  10:   84 c0   test   %al,%al
  12:   0f  .byte 0xf
  13:   85  .byte 0x85
  14:   be  .byte 0xbe
...
[ 2781.571077] RIP do_huge_pmd_numa_page (mm/huge_memory.c:1293 (discriminator 
1))
[ 2781.571077]  RSP 880418ecfc60



Re: pipe/page fault oddness.

2014-10-01 Thread Chuck Ebbert
On Wed, 01 Oct 2014 18:08:30 -0400
Sasha Levin sasha.le...@oracle.com wrote:

 On 10/01/2014 04:20 PM, Linus Torvalds wrote:
  So I'm really sending this patch out in the hope that it will get
  comments, fixup and possibly even testing by people who actually know
  the NUMA balancing code. Rik?  Anybody?
 
 Hi Linus,
 
 I've tried this patch on the same configuration that was triggering
 the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
 ran fine for ~20 minutes before exploding with:
 
 [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!

That's:
BUG_ON(is_huge_zero_page(page));

Can you change your scripts to show the source code line when
the error is a BUG_ON()? The machine code disassembly after the
oops message doesn't really help.

 [ 2781.566953] invalid opcode:  [#1] PREEMPT SMP DEBUG_PAGEALLOC
 [ 2781.568054] Dumping ftrace buffer:
 [ 2781.568826](ftrace buffer empty)
 [ 2781.569392] Modules linked in:
 [ 2781.569909] CPU: 61 PID: 13111 Comm: trinity-c61 Not tainted 
 3.17.0-rc7-sasha-00040-g65e1cb2 #1259
 [ 2781.571077] task: 88050ba8 ti: 880418ecc000 task.ti: 
 880418ecc000
 [ 2781.571077] RIP: do_huge_pmd_numa_page (mm/huge_memory.c:1293 
 (discriminator 1))
 [ 2781.571077] RSP: :880418ecfc60  EFLAGS: 00010246
 [ 2781.571077] RAX: ea0074c6 RBX: ea0074c6 RCX: 
 001d318009e0
 [ 2781.571077] RDX: ea00 RSI: b5706ef3 RDI: 
 001d318009e0
 [ 2781.571077] RBP: 880418ecfcc8 R08: 0038 R09: 
 0001
 [ 2781.571077] R10: 0038 R11: 0001 R12: 
 8804f9b52000
 [ 2781.571077] R13: 001d318009e0 R14: 880508a1f840 R15: 
 0028
 [ 2781.571077] FS:  7f5502fc9700() GS:881d77e0() 
 knlGS:
 [ 2781.571077] CS:  0010 DS:  ES:  CR0: 80050033
 [ 2781.571077] CR2:  CR3: 0004bfac4000 CR4: 
 06a0
 [ 2781.571077] Stack:
 [ 2781.571077]  880418ecfc98 0282 88050ba8 
 000b
 [ 2781.571077]  88060d2ab000 8806001d  
 881d30b3ec00
 [ 2781.571077]   881d30b3ec00 88060d2ab000 
 0100
 [ 2781.571077] Call Trace:
 [ 2781.571077] handle_mm_fault (mm/memory.c:3304 mm/memory.c:3368)
 [ 2781.571077] __do_page_fault (arch/x86/mm/fault.c:1231)
 [ 2781.571077] ? kvm_clock_read (./arch/x86/include/asm/preempt.h:90 
 arch/x86/kernel/kvmclock.c:86)
 [ 2781.571077] ? sched_clock (./arch/x86/include/asm/paravirt.h:192 
 arch/x86/kernel/tsc.c:304)
 [ 2781.571077] ? sched_clock_local (kernel/sched/clock.c:214)
 [ 2781.571077] ? context_tracking_user_exit (kernel/context_tracking.c:184)
 [ 2781.571077] ? __this_cpu_preempt_check (lib/smp_processor_id.c:63)
 [ 2781.571077] ? trace_hardirqs_off_caller (kernel/locking/lockdep.c:2641 
 (discriminator 8))
 [ 2781.571077] trace_do_page_fault (arch/x86/mm/fault.c:1314 
 include/linux/jump_label.h:115 include/linux/context_tracking_state.h:27 
 include/linux/context_tracking.h:45 arch/x86/mm/fault.c:1315)
 [ 2781.571077] do_async_page_fault (arch/x86/kernel/kvm.c:279)
 [ 2781.571077] async_page_fault (arch/x86/kernel/entry_64.S:1314)
 [ 2781.571077] ? copy_user_generic_unrolled (arch/x86/lib/copy_user_64.S:166)
 [ 2781.571077] ? sys32_mmap (arch/x86/ia32/sys_ia32.c:159)
 [ 2781.571077] ia32_do_call (arch/x86/ia32/ia32entry.S:430)
 [ 2781.571077] Code: b4 eb e0 0f 1f 84 00 00 00 00 00 4c 89 f7 e8 88 2f 0c 03 
 48 8b 45 d0 4c 89 e6 48 8b b8 88 00 00 00 e8 85 c7 ff ff e9 90 fe ff ff 0f 
 0b 66 0f 1f 44 00 00 48 89 df e8 90 e9 f9 ff 84 c0 0f 85 be
 All code
 
0: b4 eb   mov$0xeb,%ah
2: e0 0f   loopne 0x13
4: 1f  (bad)
5: 84 00   test   %al,(%rax)
7: 00 00   add%al,(%rax)
9: 00 00   add%al,(%rax)
b: 4c 89 f7mov%r14,%rdi
e: e8 88 2f 0c 03  callq  0x30c2f9b
   13: 48 8b 45 d0 mov-0x30(%rbp),%rax
   17: 4c 89 e6mov%r12,%rsi
   1a: 48 8b b8 88 00 00 00mov0x88(%rax),%rdi
   21: e8 85 c7 ff ff  callq  0xc7ab
   26: e9 90 fe ff ff  jmpq   0xfebb
   2b:*0f 0b   ud2 -- trapping instruction
   2d: 66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
   33: 48 89 dfmov%rbx,%rdi
   36: e8 90 e9 f9 ff  callq  0xfff9e9cb
   3b: 84 c0   test   %al,%al
   3d: 0f  .byte 0xf
   3e: 85  .byte 0x85
   3f: be  .byte 0xbe
   ...
 
 Code starting with the faulting instruction
 ===
0: 0f 0b   ud2
2: 66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
8: 48 89 dfmov%rbx,%rdi
b: e8 90 e9 f9 

Re: pipe/page fault oddness.

2014-10-01 Thread Linus Torvalds
On Wed, Oct 1, 2014 at 3:08 PM, Sasha Levin sasha.le...@oracle.com wrote:

 I've tried this patch on the same configuration that was triggering
 the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
 ran fine for ~20 minutes before exploding with:

Well, that's somewhat encouraging. I didn't expect it to be perfect.

That said, ran fine isn't necessarily the same thing as worked.
Who knows how buggy it was without showing overt symptoms until the
BUG_ON() triggered. But hey, I'll be optimistic.

 [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!

So that's

BUG_ON(is_huge_zero_page(page));

and the reason is trivial: the old code used to have a magical special
case for the zero-page hugepage (see change_huge_pmd()) and I got rid
of that (because now it's just about setting protections, and the
zero-page hugepage is in no way special.

So I think the solution is equally trivial: just accept that the
zero-page can happen, and ignore it (just un-numa it).

Appended is a incremental diff on top of the previous one. Even less
tested than the last case, but I think you get the idea if it doesn't
work as-is.

 Linus
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 14de54af6c38..fc33952d59c4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1290,7 +1290,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
}
 
page = pmd_page(pmd);
-   BUG_ON(is_huge_zero_page(page));
+   if (is_huge_zero_page(page))
+   goto huge_zero_page;
+
page_nid = page_to_nid(page);
last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
@@ -1381,6 +1383,11 @@ out:
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
 
return 0;
+huge_zero_page:
+   pmd = pmd_modify(pmd, vma-vm_page_prot);
+   set_pmd_at(mm, haddr, pmdp, pmd);
+   update_mmu_cache_pmd(vma, addr, pmdp);
+   goto out_unlock;
 }
 
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,


Re: pipe/page fault oddness.

2014-10-01 Thread Sasha Levin
On 10/01/2014 06:28 PM, Chuck Ebbert wrote:
 On Wed, 01 Oct 2014 18:08:30 -0400
 Sasha Levin sasha.le...@oracle.com wrote:
 
  On 10/01/2014 04:20 PM, Linus Torvalds wrote:
   So I'm really sending this patch out in the hope that it will get
   comments, fixup and possibly even testing by people who actually know
   the NUMA balancing code. Rik?  Anybody?
  
  Hi Linus,
  
  I've tried this patch on the same configuration that was triggering
  the VM_BUG_ON that Hugh mentioned previously. Surprisingly enough it
  ran fine for ~20 minutes before exploding with:
  
  [ 2781.566206] kernel BUG at mm/huge_memory.c:1293!
 That's:
   BUG_ON(is_huge_zero_page(page));
 
 Can you change your scripts to show the source code line when
 the error is a BUG_ON()? The machine code disassembly after the
 oops message doesn't really help.
 

Hum? The source code line is the first line in the trace:

[ 2781.566206] kernel BUG at mm/huge_memory.c:1293!


Thanks,
Sasha
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Linus Torvalds
On Tue, Sep 30, 2014 at 11:20 AM, Dave Jones  wrote:
>
> page_fault_kernel:address=__per_cpu_end ip=copy_page_to_iter 
> error_code=0x2

Interesting. "error_code" in particular. The value "2" means that the
CPU thinks that the page is not present (bit zero is clear).

(That "address" is useless - it's tried to turn a user address into a
kernel symbol, and the percpu symbols are zero-based, so it picks the
last of them. The "ip" is useless too, since it doesn't give the
offset)

So the CPU thinks it's a write to a not-present page, which means that
_PAGE_PRESENT bit is clear.

Now the *kernel* thinks a page is present not just if _PAGE_PRESENT is
set, but also if _PAGE_PROTNONE or _PAGE_NUMA are set. Sadly, your
trace is not very useful, because inlining has caused pretty much all
the cases to be in "handle_mm_fault()", so the trace doesn't really
tell which path this all takes.

But we can still do *some* analysis on the trace: do_wp_page()
shouldn't have been inlined, so it would have shown up in the trace if
it had been called. So I think we can be pretty confident that the
ptep_set_access_flags() we see is the one from handle_pte_fault().

And if that is the case, then we know that "pte_present()" is indeed
true as far a the kernel is concerned. So with _PAGE_PRESENT not being
set (based on the error code), we know that _PAGE_PROTNONE must be
set, otherwise we'd have triggered the pte_numa() check and exited
through do_numa_page().

So it smells like we have a PROT_NONE VM area (at least the paeg table
entries imply that). But "access_error()" should have flagged that (it
checks "vma->vm_flags & VM_WRITE"). How do we have a page table entry
marked _PAGE_PROTNONE, but VM_WRITE set in the vma?

Or, possibly, we have some confusion about the page tables themselves
(corruption, wrong %cr3 value, whatever), explaining why the CPU
thinks one thing, but our software page table walker thinks another.

I'm not seeing how this all happens. But I'm adding Kirill to the cc,
since he might see something I missed, and he touched some of this
code last ("tag, you're it").

Kirill: the thread is on lkml, but basically it boils down to the
second byte write in fault_in_pages_writeable() faulting forever,
despite handle_mm_fault() apparently thinking that everything is fine.

Also adding Hugh Dickins, just because the more people who know this
code that are involved, the better.

  Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Dave Jones
On Tue, Sep 30, 2014 at 09:46:45AM -0700, Linus Torvalds wrote:
 > On Tue, Sep 30, 2014 at 9:40 AM, Dave Jones  wrote:
 > >
 > > ah, echo 0 > tracing_on isn't enough, I had to 0 out set_ftrace_pid too.
 > > Never mind.  Subsequent traces all looking the same, minus the tracing
 > > junk though.
 > 
 > Yeah, looks like that copy_page_to_iter+0x3b3 is always there.
 > 
 > > Is there some way I can figure out what address it's faulting on ?
 > > I wonder if that might give some clues.
 > 
 > Could be useful. There's a trace_page_fault_kernel() entry that should
 > give it to you, along with error code.

Hm.

page_fault_kernel:address=__per_cpu_end ip=copy_page_to_iter error_code=0x2

over and over..

Dave


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Linus Torvalds
On Tue, Sep 30, 2014 at 9:40 AM, Dave Jones  wrote:
>
> ah, echo 0 > tracing_on isn't enough, I had to 0 out set_ftrace_pid too.
> Never mind.  Subsequent traces all looking the same, minus the tracing
> junk though.

Yeah, looks like that copy_page_to_iter+0x3b3 is always there.

> Is there some way I can figure out what address it's faulting on ?
> I wonder if that might give some clues.

Could be useful. There's a trace_page_fault_kernel() entry that should
give it to you, along with error code.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Dave Jones
On Tue, Sep 30, 2014 at 12:22:01PM -0400, Dave Jones wrote:
 
 > [] ? trace_graph_entry+0x123/0x250
 > [] ? trace_buffer_lock_reserve+0x1e/0x60
 > [] ? handle_mm_fault+0x3a7/0xcd0
 > [] ? trace_hardirqs_on+0xd/0x10
 > [] ? trace_graph_entry+0x108/0x250
 > [] ? __do_page_fault+0x234/0x600
 > [] ? prepare_ftrace_return+0x73/0xe0
 > [] ? down_write_nested+0xc0/0xc0
 > [] ? get_parent_ip+0xd/0x50
 > [] ? __do_page_fault+0x234/0x600
 > [] ? prepare_ftrace_return+0x73/0xe0
 > [] ? ftrace_graph_caller+0x5a/0x85
 > [] ? trace_hardirqs_on_thunk+0x3a/0x3f
 > [] ? context_tracking_user_exit+0x67/0x1b0
 > [] ? do_page_fault+0x1e/0x70
 > [] ? trace_hardirqs_off_thunk+0x3a/0x3c
 > [] ? copy_page_to_iter+0x3b3/0x500
 > [] ? pipe_read+0xdf/0x330
 > [] ? pipe_write+0x490/0x490
 > [] ? do_sync_readv_writev+0xa0/0xa0
 > [] ? do_iter_readv_writev+0x78/0xc0
 > [] ? do_readv_writev+0xce/0x280
 > [] ? pipe_write+0x490/0x490
 > [] ? lock_release_holdtime.part.29+0xe6/0x160
 > [] ? get_parent_ip+0xd/0x50
 > [] ? get_parent_ip+0xd/0x50
 > [] ? preempt_count_sub+0x6b/0xf0
 > [] ? vfs_readv+0x39/0x50
 > [] ? SyS_readv+0x5c/0x100
 > [] ? tracesys+0xdd/0xe2
 > 
 > That second one is odd, because I had disabled all the function tracing
 > before doing the sysrq-t, so still seeing all the ftrace stuff on the
 > stack suprised me.

ah, echo 0 > tracing_on isn't enough, I had to 0 out set_ftrace_pid too.
Never mind.  Subsequent traces all looking the same, minus the tracing
junk though.

Is there some way I can figure out what address it's faulting on ?
I wonder if that might give some clues.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Linus Torvalds
On Tue, Sep 30, 2014 at 9:03 AM, Rik van Riel  wrote:
>
> On the other hand, do_wp_page does not seem to do a tlb flush when
> the old page is reused, so CPUs do get rid of inappropriate TLB
> entries. We would have noticed do_wp_page not working right :)

Hmm? do_wp_page() uses the same ptep_set_access_flags(). So it too
used to do the TLB flush before that got removed there..

I do agree that clearly the CPU must *usually* do a TLB flush, or we'd
have noticed the lack immediately. I just worry that there are some
very specific circumstances when it might be missed.

That said, I don't actually believe it's that commit. I would just
like to remove it from the list of suspects.

I'm wondering what else could cause us to take effectively the same
page fault over and over again on what is a very simple instruction:

movb   $0x0,(%rdx)

where handle_mm_fault() doesn't see anything wrong with the page
tables, but the CPU does.

Hmm. We also have that

if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)))
return;

thing. It assumes that the fatal signal will kill the process, but
that's only true if returning to user space. So that looks like a
potential mis-feature (although it should sort itself out *eventually*
when the IO has completed), but the trace Dave had didn't go through
the retry paths.

Anybody see anyting else?

   Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Dave Jones
On Tue, Sep 30, 2014 at 09:10:26AM -0700, Linus Torvalds wrote:
 > On Tue, Sep 30, 2014 at 9:05 AM, Dave Jones  wrote:
 > >
 > > I left it spinning overnight in case someone wanted me to probe it
 > > further, so I haven't tried reproducing it yet.  It took ~12 hours
 > > yesterday before it got in that state.  I'll restart it, and tell it
 > > to only use pipe fd's, which might speed things up a little.
 > 
 > Actually, if you haven't restarted it yet, do a few more "Sysrq-T"'s
 > to see if the stack below the page fault ever changes. Is it *always*
 > that "second access by fault_in_pages_writeable()" or migth there be
 > some looping going on in copy_page_to_iter() after all? (Quite
 > frankly, I don't see how such looping could happen with a good
 > compiler, and 4.8.3 should be good, but just in case).

Here's another snap of the same process..

trinity-c49 R  running task12544 19464   7633 0x00080004
910ac790 91042e54 8800a09bf9e8 0064c0206056
0001 08dd0d40 8800a09bfaa0 8800a13427e8
8800a13427d0 88010b017e00 0002 910ac795
Call Trace:
[] ? lock_acquired+0x1d1/0x3c0
[] ? do_raw_spin_unlock+0x5/0x90
[] ? do_raw_spin_trylock+0x5/0x50
[] ? get_parent_ip+0xd/0x50
[] ? _raw_spin_unlock+0x31/0x50
[] ? handle_mm_fault+0x3a7/0xcd0
[] ? __do_page_fault+0x1a4/0x600
[] ? prepare_ftrace_return+0x73/0xe0
[] ? ftrace_graph_caller+0x5a/0x85
[] ? trace_hardirqs_off+0xd/0x10
[] ? retint_restore_args+0xe/0xe
[] ? context_tracking_user_exit+0x67/0x1b0
[] ? do_page_fault+0x1e/0x70
[] ? trace_hardirqs_on_thunk+0x3a/0x3f
[] ? page_fault+0x22/0x30
[] ? copy_page_to_iter+0x3b3/0x500
[] ? pipe_read+0xdf/0x330
[] ? pipe_write+0x490/0x490
[] ? do_sync_readv_writev+0xa0/0xa0
[] ? do_iter_readv_writev+0x78/0xc0
[] ? do_readv_writev+0xce/0x280
[] ? pipe_write+0x490/0x490
[] ? lock_release_holdtime.part.29+0xe6/0x160
[] ? get_parent_ip+0xd/0x50
[] ? get_parent_ip+0xd/0x50
[] ? preempt_count_sub+0x6b/0xf0
[] ? vfs_readv+0x39/0x50
[] ? SyS_readv+0x5c/0x100
[] ? tracesys+0xdd/0xe2

and a few seconds later..

trinity-c49 R  running task12544 19464   7633 0x00080004
880235073840 8800a09bf920 0046 8800a09bf930
8800a09bfa28 910ac840  8800a09bf968
 a8ef9693 880244c203c0 a8ef9693
Call Trace:
[] ? do_raw_spin_trylock+0x50/0x50
[] ? trace_graph_entry+0x123/0x250
[] ? trace_buffer_lock_reserve+0x1e/0x60
[] ? handle_mm_fault+0x3a7/0xcd0
[] ? trace_hardirqs_on+0xd/0x10
[] ? trace_graph_entry+0x108/0x250
[] ? __do_page_fault+0x234/0x600
[] ? prepare_ftrace_return+0x73/0xe0
[] ? down_write_nested+0xc0/0xc0
[] ? get_parent_ip+0xd/0x50
[] ? __do_page_fault+0x234/0x600
[] ? prepare_ftrace_return+0x73/0xe0
[] ? ftrace_graph_caller+0x5a/0x85
[] ? trace_hardirqs_on_thunk+0x3a/0x3f
[] ? context_tracking_user_exit+0x67/0x1b0
[] ? do_page_fault+0x1e/0x70
[] ? trace_hardirqs_off_thunk+0x3a/0x3c
[] ? copy_page_to_iter+0x3b3/0x500
[] ? pipe_read+0xdf/0x330
[] ? pipe_write+0x490/0x490
[] ? do_sync_readv_writev+0xa0/0xa0
[] ? do_iter_readv_writev+0x78/0xc0
[] ? do_readv_writev+0xce/0x280
[] ? pipe_write+0x490/0x490
[] ? lock_release_holdtime.part.29+0xe6/0x160
[] ? get_parent_ip+0xd/0x50
[] ? get_parent_ip+0xd/0x50
[] ? preempt_count_sub+0x6b/0xf0
[] ? vfs_readv+0x39/0x50
[] ? SyS_readv+0x5c/0x100
[] ? tracesys+0xdd/0xe2

That second one is odd, because I had disabled all the function tracing
before doing the sysrq-t, so still seeing all the ftrace stuff on the
stack suprised me.  Also puzzling is how every symbol in both cases
is '?'.


I've got to run to the dentist for an hour or so, so if you still have
more things you want me to test on this before I reboot, I'll put that
off until I get back.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Linus Torvalds
On Tue, Sep 30, 2014 at 9:05 AM, Dave Jones  wrote:
>
> I left it spinning overnight in case someone wanted me to probe it
> further, so I haven't tried reproducing it yet.  It took ~12 hours
> yesterday before it got in that state.  I'll restart it, and tell it
> to only use pipe fd's, which might speed things up a little.

Actually, if you haven't restarted it yet, do a few more "Sysrq-T"'s
to see if the stack below the page fault ever changes. Is it *always*
that "second access by fault_in_pages_writeable()" or migth there be
some looping going on in copy_page_to_iter() after all? (Quite
frankly, I don't see how such looping could happen with a good
compiler, and 4.8.3 should be good, but just in case).

It's an unlikely scenario, so if you already rebooted, not a big deal.

> model name  : Intel(R) Core(TM) i5-4670T CPU @ 2.30GHz

Ok, that pretty much throws the "maybe AMD did something different"
theory out of the water. Something subtler. Still worth trying the
revert.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Dave Jones
On Tue, Sep 30, 2014 at 12:03:06PM -0400, Rik van Riel wrote:

 > > What kind of CPU is the problematic machine? There was some
 > > question about just how architectural the whole "TLB entry causing
 > > a page fault gets invalidated automatically" really is.
 > 
 > Intel people told me at the time that the guarantee was architectural.
 > I don't know whether other x86 manufacturers know this...
 > 
 > Doing a local tlb flush from ptep_set_access_flags seems appropriate,
 > if that is indeed the issue.
 > 
 > On the other hand, do_wp_page does not seem to do a tlb flush when
 > the old page is reused, so CPUs do get rid of inappropriate TLB
 > entries. We would have noticed do_wp_page not working right :)
 
The puzzling thing is we've had that code change for two years
without issue. This isn't a new machine, so why would it only
be showing up bad effects now ?

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Dave Jones
On Tue, Sep 30, 2014 at 08:52:08AM -0700, Linus Torvalds wrote:

 > So if it's looping on that fault, what seems to happen is that the
 > page fault keeps happening.
 > 
 > Can you recreate this? Because if you can, please try to revert commit
 > e4a1cc56e4d7 ("x86: mm: drop TLB flush from ptep_set_access_flags").
 > Maybe the TLB has it read-only, and it doesn't get flushed, and the
 > page fault happens over and over again.

I left it spinning overnight in case someone wanted me to probe it
further, so I haven't tried reproducing it yet.  It took ~12 hours
yesterday before it got in that state.  I'll restart it, and tell it
to only use pipe fd's, which might speed things up a little.

If I can reproduce it, I'll then try that revert.

 > What kind of CPU is the problematic machine? There was some question
 > about just how architectural the whole "TLB entry causing a page fault
 > gets invalidated automatically" really is.

model name  : Intel(R) Core(TM) i5-4670T CPU @ 2.30GHz

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Rik van Riel
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 09/30/2014 11:52 AM, Linus Torvalds wrote:
> On Mon, Sep 29, 2014 at 9:54 PM, Linus Torvalds 
>  wrote:
>> 
>> Odd. The 0x3b3 offset seems to be the single-byte write of zero,
>> which is just the initial probe (aka
>> "fault_in_pages_writeable()").
>> 
>> How *that* could loop, I have no idea. Unless the exception table
>> is broken. I'll take another look tomorrow.
> 
> Confirmed. It's the second write in fault_in_pages_writeable()
> (the one that writes to the "end" pointer).
> 
> And there's no loop in software. And in fact, the trace shows that 
> there is no exception case for the fault either, so the fault is 
> perfectly successful.
> 
> So if it's looping on that fault, what seems to happen is that the 
> page fault keeps happening.
> 
> Can you recreate this? Because if you can, please try to revert
> commit e4a1cc56e4d7 ("x86: mm: drop TLB flush from
> ptep_set_access_flags"). Maybe the TLB has it read-only, and it
> doesn't get flushed, and the page fault happens over and over
> again.
> 
> What kind of CPU is the problematic machine? There was some
> question about just how architectural the whole "TLB entry causing
> a page fault gets invalidated automatically" really is.

Intel people told me at the time that the guarantee was architectural.
I don't know whether other x86 manufacturers know this...

Doing a local tlb flush from ptep_set_access_flags seems appropriate,
if that is indeed the issue.

On the other hand, do_wp_page does not seem to do a tlb flush when
the old page is reused, so CPUs do get rid of inappropriate TLB
entries. We would have noticed do_wp_page not working right :)

- -- 
All rights reversed
-BEGIN PGP SIGNATURE-
Version: GnuPG v1

iQEcBAEBAgAGBQJUKtQ6AAoJEM553pKExN6D0r4H/088aviaZYq87YoWcHVkc0ld
MD9PhuKtQugDW+CahmkQ1CDchEo25NUNV2eZhOkEzj/kCqcxzGGFVfBYmtZowp7X
X7675LIqueESIqdmlxXt8zaxnTnaC1xFiqilnL1YNvx+11EfgjIKRY3bSjrvFhBh
XBPymhVXg0DlC7FBdX4+ekGrWPu0JpAfdjGaONtImO2hEbXuYAylYsbx/vpLRD6S
Fcml2oQYE+f2Mp2+SKYNL94XWh4yz7l6UaSd8aJWIr2ssqWAYgQJ7v/N2Sa/2qq8
WJkmMcHFDWCesH+Hw5OhRAEW48WbH7EZZmR/rzIfEOgs1LaxTkfsONqjUoi/6+g=
=cd+t
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Linus Torvalds
On Mon, Sep 29, 2014 at 9:54 PM, Linus Torvalds
 wrote:
>
> Odd. The 0x3b3 offset seems to be the single-byte write of zero, which is
> just the initial probe (aka "fault_in_pages_writeable()").
>
> How *that* could loop, I have no idea. Unless the exception table is broken.
> I'll take another look tomorrow.

Confirmed. It's the second write in fault_in_pages_writeable() (the
one that writes to the "end" pointer).

And there's no loop in software. And in fact, the trace shows that
there is no exception case for the fault either, so the fault is
perfectly successful.

So if it's looping on that fault, what seems to happen is that the
page fault keeps happening.

Can you recreate this? Because if you can, please try to revert commit
e4a1cc56e4d7 ("x86: mm: drop TLB flush from ptep_set_access_flags").
Maybe the TLB has it read-only, and it doesn't get flushed, and the
page fault happens over and over again.

What kind of CPU is the problematic machine? There was some question
about just how architectural the whole "TLB entry causing a page fault
gets invalidated automatically" really is.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Linus Torvalds
On Mon, Sep 29, 2014 at 9:54 PM, Linus Torvalds
torva...@linux-foundation.org wrote:

 Odd. The 0x3b3 offset seems to be the single-byte write of zero, which is
 just the initial probe (aka fault_in_pages_writeable()).

 How *that* could loop, I have no idea. Unless the exception table is broken.
 I'll take another look tomorrow.

Confirmed. It's the second write in fault_in_pages_writeable() (the
one that writes to the end pointer).

And there's no loop in software. And in fact, the trace shows that
there is no exception case for the fault either, so the fault is
perfectly successful.

So if it's looping on that fault, what seems to happen is that the
page fault keeps happening.

Can you recreate this? Because if you can, please try to revert commit
e4a1cc56e4d7 (x86: mm: drop TLB flush from ptep_set_access_flags).
Maybe the TLB has it read-only, and it doesn't get flushed, and the
page fault happens over and over again.

What kind of CPU is the problematic machine? There was some question
about just how architectural the whole TLB entry causing a page fault
gets invalidated automatically really is.

Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Rik van Riel
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 09/30/2014 11:52 AM, Linus Torvalds wrote:
 On Mon, Sep 29, 2014 at 9:54 PM, Linus Torvalds 
 torva...@linux-foundation.org wrote:
 
 Odd. The 0x3b3 offset seems to be the single-byte write of zero,
 which is just the initial probe (aka
 fault_in_pages_writeable()).
 
 How *that* could loop, I have no idea. Unless the exception table
 is broken. I'll take another look tomorrow.
 
 Confirmed. It's the second write in fault_in_pages_writeable()
 (the one that writes to the end pointer).
 
 And there's no loop in software. And in fact, the trace shows that 
 there is no exception case for the fault either, so the fault is 
 perfectly successful.
 
 So if it's looping on that fault, what seems to happen is that the 
 page fault keeps happening.
 
 Can you recreate this? Because if you can, please try to revert
 commit e4a1cc56e4d7 (x86: mm: drop TLB flush from
 ptep_set_access_flags). Maybe the TLB has it read-only, and it
 doesn't get flushed, and the page fault happens over and over
 again.
 
 What kind of CPU is the problematic machine? There was some
 question about just how architectural the whole TLB entry causing
 a page fault gets invalidated automatically really is.

Intel people told me at the time that the guarantee was architectural.
I don't know whether other x86 manufacturers know this...

Doing a local tlb flush from ptep_set_access_flags seems appropriate,
if that is indeed the issue.

On the other hand, do_wp_page does not seem to do a tlb flush when
the old page is reused, so CPUs do get rid of inappropriate TLB
entries. We would have noticed do_wp_page not working right :)

- -- 
All rights reversed
-BEGIN PGP SIGNATURE-
Version: GnuPG v1

iQEcBAEBAgAGBQJUKtQ6AAoJEM553pKExN6D0r4H/088aviaZYq87YoWcHVkc0ld
MD9PhuKtQugDW+CahmkQ1CDchEo25NUNV2eZhOkEzj/kCqcxzGGFVfBYmtZowp7X
X7675LIqueESIqdmlxXt8zaxnTnaC1xFiqilnL1YNvx+11EfgjIKRY3bSjrvFhBh
XBPymhVXg0DlC7FBdX4+ekGrWPu0JpAfdjGaONtImO2hEbXuYAylYsbx/vpLRD6S
Fcml2oQYE+f2Mp2+SKYNL94XWh4yz7l6UaSd8aJWIr2ssqWAYgQJ7v/N2Sa/2qq8
WJkmMcHFDWCesH+Hw5OhRAEW48WbH7EZZmR/rzIfEOgs1LaxTkfsONqjUoi/6+g=
=cd+t
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Dave Jones
On Tue, Sep 30, 2014 at 08:52:08AM -0700, Linus Torvalds wrote:

  So if it's looping on that fault, what seems to happen is that the
  page fault keeps happening.
  
  Can you recreate this? Because if you can, please try to revert commit
  e4a1cc56e4d7 (x86: mm: drop TLB flush from ptep_set_access_flags).
  Maybe the TLB has it read-only, and it doesn't get flushed, and the
  page fault happens over and over again.

I left it spinning overnight in case someone wanted me to probe it
further, so I haven't tried reproducing it yet.  It took ~12 hours
yesterday before it got in that state.  I'll restart it, and tell it
to only use pipe fd's, which might speed things up a little.

If I can reproduce it, I'll then try that revert.

  What kind of CPU is the problematic machine? There was some question
  about just how architectural the whole TLB entry causing a page fault
  gets invalidated automatically really is.

model name  : Intel(R) Core(TM) i5-4670T CPU @ 2.30GHz

Dave

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Dave Jones
On Tue, Sep 30, 2014 at 12:03:06PM -0400, Rik van Riel wrote:

   What kind of CPU is the problematic machine? There was some
   question about just how architectural the whole TLB entry causing
   a page fault gets invalidated automatically really is.
  
  Intel people told me at the time that the guarantee was architectural.
  I don't know whether other x86 manufacturers know this...
  
  Doing a local tlb flush from ptep_set_access_flags seems appropriate,
  if that is indeed the issue.
  
  On the other hand, do_wp_page does not seem to do a tlb flush when
  the old page is reused, so CPUs do get rid of inappropriate TLB
  entries. We would have noticed do_wp_page not working right :)
 
The puzzling thing is we've had that code change for two years
without issue. This isn't a new machine, so why would it only
be showing up bad effects now ?

Dave

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Linus Torvalds
On Tue, Sep 30, 2014 at 9:05 AM, Dave Jones da...@redhat.com wrote:

 I left it spinning overnight in case someone wanted me to probe it
 further, so I haven't tried reproducing it yet.  It took ~12 hours
 yesterday before it got in that state.  I'll restart it, and tell it
 to only use pipe fd's, which might speed things up a little.

Actually, if you haven't restarted it yet, do a few more Sysrq-T's
to see if the stack below the page fault ever changes. Is it *always*
that second access by fault_in_pages_writeable() or migth there be
some looping going on in copy_page_to_iter() after all? (Quite
frankly, I don't see how such looping could happen with a good
compiler, and 4.8.3 should be good, but just in case).

It's an unlikely scenario, so if you already rebooted, not a big deal.

 model name  : Intel(R) Core(TM) i5-4670T CPU @ 2.30GHz

Ok, that pretty much throws the maybe AMD did something different
theory out of the water. Something subtler. Still worth trying the
revert.

Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Dave Jones
On Tue, Sep 30, 2014 at 09:10:26AM -0700, Linus Torvalds wrote:
  On Tue, Sep 30, 2014 at 9:05 AM, Dave Jones da...@redhat.com wrote:
  
   I left it spinning overnight in case someone wanted me to probe it
   further, so I haven't tried reproducing it yet.  It took ~12 hours
   yesterday before it got in that state.  I'll restart it, and tell it
   to only use pipe fd's, which might speed things up a little.
  
  Actually, if you haven't restarted it yet, do a few more Sysrq-T's
  to see if the stack below the page fault ever changes. Is it *always*
  that second access by fault_in_pages_writeable() or migth there be
  some looping going on in copy_page_to_iter() after all? (Quite
  frankly, I don't see how such looping could happen with a good
  compiler, and 4.8.3 should be good, but just in case).

Here's another snap of the same process..

trinity-c49 R  running task12544 19464   7633 0x00080004
910ac790 91042e54 8800a09bf9e8 0064c0206056
0001 08dd0d40 8800a09bfaa0 8800a13427e8
8800a13427d0 88010b017e00 0002 910ac795
Call Trace:
[910cd401] ? lock_acquired+0x1d1/0x3c0
[910d4885] ? do_raw_spin_unlock+0x5/0x90
[910d4835] ? do_raw_spin_trylock+0x5/0x50
[910ac74d] ? get_parent_ip+0xd/0x50
[918239f1] ? _raw_spin_unlock+0x31/0x50
[911c3c67] ? handle_mm_fault+0x3a7/0xcd0
[91042c84] ? __do_page_fault+0x1a4/0x600
[9103ad23] ? prepare_ftrace_return+0x73/0xe0
[91826b3a] ? ftrace_graph_caller+0x5a/0x85
[910cb1ad] ? trace_hardirqs_off+0xd/0x10
[918253e4] ? retint_restore_args+0xe/0xe
[9118eda7] ? context_tracking_user_exit+0x67/0x1b0
[910430fe] ? do_page_fault+0x1e/0x70
[913a5ebe] ? trace_hardirqs_on_thunk+0x3a/0x3f
[918264b2] ? page_fault+0x22/0x30
[911bd7e3] ? copy_page_to_iter+0x3b3/0x500
[9120eddf] ? pipe_read+0xdf/0x330
[9120ed00] ? pipe_write+0x490/0x490
[912051a0] ? do_sync_readv_writev+0xa0/0xa0
[912053b8] ? do_iter_readv_writev+0x78/0xc0
[91206bbe] ? do_readv_writev+0xce/0x280
[9120ed00] ? pipe_write+0x490/0x490
[910cbbf6] ? lock_release_holdtime.part.29+0xe6/0x160
[910ac74d] ? get_parent_ip+0xd/0x50
[910ac74d] ? get_parent_ip+0xd/0x50
[910ac8ab] ? preempt_count_sub+0x6b/0xf0
[91206da9] ? vfs_readv+0x39/0x50
[91206e6c] ? SyS_readv+0x5c/0x100
[918249e4] ? tracesys+0xdd/0xe2

and a few seconds later..

trinity-c49 R  running task12544 19464   7633 0x00080004
880235073840 8800a09bf920 0046 8800a09bf930
8800a09bfa28 910ac840  8800a09bf968
 a8ef9693 880244c203c0 a8ef9693
Call Trace:
[910d4880] ? do_raw_spin_trylock+0x50/0x50
[91167913] ? trace_graph_entry+0x123/0x250
[9115b7ce] ? trace_buffer_lock_reserve+0x1e/0x60
[911c3c67] ? handle_mm_fault+0x3a7/0xcd0
[910ce68d] ? trace_hardirqs_on+0xd/0x10
[911678f8] ? trace_graph_entry+0x108/0x250
[91042d14] ? __do_page_fault+0x234/0x600
[9103ad23] ? prepare_ftrace_return+0x73/0xe0
[910ca040] ? down_write_nested+0xc0/0xc0
[910ac74d] ? get_parent_ip+0xd/0x50
[91042d14] ? __do_page_fault+0x234/0x600
[9103ad23] ? prepare_ftrace_return+0x73/0xe0
[91826b3a] ? ftrace_graph_caller+0x5a/0x85
[913a5ebe] ? trace_hardirqs_on_thunk+0x3a/0x3f
[9118eda7] ? context_tracking_user_exit+0x67/0x1b0
[910430fe] ? do_page_fault+0x1e/0x70
[913a5efd] ? trace_hardirqs_off_thunk+0x3a/0x3c
[911bd7e3] ? copy_page_to_iter+0x3b3/0x500
[9120eddf] ? pipe_read+0xdf/0x330
[9120ed00] ? pipe_write+0x490/0x490
[912051a0] ? do_sync_readv_writev+0xa0/0xa0
[912053b8] ? do_iter_readv_writev+0x78/0xc0
[91206bbe] ? do_readv_writev+0xce/0x280
[9120ed00] ? pipe_write+0x490/0x490
[910cbbf6] ? lock_release_holdtime.part.29+0xe6/0x160
[910ac74d] ? get_parent_ip+0xd/0x50
[910ac74d] ? get_parent_ip+0xd/0x50
[910ac8ab] ? preempt_count_sub+0x6b/0xf0
[91206da9] ? vfs_readv+0x39/0x50
[91206e6c] ? SyS_readv+0x5c/0x100
[918249e4] ? tracesys+0xdd/0xe2

That second one is odd, because I had disabled all the function tracing
before doing the sysrq-t, so still seeing all the ftrace stuff on the
stack suprised me.  Also puzzling is how every symbol in both cases
is '?'.


I've got to run to the dentist for an hour or so, so if you still have
more things you want me to test on this before I reboot, I'll put that
off until I get back.

Dave

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  

Re: pipe/page fault oddness.

2014-09-30 Thread Linus Torvalds
On Tue, Sep 30, 2014 at 9:03 AM, Rik van Riel r...@redhat.com wrote:

 On the other hand, do_wp_page does not seem to do a tlb flush when
 the old page is reused, so CPUs do get rid of inappropriate TLB
 entries. We would have noticed do_wp_page not working right :)

Hmm? do_wp_page() uses the same ptep_set_access_flags(). So it too
used to do the TLB flush before that got removed there..

I do agree that clearly the CPU must *usually* do a TLB flush, or we'd
have noticed the lack immediately. I just worry that there are some
very specific circumstances when it might be missed.

That said, I don't actually believe it's that commit. I would just
like to remove it from the list of suspects.

I'm wondering what else could cause us to take effectively the same
page fault over and over again on what is a very simple instruction:

movb   $0x0,(%rdx)

where handle_mm_fault() doesn't see anything wrong with the page
tables, but the CPU does.

Hmm. We also have that

if (unlikely((fault  VM_FAULT_RETRY)  fatal_signal_pending(current)))
return;

thing. It assumes that the fatal signal will kill the process, but
that's only true if returning to user space. So that looks like a
potential mis-feature (although it should sort itself out *eventually*
when the IO has completed), but the trace Dave had didn't go through
the retry paths.

Anybody see anyting else?

   Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Dave Jones
On Tue, Sep 30, 2014 at 12:22:01PM -0400, Dave Jones wrote:
 
  [91167913] ? trace_graph_entry+0x123/0x250
  [9115b7ce] ? trace_buffer_lock_reserve+0x1e/0x60
  [911c3c67] ? handle_mm_fault+0x3a7/0xcd0
  [910ce68d] ? trace_hardirqs_on+0xd/0x10
  [911678f8] ? trace_graph_entry+0x108/0x250
  [91042d14] ? __do_page_fault+0x234/0x600
  [9103ad23] ? prepare_ftrace_return+0x73/0xe0
  [910ca040] ? down_write_nested+0xc0/0xc0
  [910ac74d] ? get_parent_ip+0xd/0x50
  [91042d14] ? __do_page_fault+0x234/0x600
  [9103ad23] ? prepare_ftrace_return+0x73/0xe0
  [91826b3a] ? ftrace_graph_caller+0x5a/0x85
  [913a5ebe] ? trace_hardirqs_on_thunk+0x3a/0x3f
  [9118eda7] ? context_tracking_user_exit+0x67/0x1b0
  [910430fe] ? do_page_fault+0x1e/0x70
  [913a5efd] ? trace_hardirqs_off_thunk+0x3a/0x3c
  [911bd7e3] ? copy_page_to_iter+0x3b3/0x500
  [9120eddf] ? pipe_read+0xdf/0x330
  [9120ed00] ? pipe_write+0x490/0x490
  [912051a0] ? do_sync_readv_writev+0xa0/0xa0
  [912053b8] ? do_iter_readv_writev+0x78/0xc0
  [91206bbe] ? do_readv_writev+0xce/0x280
  [9120ed00] ? pipe_write+0x490/0x490
  [910cbbf6] ? lock_release_holdtime.part.29+0xe6/0x160
  [910ac74d] ? get_parent_ip+0xd/0x50
  [910ac74d] ? get_parent_ip+0xd/0x50
  [910ac8ab] ? preempt_count_sub+0x6b/0xf0
  [91206da9] ? vfs_readv+0x39/0x50
  [91206e6c] ? SyS_readv+0x5c/0x100
  [918249e4] ? tracesys+0xdd/0xe2
  
  That second one is odd, because I had disabled all the function tracing
  before doing the sysrq-t, so still seeing all the ftrace stuff on the
  stack suprised me.

ah, echo 0  tracing_on isn't enough, I had to 0 out set_ftrace_pid too.
Never mind.  Subsequent traces all looking the same, minus the tracing
junk though.

Is there some way I can figure out what address it's faulting on ?
I wonder if that might give some clues.

Dave

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Linus Torvalds
On Tue, Sep 30, 2014 at 9:40 AM, Dave Jones da...@redhat.com wrote:

 ah, echo 0  tracing_on isn't enough, I had to 0 out set_ftrace_pid too.
 Never mind.  Subsequent traces all looking the same, minus the tracing
 junk though.

Yeah, looks like that copy_page_to_iter+0x3b3 is always there.

 Is there some way I can figure out what address it's faulting on ?
 I wonder if that might give some clues.

Could be useful. There's a trace_page_fault_kernel() entry that should
give it to you, along with error code.

Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Dave Jones
On Tue, Sep 30, 2014 at 09:46:45AM -0700, Linus Torvalds wrote:
  On Tue, Sep 30, 2014 at 9:40 AM, Dave Jones da...@redhat.com wrote:
  
   ah, echo 0  tracing_on isn't enough, I had to 0 out set_ftrace_pid too.
   Never mind.  Subsequent traces all looking the same, minus the tracing
   junk though.
  
  Yeah, looks like that copy_page_to_iter+0x3b3 is always there.
  
   Is there some way I can figure out what address it's faulting on ?
   I wonder if that might give some clues.
  
  Could be useful. There's a trace_page_fault_kernel() entry that should
  give it to you, along with error code.

Hm.

page_fault_kernel:address=__per_cpu_end ip=copy_page_to_iter error_code=0x2

over and over..

Dave


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-30 Thread Linus Torvalds
On Tue, Sep 30, 2014 at 11:20 AM, Dave Jones da...@redhat.com wrote:

 page_fault_kernel:address=__per_cpu_end ip=copy_page_to_iter 
 error_code=0x2

Interesting. error_code in particular. The value 2 means that the
CPU thinks that the page is not present (bit zero is clear).

(That address is useless - it's tried to turn a user address into a
kernel symbol, and the percpu symbols are zero-based, so it picks the
last of them. The ip is useless too, since it doesn't give the
offset)

So the CPU thinks it's a write to a not-present page, which means that
_PAGE_PRESENT bit is clear.

Now the *kernel* thinks a page is present not just if _PAGE_PRESENT is
set, but also if _PAGE_PROTNONE or _PAGE_NUMA are set. Sadly, your
trace is not very useful, because inlining has caused pretty much all
the cases to be in handle_mm_fault(), so the trace doesn't really
tell which path this all takes.

But we can still do *some* analysis on the trace: do_wp_page()
shouldn't have been inlined, so it would have shown up in the trace if
it had been called. So I think we can be pretty confident that the
ptep_set_access_flags() we see is the one from handle_pte_fault().

And if that is the case, then we know that pte_present() is indeed
true as far a the kernel is concerned. So with _PAGE_PRESENT not being
set (based on the error code), we know that _PAGE_PROTNONE must be
set, otherwise we'd have triggered the pte_numa() check and exited
through do_numa_page().

So it smells like we have a PROT_NONE VM area (at least the paeg table
entries imply that). But access_error() should have flagged that (it
checks vma-vm_flags  VM_WRITE). How do we have a page table entry
marked _PAGE_PROTNONE, but VM_WRITE set in the vma?

Or, possibly, we have some confusion about the page tables themselves
(corruption, wrong %cr3 value, whatever), explaining why the CPU
thinks one thing, but our software page table walker thinks another.

I'm not seeing how this all happens. But I'm adding Kirill to the cc,
since he might see something I missed, and he touched some of this
code last (tag, you're it).

Kirill: the thread is on lkml, but basically it boils down to the
second byte write in fault_in_pages_writeable() faulting forever,
despite handle_mm_fault() apparently thinking that everything is fine.

Also adding Hugh Dickins, just because the more people who know this
code that are involved, the better.

  Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-29 Thread Al Viro
On Mon, Sep 29, 2014 at 09:27:09PM -0700, Linus Torvalds wrote:
> On Mon, Sep 29, 2014 at 8:33 PM, Dave Jones  wrote:
> >
> > Looking at the dump, there's only one running trinity child,
> > with all the others blocking on it.
> >
> > trinity-c49 R  running task12856 19464   7633 0x0004
> > 8800a09bf960 0002 8800a09bf9f8 88021965
> > 001d4080  8800a09bffd8 001d4080
> > 88023f755bc0 88021965 8800a09bffd8 88010b017e00
> > Call Trace:
> > [] handle_mm_fault+0x3a7/0xcd0
> > [] __do_page_fault+0x1a4/0x600
> > [] do_page_fault+0x1e/0x70
> > [] page_fault+0x22/0x30
> > [] ? copy_page_to_iter+0x3b3/0x500
> > [] pipe_read+0xdf/0x330
> >
> > Running the function tracer on that pid shows it spinning forever..
> > http://codemonkey.org.uk/junk/pipe-trace.txt
> >
> > Kernel bug (missing EFAULT check somewhere perhaps?), or is this a
> > case where the fuzzer asked the kernel to do something stupid, and it 
> > obliged ?
> 
> Hmm. It looks like copy_page_to_iter_iovec() is broken and keeps not
> making any progress while just faulting.
> 
> I don't see how that could happen, though. All the loops there are
> conditional on the user copies *not* failing (ie "!left"), and they
> seem to properly update "iov".
> 
> Mind sending a disassembly of your "copy_page_to_iter" function, in
> particular around that whole "0x3b3/0x500" area which is where the
> page fault seems to happen?
> 
> Adding Al to the cc, since this code is from his commit 6e58e79db8a1
> ("introduce copy_page_to_iter, kill loop over iovec in
> generic_file_aio_read()") but I don't see anything obviously wrong
> there.
> 
> Al? Do you see something I don't? Dave's function trace does seem to
> say that it doesn't even get back to pipe_read(), though, so the loop
> really must be inside copy_page_to_iter().

I'll take a look tomorrow morning after I get some sleep - 19 hours of uptime,
on top of 5 hours of sleep, on top of ~20 hours of uptime ;-/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-29 Thread Dave Jones
On Mon, Sep 29, 2014 at 09:27:09PM -0700, Linus Torvalds wrote:
 > On Mon, Sep 29, 2014 at 8:33 PM, Dave Jones  wrote:
 > >
 > > Looking at the dump, there's only one running trinity child,
 > > with all the others blocking on it.
 > >
 > > trinity-c49 R  running task12856 19464   7633 0x0004
 > > 8800a09bf960 0002 8800a09bf9f8 88021965
 > > 001d4080  8800a09bffd8 001d4080
 > > 88023f755bc0 88021965 8800a09bffd8 88010b017e00
 > > Call Trace:
 > > [] handle_mm_fault+0x3a7/0xcd0
 > > [] __do_page_fault+0x1a4/0x600
 > > [] do_page_fault+0x1e/0x70
 > > [] page_fault+0x22/0x30
 > > [] ? copy_page_to_iter+0x3b3/0x500
 > > [] pipe_read+0xdf/0x330
 > >
 > > Running the function tracer on that pid shows it spinning forever..
 > > http://codemonkey.org.uk/junk/pipe-trace.txt
 > >
 > > Kernel bug (missing EFAULT check somewhere perhaps?), or is this a
 > > case where the fuzzer asked the kernel to do something stupid, and it 
 > > obliged ?
 > 
 > Hmm. It looks like copy_page_to_iter_iovec() is broken and keeps not
 > making any progress while just faulting.
 > 
 > I don't see how that could happen, though. All the loops there are
 > conditional on the user copies *not* failing (ie "!left"), and they
 > seem to properly update "iov".
 > 
 > Mind sending a disassembly of your "copy_page_to_iter" function, in
 > particular around that whole "0x3b3/0x500" area which is where the
 > page fault seems to happen?
 
Whole file is at http://codemonkey.org.uk/junk/iov_iter.txt

gcc is 4.8.3 btw

Dave
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-29 Thread Linus Torvalds
On Mon, Sep 29, 2014 at 8:33 PM, Dave Jones  wrote:
>
> Looking at the dump, there's only one running trinity child,
> with all the others blocking on it.
>
> trinity-c49 R  running task12856 19464   7633 0x0004
> 8800a09bf960 0002 8800a09bf9f8 88021965
> 001d4080  8800a09bffd8 001d4080
> 88023f755bc0 88021965 8800a09bffd8 88010b017e00
> Call Trace:
> [] handle_mm_fault+0x3a7/0xcd0
> [] __do_page_fault+0x1a4/0x600
> [] do_page_fault+0x1e/0x70
> [] page_fault+0x22/0x30
> [] ? copy_page_to_iter+0x3b3/0x500
> [] pipe_read+0xdf/0x330
>
> Running the function tracer on that pid shows it spinning forever..
> http://codemonkey.org.uk/junk/pipe-trace.txt
>
> Kernel bug (missing EFAULT check somewhere perhaps?), or is this a
> case where the fuzzer asked the kernel to do something stupid, and it obliged 
> ?

Hmm. It looks like copy_page_to_iter_iovec() is broken and keeps not
making any progress while just faulting.

I don't see how that could happen, though. All the loops there are
conditional on the user copies *not* failing (ie "!left"), and they
seem to properly update "iov".

Mind sending a disassembly of your "copy_page_to_iter" function, in
particular around that whole "0x3b3/0x500" area which is where the
page fault seems to happen?

Adding Al to the cc, since this code is from his commit 6e58e79db8a1
("introduce copy_page_to_iter, kill loop over iovec in
generic_file_aio_read()") but I don't see anything obviously wrong
there.

Al? Do you see something I don't? Dave's function trace does seem to
say that it doesn't even get back to pipe_read(), though, so the loop
really must be inside copy_page_to_iter().

 Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


pipe/page fault oddness.

2014-09-29 Thread Dave Jones
My fuzz tester ground to a halt, with many child processes blocked
on pipe_lock.  sysrq-t output: http://codemonkey.org.uk/junk/pipe-lock-wtf.txt

Looking at the dump, there's only one running trinity child,
with all the others blocking on it.

trinity-c49 R  running task12856 19464   7633 0x0004
8800a09bf960 0002 8800a09bf9f8 88021965
001d4080  8800a09bffd8 001d4080
88023f755bc0 88021965 8800a09bffd8 88010b017e00
Call Trace:
[] preempt_schedule+0x36/0x60
[] ___preempt_schedule+0x56/0xb0
[] ? handle_mm_fault+0x3a7/0xcd0
[] ? _raw_spin_unlock+0x31/0x50
[] ? _raw_spin_unlock+0x45/0x50
[] handle_mm_fault+0x3a7/0xcd0
[] ? __lock_is_held+0x57/0x80
[] __do_page_fault+0x1a4/0x600
[] ? mark_held_locks+0x75/0xa0
[] ? trace_hardirqs_on_caller+0x10d/0x1d0
[] ? trace_hardirqs_on+0xd/0x10
[] ? context_tracking_user_exit+0x67/0x1b0
[] do_page_fault+0x1e/0x70
[] page_fault+0x22/0x30
[] ? copy_page_to_iter+0x3b3/0x500
[] pipe_read+0xdf/0x330
[] ? pipe_write+0x490/0x490
[] ? do_sync_readv_writev+0xa0/0xa0
[] do_iter_readv_writev+0x78/0xc0
[] do_readv_writev+0xce/0x280
[] ? pipe_write+0x490/0x490
[] ? lock_release_holdtime.part.29+0xe6/0x160
[] ? get_parent_ip+0xd/0x50
[] ? get_parent_ip+0xd/0x50
[] ? preempt_count_sub+0x6b/0xf0
[] vfs_readv+0x39/0x50
[] SyS_readv+0x5c/0x100
[] tracesys+0xdd/0xe2

Running the function tracer on that pid shows it spinning forever..
http://codemonkey.org.uk/junk/pipe-trace.txt

Kernel bug (missing EFAULT check somewhere perhaps?), or is this a
case where the fuzzer asked the kernel to do something stupid, and it obliged ?

Trinity's watchdog process has been repeatedly sending SIGKILL's to this
running pid, but we never seem to get out of this state long enough for
it to take effect.

This is 3.17-rc7 fwiw.

Dave

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


pipe/page fault oddness.

2014-09-29 Thread Dave Jones
My fuzz tester ground to a halt, with many child processes blocked
on pipe_lock.  sysrq-t output: http://codemonkey.org.uk/junk/pipe-lock-wtf.txt

Looking at the dump, there's only one running trinity child,
with all the others blocking on it.

trinity-c49 R  running task12856 19464   7633 0x0004
8800a09bf960 0002 8800a09bf9f8 88021965
001d4080  8800a09bffd8 001d4080
88023f755bc0 88021965 8800a09bffd8 88010b017e00
Call Trace:
[9181df46] preempt_schedule+0x36/0x60
[9100e3d6] ___preempt_schedule+0x56/0xb0
[911c3c67] ? handle_mm_fault+0x3a7/0xcd0
[918239f1] ? _raw_spin_unlock+0x31/0x50
[91823a05] ? _raw_spin_unlock+0x45/0x50
[911c3c67] handle_mm_fault+0x3a7/0xcd0
[910cb687] ? __lock_is_held+0x57/0x80
[91042c84] __do_page_fault+0x1a4/0x600
[910ce485] ? mark_held_locks+0x75/0xa0
[910ce5bd] ? trace_hardirqs_on_caller+0x10d/0x1d0
[910ce68d] ? trace_hardirqs_on+0xd/0x10
[9118eda7] ? context_tracking_user_exit+0x67/0x1b0
[910430fe] do_page_fault+0x1e/0x70
[918264b2] page_fault+0x22/0x30
[911bd7e3] ? copy_page_to_iter+0x3b3/0x500
[9120eddf] pipe_read+0xdf/0x330
[9120ed00] ? pipe_write+0x490/0x490
[912051a0] ? do_sync_readv_writev+0xa0/0xa0
[912053b8] do_iter_readv_writev+0x78/0xc0
[91206bbe] do_readv_writev+0xce/0x280
[9120ed00] ? pipe_write+0x490/0x490
[910cbbf6] ? lock_release_holdtime.part.29+0xe6/0x160
[910ac74d] ? get_parent_ip+0xd/0x50
[910ac74d] ? get_parent_ip+0xd/0x50
[910ac8ab] ? preempt_count_sub+0x6b/0xf0
[91206da9] vfs_readv+0x39/0x50
[91206e6c] SyS_readv+0x5c/0x100
[918249e4] tracesys+0xdd/0xe2

Running the function tracer on that pid shows it spinning forever..
http://codemonkey.org.uk/junk/pipe-trace.txt

Kernel bug (missing EFAULT check somewhere perhaps?), or is this a
case where the fuzzer asked the kernel to do something stupid, and it obliged ?

Trinity's watchdog process has been repeatedly sending SIGKILL's to this
running pid, but we never seem to get out of this state long enough for
it to take effect.

This is 3.17-rc7 fwiw.

Dave

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-29 Thread Linus Torvalds
On Mon, Sep 29, 2014 at 8:33 PM, Dave Jones da...@redhat.com wrote:

 Looking at the dump, there's only one running trinity child,
 with all the others blocking on it.

 trinity-c49 R  running task12856 19464   7633 0x0004
 8800a09bf960 0002 8800a09bf9f8 88021965
 001d4080  8800a09bffd8 001d4080
 88023f755bc0 88021965 8800a09bffd8 88010b017e00
 Call Trace:
 [911c3c67] handle_mm_fault+0x3a7/0xcd0
 [91042c84] __do_page_fault+0x1a4/0x600
 [910430fe] do_page_fault+0x1e/0x70
 [918264b2] page_fault+0x22/0x30
 [911bd7e3] ? copy_page_to_iter+0x3b3/0x500
 [9120eddf] pipe_read+0xdf/0x330

 Running the function tracer on that pid shows it spinning forever..
 http://codemonkey.org.uk/junk/pipe-trace.txt

 Kernel bug (missing EFAULT check somewhere perhaps?), or is this a
 case where the fuzzer asked the kernel to do something stupid, and it obliged 
 ?

Hmm. It looks like copy_page_to_iter_iovec() is broken and keeps not
making any progress while just faulting.

I don't see how that could happen, though. All the loops there are
conditional on the user copies *not* failing (ie !left), and they
seem to properly update iov.

Mind sending a disassembly of your copy_page_to_iter function, in
particular around that whole 0x3b3/0x500 area which is where the
page fault seems to happen?

Adding Al to the cc, since this code is from his commit 6e58e79db8a1
(introduce copy_page_to_iter, kill loop over iovec in
generic_file_aio_read()) but I don't see anything obviously wrong
there.

Al? Do you see something I don't? Dave's function trace does seem to
say that it doesn't even get back to pipe_read(), though, so the loop
really must be inside copy_page_to_iter().

 Linus
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-29 Thread Dave Jones
On Mon, Sep 29, 2014 at 09:27:09PM -0700, Linus Torvalds wrote:
  On Mon, Sep 29, 2014 at 8:33 PM, Dave Jones da...@redhat.com wrote:
  
   Looking at the dump, there's only one running trinity child,
   with all the others blocking on it.
  
   trinity-c49 R  running task12856 19464   7633 0x0004
   8800a09bf960 0002 8800a09bf9f8 88021965
   001d4080  8800a09bffd8 001d4080
   88023f755bc0 88021965 8800a09bffd8 88010b017e00
   Call Trace:
   [911c3c67] handle_mm_fault+0x3a7/0xcd0
   [91042c84] __do_page_fault+0x1a4/0x600
   [910430fe] do_page_fault+0x1e/0x70
   [918264b2] page_fault+0x22/0x30
   [911bd7e3] ? copy_page_to_iter+0x3b3/0x500
   [9120eddf] pipe_read+0xdf/0x330
  
   Running the function tracer on that pid shows it spinning forever..
   http://codemonkey.org.uk/junk/pipe-trace.txt
  
   Kernel bug (missing EFAULT check somewhere perhaps?), or is this a
   case where the fuzzer asked the kernel to do something stupid, and it 
   obliged ?
  
  Hmm. It looks like copy_page_to_iter_iovec() is broken and keeps not
  making any progress while just faulting.
  
  I don't see how that could happen, though. All the loops there are
  conditional on the user copies *not* failing (ie !left), and they
  seem to properly update iov.
  
  Mind sending a disassembly of your copy_page_to_iter function, in
  particular around that whole 0x3b3/0x500 area which is where the
  page fault seems to happen?
 
Whole file is at http://codemonkey.org.uk/junk/iov_iter.txt

gcc is 4.8.3 btw

Dave
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: pipe/page fault oddness.

2014-09-29 Thread Al Viro
On Mon, Sep 29, 2014 at 09:27:09PM -0700, Linus Torvalds wrote:
 On Mon, Sep 29, 2014 at 8:33 PM, Dave Jones da...@redhat.com wrote:
 
  Looking at the dump, there's only one running trinity child,
  with all the others blocking on it.
 
  trinity-c49 R  running task12856 19464   7633 0x0004
  8800a09bf960 0002 8800a09bf9f8 88021965
  001d4080  8800a09bffd8 001d4080
  88023f755bc0 88021965 8800a09bffd8 88010b017e00
  Call Trace:
  [911c3c67] handle_mm_fault+0x3a7/0xcd0
  [91042c84] __do_page_fault+0x1a4/0x600
  [910430fe] do_page_fault+0x1e/0x70
  [918264b2] page_fault+0x22/0x30
  [911bd7e3] ? copy_page_to_iter+0x3b3/0x500
  [9120eddf] pipe_read+0xdf/0x330
 
  Running the function tracer on that pid shows it spinning forever..
  http://codemonkey.org.uk/junk/pipe-trace.txt
 
  Kernel bug (missing EFAULT check somewhere perhaps?), or is this a
  case where the fuzzer asked the kernel to do something stupid, and it 
  obliged ?
 
 Hmm. It looks like copy_page_to_iter_iovec() is broken and keeps not
 making any progress while just faulting.
 
 I don't see how that could happen, though. All the loops there are
 conditional on the user copies *not* failing (ie !left), and they
 seem to properly update iov.
 
 Mind sending a disassembly of your copy_page_to_iter function, in
 particular around that whole 0x3b3/0x500 area which is where the
 page fault seems to happen?
 
 Adding Al to the cc, since this code is from his commit 6e58e79db8a1
 (introduce copy_page_to_iter, kill loop over iovec in
 generic_file_aio_read()) but I don't see anything obviously wrong
 there.
 
 Al? Do you see something I don't? Dave's function trace does seem to
 say that it doesn't even get back to pipe_read(), though, so the loop
 really must be inside copy_page_to_iter().

I'll take a look tomorrow morning after I get some sleep - 19 hours of uptime,
on top of 5 hours of sleep, on top of ~20 hours of uptime ;-/
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/