date:20171020

Re: [PATCH] lkdtm: Convert from jprobe to kprobe

2017-10-20 Thread Masami Hiramatsu

On Fri, 20 Oct 2017 06:31:27 -0700
Kees Cook  wrote:

> The jprobe subsystem is being removed, so convert to using kprobe instead.
> 

Looks good to me:)

Acked-by: Masami Hiramatsu 

Thanks,

> Cc: Masami Hiramatsu 
> Signed-off-by: Kees Cook 
> ---
>  drivers/misc/lkdtm_core.c | 154 
> ++
>  1 file changed, 45 insertions(+), 109 deletions(-)
> 
> diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c
> index 981b3ef71e47..ed7f0c61c59a 100644
> --- a/drivers/misc/lkdtm_core.c
> +++ b/drivers/misc/lkdtm_core.c
> @@ -56,122 +56,54 @@ static ssize_t direct_entry(struct file *f, const char 
> __user *user_buf,
>   size_t count, loff_t *off);
>  
>  #ifdef CONFIG_KPROBES
> -static void lkdtm_handler(void);
> +static int lkdtm_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
>  static ssize_t lkdtm_debugfs_entry(struct file *f,
>  const char __user *user_buf,
>  size_t count, loff_t *off);
> -
> -
> -/* jprobe entry point handlers. */
> -static unsigned int jp_do_irq(unsigned int irq)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -
> -static irqreturn_t jp_handle_irq_event(unsigned int irq,
> -struct irqaction *action)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -
> -static void jp_tasklet_action(struct softirq_action *a)
> -{
> - lkdtm_handler();
> - jprobe_return();
> -}
> -
> -static void jp_ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
> -{
> - lkdtm_handler();
> - jprobe_return();
> -}
> -
> -struct scan_control;
> -
> -static unsigned long jp_shrink_inactive_list(unsigned long max_scan,
> -  struct zone *zone,
> -  struct scan_control *sc)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -
> -static int jp_hrtimer_start(struct hrtimer *timer, ktime_t tim,
> - const enum hrtimer_mode mode)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -
> -static int jp_scsi_dispatch_cmd(struct scsi_cmnd *cmd)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -
> -# ifdef CONFIG_IDE
> -static int jp_generic_ide_ioctl(ide_drive_t *drive, struct file *file,
> - struct block_device *bdev, unsigned int cmd,
> - unsigned long arg)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -# endif
> +# define CRASHPOINT_KPROBE(_symbol)  \
> + .kprobe = { \
> + .symbol_name = (_symbol),   \
> + .pre_handler = lkdtm_kprobe_handler,\
> + },
> +# define CRASHPOINT_WRITE(_symbol)   \
> + (_symbol) ? lkdtm_debugfs_entry : direct_entry
> +#else
> +# define CRASHPOINT_KPROBE(_symbol)
> +# define CRASHPOINT_WRITE(_symbol)   direct_entry
>  #endif
>  
>  /* Crash points */
>  struct crashpoint {
>   const char *name;
>   const struct file_operations fops;
> - struct jprobe jprobe;
> + struct kprobe kprobe;
>  };
>  
> -#define CRASHPOINT(_name, _write, _symbol, _entry)   \
> +#define CRASHPOINT(_name, _symbol)   \
>   {   \
>   .name = _name,  \
>   .fops = {   \
>   .read   = lkdtm_debugfs_read,   \
>   .llseek = generic_file_llseek,  \
>   .open   = lkdtm_debugfs_open,   \
> - .write  = _write,   \
> - },  \
> - .jprobe = { \
> - .kp.symbol_name = _symbol,  \
> - .entry = (kprobe_opcode_t *)_entry, \
> + .write  = CRASHPOINT_WRITE(_symbol) \
>   },  \
> + CRASHPOINT_KPROBE(_symbol)  \
>   }
>  
>  /* Define the possible places where we can trigger a crash point. */
> -struct crashpoint crashpoints[] = {
> - CRASHPOINT("DIRECT",direct_entry,
> -NULL,NULL),
> +static struct crashpoint crashpoints[] = {
> + CRASHPOINT("DIRECT", NULL),
>  #ifdef CONFIG_KPROBES
> - CRASHPOINT("INT_HARDWARE_ENTRY",lkdtm_debugfs_entry,
> -"do_IRQ",

Re: [PATCH] lkdtm: Convert from jprobe to kprobe

2017-10-20 Thread Masami Hiramatsu

On Fri, 20 Oct 2017 06:31:27 -0700
Kees Cook  wrote:

> The jprobe subsystem is being removed, so convert to using kprobe instead.
> 

Looks good to me:)

Acked-by: Masami Hiramatsu 

Thanks,

> Cc: Masami Hiramatsu 
> Signed-off-by: Kees Cook 
> ---
>  drivers/misc/lkdtm_core.c | 154 
> ++
>  1 file changed, 45 insertions(+), 109 deletions(-)
> 
> diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c
> index 981b3ef71e47..ed7f0c61c59a 100644
> --- a/drivers/misc/lkdtm_core.c
> +++ b/drivers/misc/lkdtm_core.c
> @@ -56,122 +56,54 @@ static ssize_t direct_entry(struct file *f, const char 
> __user *user_buf,
>   size_t count, loff_t *off);
>  
>  #ifdef CONFIG_KPROBES
> -static void lkdtm_handler(void);
> +static int lkdtm_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
>  static ssize_t lkdtm_debugfs_entry(struct file *f,
>  const char __user *user_buf,
>  size_t count, loff_t *off);
> -
> -
> -/* jprobe entry point handlers. */
> -static unsigned int jp_do_irq(unsigned int irq)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -
> -static irqreturn_t jp_handle_irq_event(unsigned int irq,
> -struct irqaction *action)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -
> -static void jp_tasklet_action(struct softirq_action *a)
> -{
> - lkdtm_handler();
> - jprobe_return();
> -}
> -
> -static void jp_ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
> -{
> - lkdtm_handler();
> - jprobe_return();
> -}
> -
> -struct scan_control;
> -
> -static unsigned long jp_shrink_inactive_list(unsigned long max_scan,
> -  struct zone *zone,
> -  struct scan_control *sc)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -
> -static int jp_hrtimer_start(struct hrtimer *timer, ktime_t tim,
> - const enum hrtimer_mode mode)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -
> -static int jp_scsi_dispatch_cmd(struct scsi_cmnd *cmd)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -
> -# ifdef CONFIG_IDE
> -static int jp_generic_ide_ioctl(ide_drive_t *drive, struct file *file,
> - struct block_device *bdev, unsigned int cmd,
> - unsigned long arg)
> -{
> - lkdtm_handler();
> - jprobe_return();
> - return 0;
> -}
> -# endif
> +# define CRASHPOINT_KPROBE(_symbol)  \
> + .kprobe = { \
> + .symbol_name = (_symbol),   \
> + .pre_handler = lkdtm_kprobe_handler,\
> + },
> +# define CRASHPOINT_WRITE(_symbol)   \
> + (_symbol) ? lkdtm_debugfs_entry : direct_entry
> +#else
> +# define CRASHPOINT_KPROBE(_symbol)
> +# define CRASHPOINT_WRITE(_symbol)   direct_entry
>  #endif
>  
>  /* Crash points */
>  struct crashpoint {
>   const char *name;
>   const struct file_operations fops;
> - struct jprobe jprobe;
> + struct kprobe kprobe;
>  };
>  
> -#define CRASHPOINT(_name, _write, _symbol, _entry)   \
> +#define CRASHPOINT(_name, _symbol)   \
>   {   \
>   .name = _name,  \
>   .fops = {   \
>   .read   = lkdtm_debugfs_read,   \
>   .llseek = generic_file_llseek,  \
>   .open   = lkdtm_debugfs_open,   \
> - .write  = _write,   \
> - },  \
> - .jprobe = { \
> - .kp.symbol_name = _symbol,  \
> - .entry = (kprobe_opcode_t *)_entry, \
> + .write  = CRASHPOINT_WRITE(_symbol) \
>   },  \
> + CRASHPOINT_KPROBE(_symbol)  \
>   }
>  
>  /* Define the possible places where we can trigger a crash point. */
> -struct crashpoint crashpoints[] = {
> - CRASHPOINT("DIRECT",direct_entry,
> -NULL,NULL),
> +static struct crashpoint crashpoints[] = {
> + CRASHPOINT("DIRECT", NULL),
>  #ifdef CONFIG_KPROBES
> - CRASHPOINT("INT_HARDWARE_ENTRY",lkdtm_debugfs_entry,
> -"do_IRQ",jp_do_irq),
> - CRASHPOINT("INT_HW_IRQ_EN", lkdtm_debugfs_entry,
> -

Re: [PATCH 1/2] mm/mmu_notifier: avoid double notification when it is useless v2

2017-10-20 Thread Balbir Singh

On Thu, 2017-10-19 at 12:58 -0400, Jerome Glisse wrote:
> On Thu, Oct 19, 2017 at 09:53:11PM +1100, Balbir Singh wrote:
> > On Thu, Oct 19, 2017 at 2:28 PM, Jerome Glisse  wrote:
> > > On Thu, Oct 19, 2017 at 02:04:26PM +1100, Balbir Singh wrote:
> > > > On Mon, 16 Oct 2017 23:10:02 -0400
> > > > jgli...@redhat.com wrote:
> > > > 
> > > > > From: Jérôme Glisse 
> > > > > 
> > > > > +   /*
> > > > > +* No need to call mmu_notifier_invalidate_range() as we 
> > > > > are
> > > > > +* downgrading page table protection not changing it to 
> > > > > point
> > > > > +* to a new page.
> > > > > +*
> > > > > +* See Documentation/vm/mmu_notifier.txt
> > > > > +*/
> > > > > if (pmdp) {
> > > > >  #ifdef CONFIG_FS_DAX_PMD
> > > > > pmd_t pmd;
> > > > > @@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct 
> > > > > address_space *mapping,
> > > > > pmd = pmd_wrprotect(pmd);
> > > > > pmd = pmd_mkclean(pmd);
> > > > > set_pmd_at(vma->vm_mm, address, pmdp, pmd);
> > > > > -   mmu_notifier_invalidate_range(vma->vm_mm, start, 
> > > > > end);
> > > > 
> > > > Could the secondary TLB still see the mapping as dirty and propagate 
> > > > the dirty bit back?
> > > 
> > > I am assuming hardware does sane thing of setting the dirty bit only
> > > when walking the CPU page table when device does a write fault ie
> > > once the device get a write TLB entry the dirty is set by the IOMMU
> > > when walking the page table before returning the lookup result to the
> > > device and that it won't be set again latter (ie propagated back
> > > latter).
> > > 
> > 
> > The other possibility is that the hardware things the page is writable
> > and already
> > marked dirty. It allows writes and does not set the dirty bit?
> 
> I thought about this some more and the patch can not regress anything
> that is not broken today. So if we assume that device can propagate
> dirty bit because it can cache the write protection than all current
> code is broken for two reasons:
> 
> First one is current code clear pte entry, build a new pte value with
> write protection and update pte entry with new pte value. So any PASID/
> ATS platform that allows device to cache the write bit and set dirty
> bit anytime after that can race during that window and you would loose
> the dirty bit of the device. That is not that bad as you are gonna
> propagate the dirty bit to the struct page.

But they stay consistent with the notifiers, so from the OS perspective
it notifies of any PTE changes as they happen. When the ATS platform sees
invalidation, it invalidates it's PTE's as well.

I was speaking of the case where the ATS platform could assume it has
write access and has not seen any invalidation, the OS could return
back to user space or the caller with write bit clear, but the ATS
platform could still do a write since it's not seen the invalidation.

> 
> Second one is if the dirty bit is propagated back to the new write
> protected pte. Quick look at code it seems that when we zap pte or
> or mkclean we don't check that the pte has write permission but only
> care about the dirty bit. So it should not have any bad consequence.
> 
> After this patch only the second window is bigger and thus more likely
> to happen. But nothing sinister should happen from that.
> 
> 
> > 
> > > I should probably have spell that out and maybe some of the ATS/PASID
> > > implementer did not do that.
> > > 
> > > > 
> > > > >  unlock_pmd:
> > > > > spin_unlock(ptl);
> > > > >  #endif
> > > > > @@ -643,7 +649,6 @@ static void dax_mapping_entry_mkclean(struct 
> > > > > address_space *mapping,
> > > > > pte = pte_wrprotect(pte);
> > > > > pte = pte_mkclean(pte);
> > > > > set_pte_at(vma->vm_mm, address, ptep, pte);
> > > > > -   mmu_notifier_invalidate_range(vma->vm_mm, start, 
> > > > > end);
> > > > 
> > > > Ditto
> > > > 
> > > > >  unlock_pte:
> > > > > pte_unmap_unlock(ptep, ptl);
> > > > > }
> > > > > diff --git a/include/linux/mmu_notifier.h 
> > > > > b/include/linux/mmu_notifier.h
> > > > > index 6866e8126982..49c925c96b8a 100644
> > > > > --- a/include/linux/mmu_notifier.h
> > > > > +++ b/include/linux/mmu_notifier.h
> > > > > @@ -155,7 +155,8 @@ struct mmu_notifier_ops {
> > > > >  * shared page-tables, it not necessary to implement the
> > > > >  * invalidate_range_start()/end() notifiers, as
> > > > >  * invalidate_range() alread catches the points in time when an
> > > > > -* external TLB range needs to be flushed.
> > > > > +* external TLB range needs to be flushed. For more in depth
> > > > > +* discussion on this see Documentation/vm/mmu_notifier.txt
> > > > >  *
> > > > >

Re: [PATCH 1/2] mm/mmu_notifier: avoid double notification when it is useless v2

2017-10-20 Thread Balbir Singh

On Thu, 2017-10-19 at 12:58 -0400, Jerome Glisse wrote:
> On Thu, Oct 19, 2017 at 09:53:11PM +1100, Balbir Singh wrote:
> > On Thu, Oct 19, 2017 at 2:28 PM, Jerome Glisse  wrote:
> > > On Thu, Oct 19, 2017 at 02:04:26PM +1100, Balbir Singh wrote:
> > > > On Mon, 16 Oct 2017 23:10:02 -0400
> > > > jgli...@redhat.com wrote:
> > > > 
> > > > > From: Jérôme Glisse 
> > > > > 
> > > > > +   /*
> > > > > +* No need to call mmu_notifier_invalidate_range() as we 
> > > > > are
> > > > > +* downgrading page table protection not changing it to 
> > > > > point
> > > > > +* to a new page.
> > > > > +*
> > > > > +* See Documentation/vm/mmu_notifier.txt
> > > > > +*/
> > > > > if (pmdp) {
> > > > >  #ifdef CONFIG_FS_DAX_PMD
> > > > > pmd_t pmd;
> > > > > @@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct 
> > > > > address_space *mapping,
> > > > > pmd = pmd_wrprotect(pmd);
> > > > > pmd = pmd_mkclean(pmd);
> > > > > set_pmd_at(vma->vm_mm, address, pmdp, pmd);
> > > > > -   mmu_notifier_invalidate_range(vma->vm_mm, start, 
> > > > > end);
> > > > 
> > > > Could the secondary TLB still see the mapping as dirty and propagate 
> > > > the dirty bit back?
> > > 
> > > I am assuming hardware does sane thing of setting the dirty bit only
> > > when walking the CPU page table when device does a write fault ie
> > > once the device get a write TLB entry the dirty is set by the IOMMU
> > > when walking the page table before returning the lookup result to the
> > > device and that it won't be set again latter (ie propagated back
> > > latter).
> > > 
> > 
> > The other possibility is that the hardware things the page is writable
> > and already
> > marked dirty. It allows writes and does not set the dirty bit?
> 
> I thought about this some more and the patch can not regress anything
> that is not broken today. So if we assume that device can propagate
> dirty bit because it can cache the write protection than all current
> code is broken for two reasons:
> 
> First one is current code clear pte entry, build a new pte value with
> write protection and update pte entry with new pte value. So any PASID/
> ATS platform that allows device to cache the write bit and set dirty
> bit anytime after that can race during that window and you would loose
> the dirty bit of the device. That is not that bad as you are gonna
> propagate the dirty bit to the struct page.

But they stay consistent with the notifiers, so from the OS perspective
it notifies of any PTE changes as they happen. When the ATS platform sees
invalidation, it invalidates it's PTE's as well.

I was speaking of the case where the ATS platform could assume it has
write access and has not seen any invalidation, the OS could return
back to user space or the caller with write bit clear, but the ATS
platform could still do a write since it's not seen the invalidation.

> 
> Second one is if the dirty bit is propagated back to the new write
> protected pte. Quick look at code it seems that when we zap pte or
> or mkclean we don't check that the pte has write permission but only
> care about the dirty bit. So it should not have any bad consequence.
> 
> After this patch only the second window is bigger and thus more likely
> to happen. But nothing sinister should happen from that.
> 
> 
> > 
> > > I should probably have spell that out and maybe some of the ATS/PASID
> > > implementer did not do that.
> > > 
> > > > 
> > > > >  unlock_pmd:
> > > > > spin_unlock(ptl);
> > > > >  #endif
> > > > > @@ -643,7 +649,6 @@ static void dax_mapping_entry_mkclean(struct 
> > > > > address_space *mapping,
> > > > > pte = pte_wrprotect(pte);
> > > > > pte = pte_mkclean(pte);
> > > > > set_pte_at(vma->vm_mm, address, ptep, pte);
> > > > > -   mmu_notifier_invalidate_range(vma->vm_mm, start, 
> > > > > end);
> > > > 
> > > > Ditto
> > > > 
> > > > >  unlock_pte:
> > > > > pte_unmap_unlock(ptep, ptl);
> > > > > }
> > > > > diff --git a/include/linux/mmu_notifier.h 
> > > > > b/include/linux/mmu_notifier.h
> > > > > index 6866e8126982..49c925c96b8a 100644
> > > > > --- a/include/linux/mmu_notifier.h
> > > > > +++ b/include/linux/mmu_notifier.h
> > > > > @@ -155,7 +155,8 @@ struct mmu_notifier_ops {
> > > > >  * shared page-tables, it not necessary to implement the
> > > > >  * invalidate_range_start()/end() notifiers, as
> > > > >  * invalidate_range() alread catches the points in time when an
> > > > > -* external TLB range needs to be flushed.
> > > > > +* external TLB range needs to be flushed. For more in depth
> > > > > +* discussion on this see Documentation/vm/mmu_notifier.txt
> > > > >  *
> > > > >  * The invalidate_range() function is

Re: [PATCH 12/27] x86/msr: Restrict MSR access when the kernel is locked down

2017-10-20 Thread joeyli

On Fri, Oct 20, 2017 at 09:48:16PM +0100, David Howells wrote:
> Alan Cox  wrote:
> 
> > There are a load of standard tools that use this so I think you are going
> > to need a whitelist. Can you at least log *which* MSR in the failing case
> > so a whitelist can be built over time ?
> 
[...snip]
> 
> And do you know where wrmsr_safe_regs() might be found?  I can see things
> using it and exporting it, but no implementation, so I'm guessing it's
> macroised somewhere.

Looks the definition is in 

arch/x86/lib/msr-reg.S

#ifdef CONFIG_X86_64
/*
 * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
 *
 * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
 *
 */
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
pushq %rbx
pushq %r12
...

Regards
Joey Lee

Re: [PATCH 12/27] x86/msr: Restrict MSR access when the kernel is locked down

2017-10-20 Thread joeyli

On Fri, Oct 20, 2017 at 09:48:16PM +0100, David Howells wrote:
> Alan Cox  wrote:
> 
> > There are a load of standard tools that use this so I think you are going
> > to need a whitelist. Can you at least log *which* MSR in the failing case
> > so a whitelist can be built over time ?
> 
[...snip]
> 
> And do you know where wrmsr_safe_regs() might be found?  I can see things
> using it and exporting it, but no implementation, so I'm guessing it's
> macroised somewhere.

Looks the definition is in 

arch/x86/lib/msr-reg.S

#ifdef CONFIG_X86_64
/*
 * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]);
 *
 * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi]
 *
 */
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
pushq %rbx
pushq %r12
...

Regards
Joey Lee

Re: [PATCH v3 02/13] dax: require 'struct page' for filesystem dax

2017-10-20 Thread Dan Williams

On Fri, Oct 20, 2017 at 8:20 PM, Matthew Wilcox  wrote:
> On Fri, Oct 20, 2017 at 03:29:57PM -0700, Dan Williams wrote:
>> Ok, I'd also like to kill DAX support in the brd driver. It's a source
>> of complexity and maintenance burden for zero benefit. It's the only
>> ->direct_access() implementation that sleeps and it's the only
>> implementation where there is a non-linear relationship between
>> sectors and pfns. Having a 1:1 sector to pfn relationship will help
>> with the dma-extent-busy management since we don't need to keep
>> calling into the driver to map pfns back to sectors once we know the
>> pfn[0] sector[0] relationship.
>
> But these are important things that other block devices may / will want.
>
> For example, I think it's entirely sensible to support ->direct_access
> for RAID-0.  Dell are looking at various different options for having
> one pmemX device per DIMM and using RAID to lash them together.
> ->direct_access makes no sense for RAID-5 or RAID-1, but RAID-0 makes
> sense to me.
>
> Last time we tried to take sleeping out, there were grumblings from people
> with network block devices who thought they'd want to bring pages in
> across the network.  I'm a bit less sympathetic to this because I don't
> know anyone actively working on it, but the RAID-0 case is something I
> think we should care about.

True, good point. In fact we already support device-mapper striping
with ->direct_access(). I'd still like to go ahead with the sleeping
removal. When those folks come back and add network direct_access they
can do the hard work of figuring out cases where we need to call
direct_access in atomic contexts.

Re: [PATCH v3 02/13] dax: require 'struct page' for filesystem dax

2017-10-20 Thread Dan Williams

On Fri, Oct 20, 2017 at 8:20 PM, Matthew Wilcox  wrote:
> On Fri, Oct 20, 2017 at 03:29:57PM -0700, Dan Williams wrote:
>> Ok, I'd also like to kill DAX support in the brd driver. It's a source
>> of complexity and maintenance burden for zero benefit. It's the only
>> ->direct_access() implementation that sleeps and it's the only
>> implementation where there is a non-linear relationship between
>> sectors and pfns. Having a 1:1 sector to pfn relationship will help
>> with the dma-extent-busy management since we don't need to keep
>> calling into the driver to map pfns back to sectors once we know the
>> pfn[0] sector[0] relationship.
>
> But these are important things that other block devices may / will want.
>
> For example, I think it's entirely sensible to support ->direct_access
> for RAID-0.  Dell are looking at various different options for having
> one pmemX device per DIMM and using RAID to lash them together.
> ->direct_access makes no sense for RAID-5 or RAID-1, but RAID-0 makes
> sense to me.
>
> Last time we tried to take sleeping out, there were grumblings from people
> with network block devices who thought they'd want to bring pages in
> across the network.  I'm a bit less sympathetic to this because I don't
> know anyone actively working on it, but the RAID-0 case is something I
> think we should care about.

True, good point. In fact we already support device-mapper striping
with ->direct_access(). I'd still like to go ahead with the sleeping
removal. When those folks come back and add network direct_access they
can do the hard work of figuring out cases where we need to call
direct_access in atomic contexts.

[PATCH] net: can: dev: remove unused code in can_restart

2017-10-20 Thread Gustavo A. R. Silva

Value assigned to variable err is overwritten at line
562: err = priv->do_set_mode(dev, CAN_MODE_START); before
it can be used.

Also, notice that this code has been there since 2014.

Addresses-Coverity-ID: 1227031
Signed-off-by: Gustavo A. R. Silva 
---
This code was tested by compilation only.

 drivers/net/can/dev.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 365a8cc..064d0f6 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -543,10 +543,9 @@ static void can_restart(struct net_device *dev)
 
/* send restart message upstream */
skb = alloc_can_err_skb(dev, );
-   if (skb == NULL) {
-   err = -ENOMEM;
+   if (!skb)
goto restart;
-   }
+
cf->can_id |= CAN_ERR_RESTARTED;
 
netif_rx(skb);
-- 
2.7.4

[PATCH] net: can: dev: remove unused code in can_restart

2017-10-20 Thread Gustavo A. R. Silva

Value assigned to variable err is overwritten at line
562: err = priv->do_set_mode(dev, CAN_MODE_START); before
it can be used.

Also, notice that this code has been there since 2014.

Addresses-Coverity-ID: 1227031
Signed-off-by: Gustavo A. R. Silva 
---
This code was tested by compilation only.

 drivers/net/can/dev.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index 365a8cc..064d0f6 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -543,10 +543,9 @@ static void can_restart(struct net_device *dev)
 
/* send restart message upstream */
skb = alloc_can_err_skb(dev, );
-   if (skb == NULL) {
-   err = -ENOMEM;
+   if (!skb)
goto restart;
-   }
+
cf->can_id |= CAN_ERR_RESTARTED;
 
netif_rx(skb);
-- 
2.7.4

Re: [PATCH v3 02/13] dax: require 'struct page' for filesystem dax

2017-10-20 Thread Matthew Wilcox

On Fri, Oct 20, 2017 at 03:29:57PM -0700, Dan Williams wrote:
> Ok, I'd also like to kill DAX support in the brd driver. It's a source
> of complexity and maintenance burden for zero benefit. It's the only
> ->direct_access() implementation that sleeps and it's the only
> implementation where there is a non-linear relationship between
> sectors and pfns. Having a 1:1 sector to pfn relationship will help
> with the dma-extent-busy management since we don't need to keep
> calling into the driver to map pfns back to sectors once we know the
> pfn[0] sector[0] relationship.

But these are important things that other block devices may / will want.

For example, I think it's entirely sensible to support ->direct_access
for RAID-0.  Dell are looking at various different options for having
one pmemX device per DIMM and using RAID to lash them together.
->direct_access makes no sense for RAID-5 or RAID-1, but RAID-0 makes
sense to me.

Last time we tried to take sleeping out, there were grumblings from people
with network block devices who thought they'd want to bring pages in
across the network.  I'm a bit less sympathetic to this because I don't
know anyone actively working on it, but the RAID-0 case is something I
think we should care about.

Re: [PATCH v3 02/13] dax: require 'struct page' for filesystem dax

2017-10-20 Thread Matthew Wilcox

On Fri, Oct 20, 2017 at 03:29:57PM -0700, Dan Williams wrote:
> Ok, I'd also like to kill DAX support in the brd driver. It's a source
> of complexity and maintenance burden for zero benefit. It's the only
> ->direct_access() implementation that sleeps and it's the only
> implementation where there is a non-linear relationship between
> sectors and pfns. Having a 1:1 sector to pfn relationship will help
> with the dma-extent-busy management since we don't need to keep
> calling into the driver to map pfns back to sectors once we know the
> pfn[0] sector[0] relationship.

But these are important things that other block devices may / will want.

For example, I think it's entirely sensible to support ->direct_access
for RAID-0.  Dell are looking at various different options for having
one pmemX device per DIMM and using RAID to lash them together.
->direct_access makes no sense for RAID-5 or RAID-1, but RAID-0 makes
sense to me.

Last time we tried to take sleeping out, there were grumblings from people
with network block devices who thought they'd want to bring pages in
across the network.  I'm a bit less sympathetic to this because I don't
know anyone actively working on it, but the RAID-0 case is something I
think we should care about.

Re: [PATCH mmotm] lib/bug.c: fix build when MODULES are not enabled

2017-10-20 Thread Andi Kleen

On Fri, Oct 20, 2017 at 04:53:50PM -0700, Randy Dunlap wrote:
> From: Randy Dunlap 
> 
> Fix build when CONFIG_MODULES is not enabled.
> Fixes multiple build errors.

Thanks. I already sent a patch earlier.

-Andi
> 
> Signed-off-by: Randy Dunlap 
> Reported-by: kbuild test robot 
> Cc: Andi Kleen

Re: [PATCH mmotm] lib/bug.c: fix build when MODULES are not enabled

2017-10-20 Thread Andi Kleen

On Fri, Oct 20, 2017 at 04:53:50PM -0700, Randy Dunlap wrote:
> From: Randy Dunlap 
> 
> Fix build when CONFIG_MODULES is not enabled.
> Fixes multiple build errors.

Thanks. I already sent a patch earlier.

-Andi
> 
> Signed-off-by: Randy Dunlap 
> Reported-by: kbuild test robot 
> Cc: Andi Kleen

Re: [PATCH 00/23] Hardened usercopy whitelisting

2017-10-20 Thread Kees Cook

On Fri, Oct 20, 2017 at 4:25 PM, Paolo Bonzini  wrote:
> On 21/10/2017 00:40, Paolo Bonzini wrote:
>> This breaks KVM completely on x86, due to two ioctls
>> (KVM_GET/SET_CPUID2) accessing the cpuid_entries field of struct
>> kvm_vcpu_arch.
>>
>> There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is
>> obsolete and not a big deal at all.
>>
>> I can post some patches, but probably not until the beginning of
>> November due to travelling.  Please do not send this too close to the
>> beginning of the merge window.
>
> Sleeping is overrated, sending patches now...

Oh awesome, thank you very much for tracking this down and building fixes!

I'll insert these into the usercopy whitelisting series, and see if I
can find any similar cases.

Thanks!

-Kees

-- 
Kees Cook
Pixel Security

Re: [PATCH 00/23] Hardened usercopy whitelisting

2017-10-20 Thread Kees Cook

On Fri, Oct 20, 2017 at 4:25 PM, Paolo Bonzini  wrote:
> On 21/10/2017 00:40, Paolo Bonzini wrote:
>> This breaks KVM completely on x86, due to two ioctls
>> (KVM_GET/SET_CPUID2) accessing the cpuid_entries field of struct
>> kvm_vcpu_arch.
>>
>> There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is
>> obsolete and not a big deal at all.
>>
>> I can post some patches, but probably not until the beginning of
>> November due to travelling.  Please do not send this too close to the
>> beginning of the merge window.
>
> Sleeping is overrated, sending patches now...

Oh awesome, thank you very much for tracking this down and building fixes!

I'll insert these into the usercopy whitelisting series, and see if I
can find any similar cases.

Thanks!

-Kees

-- 
Kees Cook
Pixel Security

[PATCH v7] acpi: apei: remove the unused dead-code for SEA/NMI notification type

2017-10-20 Thread Dongjiu Geng

For the SEA notification, the two functions ghes_sea_add() and
ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA
is defined. If not, it will return errors in the ghes_probe()
and not continue. If the probe is failed, the ghes_sea_remove()
also has no chance to be called. Hence, remove the unnecessary
handling when CONFIG_ACPI_APEI_SEA is not defined.

For the NMI notification, it has the same issue as SEA notification,
so also remove the unused dead-code for it.

Cc: Tyler Baicar 
Cc: James Morse 
Signed-off-by: Dongjiu Geng 
Tested-by: Tyler Baicar 
Signed-off-by: Borislav Petkov 
---
 drivers/acpi/apei/ghes.c | 33 +
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d661d45..3eee30a 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -849,17 +849,8 @@ static void ghes_sea_remove(struct ghes *ghes)
synchronize_rcu();
 }
 #else /* CONFIG_ACPI_APEI_SEA */
-static inline void ghes_sea_add(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not 
supported\n",
-  ghes->generic->header.source_id);
-}
-
-static inline void ghes_sea_remove(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not 
supported\n",
-  ghes->generic->header.source_id);
-}
+static inline void ghes_sea_add(struct ghes *ghes) { }
+static inline void ghes_sea_remove(struct ghes *ghes) { }
 #endif /* CONFIG_ACPI_APEI_SEA */
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
@@ -1061,23 +1052,9 @@ static void ghes_nmi_init_cxt(void)
init_irq_work(_proc_irq_work, ghes_proc_in_irq);
 }
 #else /* CONFIG_HAVE_ACPI_APEI_NMI */
-static inline void ghes_nmi_add(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not 
supported!\n",
-  ghes->generic->header.source_id);
-   BUG();
-}
-
-static inline void ghes_nmi_remove(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not 
supported!\n",
-  ghes->generic->header.source_id);
-   BUG();
-}
-
-static inline void ghes_nmi_init_cxt(void)
-{
-}
+static inline void ghes_nmi_add(struct ghes *ghes) { }
+static inline void ghes_nmi_remove(struct ghes *ghes) { }
+static inline void ghes_nmi_init_cxt(void) { }
 #endif /* CONFIG_HAVE_ACPI_APEI_NMI */
 
 static int ghes_probe(struct platform_device *ghes_dev)
-- 
2.10.1

[PATCH v7] acpi: apei: remove the unused dead-code for SEA/NMI notification type

2017-10-20 Thread Dongjiu Geng

For the SEA notification, the two functions ghes_sea_add() and
ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA
is defined. If not, it will return errors in the ghes_probe()
and not continue. If the probe is failed, the ghes_sea_remove()
also has no chance to be called. Hence, remove the unnecessary
handling when CONFIG_ACPI_APEI_SEA is not defined.

For the NMI notification, it has the same issue as SEA notification,
so also remove the unused dead-code for it.

Cc: Tyler Baicar 
Cc: James Morse 
Signed-off-by: Dongjiu Geng 
Tested-by: Tyler Baicar 
Signed-off-by: Borislav Petkov 
---
 drivers/acpi/apei/ghes.c | 33 +
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d661d45..3eee30a 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -849,17 +849,8 @@ static void ghes_sea_remove(struct ghes *ghes)
synchronize_rcu();
 }
 #else /* CONFIG_ACPI_APEI_SEA */
-static inline void ghes_sea_add(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not 
supported\n",
-  ghes->generic->header.source_id);
-}
-
-static inline void ghes_sea_remove(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not 
supported\n",
-  ghes->generic->header.source_id);
-}
+static inline void ghes_sea_add(struct ghes *ghes) { }
+static inline void ghes_sea_remove(struct ghes *ghes) { }
 #endif /* CONFIG_ACPI_APEI_SEA */
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
@@ -1061,23 +1052,9 @@ static void ghes_nmi_init_cxt(void)
init_irq_work(_proc_irq_work, ghes_proc_in_irq);
 }
 #else /* CONFIG_HAVE_ACPI_APEI_NMI */
-static inline void ghes_nmi_add(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not 
supported!\n",
-  ghes->generic->header.source_id);
-   BUG();
-}
-
-static inline void ghes_nmi_remove(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not 
supported!\n",
-  ghes->generic->header.source_id);
-   BUG();
-}
-
-static inline void ghes_nmi_init_cxt(void)
-{
-}
+static inline void ghes_nmi_add(struct ghes *ghes) { }
+static inline void ghes_nmi_remove(struct ghes *ghes) { }
+static inline void ghes_nmi_init_cxt(void) { }
 #endif /* CONFIG_HAVE_ACPI_APEI_NMI */
 
 static int ghes_probe(struct platform_device *ghes_dev)
-- 
2.10.1

Re: [PATCH 2/3] watchdog: hpwdt: SMBIOS check

2017-10-20 Thread Guenter Roeck


On 10/20/2017 03:54 PM, Jerry Hoemann wrote:

Correct test on SMBIOS table 219 Misc Features bits for UEFI supported.


Please explain in more detail. There is no table 219 in the SMBIOS 
specification.
There is table 9, BIOS Characteristics Extension Byte 2, which specifies bit 3
as "UEFI Specification is supported.", but nothing that really maps to the
other byte, and no "misc features". Maybe this is HP specific, but then we'll
need to have much better explanation.


Signed-off-by: Jerry Hoemann 
---
  drivers/watchdog/hpwdt.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index ef54b03..4c011e8 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -707,7 +707,7 @@ static void dmi_find_icru(const struct dmi_header *dm, void 
*dummy)
smbios_proliant_ptr = (struct smbios_proliant_info *) dm;
if (smbios_proliant_ptr->misc_features & 0x01)
is_icru = 1;
-   if (smbios_proliant_ptr->misc_features & 0x408)
+   if (smbios_proliant_ptr->misc_features & 0x1400)
is_uefi = 1;
}
  }


Presumably patch 2/3 and 3/3 are bug fixs and should come first
so they can be applied to stable releases.

Thanks,
Guenter

Re: [PATCH 2/3] watchdog: hpwdt: SMBIOS check

2017-10-20 Thread Guenter Roeck


On 10/20/2017 03:54 PM, Jerry Hoemann wrote:

Correct test on SMBIOS table 219 Misc Features bits for UEFI supported.


Please explain in more detail. There is no table 219 in the SMBIOS 
specification.
There is table 9, BIOS Characteristics Extension Byte 2, which specifies bit 3
as "UEFI Specification is supported.", but nothing that really maps to the
other byte, and no "misc features". Maybe this is HP specific, but then we'll
need to have much better explanation.


Signed-off-by: Jerry Hoemann 
---
  drivers/watchdog/hpwdt.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index ef54b03..4c011e8 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -707,7 +707,7 @@ static void dmi_find_icru(const struct dmi_header *dm, void 
*dummy)
smbios_proliant_ptr = (struct smbios_proliant_info *) dm;
if (smbios_proliant_ptr->misc_features & 0x01)
is_icru = 1;
-   if (smbios_proliant_ptr->misc_features & 0x408)
+   if (smbios_proliant_ptr->misc_features & 0x1400)
is_uefi = 1;
}
  }


Presumably patch 2/3 and 3/3 are bug fixs and should come first
so they can be applied to stable releases.

Thanks,
Guenter

Re: [PATCH 1/3] watchdog: hpwdt: add ioctl WDIOC_GETPRETIMEOUT

2017-10-20 Thread Guenter Roeck


On 10/20/2017 03:54 PM, Jerry Hoemann wrote:

Add support for WDIOC_GETPRETIMEOUT ioctl so that user applications
can determine when the NMI should arrive.

Signed-off-by: Jerry Hoemann 
---
  drivers/watchdog/hpwdt.c | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 67fbe35..ef54b03 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -50,6 +50,7 @@
  static bool nowayout = WATCHDOG_NOWAYOUT;
  static char expect_release;
  static unsigned long hpwdt_is_open;
+static const int pretimeout = 9;
  
  static void __iomem *pci_mem_addr;		/* the PCI-memory address */

  static unsigned long __iomem *hpwdt_timer_reg;
@@ -622,6 +623,12 @@ static long hpwdt_ioctl(struct file *file, unsigned int 
cmd,
}
break;
  
+	case WDIOC_GETPRETIMEOUT:

+   ret = copy_to_user(argp, , sizeof(pretimeout));
+   if (ret)
+   ret = -EFAULT;
+   break;
+
case WDIOC_SETTIMEOUT:
ret = get_user(new_margin, p);
if (ret)



Can you please convert the driver to use the watchdog subsystem instead ?
If there are still improvements needed afterwards, they can still be
implemented, but we really should not make improvements which are
already supported by the watchdog core.

Thanks,
Guenter

Re: [PATCH 1/3] watchdog: hpwdt: add ioctl WDIOC_GETPRETIMEOUT

2017-10-20 Thread Guenter Roeck


On 10/20/2017 03:54 PM, Jerry Hoemann wrote:

Add support for WDIOC_GETPRETIMEOUT ioctl so that user applications
can determine when the NMI should arrive.

Signed-off-by: Jerry Hoemann 
---
  drivers/watchdog/hpwdt.c | 7 +++
  1 file changed, 7 insertions(+)

diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 67fbe35..ef54b03 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -50,6 +50,7 @@
  static bool nowayout = WATCHDOG_NOWAYOUT;
  static char expect_release;
  static unsigned long hpwdt_is_open;
+static const int pretimeout = 9;
  
  static void __iomem *pci_mem_addr;		/* the PCI-memory address */

  static unsigned long __iomem *hpwdt_timer_reg;
@@ -622,6 +623,12 @@ static long hpwdt_ioctl(struct file *file, unsigned int 
cmd,
}
break;
  
+	case WDIOC_GETPRETIMEOUT:

+   ret = copy_to_user(argp, , sizeof(pretimeout));
+   if (ret)
+   ret = -EFAULT;
+   break;
+
case WDIOC_SETTIMEOUT:
ret = get_user(new_margin, p);
if (ret)



Can you please convert the driver to use the watchdog subsystem instead ?
If there are still improvements needed afterwards, they can still be
implemented, but we really should not make improvements which are
already supported by the watchdog core.

Thanks,
Guenter

Re: [RESEND PATCH 1/3] completion: Add support for initializing completion with lockdep_map

2017-10-20 Thread Byungchul Park

On Sat, Oct 21, 2017 at 4:58 AM, Bart Van Assche  wrote:
> Sorry but I'm not sure that's the best possible answer. In my opinion
> avoiding that completion objects have dependencies on other lock objects,
> e.g. by avoiding to wait on a completion object while holding a mutex, is a
> far superior strategy over adding cross-release checking to completion
> objects. The former strategy namely makes it unnecessary to add
> cross-release checking to completion objects because that strategy ensures
> that these completion objects cannot get involved in a deadlock. The latter

It's true if we force it. But do you think it's possible?

> strategy can lead to false positive deadlock reports by the lockdep code,

What do you think false positives come from? It comes from assigning
lock classes falsely where we should more care, rather than lockdep code
itself. The same is applicable to cross-release.

> something none of us wants.
>
> A possible alternative strategy could be to enable cross-release checking
> only for those completion objects for which waiting occurs inside a critical
> section.

Of course, it already did. Cross-release doesn't consider any waiting
outside of critical sections at all, and it should do.

> As explained in another e-mail thread, unlike the lock inversion checking
> performed by the <= v4.13 lockdep code, cross-release checking is a heuristic
> that does not have a sound theoretical basis. The lock validator is an

It's not heuristic but based on the same theoretical basis as <=4.13
lockdep. I mean, the key basis is:

   1) What causes deadlock
   2) What is a dependency
   3) Build a dependency when identified

> important tool for kernel developers. It is important that it produces as few
> false positives as possible. Since the cross-release checks are enabled
> automatically when enabling lockdep, I think it is normal that I, as a kernel
> developer, care that the cross-release checks produce as few false positives
> as possible.

No doubt. That's why I proposed these patches.

Re: [RESEND PATCH 1/3] completion: Add support for initializing completion with lockdep_map

2017-10-20 Thread Byungchul Park

On Sat, Oct 21, 2017 at 4:58 AM, Bart Van Assche  wrote:
> Sorry but I'm not sure that's the best possible answer. In my opinion
> avoiding that completion objects have dependencies on other lock objects,
> e.g. by avoiding to wait on a completion object while holding a mutex, is a
> far superior strategy over adding cross-release checking to completion
> objects. The former strategy namely makes it unnecessary to add
> cross-release checking to completion objects because that strategy ensures
> that these completion objects cannot get involved in a deadlock. The latter

It's true if we force it. But do you think it's possible?

> strategy can lead to false positive deadlock reports by the lockdep code,

What do you think false positives come from? It comes from assigning
lock classes falsely where we should more care, rather than lockdep code
itself. The same is applicable to cross-release.

> something none of us wants.
>
> A possible alternative strategy could be to enable cross-release checking
> only for those completion objects for which waiting occurs inside a critical
> section.

Of course, it already did. Cross-release doesn't consider any waiting
outside of critical sections at all, and it should do.

> As explained in another e-mail thread, unlike the lock inversion checking
> performed by the <= v4.13 lockdep code, cross-release checking is a heuristic
> that does not have a sound theoretical basis. The lock validator is an

It's not heuristic but based on the same theoretical basis as <=4.13
lockdep. I mean, the key basis is:

   1) What causes deadlock
   2) What is a dependency
   3) Build a dependency when identified

> important tool for kernel developers. It is important that it produces as few
> false positives as possible. Since the cross-release checks are enabled
> automatically when enabling lockdep, I think it is normal that I, as a kernel
> developer, care that the cross-release checks produce as few false positives
> as possible.

No doubt. That's why I proposed these patches.

Re: [PATCH] [v2] tomoyo: fix timestamping for y2038

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, Arnd Bergmann wrote:

> Tomoyo uses an open-coded version of time_to_tm() to create a timestamp
> from the current time as read by get_seconds(). This will overflow and
> give wrong results on 32-bit systems in 2038.
> 
> To correct this, this changes the code to use ktime_get_real_seconds()
> and the generic time64_to_tm() function that are both y2038-safe.
> Using the library function avoids adding an expensive 64-bit division
> in this code and can benefit from any optimizations we do in common
> code.
> 
> Acked-by: Tetsuo Handa 
> Signed-off-by: Arnd Bergmann 
> ---
> v2: fix year calculation

Applied to:

git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git 
next-general

-- 
James Morris

Re: [PATCH] [v2] tomoyo: fix timestamping for y2038

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, Arnd Bergmann wrote:

> Tomoyo uses an open-coded version of time_to_tm() to create a timestamp
> from the current time as read by get_seconds(). This will overflow and
> give wrong results on 32-bit systems in 2038.
> 
> To correct this, this changes the code to use ktime_get_real_seconds()
> and the generic time64_to_tm() function that are both y2038-safe.
> Using the library function avoids adding an expensive 64-bit division
> in this code and can benefit from any optimizations we do in common
> code.
> 
> Acked-by: Tetsuo Handa 
> Signed-off-by: Arnd Bergmann 
> ---
> v2: fix year calculation

Applied to:

git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git 
next-general

-- 
James Morris

Re: [PATCH 26/27] efi: Add an EFI_SECURE_BOOT flag to indicate secure boot mode

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> + if (efi_enabled(EFI_BOOT)) {
> + switch (mode) {
> + case efi_secureboot_mode_disabled:
> + pr_info("Secure boot disabled\n");
> + break;
> + case efi_secureboot_mode_enabled:
> + set_bit(EFI_SECURE_BOOT, );
> + pr_info("Secure boot enabled\n");
> + break;
> + default:
> + pr_info("Secure boot could not be determined\n");

Perhaps make this pr_warning and include the unknown mode value?

-- 
James Morris

Re: [PATCH 26/27] efi: Add an EFI_SECURE_BOOT flag to indicate secure boot mode

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> + if (efi_enabled(EFI_BOOT)) {
> + switch (mode) {
> + case efi_secureboot_mode_disabled:
> + pr_info("Secure boot disabled\n");
> + break;
> + case efi_secureboot_mode_enabled:
> + set_bit(EFI_SECURE_BOOT, );
> + pr_info("Secure boot enabled\n");
> + break;
> + default:
> + pr_info("Secure boot could not be determined\n");

Perhaps make this pr_warning and include the unknown mode value?

-- 
James Morris

Re: [PATCH 25/27] Lock down /proc/kcore

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> Disallow access to /proc/kcore when the kernel is locked down to prevent
> access to cryptographic data.
> 
> Signed-off-by: David Howells 

Reviewed-by: James Morris 

I have to wonder, though, after everything is locked down, how easy will 
it be for new things to slip in which need to be included in the lockdown, 
but are not.

-- 
James Morris

Re: [PATCH 25/27] Lock down /proc/kcore

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> Disallow access to /proc/kcore when the kernel is locked down to prevent
> access to cryptographic data.
> 
> Signed-off-by: David Howells 

Reviewed-by: James Morris 

I have to wonder, though, after everything is locked down, how easy will 
it be for new things to slip in which need to be included in the lockdown, 
but are not.

-- 
James Morris

[PATCH v6] acpi: apei: remove the unused dead-code for SEA/NMI notification type

2017-10-20 Thread Dongjiu Geng

For the SEA notification, the two functions ghes_sea_add() and
ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA
is defined. If not, it will return errors in the ghes_probe()
and not continue. If the probe is failed, the ghes_sea_remove()
also has no chance to be called. Hence, remove the unnecessary
handling when CONFIG_ACPI_APEI_SEA is not defined.

For the NMI notification, it has the same issue as SEA notification,
so also remove the unused dead-code for it.

Cc: Tyler Baicar 
Cc: James Morse 
Signed-off-by: Dongjiu Geng 
Tested-by: Tyler Baicar 
Signed-off-by: Borislav Petkov 
---
 drivers/acpi/apei/ghes.c | 33 +
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d661d45..3eee30a 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -849,17 +849,8 @@ static void ghes_sea_remove(struct ghes *ghes)
synchronize_rcu();
 }
 #else /* CONFIG_ACPI_APEI_SEA */
-static inline void ghes_sea_add(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not 
supported\n",
-  ghes->generic->header.source_id);
-}
-
-static inline void ghes_sea_remove(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not 
supported\n",
-  ghes->generic->header.source_id);
-}
+static inline void ghes_sea_add(struct ghes *ghes) { }
+static inline void ghes_sea_remove(struct ghes *ghes) { }
 #endif /* CONFIG_ACPI_APEI_SEA */
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
@@ -1061,23 +1052,9 @@ static void ghes_nmi_init_cxt(void)
init_irq_work(_proc_irq_work, ghes_proc_in_irq);
 }
 #else /* CONFIG_HAVE_ACPI_APEI_NMI */
-static inline void ghes_nmi_add(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not 
supported!\n",
-  ghes->generic->header.source_id);
-   BUG();
-}
-
-static inline void ghes_nmi_remove(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not 
supported!\n",
-  ghes->generic->header.source_id);
-   BUG();
-}
-
-static inline void ghes_nmi_init_cxt(void)
-{
-}
+static inline void ghes_nmi_add(struct ghes *ghes) { }
+static inline void ghes_nmi_remove(struct ghes *ghes) { }
+static inline void ghes_nmi_init_cxt(void) { }
 #endif /* CONFIG_HAVE_ACPI_APEI_NMI */
 
 static int ghes_probe(struct platform_device *ghes_dev)
-- 
2.10.1

[PATCH v6] acpi: apei: remove the unused dead-code for SEA/NMI notification type

2017-10-20 Thread Dongjiu Geng

For the SEA notification, the two functions ghes_sea_add() and
ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA
is defined. If not, it will return errors in the ghes_probe()
and not continue. If the probe is failed, the ghes_sea_remove()
also has no chance to be called. Hence, remove the unnecessary
handling when CONFIG_ACPI_APEI_SEA is not defined.

For the NMI notification, it has the same issue as SEA notification,
so also remove the unused dead-code for it.

Cc: Tyler Baicar 
Cc: James Morse 
Signed-off-by: Dongjiu Geng 
Tested-by: Tyler Baicar 
Signed-off-by: Borislav Petkov 
---
 drivers/acpi/apei/ghes.c | 33 +
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d661d45..3eee30a 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -849,17 +849,8 @@ static void ghes_sea_remove(struct ghes *ghes)
synchronize_rcu();
 }
 #else /* CONFIG_ACPI_APEI_SEA */
-static inline void ghes_sea_add(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not 
supported\n",
-  ghes->generic->header.source_id);
-}
-
-static inline void ghes_sea_remove(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not 
supported\n",
-  ghes->generic->header.source_id);
-}
+static inline void ghes_sea_add(struct ghes *ghes) { }
+static inline void ghes_sea_remove(struct ghes *ghes) { }
 #endif /* CONFIG_ACPI_APEI_SEA */
 
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
@@ -1061,23 +1052,9 @@ static void ghes_nmi_init_cxt(void)
init_irq_work(_proc_irq_work, ghes_proc_in_irq);
 }
 #else /* CONFIG_HAVE_ACPI_APEI_NMI */
-static inline void ghes_nmi_add(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not 
supported!\n",
-  ghes->generic->header.source_id);
-   BUG();
-}
-
-static inline void ghes_nmi_remove(struct ghes *ghes)
-{
-   pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not 
supported!\n",
-  ghes->generic->header.source_id);
-   BUG();
-}
-
-static inline void ghes_nmi_init_cxt(void)
-{
-}
+static inline void ghes_nmi_add(struct ghes *ghes) { }
+static inline void ghes_nmi_remove(struct ghes *ghes) { }
+static inline void ghes_nmi_init_cxt(void) { }
 #endif /* CONFIG_HAVE_ACPI_APEI_NMI */
 
 static int ghes_probe(struct platform_device *ghes_dev)
-- 
2.10.1

Re: [PATCH 1/4] mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS

2017-10-20 Thread Nitin Gupta

On Fri, Oct 20, 2017 at 12:59 PM, Kirill A. Shutemov
 wrote:
> With boot-time switching between paging mode we will have variable
> MAX_PHYSMEM_BITS.
>
> Let's use the maximum variable possible for CONFIG_X86_5LEVEL=y
> configuration to define zsmalloc data structures.
>
> The patch introduces MAX_POSSIBLE_PHYSMEM_BITS to cover such case.
> It also suits well to handle PAE special case.
>

I see that with your upcoming patch, MAX_PHYSMEM_BITS is turned into a
variable for x86_64 case as: (pgtable_l5_enabled ? 52 : 46).

Even with this change, I don't see a need for this new
MAX_POSSIBLE_PHYSMEM_BITS constant.

> -#ifndef MAX_PHYSMEM_BITS
> -#ifdef CONFIG_HIGHMEM64G
> -#define MAX_PHYSMEM_BITS 36
> -#else /* !CONFIG_HIGHMEM64G */
> +#ifndef MAX_POSSIBLE_PHYSMEM_BITS
> +#ifdef MAX_PHYSMEM_BITS
> +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
> +#else

This ifdef on HIGHMEM64G is redundant, as x86 already defines
MAX_PHYSMEM_BITS = 36 in PAE case. So, all that zsmalloc should do is:

#ifndef MAX_PHYSMEM_BITS
#define MAX_PHYSMEM_BITS BITS_PER_LONG
#endif

.. and then no change is needed for rest of derived constants like _PFN_BITS.

It is upto every arch to define correct MAX_PHYSMEM_BITS (variable or constant)
based on whatever configurations the arch supports. If not defined,
zsmalloc picks
a reasonable default of BITS_PER_LONG.

I will send a patch which makes the change to remove ifdef on CONFIG_HIGHMEM64G.

Thanks,
Nitin

Re: [PATCH 1/4] mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS

2017-10-20 Thread Nitin Gupta

On Fri, Oct 20, 2017 at 12:59 PM, Kirill A. Shutemov
 wrote:
> With boot-time switching between paging mode we will have variable
> MAX_PHYSMEM_BITS.
>
> Let's use the maximum variable possible for CONFIG_X86_5LEVEL=y
> configuration to define zsmalloc data structures.
>
> The patch introduces MAX_POSSIBLE_PHYSMEM_BITS to cover such case.
> It also suits well to handle PAE special case.
>

I see that with your upcoming patch, MAX_PHYSMEM_BITS is turned into a
variable for x86_64 case as: (pgtable_l5_enabled ? 52 : 46).

Even with this change, I don't see a need for this new
MAX_POSSIBLE_PHYSMEM_BITS constant.

> -#ifndef MAX_PHYSMEM_BITS
> -#ifdef CONFIG_HIGHMEM64G
> -#define MAX_PHYSMEM_BITS 36
> -#else /* !CONFIG_HIGHMEM64G */
> +#ifndef MAX_POSSIBLE_PHYSMEM_BITS
> +#ifdef MAX_PHYSMEM_BITS
> +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS
> +#else

This ifdef on HIGHMEM64G is redundant, as x86 already defines
MAX_PHYSMEM_BITS = 36 in PAE case. So, all that zsmalloc should do is:

#ifndef MAX_PHYSMEM_BITS
#define MAX_PHYSMEM_BITS BITS_PER_LONG
#endif

.. and then no change is needed for rest of derived constants like _PFN_BITS.

It is upto every arch to define correct MAX_PHYSMEM_BITS (variable or constant)
based on whatever configurations the arch supports. If not defined,
zsmalloc picks
a reasonable default of BITS_PER_LONG.

I will send a patch which makes the change to remove ifdef on CONFIG_HIGHMEM64G.

Thanks,
Nitin

Re: [PATCH net] hv_sock: add locking in the open/close/release code paths

2017-10-20 Thread David Miller

From: Dexuan Cui 
Date: Thu, 19 Oct 2017 03:33:14 +

> 
> Without the patch, when hvs_open_connection() hasn't completely established
> a connection (e.g. it has changed sk->sk_state to SS_CONNECTED, but hasn't
> inserted the sock into the connected queue), vsock_stream_connect() may see
> the sk_state change and return the connection to the userspace, and next
> when the userspace closes the connection quickly, hvs_release() may not see
> the connection in the connected queue; finally hvs_open_connection()
> inserts the connection into the queue, but we won't be able to purge the
> connection for ever.
> 
> Signed-off-by: Dexuan Cui 

Applied.

Re: [PATCH net] hv_sock: add locking in the open/close/release code paths

2017-10-20 Thread David Miller

From: Dexuan Cui 
Date: Thu, 19 Oct 2017 03:33:14 +

> 
> Without the patch, when hvs_open_connection() hasn't completely established
> a connection (e.g. it has changed sk->sk_state to SS_CONNECTED, but hasn't
> inserted the sock into the connected queue), vsock_stream_connect() may see
> the sk_state change and return the connection to the userspace, and next
> when the userspace closes the connection quickly, hvs_release() may not see
> the connection in the connected queue; finally hvs_open_connection()
> inserts the connection into the queue, but we won't be able to purge the
> connection for ever.
> 
> Signed-off-by: Dexuan Cui 

Applied.

[PATCH v1] mm: broken deferred calculation

2017-10-20 Thread Pavel Tatashin

In reset_deferred_meminit we determine number of pages that must not be
deferred. We initialize pages for at least 2G of memory, but also pages for
reserved memory in this node.

The reserved memory is determined in this function:
memblock_reserved_memory_within(), which operates over physical addresses,
and returns size in bytes. However, reset_deferred_meminit() assumes that
that this function operates with pfns, and returns page count.

The result is that in the best case machine boots slower than expected
due to initializing more pages than needed in single thread, and in the
worst case panics because fewer than needed pages are initialized early.

Fixes: 864b9a393dcb ("mm: consider memblock reservations for deferred memory 
initialization sizing")

Signed-off-by: Pavel Tatashin 
---
 include/linux/mmzone.h |  3 ++-
 mm/page_alloc.c| 27 ++-
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a6f361931d52..d45ba78c7e42 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -699,7 +699,8 @@ typedef struct pglist_data {
 * is the first PFN that needs to be initialised.
 */
unsigned long first_deferred_pfn;
-   unsigned long static_init_size;
+   /* Number of non-deferred pages */
+   unsigned long static_init_pgcnt;
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 97687b38da05..16419cdbbb7a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -289,28 +289,37 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+/*
+ * Determine how many pages need to be initialized durig early boot
+ * (non-deferred initialization).
+ * The value of first_deferred_pfn will be set later, once non-deferred pages
+ * are initialized, but for now set it ULONG_MAX.
+ */
 static inline void reset_deferred_meminit(pg_data_t *pgdat)
 {
-   unsigned long max_initialise;
-   unsigned long reserved_lowmem;
+   phys_addr_t start_addr, end_addr;
+   unsigned long max_pgcnt;
+   unsigned long reserved;
 
/*
 * Initialise at least 2G of a node but also take into account that
 * two large system hashes that can take up 1GB for 0.25TB/node.
 */
-   max_initialise = max(2UL << (30 - PAGE_SHIFT),
-   (pgdat->node_spanned_pages >> 8));
+   max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
+   (pgdat->node_spanned_pages >> 8));
 
/*
 * Compensate the all the memblock reservations (e.g. crash kernel)
 * from the initial estimation to make sure we will initialize enough
 * memory to boot.
 */
-   reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
-   pgdat->node_start_pfn + max_initialise);
-   max_initialise += reserved_lowmem;
+   start_addr = PFN_PHYS(pgdat->node_start_pfn);
+   end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
+   reserved = memblock_reserved_memory_within(start_addr, end_addr);
+   max_pgcnt += PHYS_PFN(reserved);
 
-   pgdat->static_init_size = min(max_initialise, 
pgdat->node_spanned_pages);
+   pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
 }
 
@@ -337,7 +346,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
if (zone_end < pgdat_end_pfn(pgdat))
return true;
(*nr_initialised)++;
-   if ((*nr_initialised > pgdat->static_init_size) &&
+   if ((*nr_initialised > pgdat->static_init_pgcnt) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
-- 
2.14.2

[PATCH v1] mm: broken deferred calculation

2017-10-20 Thread Pavel Tatashin

In reset_deferred_meminit we determine number of pages that must not be
deferred. We initialize pages for at least 2G of memory, but also pages for
reserved memory in this node.

The reserved memory is determined in this function:
memblock_reserved_memory_within(), which operates over physical addresses,
and returns size in bytes. However, reset_deferred_meminit() assumes that
that this function operates with pfns, and returns page count.

The result is that in the best case machine boots slower than expected
due to initializing more pages than needed in single thread, and in the
worst case panics because fewer than needed pages are initialized early.

Fixes: 864b9a393dcb ("mm: consider memblock reservations for deferred memory 
initialization sizing")

Signed-off-by: Pavel Tatashin 
---
 include/linux/mmzone.h |  3 ++-
 mm/page_alloc.c| 27 ++-
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a6f361931d52..d45ba78c7e42 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -699,7 +699,8 @@ typedef struct pglist_data {
 * is the first PFN that needs to be initialised.
 */
unsigned long first_deferred_pfn;
-   unsigned long static_init_size;
+   /* Number of non-deferred pages */
+   unsigned long static_init_pgcnt;
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 97687b38da05..16419cdbbb7a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -289,28 +289,37 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+
+/*
+ * Determine how many pages need to be initialized durig early boot
+ * (non-deferred initialization).
+ * The value of first_deferred_pfn will be set later, once non-deferred pages
+ * are initialized, but for now set it ULONG_MAX.
+ */
 static inline void reset_deferred_meminit(pg_data_t *pgdat)
 {
-   unsigned long max_initialise;
-   unsigned long reserved_lowmem;
+   phys_addr_t start_addr, end_addr;
+   unsigned long max_pgcnt;
+   unsigned long reserved;
 
/*
 * Initialise at least 2G of a node but also take into account that
 * two large system hashes that can take up 1GB for 0.25TB/node.
 */
-   max_initialise = max(2UL << (30 - PAGE_SHIFT),
-   (pgdat->node_spanned_pages >> 8));
+   max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
+   (pgdat->node_spanned_pages >> 8));
 
/*
 * Compensate the all the memblock reservations (e.g. crash kernel)
 * from the initial estimation to make sure we will initialize enough
 * memory to boot.
 */
-   reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
-   pgdat->node_start_pfn + max_initialise);
-   max_initialise += reserved_lowmem;
+   start_addr = PFN_PHYS(pgdat->node_start_pfn);
+   end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
+   reserved = memblock_reserved_memory_within(start_addr, end_addr);
+   max_pgcnt += PHYS_PFN(reserved);
 
-   pgdat->static_init_size = min(max_initialise, 
pgdat->node_spanned_pages);
+   pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
 }
 
@@ -337,7 +346,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
if (zone_end < pgdat_end_pfn(pgdat))
return true;
(*nr_initialised)++;
-   if ((*nr_initialised > pgdat->static_init_size) &&
+   if ((*nr_initialised > pgdat->static_init_pgcnt) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
-- 
2.14.2

Re: [PATCH 0/12] PM / sleep: Driver flags for system suspend/resume

2017-10-20 Thread Rafael J. Wysocki

On Friday, October 20, 2017 10:46:07 PM CEST Bjorn Helgaas wrote:
> On Mon, Oct 16, 2017 at 03:12:35AM +0200, Rafael J. Wysocki wrote:
> > Hi All,
> > 
> > Well, this took more time than expected, as I tried to cover everything I 
> > had
> > in mind regarding PM flags for drivers.
> 
> For the parts that touch PCI,
> 
> Acked-by: Bjorn Helgaas 

Thank you!

> I doubt there'll be conflicts with changes in my tree, but let me know if
> you trip over any so I can watch for them when merging.

Well, if there are any conflicts, we'll see them in linux-next I guess. :-)

Thanks,
Rafael

Re: [PATCH 0/12] PM / sleep: Driver flags for system suspend/resume

2017-10-20 Thread Rafael J. Wysocki

On Friday, October 20, 2017 10:46:07 PM CEST Bjorn Helgaas wrote:
> On Mon, Oct 16, 2017 at 03:12:35AM +0200, Rafael J. Wysocki wrote:
> > Hi All,
> > 
> > Well, this took more time than expected, as I tried to cover everything I 
> > had
> > in mind regarding PM flags for drivers.
> 
> For the parts that touch PCI,
> 
> Acked-by: Bjorn Helgaas 

Thank you!

> I doubt there'll be conflicts with changes in my tree, but let me know if
> you trip over any so I can watch for them when merging.

Well, if there are any conflicts, we'll see them in linux-next I guess. :-)

Thanks,
Rafael

Re: swap_info_get: Bad swap offset entry 0200f8a7

2017-10-20 Thread Christian Kujau

On Fri, 20 Oct 2017, huang ying wrote:
> >   4 May  < Linux version 4.11.2-1-ARCH
> >   4 Jun  < Linux version 4.11.3-1-ARCH
> >   7 Jul  < Linux version 4.11.9-1-ARCH
> >   4 Aug  < Linux version 4.12.8-2-ARCH
> >  24 Sep  < Linux version 4.12.13-1-ARCH
> > 158 Oct  < Linux version 4.13.5-1-ARCH
> 
> So you have never seen this before 4.11 like 4.10?

Unfortunately the kernel logs for that machine only go back until May 
2017 and I cannot tell if that hasn't happened before. I've seen these 
messages appear since then but didn't bother much. But as it now happens 
more frequently, I thought I should mention this to the list.

> Which operations will trigger this error messages?

I'm not able to reproduce it at will, but I suspect that memory pressure 
triggers these messages. The machine in question is an Lenovo Ideapad S10 
notebook running 24x7 and is equipped with 1 GB of RAM. Two Java processes 
are basically using up all the memory, so usually it tooks like this:

$ free -m
   total   used  free   shared  buff/cache available
Mem: 99486667   1   6020
Swap:760437   322

$ zramctl 
NAME   ALGORITHM DISKSIZE  DATA COMPR TOTAL STREAMS MOUNTPOINT
/dev/zram0 lz4 248.7M  247M 92.3M 97.4M   2 [SWAP]

I just assumed the message is triggered when the system is really low on 
memory and maybe zram is too slow to provide the memory requested. But 
that's just my layman's assumption :-) For example, today's message was 
emitted during the night:

Oct 20 01:26:18 len kernel: [638973.207849] \
   swap_info_get: Bad swap offset entry 0200f8a7

And here are the sysstat numbers for that time frame:

$ sar -r -s 00:00 -e 02:00
Linux 4.13.5-1-ARCH (len)   10/20/2017  _i686_  (2 CPU)
12:00:01 AM kbmemfree kbmemused  %memused kbbuffers  kbcached  kbcommit   
%commit  kbactive   kbinact   kbdirty
12:10:06 AM 70076948404 93.12 4 19004   1556176 
86.58376608379408   220
12:20:02 AM 80488937992 92.10 4180404   1563952 
87.01380184327736  5568
12:30:03 AM 83296935184 91.82 4137260   1569776 
87.3432951233   280
12:40:03 AM 65188953292 93.60 4 21156   1571048 
87.41386644389820  1144
12:50:03 AM 67512950968 93.37 4 33452   1570628 
87.38378936381580  1304
01:00:07 AM 65520952960 93.57 4 24996   1573180 
87.53385396386152   904
01:10:03 AM 66956951524 93.43 4 35520   1572696 
87.50379548379364   172
01:20:02 AM 67440951040 93.38 4 88736   1569864 
87.34381764370472  7080
01:30:03 AM 70048948432 93.12 4 29212   1572504 
87.49383516381900  1832
01:40:04 AM 71532946948 92.98 4 29220   1570096 
87.35380120380284  1000
01:50:03 AM 65828952652 93.54 4 34408   1570604 
87.38381040381028  1604
Average:70353948127 93.09 4 57579   1569139 
87.30376661371613  1919

 == If that is unreadable, here it is again: https://paste.debian.net/991927/

> Is it possible for you to check
> whether the error exists for normal swap device (not ZRAM)?

I have "normal" (but encrpted) swap configured but with a lower priority:

cat /proc/swaps 
FilenameTypeSizeUsedPriority
/dev/dm-0   partition   524284  194348  0
/dev/zram0  partition   254616  253536  32767

I shall disable the zram device and disable encryption too and will report 
back if the message appears again.

> 32bit or 64bit kernel do you use?

I'm using an i686 kernel for this Atom N270 processor (with HT enabled).

Thanks for your response,
Christian.
-- 
BOFH excuse #403:

Sysadmin didn't hear pager go off due to loud music from bar-room speakers.

Re: swap_info_get: Bad swap offset entry 0200f8a7

2017-10-20 Thread Christian Kujau

On Fri, 20 Oct 2017, huang ying wrote:
> >   4 May  < Linux version 4.11.2-1-ARCH
> >   4 Jun  < Linux version 4.11.3-1-ARCH
> >   7 Jul  < Linux version 4.11.9-1-ARCH
> >   4 Aug  < Linux version 4.12.8-2-ARCH
> >  24 Sep  < Linux version 4.12.13-1-ARCH
> > 158 Oct  < Linux version 4.13.5-1-ARCH
> 
> So you have never seen this before 4.11 like 4.10?

Unfortunately the kernel logs for that machine only go back until May 
2017 and I cannot tell if that hasn't happened before. I've seen these 
messages appear since then but didn't bother much. But as it now happens 
more frequently, I thought I should mention this to the list.

> Which operations will trigger this error messages?

I'm not able to reproduce it at will, but I suspect that memory pressure 
triggers these messages. The machine in question is an Lenovo Ideapad S10 
notebook running 24x7 and is equipped with 1 GB of RAM. Two Java processes 
are basically using up all the memory, so usually it tooks like this:

$ free -m
   total   used  free   shared  buff/cache available
Mem: 99486667   1   6020
Swap:760437   322

$ zramctl 
NAME   ALGORITHM DISKSIZE  DATA COMPR TOTAL STREAMS MOUNTPOINT
/dev/zram0 lz4 248.7M  247M 92.3M 97.4M   2 [SWAP]

I just assumed the message is triggered when the system is really low on 
memory and maybe zram is too slow to provide the memory requested. But 
that's just my layman's assumption :-) For example, today's message was 
emitted during the night:

Oct 20 01:26:18 len kernel: [638973.207849] \
   swap_info_get: Bad swap offset entry 0200f8a7

And here are the sysstat numbers for that time frame:

$ sar -r -s 00:00 -e 02:00
Linux 4.13.5-1-ARCH (len)   10/20/2017  _i686_  (2 CPU)
12:00:01 AM kbmemfree kbmemused  %memused kbbuffers  kbcached  kbcommit   
%commit  kbactive   kbinact   kbdirty
12:10:06 AM 70076948404 93.12 4 19004   1556176 
86.58376608379408   220
12:20:02 AM 80488937992 92.10 4180404   1563952 
87.01380184327736  5568
12:30:03 AM 83296935184 91.82 4137260   1569776 
87.3432951233   280
12:40:03 AM 65188953292 93.60 4 21156   1571048 
87.41386644389820  1144
12:50:03 AM 67512950968 93.37 4 33452   1570628 
87.38378936381580  1304
01:00:07 AM 65520952960 93.57 4 24996   1573180 
87.53385396386152   904
01:10:03 AM 66956951524 93.43 4 35520   1572696 
87.50379548379364   172
01:20:02 AM 67440951040 93.38 4 88736   1569864 
87.34381764370472  7080
01:30:03 AM 70048948432 93.12 4 29212   1572504 
87.49383516381900  1832
01:40:04 AM 71532946948 92.98 4 29220   1570096 
87.35380120380284  1000
01:50:03 AM 65828952652 93.54 4 34408   1570604 
87.38381040381028  1604
Average:70353948127 93.09 4 57579   1569139 
87.30376661371613  1919

 == If that is unreadable, here it is again: https://paste.debian.net/991927/

> Is it possible for you to check
> whether the error exists for normal swap device (not ZRAM)?

I have "normal" (but encrpted) swap configured but with a lower priority:

cat /proc/swaps 
FilenameTypeSizeUsedPriority
/dev/dm-0   partition   524284  194348  0
/dev/zram0  partition   254616  253536  32767

I shall disable the zram device and disable encryption too and will report 
back if the message appears again.

> 32bit or 64bit kernel do you use?

I'm using an i686 kernel for this Atom N270 processor (with HT enabled).

Thanks for your response,
Christian.
-- 
BOFH excuse #403:

Sysadmin didn't hear pager go off due to loud music from bar-room speakers.

[PATCH v2 2/4] android: binder: Fix null ptr dereference in debug msg

2017-10-20 Thread Sherry Yang

Don't access next->data in kernel debug message when the
next buffer is null.

Acked-by: Arve Hjønnevåg 
Signed-off-by: Sherry Yang 
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index e12072b1d507..c2819a3d58a6 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -560,7 +560,7 @@ static void binder_delete_free_buffer(struct binder_alloc 
*alloc,
binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
   "%d: merge free, buffer %pK do not share 
page with %pK or %pK\n",
   alloc->pid, buffer->data,
-  prev->data, next->data);
+  prev->data, next ? next->data : NULL);
binder_update_page_range(alloc, 0, buffer_start_page(buffer),
 buffer_start_page(buffer) + PAGE_SIZE,
 NULL);
-- 
2.11.0 (Apple Git-81)

[PATCH v2 2/4] android: binder: Fix null ptr dereference in debug msg

2017-10-20 Thread Sherry Yang

Don't access next->data in kernel debug message when the
next buffer is null.

Acked-by: Arve Hjønnevåg 
Signed-off-by: Sherry Yang 
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index e12072b1d507..c2819a3d58a6 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -560,7 +560,7 @@ static void binder_delete_free_buffer(struct binder_alloc 
*alloc,
binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC,
   "%d: merge free, buffer %pK do not share 
page with %pK or %pK\n",
   alloc->pid, buffer->data,
-  prev->data, next->data);
+  prev->data, next ? next->data : NULL);
binder_update_page_range(alloc, 0, buffer_start_page(buffer),
 buffer_start_page(buffer) + PAGE_SIZE,
 NULL);
-- 
2.11.0 (Apple Git-81)

[PATCH v2 3/4] android: binder: Remove unused vma argument

2017-10-20 Thread Sherry Yang

The vma argument in update_page_range is no longer
used after 74310e06 ("android: binder: Move buffer
out of area shared with user space"), since mmap_handler
no longer calls update_page_range with a vma.

Acked-by: Arve Hjønnevåg 
Signed-off-by: Sherry Yang 
---
 drivers/android/binder_alloc.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index c2819a3d58a6..36bcea1fc47c 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -186,12 +186,12 @@ struct binder_buffer *binder_alloc_prepare_to_free(struct 
binder_alloc *alloc,
 }
 
 static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
-   void *start, void *end,
-   struct vm_area_struct *vma)
+   void *start, void *end)
 {
void *page_addr;
unsigned long user_page_addr;
struct binder_lru_page *page;
+   struct vm_area_struct *vma = NULL;
struct mm_struct *mm = NULL;
bool need_mm = false;
 
@@ -215,7 +215,7 @@ static int binder_update_page_range(struct binder_alloc 
*alloc, int allocate,
}
}
 
-   if (!vma && need_mm && mmget_not_zero(alloc->vma_vm_mm))
+   if (need_mm && mmget_not_zero(alloc->vma_vm_mm))
mm = alloc->vma_vm_mm;
 
if (mm) {
@@ -437,7 +437,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct 
binder_alloc *alloc,
if (end_page_addr > has_page_addr)
end_page_addr = has_page_addr;
ret = binder_update_page_range(alloc, 1,
-   (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL);
+   (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr);
if (ret)
return ERR_PTR(ret);
 
@@ -478,7 +478,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct 
binder_alloc *alloc,
 err_alloc_buf_struct_failed:
binder_update_page_range(alloc, 0,
 (void *)PAGE_ALIGN((uintptr_t)buffer->data),
-end_page_addr, NULL);
+end_page_addr);
return ERR_PTR(-ENOMEM);
 }
 
@@ -562,8 +562,7 @@ static void binder_delete_free_buffer(struct binder_alloc 
*alloc,
   alloc->pid, buffer->data,
   prev->data, next ? next->data : NULL);
binder_update_page_range(alloc, 0, buffer_start_page(buffer),
-buffer_start_page(buffer) + PAGE_SIZE,
-NULL);
+buffer_start_page(buffer) + PAGE_SIZE);
}
list_del(>entry);
kfree(buffer);
@@ -600,8 +599,7 @@ static void binder_free_buf_locked(struct binder_alloc 
*alloc,
 
binder_update_page_range(alloc, 0,
(void *)PAGE_ALIGN((uintptr_t)buffer->data),
-   (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
-   NULL);
+   (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK));
 
rb_erase(>rb_node, >allocated_buffers);
buffer->free = 1;
-- 
2.11.0 (Apple Git-81)

[PATCH v2 3/4] android: binder: Remove unused vma argument

2017-10-20 Thread Sherry Yang

The vma argument in update_page_range is no longer
used after 74310e06 ("android: binder: Move buffer
out of area shared with user space"), since mmap_handler
no longer calls update_page_range with a vma.

Acked-by: Arve Hjønnevåg 
Signed-off-by: Sherry Yang 
---
 drivers/android/binder_alloc.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index c2819a3d58a6..36bcea1fc47c 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -186,12 +186,12 @@ struct binder_buffer *binder_alloc_prepare_to_free(struct 
binder_alloc *alloc,
 }
 
 static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
-   void *start, void *end,
-   struct vm_area_struct *vma)
+   void *start, void *end)
 {
void *page_addr;
unsigned long user_page_addr;
struct binder_lru_page *page;
+   struct vm_area_struct *vma = NULL;
struct mm_struct *mm = NULL;
bool need_mm = false;
 
@@ -215,7 +215,7 @@ static int binder_update_page_range(struct binder_alloc 
*alloc, int allocate,
}
}
 
-   if (!vma && need_mm && mmget_not_zero(alloc->vma_vm_mm))
+   if (need_mm && mmget_not_zero(alloc->vma_vm_mm))
mm = alloc->vma_vm_mm;
 
if (mm) {
@@ -437,7 +437,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct 
binder_alloc *alloc,
if (end_page_addr > has_page_addr)
end_page_addr = has_page_addr;
ret = binder_update_page_range(alloc, 1,
-   (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL);
+   (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr);
if (ret)
return ERR_PTR(ret);
 
@@ -478,7 +478,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct 
binder_alloc *alloc,
 err_alloc_buf_struct_failed:
binder_update_page_range(alloc, 0,
 (void *)PAGE_ALIGN((uintptr_t)buffer->data),
-end_page_addr, NULL);
+end_page_addr);
return ERR_PTR(-ENOMEM);
 }
 
@@ -562,8 +562,7 @@ static void binder_delete_free_buffer(struct binder_alloc 
*alloc,
   alloc->pid, buffer->data,
   prev->data, next ? next->data : NULL);
binder_update_page_range(alloc, 0, buffer_start_page(buffer),
-buffer_start_page(buffer) + PAGE_SIZE,
-NULL);
+buffer_start_page(buffer) + PAGE_SIZE);
}
list_del(>entry);
kfree(buffer);
@@ -600,8 +599,7 @@ static void binder_free_buf_locked(struct binder_alloc 
*alloc,
 
binder_update_page_range(alloc, 0,
(void *)PAGE_ALIGN((uintptr_t)buffer->data),
-   (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
-   NULL);
+   (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK));
 
rb_erase(>rb_node, >allocated_buffers);
buffer->free = 1;
-- 
2.11.0 (Apple Git-81)

[PATCH v2 1/4] android: binder: Don't get mm from task

2017-10-20 Thread Sherry Yang

Use binder_alloc struct's mm_struct rather than getting
a reference to the mm struct through get_task_mm to
avoid a potential deadlock between lru lock, task lock and
dentry lock, since a thread can be holding the task lock
and the dentry lock while trying to acquire the lru lock.

Acked-by: Arve Hjønnevåg 
Signed-off-by: Sherry Yang 
---
 drivers/android/binder_alloc.c | 22 +-
 drivers/android/binder_alloc.h |  1 -
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 064f5e31ec55..e12072b1d507 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -215,17 +215,12 @@ static int binder_update_page_range(struct binder_alloc 
*alloc, int allocate,
}
}
 
-   if (!vma && need_mm)
-   mm = get_task_mm(alloc->tsk);
+   if (!vma && need_mm && mmget_not_zero(alloc->vma_vm_mm))
+   mm = alloc->vma_vm_mm;
 
if (mm) {
down_write(>mmap_sem);
vma = alloc->vma;
-   if (vma && mm != alloc->vma_vm_mm) {
-   pr_err("%d: vma mm and task mm mismatch\n",
-   alloc->pid);
-   vma = NULL;
-   }
}
 
if (!vma && need_mm) {
@@ -720,6 +715,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
barrier();
alloc->vma = vma;
alloc->vma_vm_mm = vma->vm_mm;
+   mmgrab(alloc->vma_vm_mm);
 
return 0;
 
@@ -795,6 +791,8 @@ void binder_alloc_deferred_release(struct binder_alloc 
*alloc)
vfree(alloc->buffer);
}
mutex_unlock(>mutex);
+   if (alloc->vma_vm_mm)
+   mmdrop(alloc->vma_vm_mm);
 
binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE,
 "%s: %d buffers %d, pages %d\n",
@@ -889,7 +887,6 @@ int binder_alloc_get_allocated_count(struct binder_alloc 
*alloc)
 void binder_alloc_vma_close(struct binder_alloc *alloc)
 {
WRITE_ONCE(alloc->vma, NULL);
-   WRITE_ONCE(alloc->vma_vm_mm, NULL);
 }
 
 /**
@@ -926,9 +923,9 @@ enum lru_status binder_alloc_free_page(struct list_head 
*item,
page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE;
vma = alloc->vma;
if (vma) {
-   mm = get_task_mm(alloc->tsk);
-   if (!mm)
-   goto err_get_task_mm_failed;
+   if (!mmget_not_zero(alloc->vma_vm_mm))
+   goto err_mmget;
+   mm = alloc->vma_vm_mm;
if (!down_write_trylock(>mmap_sem))
goto err_down_write_mmap_sem_failed;
}
@@ -963,7 +960,7 @@ enum lru_status binder_alloc_free_page(struct list_head 
*item,
 
 err_down_write_mmap_sem_failed:
mmput_async(mm);
-err_get_task_mm_failed:
+err_mmget:
 err_page_already_freed:
mutex_unlock(>mutex);
 err_get_alloc_mutex_failed:
@@ -1002,7 +999,6 @@ struct shrinker binder_shrinker = {
  */
 void binder_alloc_init(struct binder_alloc *alloc)
 {
-   alloc->tsk = current->group_leader;
alloc->pid = current->group_leader->pid;
mutex_init(>mutex);
INIT_LIST_HEAD(>buffers);
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index a3a3602c689c..2dd33b6df104 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -100,7 +100,6 @@ struct binder_lru_page {
  */
 struct binder_alloc {
struct mutex mutex;
-   struct task_struct *tsk;
struct vm_area_struct *vma;
struct mm_struct *vma_vm_mm;
void *buffer;
-- 
2.11.0 (Apple Git-81)

[PATCH v2 4/4] android: binder: Change binder_shrinker to static

2017-10-20 Thread Sherry Yang

binder_shrinker struct is not used anywhere outside of
binder_alloc.c and should be static.

Acked-by: Arve Hjønnevåg 
Signed-off-by: Sherry Yang 
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 36bcea1fc47c..6f6f745605af 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -982,7 +982,7 @@ binder_shrink_scan(struct shrinker *shrink, struct 
shrink_control *sc)
return ret;
 }
 
-struct shrinker binder_shrinker = {
+static struct shrinker binder_shrinker = {
.count_objects = binder_shrink_count,
.scan_objects = binder_shrink_scan,
.seeks = DEFAULT_SEEKS,
-- 
2.11.0 (Apple Git-81)

[PATCH v2 1/4] android: binder: Don't get mm from task

2017-10-20 Thread Sherry Yang

Use binder_alloc struct's mm_struct rather than getting
a reference to the mm struct through get_task_mm to
avoid a potential deadlock between lru lock, task lock and
dentry lock, since a thread can be holding the task lock
and the dentry lock while trying to acquire the lru lock.

Acked-by: Arve Hjønnevåg 
Signed-off-by: Sherry Yang 
---
 drivers/android/binder_alloc.c | 22 +-
 drivers/android/binder_alloc.h |  1 -
 2 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 064f5e31ec55..e12072b1d507 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -215,17 +215,12 @@ static int binder_update_page_range(struct binder_alloc 
*alloc, int allocate,
}
}
 
-   if (!vma && need_mm)
-   mm = get_task_mm(alloc->tsk);
+   if (!vma && need_mm && mmget_not_zero(alloc->vma_vm_mm))
+   mm = alloc->vma_vm_mm;
 
if (mm) {
down_write(>mmap_sem);
vma = alloc->vma;
-   if (vma && mm != alloc->vma_vm_mm) {
-   pr_err("%d: vma mm and task mm mismatch\n",
-   alloc->pid);
-   vma = NULL;
-   }
}
 
if (!vma && need_mm) {
@@ -720,6 +715,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc,
barrier();
alloc->vma = vma;
alloc->vma_vm_mm = vma->vm_mm;
+   mmgrab(alloc->vma_vm_mm);
 
return 0;
 
@@ -795,6 +791,8 @@ void binder_alloc_deferred_release(struct binder_alloc 
*alloc)
vfree(alloc->buffer);
}
mutex_unlock(>mutex);
+   if (alloc->vma_vm_mm)
+   mmdrop(alloc->vma_vm_mm);
 
binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE,
 "%s: %d buffers %d, pages %d\n",
@@ -889,7 +887,6 @@ int binder_alloc_get_allocated_count(struct binder_alloc 
*alloc)
 void binder_alloc_vma_close(struct binder_alloc *alloc)
 {
WRITE_ONCE(alloc->vma, NULL);
-   WRITE_ONCE(alloc->vma_vm_mm, NULL);
 }
 
 /**
@@ -926,9 +923,9 @@ enum lru_status binder_alloc_free_page(struct list_head 
*item,
page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE;
vma = alloc->vma;
if (vma) {
-   mm = get_task_mm(alloc->tsk);
-   if (!mm)
-   goto err_get_task_mm_failed;
+   if (!mmget_not_zero(alloc->vma_vm_mm))
+   goto err_mmget;
+   mm = alloc->vma_vm_mm;
if (!down_write_trylock(>mmap_sem))
goto err_down_write_mmap_sem_failed;
}
@@ -963,7 +960,7 @@ enum lru_status binder_alloc_free_page(struct list_head 
*item,
 
 err_down_write_mmap_sem_failed:
mmput_async(mm);
-err_get_task_mm_failed:
+err_mmget:
 err_page_already_freed:
mutex_unlock(>mutex);
 err_get_alloc_mutex_failed:
@@ -1002,7 +999,6 @@ struct shrinker binder_shrinker = {
  */
 void binder_alloc_init(struct binder_alloc *alloc)
 {
-   alloc->tsk = current->group_leader;
alloc->pid = current->group_leader->pid;
mutex_init(>mutex);
INIT_LIST_HEAD(>buffers);
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h
index a3a3602c689c..2dd33b6df104 100644
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -100,7 +100,6 @@ struct binder_lru_page {
  */
 struct binder_alloc {
struct mutex mutex;
-   struct task_struct *tsk;
struct vm_area_struct *vma;
struct mm_struct *vma_vm_mm;
void *buffer;
-- 
2.11.0 (Apple Git-81)

[PATCH v2 4/4] android: binder: Change binder_shrinker to static

2017-10-20 Thread Sherry Yang

binder_shrinker struct is not used anywhere outside of
binder_alloc.c and should be static.

Acked-by: Arve Hjønnevåg 
Signed-off-by: Sherry Yang 
---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 36bcea1fc47c..6f6f745605af 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -982,7 +982,7 @@ binder_shrink_scan(struct shrinker *shrink, struct 
shrink_control *sc)
return ret;
 }
 
-struct shrinker binder_shrinker = {
+static struct shrinker binder_shrinker = {
.count_objects = binder_shrink_count,
.scan_objects = binder_shrink_scan,
.seeks = DEFAULT_SEEKS,
-- 
2.11.0 (Apple Git-81)

[PATCH v2 0/4] android: binder: fixes for memory allocator change

2017-10-20 Thread Sherry Yang

This patchset resolves a potential deadlock between lru lock, task lock
and dentry lock reported by lockdep. It also fixes the null pointer
dereference in kernel debug message in binder_alloc.c.

Unused vma argument is removed, and binder shrinker is made static.

   android: binder: Don't get mm from task
   android: binder: Fix null ptr dereference in debug msg
   android: binder: Remove unused vma argument
   android: binder: Change binder_shrinker to static

v2: combines the version 1 of two separate patches and reorder
the commits so that the first two can be applied to 4.14.

drivers/android/binder_alloc.c | 40 +---
drivers/android/binder_alloc.h |  1 -
2 files changed, 17 insertions(+), 24 deletions(-)

[PATCH v2 0/4] android: binder: fixes for memory allocator change

2017-10-20 Thread Sherry Yang

This patchset resolves a potential deadlock between lru lock, task lock
and dentry lock reported by lockdep. It also fixes the null pointer
dereference in kernel debug message in binder_alloc.c.

Unused vma argument is removed, and binder shrinker is made static.

   android: binder: Don't get mm from task
   android: binder: Fix null ptr dereference in debug msg
   android: binder: Remove unused vma argument
   android: binder: Change binder_shrinker to static

v2: combines the version 1 of two separate patches and reorder
the commits so that the first two can be applied to 4.14.

drivers/android/binder_alloc.c | 40 +---
drivers/android/binder_alloc.h |  1 -
2 files changed, 17 insertions(+), 24 deletions(-)

Re: [PATCH net 5/5] net/ncsi: Fix length of GVI response packet

2017-10-20 Thread David Miller

From: Samuel Mendoza-Jonas 
Date: Thu, 19 Oct 2017 13:43:09 +1100

> From: Gavin Shan 
> 
> The length of GVI (GetVersionInfo) response packet should be 40 instead
> of 36. This issue was found from /sys/kernel/debug/ncsi/eth0/stats.
> 
>  # ethtool --ncsi eth0 swstats
>  :
>  RESPONSE OK   TIMEOUT  ERROR
>  ===
>  GVI  002
> 
> With this applied, no error reported on GVI response packets:
> 
>  # ethtool --ncsi eth0 swstats
>  :
>  RESPONSE OK   TIMEOUT  ERROR
>  ===
>  GVI  200
> 
> Signed-off-by: Gavin Shan 
> Signed-off-by: Samuel Mendoza-Jonas 

Applied.

Re: [PATCH net 2/5] net/ncsi: Stop monitor if channel times out or is inactive

2017-10-20 Thread David Miller

From: Samuel Mendoza-Jonas 
Date: Thu, 19 Oct 2017 13:43:06 +1100

> ncsi_channel_monitor() misses stopping the channel monitor in several
> places that it should, causing a WARN_ON_ONCE() to trigger when the
> monitor is re-started later, eg:
> 
> [  459.04] WARNING: CPU: 0 PID: 1093 at net/ncsi/ncsi-manage.c:269 
> ncsi_start_channel_monitor+0x7c/0x90
> [  459.04] CPU: 0 PID: 1093 Comm: kworker/0:3 Not tainted 
> 4.10.17-gaca2fdd #140
> [  459.04] Hardware name: ASpeed SoC
> [  459.04] Workqueue: events ncsi_dev_work
> [  459.04] [<80010094>] (unwind_backtrace) from [<8000d950>] 
> (show_stack+0x20/0x24)
> [  459.04] [<8000d950>] (show_stack) from [<801dbf70>] 
> (dump_stack+0x20/0x28)
> [  459.04] [<801dbf70>] (dump_stack) from [<80018d7c>] (__warn+0xe0/0x108)
> [  459.04] [<80018d7c>] (__warn) from [<80018e70>] 
> (warn_slowpath_null+0x30/0x38)
> [  459.04] [<80018e70>] (warn_slowpath_null) from [<803f6a08>] 
> (ncsi_start_channel_monitor+0x7c/0x90)
> [  459.04] [<803f6a08>] (ncsi_start_channel_monitor) from [<803f7664>] 
> (ncsi_configure_channel+0xdc/0x5fc)
> [  459.04] [<803f7664>] (ncsi_configure_channel) from [<803f8160>] 
> (ncsi_dev_work+0xac/0x474)
> [  459.04] [<803f8160>] (ncsi_dev_work) from [<8002d244>] 
> (process_one_work+0x1e0/0x450)
> [  459.04] [<8002d244>] (process_one_work) from [<8002d510>] 
> (worker_thread+0x5c/0x570)
> [  459.04] [<8002d510>] (worker_thread) from [<80033614>] 
> (kthread+0x124/0x164)
> [  459.04] [<80033614>] (kthread) from [<8000a5e8>] 
> (ret_from_fork+0x14/0x2c)
> 
> This also updates the monitor instead of just returning if
> ncsi_xmit_cmd() fails to send the get-link-status command so that the
> monitor properly times out.
> 
> Fixes: e6f44ed6d04d3 "net/ncsi: Package and channel management"
> 
> Signed-off-by: Samuel Mendoza-Jonas 

Applied.

Re: [PATCH net 3/5] net/ncsi: Disable HWA mode when no channels are found

2017-10-20 Thread David Miller

From: Samuel Mendoza-Jonas 
Date: Thu, 19 Oct 2017 13:43:07 +1100

> From: Gavin Shan 
> 
> When there are no NCSI channels probed, HWA (Hardware Arbitration)
> mode is enabled. It's not correct because HWA depends on the fact:
> NCSI channels exist and all of them support HWA mode. This disables
> HWA when no channels are probed.
> 
> Signed-off-by: Gavin Shan 
> Signed-off-by: Samuel Mendoza-Jonas 

Applied.

Re: [PATCH net 5/5] net/ncsi: Fix length of GVI response packet

2017-10-20 Thread David Miller

From: Samuel Mendoza-Jonas 
Date: Thu, 19 Oct 2017 13:43:09 +1100

> From: Gavin Shan 
> 
> The length of GVI (GetVersionInfo) response packet should be 40 instead
> of 36. This issue was found from /sys/kernel/debug/ncsi/eth0/stats.
> 
>  # ethtool --ncsi eth0 swstats
>  :
>  RESPONSE OK   TIMEOUT  ERROR
>  ===
>  GVI  002
> 
> With this applied, no error reported on GVI response packets:
> 
>  # ethtool --ncsi eth0 swstats
>  :
>  RESPONSE OK   TIMEOUT  ERROR
>  ===
>  GVI  200
> 
> Signed-off-by: Gavin Shan 
> Signed-off-by: Samuel Mendoza-Jonas 

Applied.

Re: [PATCH net 2/5] net/ncsi: Stop monitor if channel times out or is inactive

2017-10-20 Thread David Miller

From: Samuel Mendoza-Jonas 
Date: Thu, 19 Oct 2017 13:43:06 +1100

> ncsi_channel_monitor() misses stopping the channel monitor in several
> places that it should, causing a WARN_ON_ONCE() to trigger when the
> monitor is re-started later, eg:
> 
> [  459.04] WARNING: CPU: 0 PID: 1093 at net/ncsi/ncsi-manage.c:269 
> ncsi_start_channel_monitor+0x7c/0x90
> [  459.04] CPU: 0 PID: 1093 Comm: kworker/0:3 Not tainted 
> 4.10.17-gaca2fdd #140
> [  459.04] Hardware name: ASpeed SoC
> [  459.04] Workqueue: events ncsi_dev_work
> [  459.04] [<80010094>] (unwind_backtrace) from [<8000d950>] 
> (show_stack+0x20/0x24)
> [  459.04] [<8000d950>] (show_stack) from [<801dbf70>] 
> (dump_stack+0x20/0x28)
> [  459.04] [<801dbf70>] (dump_stack) from [<80018d7c>] (__warn+0xe0/0x108)
> [  459.04] [<80018d7c>] (__warn) from [<80018e70>] 
> (warn_slowpath_null+0x30/0x38)
> [  459.04] [<80018e70>] (warn_slowpath_null) from [<803f6a08>] 
> (ncsi_start_channel_monitor+0x7c/0x90)
> [  459.04] [<803f6a08>] (ncsi_start_channel_monitor) from [<803f7664>] 
> (ncsi_configure_channel+0xdc/0x5fc)
> [  459.04] [<803f7664>] (ncsi_configure_channel) from [<803f8160>] 
> (ncsi_dev_work+0xac/0x474)
> [  459.04] [<803f8160>] (ncsi_dev_work) from [<8002d244>] 
> (process_one_work+0x1e0/0x450)
> [  459.04] [<8002d244>] (process_one_work) from [<8002d510>] 
> (worker_thread+0x5c/0x570)
> [  459.04] [<8002d510>] (worker_thread) from [<80033614>] 
> (kthread+0x124/0x164)
> [  459.04] [<80033614>] (kthread) from [<8000a5e8>] 
> (ret_from_fork+0x14/0x2c)
> 
> This also updates the monitor instead of just returning if
> ncsi_xmit_cmd() fails to send the get-link-status command so that the
> monitor properly times out.
> 
> Fixes: e6f44ed6d04d3 "net/ncsi: Package and channel management"
> 
> Signed-off-by: Samuel Mendoza-Jonas 

Applied.

Re: [PATCH net 3/5] net/ncsi: Disable HWA mode when no channels are found

2017-10-20 Thread David Miller

From: Samuel Mendoza-Jonas 
Date: Thu, 19 Oct 2017 13:43:07 +1100

> From: Gavin Shan 
> 
> When there are no NCSI channels probed, HWA (Hardware Arbitration)
> mode is enabled. It's not correct because HWA depends on the fact:
> NCSI channels exist and all of them support HWA mode. This disables
> HWA when no channels are probed.
> 
> Signed-off-by: Gavin Shan 
> Signed-off-by: Samuel Mendoza-Jonas 

Applied.

Re: [PATCH net 4/5] net/ncsi: Enforce failover on link monitor timeout

2017-10-20 Thread David Miller

From: Samuel Mendoza-Jonas 
Date: Thu, 19 Oct 2017 13:43:08 +1100

> From: Gavin Shan 
> 
> The NCSI channel has been configured to provide service if its link
> monitor timer is enabled, regardless of its state (inactive or active).
> So the timeout event on the link monitor indicates the out-of-service
> on that channel, for which a failover is needed.
> 
> This sets NCSI_DEV_RESHUFFLE flag to enforce failover on link monitor
> timeout, regardless the channel's original state (inactive or active).
> Also, the link is put into "down" state to give the failing channel
> lowest priority when selecting for the active channel. The state of
> failing channel should be set to active in order for deinitialization
> and failover to be done.
> 
> Signed-off-by: Gavin Shan 
> Signed-off-by: Samuel Mendoza-Jonas 

Applied.

Re: [PATCH net 4/5] net/ncsi: Enforce failover on link monitor timeout

2017-10-20 Thread David Miller

From: Samuel Mendoza-Jonas 
Date: Thu, 19 Oct 2017 13:43:08 +1100

> From: Gavin Shan 
> 
> The NCSI channel has been configured to provide service if its link
> monitor timer is enabled, regardless of its state (inactive or active).
> So the timeout event on the link monitor indicates the out-of-service
> on that channel, for which a failover is needed.
> 
> This sets NCSI_DEV_RESHUFFLE flag to enforce failover on link monitor
> timeout, regardless the channel's original state (inactive or active).
> Also, the link is put into "down" state to give the failing channel
> lowest priority when selecting for the active channel. The state of
> failing channel should be set to active in order for deinitialization
> and failover to be done.
> 
> Signed-off-by: Gavin Shan 
> Signed-off-by: Samuel Mendoza-Jonas 

Applied.

Re: [PATCH net 1/5] net/ncsi: Fix AEN HNCDSC packet length

2017-10-20 Thread David Miller

From: Samuel Mendoza-Jonas 
Date: Thu, 19 Oct 2017 13:43:05 +1100

> Correct the value of the HNCDSC AEN packet.
> Fixes: 7a82ecf4cfb85 "net/ncsi: NCSI AEN packet handler"
> 
> Signed-off-by: Samuel Mendoza-Jonas 

Applied.

Re: [PATCH net 1/5] net/ncsi: Fix AEN HNCDSC packet length

2017-10-20 Thread David Miller

From: Samuel Mendoza-Jonas 
Date: Thu, 19 Oct 2017 13:43:05 +1100

> Correct the value of the HNCDSC AEN packet.
> Fixes: 7a82ecf4cfb85 "net/ncsi: NCSI AEN packet handler"
> 
> Signed-off-by: Samuel Mendoza-Jonas 

Applied.

Re: [PATCH 7/7] gpio: brcmstb: implement suspend/resume/shutdown

2017-10-20 Thread Gregory Fong

On Thu, Oct 19, 2017 at 11:39 AM, Doug Berger  wrote:
>>> +static int brcmstb_gpio_resume(struct device *dev)
>>> +{
>>> +struct brcmstb_gpio_priv *priv = dev_get_drvdata(dev);
>>> +struct brcmstb_gpio_bank *bank;
>>> +u32 wake_mask = 0;
>>
>> This isn't really being used as a mask, contrary to appearances.  It's
>> just tracking whether any active IRQs were seen.  Please change to use a
>> bool instead and adjust the name accordingly.
>>
>
> I see your point, but I believe it is cleaner to use this to consolidate
> the bit masks returned by each __brcmstb_gpio_get_active_irqs() call.
> This allows a single test rather than a test per bank.

What about something like this?

bool need_wakeup_event = false;

list_for_each_entry(bank, >bank_list, node) {
need_wakeup_event |= !!__brcmstb_gpio_get_active_irqs(bank);
brcmstb_gpio_bank_restore(priv, bank);
}

if (priv->parent_wake_irq && need_wakeup_event)
pm_wakeup_event(dev, 0);

Re: [PATCH 7/7] gpio: brcmstb: implement suspend/resume/shutdown

2017-10-20 Thread Gregory Fong

On Thu, Oct 19, 2017 at 11:39 AM, Doug Berger  wrote:
>>> +static int brcmstb_gpio_resume(struct device *dev)
>>> +{
>>> +struct brcmstb_gpio_priv *priv = dev_get_drvdata(dev);
>>> +struct brcmstb_gpio_bank *bank;
>>> +u32 wake_mask = 0;
>>
>> This isn't really being used as a mask, contrary to appearances.  It's
>> just tracking whether any active IRQs were seen.  Please change to use a
>> bool instead and adjust the name accordingly.
>>
>
> I see your point, but I believe it is cleaner to use this to consolidate
> the bit masks returned by each __brcmstb_gpio_get_active_irqs() call.
> This allows a single test rather than a test per bank.

What about something like this?

bool need_wakeup_event = false;

list_for_each_entry(bank, >bank_list, node) {
need_wakeup_event |= !!__brcmstb_gpio_get_active_irqs(bank);
brcmstb_gpio_bank_restore(priv, bank);
}

if (priv->parent_wake_irq && need_wakeup_event)
pm_wakeup_event(dev, 0);

Re: [PATCH 0/3] arm64: cpuinfo: make /proc/cpuinfo more human-readable

2017-10-20 Thread Jon Masters

On 10/20/2017 01:24 PM, Jon Masters wrote:

> 1). The first thing people do when they get an Arm server is to cat
> /proc/cpuinfo. They then come complaining that it's not like x86. They
> can't get the output their looking for and this results in bug filing,
> and countless hours on phone calls discussing over and over again.
> Worse, there are some parts of the stack that really need this.

Within 6 hours of sending this, I get a ping about this week's "Works On
Arm" newsletter and...people reporting bugs with not getting CPU
capabilities in /proc/cpuinfo. This madness is going to end. Soon.

Jon.

-- 
Computer Architect | Sent from my Fedora powered laptop

Re: [PATCH 0/3] arm64: cpuinfo: make /proc/cpuinfo more human-readable

2017-10-20 Thread Jon Masters

On 10/20/2017 01:24 PM, Jon Masters wrote:

> 1). The first thing people do when they get an Arm server is to cat
> /proc/cpuinfo. They then come complaining that it's not like x86. They
> can't get the output their looking for and this results in bug filing,
> and countless hours on phone calls discussing over and over again.
> Worse, there are some parts of the stack that really need this.

Within 6 hours of sending this, I get a ping about this week's "Works On
Arm" newsletter and...people reporting bugs with not getting CPU
capabilities in /proc/cpuinfo. This madness is going to end. Soon.

Jon.

-- 
Computer Architect | Sent from my Fedora powered laptop

[PATCH] net: core: rtnetlink: use BUG_ON instead of if condition followed by BUG

2017-10-20 Thread Gustavo A. R. Silva

Use BUG_ON instead of if condition followed by BUG in do_setlink.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva 
---
This code was tested by compilation only.

 net/core/rtnetlink.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 04680a5..df8dba9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2274,8 +2274,7 @@ static int do_setlink(const struct sk_buff *skb,
 
rcu_read_lock();
 
-   if (!(af_ops = rtnl_af_lookup(nla_type(af
-   BUG();
+   BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af;
 
err = af_ops->set_link_af(dev, af);
if (err < 0) {
-- 
2.7.4

[PATCH] net: core: rtnetlink: use BUG_ON instead of if condition followed by BUG

2017-10-20 Thread Gustavo A. R. Silva

Use BUG_ON instead of if condition followed by BUG in do_setlink.

This issue was detected with the help of Coccinelle.

Signed-off-by: Gustavo A. R. Silva 
---
This code was tested by compilation only.

 net/core/rtnetlink.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 04680a5..df8dba9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2274,8 +2274,7 @@ static int do_setlink(const struct sk_buff *skb,
 
rcu_read_lock();
 
-   if (!(af_ops = rtnl_af_lookup(nla_type(af
-   BUG();
+   BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af;
 
err = af_ops->set_link_af(dev, af);
if (err < 0) {
-- 
2.7.4

Re: [PATCH] MIPS: kernel: proc: Remove spurious white space in cpuinfo

2017-10-20 Thread David Daney


On 10/20/2017 01:47 PM, Maciej W. Rozycki wrote:

On Fri, 20 Oct 2017, Aleksandar Markovic wrote:


Remove unnecessary space from FPU info segment of /proc/cpuinfo.


  NAK.  As I recall back in Nov 2001 I placed the extra space there to
visually separate the CPU part from the FPU part, e.g.:

cpu model   : R3000A V3.0  FPU V4.0
cpu model   : SiByte SB1 V0.2  FPU V0.2

etc.  And the motivation behind it still stands.  Please remember that
/proc/cpuinfo is there for live humans to parse and grouping all these
pieces together would make it harder.  Which means your change adds no
value I'm afraid.


I think it is even riskier than that.  This is part of the 
kernel-userspace ABI, many programs parse this file, any gratuitous 
changes risk breaking something.


I don't really have an opinion about the various *printf functions being 
used, but think the resultant change in what is visible to userspace 
should not be done.




  NB regrettably back in those days much of the patch traffic happened off
any mailing list, however I have quickly tracked down my archival copy of
the original submission of the change introducing this piece of code and
I'll be happy to share it with anyone upon request.

   Maciej

Re: [PATCH] MIPS: kernel: proc: Remove spurious white space in cpuinfo

2017-10-20 Thread David Daney


On 10/20/2017 01:47 PM, Maciej W. Rozycki wrote:

On Fri, 20 Oct 2017, Aleksandar Markovic wrote:


Remove unnecessary space from FPU info segment of /proc/cpuinfo.


  NAK.  As I recall back in Nov 2001 I placed the extra space there to
visually separate the CPU part from the FPU part, e.g.:

cpu model   : R3000A V3.0  FPU V4.0
cpu model   : SiByte SB1 V0.2  FPU V0.2

etc.  And the motivation behind it still stands.  Please remember that
/proc/cpuinfo is there for live humans to parse and grouping all these
pieces together would make it harder.  Which means your change adds no
value I'm afraid.


I think it is even riskier than that.  This is part of the 
kernel-userspace ABI, many programs parse this file, any gratuitous 
changes risk breaking something.


I don't really have an opinion about the various *printf functions being 
used, but think the resultant change in what is visible to userspace 
should not be done.




  NB regrettably back in those days much of the patch traffic happened off
any mailing list, however I have quickly tracked down my archival copy of
the original submission of the change introducing this piece of code and
I'll be happy to share it with anyone upon request.

   Maciej

Re: [PATCH] PCI/AER: update AER status string print to match other AER logs

2017-10-20 Thread Bjorn Helgaas

On Tue, Oct 17, 2017 at 09:42:02AM -0600, Tyler Baicar wrote:
> Currently the AER driver uses cper_print_bits() to print the AER status
> string. This causes the status string to not include the proper PCI device
> name prefix that the other AER prints include. Also, it has a different
> print level than all the other AER prints.
> 
> Update the AER driver to print the AER status string with the proper string
> prefix and proper print level.
> 
> Previous log example:
> 
> e1000e 0003:01:00.1: aer_status: 0x0041, aer_mask: 0x
> Receiver Error, Bad TLP
> e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID
> pcieport 0003:00:00.0: aer_status: 0x1000, aer_mask: 0xe000
> Replay Timer Timeout
> pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID
> 
> New log:
> 
> e1000e 0003:01:00.1: aer_status: 0x0041, aer_mask: 0x
> e1000e 0003:01:00.1: Receiver Error
> e1000e 0003:01:00.1: Bad TLP
> e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID
> pcieport 0003:00:00.0: aer_status: 0x1000, aer_mask: 0xe000
> pcieport 0003:00:00.0: Replay Timer Timeout
> pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID

I definitely think it's MUCH better to use dev_err() as you do.

I don't like the cper_print_bits() strategy of inserting line breaks
to fit in 80 columns.  That leads to atomicity issues, e.g., other
printk output getting inserted in the middle of a single AER log, and
suggests an ordering ("Receiver Error" occurred before "Bad TLP") that
isn't real.  It'd be ideal if everything fit on one line per event,
but that might not be practical.

I'm not necessarily attached to the actual strings.  These messages
are for sophisticated users and maybe could be abbreviated as in lspci
output.  It might actually be kind of neat if the output here matched
up with the output of "lspci -vv" (lspci prints all the bits; here you
probably want only the set bits).  Or maybe not.

But even what you have here is a huge improvement.  I *hate*
unattached things in dmesg like we currently get.  There's no reliable
way to connect that "Receiver Error, Bad TLP" with the device.

> Signed-off-by: Tyler Baicar 
> ---
>  drivers/pci/pcie/aer/aerdrv_errprint.c | 15 ++-
>  1 file changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c 
> b/drivers/pci/pcie/aer/aerdrv_errprint.c
> index 54c4b69..b718daa 100644
> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> @@ -206,6 +206,19 @@ void aer_print_port_info(struct pci_dev *dev, struct 
> aer_err_info *info)
>  }
>  
>  #ifdef CONFIG_ACPI_APEI_PCIEAER
> +void dev_print_bits(struct pci_dev *dev, unsigned int bits,
> + const char * const strs[], unsigned int strs_size)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < strs_size; i++) {
> + if (!(bits & (1U << i)))
> + continue;
> + if (strs[i])
> + dev_err(>dev, "%s\n", strs[i]);
> + }
> +}
> +
>  int cper_severity_to_aer(int cper_severity)
>  {
>   switch (cper_severity) {
> @@ -243,7 +256,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
>   agent = AER_GET_AGENT(aer_severity, status);
>  
>   dev_err(>dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, 
> mask);
> - cper_print_bits("", status, status_strs, status_strs_size);
> + dev_print_bits(dev, status, status_strs, status_strs_size);
>   dev_err(>dev, "aer_layer=%s, aer_agent=%s\n",
>   aer_error_layer[layer], aer_agent_string[agent]);
>  
> -- 
> Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm 
> Technologies, Inc.
> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project.
>

Re: [PATCH] PCI/AER: update AER status string print to match other AER logs

2017-10-20 Thread Bjorn Helgaas

On Tue, Oct 17, 2017 at 09:42:02AM -0600, Tyler Baicar wrote:
> Currently the AER driver uses cper_print_bits() to print the AER status
> string. This causes the status string to not include the proper PCI device
> name prefix that the other AER prints include. Also, it has a different
> print level than all the other AER prints.
> 
> Update the AER driver to print the AER status string with the proper string
> prefix and proper print level.
> 
> Previous log example:
> 
> e1000e 0003:01:00.1: aer_status: 0x0041, aer_mask: 0x
> Receiver Error, Bad TLP
> e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID
> pcieport 0003:00:00.0: aer_status: 0x1000, aer_mask: 0xe000
> Replay Timer Timeout
> pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID
> 
> New log:
> 
> e1000e 0003:01:00.1: aer_status: 0x0041, aer_mask: 0x
> e1000e 0003:01:00.1: Receiver Error
> e1000e 0003:01:00.1: Bad TLP
> e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID
> pcieport 0003:00:00.0: aer_status: 0x1000, aer_mask: 0xe000
> pcieport 0003:00:00.0: Replay Timer Timeout
> pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID

I definitely think it's MUCH better to use dev_err() as you do.

I don't like the cper_print_bits() strategy of inserting line breaks
to fit in 80 columns.  That leads to atomicity issues, e.g., other
printk output getting inserted in the middle of a single AER log, and
suggests an ordering ("Receiver Error" occurred before "Bad TLP") that
isn't real.  It'd be ideal if everything fit on one line per event,
but that might not be practical.

I'm not necessarily attached to the actual strings.  These messages
are for sophisticated users and maybe could be abbreviated as in lspci
output.  It might actually be kind of neat if the output here matched
up with the output of "lspci -vv" (lspci prints all the bits; here you
probably want only the set bits).  Or maybe not.

But even what you have here is a huge improvement.  I *hate*
unattached things in dmesg like we currently get.  There's no reliable
way to connect that "Receiver Error, Bad TLP" with the device.

> Signed-off-by: Tyler Baicar 
> ---
>  drivers/pci/pcie/aer/aerdrv_errprint.c | 15 ++-
>  1 file changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c 
> b/drivers/pci/pcie/aer/aerdrv_errprint.c
> index 54c4b69..b718daa 100644
> --- a/drivers/pci/pcie/aer/aerdrv_errprint.c
> +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c
> @@ -206,6 +206,19 @@ void aer_print_port_info(struct pci_dev *dev, struct 
> aer_err_info *info)
>  }
>  
>  #ifdef CONFIG_ACPI_APEI_PCIEAER
> +void dev_print_bits(struct pci_dev *dev, unsigned int bits,
> + const char * const strs[], unsigned int strs_size)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < strs_size; i++) {
> + if (!(bits & (1U << i)))
> + continue;
> + if (strs[i])
> + dev_err(>dev, "%s\n", strs[i]);
> + }
> +}
> +
>  int cper_severity_to_aer(int cper_severity)
>  {
>   switch (cper_severity) {
> @@ -243,7 +256,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
>   agent = AER_GET_AGENT(aer_severity, status);
>  
>   dev_err(>dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, 
> mask);
> - cper_print_bits("", status, status_strs, status_strs_size);
> + dev_print_bits(dev, status, status_strs, status_strs_size);
>   dev_err(>dev, "aer_layer=%s, aer_agent=%s\n",
>   aer_error_layer[layer], aer_agent_string[agent]);
>  
> -- 
> Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm 
> Technologies, Inc.
> Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project.
>

[PATCH mmotm] lib/bug.c: fix build when MODULES are not enabled

2017-10-20 Thread Randy Dunlap

From: Randy Dunlap 

Fix build when CONFIG_MODULES is not enabled.
Fixes multiple build errors.

Signed-off-by: Randy Dunlap 
Reported-by: kbuild test robot 
Cc: Andi Kleen 
---
 lib/bug.c |2 ++
 1 file changed, 2 insertions(+)

--- mmotm-2017-1019-1700.orig/lib/bug.c
+++ mmotm-2017-1019-1700/lib/bug.c
@@ -206,6 +206,7 @@ static void clear_once_table(struct bug_
 
 void generic_bug_clear_once(void)
 {
+#ifdef CONFIG_MODULES
struct module *mod;
 
rcu_read_lock_sched();
@@ -213,6 +214,7 @@ void generic_bug_clear_once(void)
clear_once_table(mod->bug_table,
 mod->bug_table + mod->num_bugs);
rcu_read_unlock_sched();
+#endif
 
clear_once_table(__start___bug_table, __stop___bug_table);
 }

[PATCH mmotm] lib/bug.c: fix build when MODULES are not enabled

2017-10-20 Thread Randy Dunlap

From: Randy Dunlap 

Fix build when CONFIG_MODULES is not enabled.
Fixes multiple build errors.

Signed-off-by: Randy Dunlap 
Reported-by: kbuild test robot 
Cc: Andi Kleen 
---
 lib/bug.c |2 ++
 1 file changed, 2 insertions(+)

--- mmotm-2017-1019-1700.orig/lib/bug.c
+++ mmotm-2017-1019-1700/lib/bug.c
@@ -206,6 +206,7 @@ static void clear_once_table(struct bug_
 
 void generic_bug_clear_once(void)
 {
+#ifdef CONFIG_MODULES
struct module *mod;
 
rcu_read_lock_sched();
@@ -213,6 +214,7 @@ void generic_bug_clear_once(void)
clear_once_table(mod->bug_table,
 mod->bug_table + mod->num_bugs);
rcu_read_unlock_sched();
+#endif
 
clear_once_table(__start___bug_table, __stop___bug_table);
 }

Re: [bug report] regression bisected to "block: Make most scsi_req_init() calls implicit"

2017-10-20 Thread Bart Van Assche

On Fri, 2017-10-20 at 16:54 -0600, dann frazier wrote:
> hey,
>   I'm seeing a regression when executing 'dmraid -r -c' in an arm64
> QEMU guest, which I've bisected to the following commit:
> 
>   ca18d6f7 "block: Make most scsi_req_init() calls implicit"
> 
> I haven't yet had time to try and debug it yet, but wanted to get
> the report out there before the weekend. Here's the crash:
> 
> [  138.519885] usercopy: kernel memory overwrite attempt detected to  
>  (null) () (6 bytes)
> [  138.521562] kernel BUG at mm/usercopy.c:72!
> [  138.522294] Internal error: Oops - BUG: 0 [#1] SMP
> [  138.523105] Modules linked in: nls_utf8 isofs nls_iso8859_1 qemu_fw_cfg 
> ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi 
> scsi_transport_iscsi ip_tables
> x_tables autofs4 btrfs zstd_decompress zstd_compress xxhash raid10 raid456 
> async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq 
> libcrc32c raid1 raid0
> multipath linear aes_ce_blk aes_ce_cipher crc32_ce crct10dif_ce ghash_ce 
> sha2_ce sha256_arm64 sha1_ce virtio_net virtio_blk aes_neon_bs aes_neon_blk 
> crypto_simd cryptd
> aes_arm64
> [  138.531307] CPU: 62 PID: 2271 Comm: dmraid Not tainted 4.14.0-rc5+ #20
> [  138.532512] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
> [  138.533796] task: 8003cba2e900 task.stack: 110e8000
> [  138.534887] PC is at __check_object_size+0x114/0x200
> [  138.535800] LR is at __check_object_size+0x114/0x200
> [  138.536711] pc : [] lr : [] pstate: 
> 00400145
> [  138.538073] sp : 110ebb00
> [  138.538682] x29: 110ebb00 x28:  
> [  138.539658] x27: d88e1110 x26: 8003e8d3d800 
> [  138.540633] x25: 0802001d x24: 8003e1131920 
> [  138.541621] x23: 0006 x22: 0006 
> [  138.542596] x21:  x20: 0006 
> [  138.543571] x19:  x18:  
> [  138.544548] x17: 83380ce0 x16: 082dd3b0 
> [  138.545525] x15: 093c8c08 x14: 6c756e2820202020 
> [  138.546511] x13: 202020202020206f x12: 7420646574636574 
> [  138.547489] x11: 093c9658 x10: 086ae800 
> [  138.548466] x9 : 7265766f2079726f x8 : 0017 
> [  138.549445] x7 : 6c756e3c2820296c x6 : 8003eeb51c28 
> [  138.550434] x5 : 8003eeb51c28 x4 :  
> [  138.551411] x3 : 8003eeb59ec8 x2 : d4a0cd0f45236000 
> [  138.552388] x1 :  x0 : 0059 
> [  138.553364] Process dmraid (pid: 2271, stack limit = 0x110e8000)
> [  138.554593] Call trace:
> [  138.555043] Exception stack(0x110eb9c0 to 0x110ebb00)
> [  138.556214] b9c0: 0059  d4a0cd0f45236000 
> 8003eeb59ec8
> [  138.557653] b9e0:  8003eeb51c28 8003eeb51c28 
> 6c756e3c2820296c
> [  138.559082] ba00: 0017 7265766f2079726f 086ae800 
> 093c9658
> [  138.560510] ba20: 7420646574636574 202020202020206f 6c756e2820202020 
> 093c8c08
> [  138.561950] ba40: 082dd3b0 83380ce0  
> 
> [  138.563379] ba60: 0006  0006 
> 0006
> [  138.564805] ba80: 8003e1131920 0802001d 8003e8d3d800 
> d88e1110
> [  138.566238] baa0:  110ebb00 082c0e5c 
> 110ebb00
> [  138.567666] bac0: 082c0e5c 00400145 08e25a80 
> 
> [  138.569090] bae0: 0001 0006 110ebb00 
> 082c0e5c
> [  138.570523] [] __check_object_size+0x114/0x200
> [  138.571628] [] sg_io+0x120/0x438
> [  138.572507] [] scsi_cmd_ioctl+0x594/0x728
> [  138.573531] [] scsi_cmd_blk_ioctl+0x50/0x60
> [  138.574594] [] virtblk_ioctl+0x60/0x80 [virtio_blk]
> [  138.575769] [] blkdev_ioctl+0x5e4/0xb50
> [  138.576756] [] block_ioctl+0x50/0x68
> [  138.577698] [] do_vfs_ioctl+0xc4/0x940
> [  138.578671] [] SyS_ioctl+0x8c/0xa8
> [  138.579581] Exception stack(0x110ebec0 to 0x110ec000)
> [  138.580752] bec0: 0005 2285 d88e10b8 
> 0006
> [  138.582199] bee0:  0004 83416648 
> 0050
> [  138.583623] bf00: 001d 0003 0012 
> 0011
> [  138.585050] bf20: 83409000 00ff 8309dc70 
> 0531
> [  138.586490] bf40: 8344a360 83380ce0 00dc 
> 83478948
> [  138.587918] bf60: 0004 17ee7f90 0005 
> 17ede920
> [  138.589346] bf80: 17ee7f60 0003 83416648 
> 17ee7f60
> [  138.590785] bfa0: d88e1218 d88e1090 834166dc 
> d88e1090
> [  138.592215] bfc0: 83380cec 8000 0005 
> 001d
> [  138.593649] bfe0:

Re: [bug report] regression bisected to "block: Make most scsi_req_init() calls implicit"

2017-10-20 Thread Bart Van Assche

On Fri, 2017-10-20 at 16:54 -0600, dann frazier wrote:
> hey,
>   I'm seeing a regression when executing 'dmraid -r -c' in an arm64
> QEMU guest, which I've bisected to the following commit:
> 
>   ca18d6f7 "block: Make most scsi_req_init() calls implicit"
> 
> I haven't yet had time to try and debug it yet, but wanted to get
> the report out there before the weekend. Here's the crash:
> 
> [  138.519885] usercopy: kernel memory overwrite attempt detected to  
>  (null) () (6 bytes)
> [  138.521562] kernel BUG at mm/usercopy.c:72!
> [  138.522294] Internal error: Oops - BUG: 0 [#1] SMP
> [  138.523105] Modules linked in: nls_utf8 isofs nls_iso8859_1 qemu_fw_cfg 
> ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi 
> scsi_transport_iscsi ip_tables
> x_tables autofs4 btrfs zstd_decompress zstd_compress xxhash raid10 raid456 
> async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq 
> libcrc32c raid1 raid0
> multipath linear aes_ce_blk aes_ce_cipher crc32_ce crct10dif_ce ghash_ce 
> sha2_ce sha256_arm64 sha1_ce virtio_net virtio_blk aes_neon_bs aes_neon_blk 
> crypto_simd cryptd
> aes_arm64
> [  138.531307] CPU: 62 PID: 2271 Comm: dmraid Not tainted 4.14.0-rc5+ #20
> [  138.532512] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
> [  138.533796] task: 8003cba2e900 task.stack: 110e8000
> [  138.534887] PC is at __check_object_size+0x114/0x200
> [  138.535800] LR is at __check_object_size+0x114/0x200
> [  138.536711] pc : [] lr : [] pstate: 
> 00400145
> [  138.538073] sp : 110ebb00
> [  138.538682] x29: 110ebb00 x28:  
> [  138.539658] x27: d88e1110 x26: 8003e8d3d800 
> [  138.540633] x25: 0802001d x24: 8003e1131920 
> [  138.541621] x23: 0006 x22: 0006 
> [  138.542596] x21:  x20: 0006 
> [  138.543571] x19:  x18:  
> [  138.544548] x17: 83380ce0 x16: 082dd3b0 
> [  138.545525] x15: 093c8c08 x14: 6c756e2820202020 
> [  138.546511] x13: 202020202020206f x12: 7420646574636574 
> [  138.547489] x11: 093c9658 x10: 086ae800 
> [  138.548466] x9 : 7265766f2079726f x8 : 0017 
> [  138.549445] x7 : 6c756e3c2820296c x6 : 8003eeb51c28 
> [  138.550434] x5 : 8003eeb51c28 x4 :  
> [  138.551411] x3 : 8003eeb59ec8 x2 : d4a0cd0f45236000 
> [  138.552388] x1 :  x0 : 0059 
> [  138.553364] Process dmraid (pid: 2271, stack limit = 0x110e8000)
> [  138.554593] Call trace:
> [  138.555043] Exception stack(0x110eb9c0 to 0x110ebb00)
> [  138.556214] b9c0: 0059  d4a0cd0f45236000 
> 8003eeb59ec8
> [  138.557653] b9e0:  8003eeb51c28 8003eeb51c28 
> 6c756e3c2820296c
> [  138.559082] ba00: 0017 7265766f2079726f 086ae800 
> 093c9658
> [  138.560510] ba20: 7420646574636574 202020202020206f 6c756e2820202020 
> 093c8c08
> [  138.561950] ba40: 082dd3b0 83380ce0  
> 
> [  138.563379] ba60: 0006  0006 
> 0006
> [  138.564805] ba80: 8003e1131920 0802001d 8003e8d3d800 
> d88e1110
> [  138.566238] baa0:  110ebb00 082c0e5c 
> 110ebb00
> [  138.567666] bac0: 082c0e5c 00400145 08e25a80 
> 
> [  138.569090] bae0: 0001 0006 110ebb00 
> 082c0e5c
> [  138.570523] [] __check_object_size+0x114/0x200
> [  138.571628] [] sg_io+0x120/0x438
> [  138.572507] [] scsi_cmd_ioctl+0x594/0x728
> [  138.573531] [] scsi_cmd_blk_ioctl+0x50/0x60
> [  138.574594] [] virtblk_ioctl+0x60/0x80 [virtio_blk]
> [  138.575769] [] blkdev_ioctl+0x5e4/0xb50
> [  138.576756] [] block_ioctl+0x50/0x68
> [  138.577698] [] do_vfs_ioctl+0xc4/0x940
> [  138.578671] [] SyS_ioctl+0x8c/0xa8
> [  138.579581] Exception stack(0x110ebec0 to 0x110ec000)
> [  138.580752] bec0: 0005 2285 d88e10b8 
> 0006
> [  138.582199] bee0:  0004 83416648 
> 0050
> [  138.583623] bf00: 001d 0003 0012 
> 0011
> [  138.585050] bf20: 83409000 00ff 8309dc70 
> 0531
> [  138.586490] bf40: 8344a360 83380ce0 00dc 
> 83478948
> [  138.587918] bf60: 0004 17ee7f90 0005 
> 17ede920
> [  138.589346] bf80: 17ee7f60 0003 83416648 
> 17ee7f60
> [  138.590785] bfa0: d88e1218 d88e1090 834166dc 
> d88e1090
> [  138.592215] bfc0: 83380cec 8000 0005 
> 001d
> [  138.593649] bfe0:

Re: [PATCH 09/27] uswsusp: Disable when the kernel is locked down

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> From: Matthew Garrett 
> 
> uswsusp allows a user process to dump and then restore kernel state, which
> makes it possible to modify the running kernel.  Disable this if the kernel
> is locked down.
> 
> Signed-off-by: Matthew Garrett 
> Signed-off-by: David Howells 


Reviewed-by: James Morris 

-- 
James Morris

Re: [PATCH 09/27] uswsusp: Disable when the kernel is locked down

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> From: Matthew Garrett 
> 
> uswsusp allows a user process to dump and then restore kernel state, which
> makes it possible to modify the running kernel.  Disable this if the kernel
> is locked down.
> 
> Signed-off-by: Matthew Garrett 
> Signed-off-by: David Howells 


Reviewed-by: James Morris 

-- 
James Morris

Re: [PATCH 07/27] kexec_file: Disable at runtime if securelevel has been set

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> From: Chun-Yi Lee 
> 
> When KEXEC_VERIFY_SIG is not enabled, kernel should not loads image
> through kexec_file systemcall if securelevel has been set.
> 
> This code was showed in Matthew's patch but not in git:
> https://lkml.org/lkml/2015/3/13/778
> 


Reviewed-by: James Morris 

-- 
James Morris

Re: [PATCH 07/27] kexec_file: Disable at runtime if securelevel has been set

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> From: Chun-Yi Lee 
> 
> When KEXEC_VERIFY_SIG is not enabled, kernel should not loads image
> through kexec_file systemcall if securelevel has been set.
> 
> This code was showed in Matthew's patch but not in git:
> https://lkml.org/lkml/2015/3/13/778
> 


Reviewed-by: James Morris 

-- 
James Morris

Re: [PATCH 0/3] arm64: cpuinfo: make /proc/cpuinfo more human-readable

2017-10-20 Thread Al Stone

On 10/20/2017 10:10 AM, Mark Rutland wrote:
> Hi Al,
> 
> On Mon, Oct 16, 2017 at 05:43:19PM -0600, Al Stone wrote:
>> On 10/13/2017 08:27 AM, Mark Rutland wrote:
>>> I certainly agree that exposing the information that we have is useful,
>>> as I have stated several times. I'm not NAKing exposing this information
>>> elsewhere.
>>>
>>> If you want a consistent cross-architecture interface for this
>>> information, then you need to propose a new one. That was we can
>>> actually solve the underlying issues, for all architectures, without
>>> breaking ABI.
>>>
>>> I would be *very* interested in such an interface, and would be more
>>> than happy to help.
>>
>> I'm playing with some patches that do very similar things in sysfs, vs
>> proc.  Is that better :)?
> 
> Exposing data under sysfs is certainly better, yes. :)
> 
>> Obviously, you'll have to see the patches to
>> properly answer that, but what I'm playing with at present is placing
>> this info in new entries in /sys/devices/cpu and/or /sys/devices/system,
>> and generating some of the content based on what's already in header files
>> (e.g., in cputype.h). 
> 
> My opposition to MIDR -> string mapping applies regardless of
> location...

Harumph.  This is the one thing I get asked for most often, however (second most
is frequency).  It turns out humans are not nearly as good at indexed lookups as
computers, hence the requests.

Whatever the root of the opposition is, it needs to get fixed.  My fear is that
if it doesn't get fixed in the firmware or the kernel, it will get fixed in some
far messier, less controllable way somewhere else (more likely several somewhere
elses) and just exacerbate the problem.

>> The idea of course is to keep this new info from touching any existing
>> info so we don't break compatibility -- does that feel like a better
>> direction, at least?
> 
> ... but otherwise this sounds good to me!
> 
> Thanks,
> Mark.
> 


-- 
ciao,
al
---
Al Stone
Software Engineer
Red Hat, Inc.
a...@redhat.com
---

Re: [PATCH 0/3] arm64: cpuinfo: make /proc/cpuinfo more human-readable

2017-10-20 Thread Al Stone

On 10/20/2017 10:10 AM, Mark Rutland wrote:
> Hi Al,
> 
> On Mon, Oct 16, 2017 at 05:43:19PM -0600, Al Stone wrote:
>> On 10/13/2017 08:27 AM, Mark Rutland wrote:
>>> I certainly agree that exposing the information that we have is useful,
>>> as I have stated several times. I'm not NAKing exposing this information
>>> elsewhere.
>>>
>>> If you want a consistent cross-architecture interface for this
>>> information, then you need to propose a new one. That was we can
>>> actually solve the underlying issues, for all architectures, without
>>> breaking ABI.
>>>
>>> I would be *very* interested in such an interface, and would be more
>>> than happy to help.
>>
>> I'm playing with some patches that do very similar things in sysfs, vs
>> proc.  Is that better :)?
> 
> Exposing data under sysfs is certainly better, yes. :)
> 
>> Obviously, you'll have to see the patches to
>> properly answer that, but what I'm playing with at present is placing
>> this info in new entries in /sys/devices/cpu and/or /sys/devices/system,
>> and generating some of the content based on what's already in header files
>> (e.g., in cputype.h). 
> 
> My opposition to MIDR -> string mapping applies regardless of
> location...

Harumph.  This is the one thing I get asked for most often, however (second most
is frequency).  It turns out humans are not nearly as good at indexed lookups as
computers, hence the requests.

Whatever the root of the opposition is, it needs to get fixed.  My fear is that
if it doesn't get fixed in the firmware or the kernel, it will get fixed in some
far messier, less controllable way somewhere else (more likely several somewhere
elses) and just exacerbate the problem.

>> The idea of course is to keep this new info from touching any existing
>> info so we don't break compatibility -- does that feel like a better
>> direction, at least?
> 
> ... but otherwise this sounds good to me!
> 
> Thanks,
> Mark.
> 


-- 
ciao,
al
---
Al Stone
Software Engineer
Red Hat, Inc.
a...@redhat.com
---

Re: [PATCH 00/23] Hardened usercopy whitelisting

2017-10-20 Thread Paolo Bonzini

On 21/10/2017 00:40, Paolo Bonzini wrote:
> This breaks KVM completely on x86, due to two ioctls
> (KVM_GET/SET_CPUID2) accessing the cpuid_entries field of struct
> kvm_vcpu_arch.
> 
> There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is
> obsolete and not a big deal at all.
> 
> I can post some patches, but probably not until the beginning of
> November due to travelling.  Please do not send this too close to the
> beginning of the merge window.

Sleeping is overrated, sending patches now...

Paolo

Re: [PATCH 00/23] Hardened usercopy whitelisting

2017-10-20 Thread Paolo Bonzini

On 21/10/2017 00:40, Paolo Bonzini wrote:
> This breaks KVM completely on x86, due to two ioctls
> (KVM_GET/SET_CPUID2) accessing the cpuid_entries field of struct
> kvm_vcpu_arch.
> 
> There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is
> obsolete and not a big deal at all.
> 
> I can post some patches, but probably not until the beginning of
> November due to travelling.  Please do not send this too close to the
> beginning of the merge window.

Sleeping is overrated, sending patches now...

Paolo

[PATCH 2/2] KVM: fix KVM_XEN_HVM_CONFIG ioctl

2017-10-20 Thread Paolo Bonzini

This ioctl is obsolete (it was used by Xenner as far as I know) but
still let's not break it gratuitously...  Its handler is copying
directly into struct kvm.  Go through a bounce buffer instead, with
the added benefit that we can actually do something useful with the
flags argument---the previous code was exiting with -EINVAL but still
doing the copy.

This technically is a userspace ABI breakage, but since no one should be
using the ioctl, it's a good occasion to see if someone actually
complains.

Cc: kernel-harden...@lists.openwall.com
Cc: Kees Cook 
Cc: Radim Krčmář 
Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/x86.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 415529a78c37..c76d7afa30be 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4218,13 +4218,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
mutex_unlock(>lock);
break;
case KVM_XEN_HVM_CONFIG: {
+   struct kvm_xen_hvm_config xhc;
r = -EFAULT;
-   if (copy_from_user(>arch.xen_hvm_config, argp,
-  sizeof(struct kvm_xen_hvm_config)))
+   if (copy_from_user(, argp, sizeof(xhc)))
goto out;
r = -EINVAL;
-   if (kvm->arch.xen_hvm_config.flags)
+   if (xhc.flags)
goto out;
+   memcpy(>arch.xen_hvm_config, , sizeof(xhc));
r = 0;
break;
}
-- 
2.14.2

[PATCH 2/2] KVM: fix KVM_XEN_HVM_CONFIG ioctl

2017-10-20 Thread Paolo Bonzini

This ioctl is obsolete (it was used by Xenner as far as I know) but
still let's not break it gratuitously...  Its handler is copying
directly into struct kvm.  Go through a bounce buffer instead, with
the added benefit that we can actually do something useful with the
flags argument---the previous code was exiting with -EINVAL but still
doing the copy.

This technically is a userspace ABI breakage, but since no one should be
using the ioctl, it's a good occasion to see if someone actually
complains.

Cc: kernel-harden...@lists.openwall.com
Cc: Kees Cook 
Cc: Radim Krčmář 
Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/x86.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 415529a78c37..c76d7afa30be 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4218,13 +4218,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
mutex_unlock(>lock);
break;
case KVM_XEN_HVM_CONFIG: {
+   struct kvm_xen_hvm_config xhc;
r = -EFAULT;
-   if (copy_from_user(>arch.xen_hvm_config, argp,
-  sizeof(struct kvm_xen_hvm_config)))
+   if (copy_from_user(, argp, sizeof(xhc)))
goto out;
r = -EINVAL;
-   if (kvm->arch.xen_hvm_config.flags)
+   if (xhc.flags)
goto out;
+   memcpy(>arch.xen_hvm_config, , sizeof(xhc));
r = 0;
break;
}
-- 
2.14.2

[PATCH 0/2] KVM: fixes for the kernel-hardening tree

2017-10-20 Thread Paolo Bonzini

Two KVM ioctls (KVM_GET/SET_CPUID2) directly access the cpuid_entries
field of struct kvm_vcpu_arch.  Therefore, the new usercopy hardening
work in linux-next, which forbids copies from and to slab objects
unless they are from kmalloc or explicitly whitelisted, breaks KVM
completely.

This series fixes it by adding the two new usercopy arguments
to kvm_init (more precisely to a new function kvm_init_usercopy,
while kvm_init passes zeroes as a default).

There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is
obsolete and not a big deal at all.

I'm Ccing all submaintainers in case they have something similar
going on in their kvm_arch and kvm_vcpu_arch structs.  KVM has a
pretty complex userspace API, so thorough with linux-next is highly
recommended.

Many thanks to Thomas Gleixner for reporting this to me.

Paolo

Paolo Bonzini (2):
  KVM: allow setting a usercopy region in struct kvm_vcpu
  KVM: fix KVM_XEN_HVM_CONFIG ioctl

 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  |  4 ++--
 arch/x86/kvm/x86.c  | 17 ++---
 include/linux/kvm_host.h| 13 +++--
 virt/kvm/kvm_main.c | 13 -
 6 files changed, 40 insertions(+), 14 deletions(-)

-- 
2.14.2

[PATCH 1/2] KVM: allow setting a usercopy region in struct kvm_vcpu

2017-10-20 Thread Paolo Bonzini

On x86, struct kvm_vcpu has a usercopy region corresponding to the CPUID
entries.  The area is read and written by the KVM_GET/SET_CPUID2 ioctls.
Without this patch, KVM is completely broken on x86 with usercopy
hardening enabled.

Define kvm_init in terms of a more generic function that allows setting
a usercopy region.  Because x86 has separate kvm_init callers for Intel and
AMD, another variant called kvm_init_x86 passes the region corresponding
to the cpuid_entries array.

Reported-by: Thomas Gleixner 
Cc: kernel-harden...@lists.openwall.com
Cc: Kees Cook 
Cc: Radim Krčmář 
Cc: Christoffer Dall 
Cc: Marc Zyngier 
Cc: Christian Borntraeger 
Cc: Cornelia Huck 
Cc: James Hogan 
Cc: Paul Mackerras 
Signed-off-by: Paolo Bonzini 
---
The patch is on top of linux-next.

 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  |  4 ++--
 arch/x86/kvm/x86.c  | 10 ++
 include/linux/kvm_host.h| 13 +++--
 virt/kvm/kvm_main.c | 13 -
 6 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6b8f937ca398..bb8243d413d0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1420,6 +1420,9 @@ static inline void kvm_arch_vcpu_unblocking(struct 
kvm_vcpu *vcpu)
 
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
+int kvm_init_x86(struct kvm_x86_ops *kvm_x86_ops, unsigned vcpu_size,
+unsigned vcpu_align, struct module *module);
+
 static inline int kvm_cpu_get_apicid(int mps_cpu)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ff94552f85d0..457433c3a703 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5594,8 +5594,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
 static int __init svm_init(void)
 {
-   return kvm_init(_x86_ops, sizeof(struct vcpu_svm),
-   __alignof__(struct vcpu_svm), THIS_MODULE);
+   return kvm_init_x86(_x86_ops, sizeof(struct vcpu_svm),
+   __alignof__(struct vcpu_svm), THIS_MODULE);
 }
 
 static void __exit svm_exit(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c460b0b439d3..6e78530df6a8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -12106,8 +12106,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = 
{
 
 static int __init vmx_init(void)
 {
-   int r = kvm_init(_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+   int r = kvm_init_x86(_x86_ops, sizeof(struct vcpu_vmx),
+__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5669af09b732..415529a78c37 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8181,6 +8181,16 @@ void kvm_arch_sync_events(struct kvm *kvm)
kvm_free_pit(kvm);
 }
 
+int kvm_init_x86(struct kvm_x86_ops *kvm_x86_ops, unsigned vcpu_size,
+unsigned vcpu_align, struct module *module)
+{
+   return kvm_init_usercopy(kvm_x86_ops, vcpu_size, vcpu_align,
+offsetof(struct kvm_vcpu_arch, cpuid_entries),
+sizeof_field(struct kvm_vcpu_arch, 
cpuid_entries),
+module);
+}
+EXPORT_SYMBOL_GPL(kvm_init_x86);
+
 int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 {
int i, r;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6882538eda32..21e19658b086 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -561,8 +561,17 @@ static inline void kvm_irqfd_exit(void)
 {
 }
 #endif
-int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
- struct module *module);
+
+int kvm_init_usercopy(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+ unsigned vcpu_usercopy_start, unsigned vcpu_usercopy_size,
+ struct module *module);
+
+static inline int kvm_init(void *opaque, unsigned vcpu_size, unsigned 
vcpu_align,
+  struct module *module)
+{
+   return kvm_init_usercopy(opaque, vcpu_size, vcpu_align, 0, 0, module);
+}
+
 void kvm_exit(void);
 
 void kvm_get_kvm(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 261c782a688f..ac889b28bb54 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3959,8 +3959,9 @@ static void kvm_sched_out(struct preempt_notifier *pn,
kvm_arch_vcpu_put(vcpu);
 }
 
-int kvm_init(void *opaque, unsigned vcpu_size,

[PATCH 0/2] KVM: fixes for the kernel-hardening tree

2017-10-20 Thread Paolo Bonzini

Two KVM ioctls (KVM_GET/SET_CPUID2) directly access the cpuid_entries
field of struct kvm_vcpu_arch.  Therefore, the new usercopy hardening
work in linux-next, which forbids copies from and to slab objects
unless they are from kmalloc or explicitly whitelisted, breaks KVM
completely.

This series fixes it by adding the two new usercopy arguments
to kvm_init (more precisely to a new function kvm_init_usercopy,
while kvm_init passes zeroes as a default).

There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is
obsolete and not a big deal at all.

I'm Ccing all submaintainers in case they have something similar
going on in their kvm_arch and kvm_vcpu_arch structs.  KVM has a
pretty complex userspace API, so thorough with linux-next is highly
recommended.

Many thanks to Thomas Gleixner for reporting this to me.

Paolo

Paolo Bonzini (2):
  KVM: allow setting a usercopy region in struct kvm_vcpu
  KVM: fix KVM_XEN_HVM_CONFIG ioctl

 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  |  4 ++--
 arch/x86/kvm/x86.c  | 17 ++---
 include/linux/kvm_host.h| 13 +++--
 virt/kvm/kvm_main.c | 13 -
 6 files changed, 40 insertions(+), 14 deletions(-)

-- 
2.14.2

[PATCH 1/2] KVM: allow setting a usercopy region in struct kvm_vcpu

2017-10-20 Thread Paolo Bonzini

On x86, struct kvm_vcpu has a usercopy region corresponding to the CPUID
entries.  The area is read and written by the KVM_GET/SET_CPUID2 ioctls.
Without this patch, KVM is completely broken on x86 with usercopy
hardening enabled.

Define kvm_init in terms of a more generic function that allows setting
a usercopy region.  Because x86 has separate kvm_init callers for Intel and
AMD, another variant called kvm_init_x86 passes the region corresponding
to the cpuid_entries array.

Reported-by: Thomas Gleixner 
Cc: kernel-harden...@lists.openwall.com
Cc: Kees Cook 
Cc: Radim Krčmář 
Cc: Christoffer Dall 
Cc: Marc Zyngier 
Cc: Christian Borntraeger 
Cc: Cornelia Huck 
Cc: James Hogan 
Cc: Paul Mackerras 
Signed-off-by: Paolo Bonzini 
---
The patch is on top of linux-next.

 arch/x86/include/asm/kvm_host.h |  3 +++
 arch/x86/kvm/svm.c  |  4 ++--
 arch/x86/kvm/vmx.c  |  4 ++--
 arch/x86/kvm/x86.c  | 10 ++
 include/linux/kvm_host.h| 13 +++--
 virt/kvm/kvm_main.c | 13 -
 6 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6b8f937ca398..bb8243d413d0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1420,6 +1420,9 @@ static inline void kvm_arch_vcpu_unblocking(struct 
kvm_vcpu *vcpu)
 
 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {}
 
+int kvm_init_x86(struct kvm_x86_ops *kvm_x86_ops, unsigned vcpu_size,
+unsigned vcpu_align, struct module *module);
+
 static inline int kvm_cpu_get_apicid(int mps_cpu)
 {
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ff94552f85d0..457433c3a703 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -5594,8 +5594,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
 static int __init svm_init(void)
 {
-   return kvm_init(_x86_ops, sizeof(struct vcpu_svm),
-   __alignof__(struct vcpu_svm), THIS_MODULE);
+   return kvm_init_x86(_x86_ops, sizeof(struct vcpu_svm),
+   __alignof__(struct vcpu_svm), THIS_MODULE);
 }
 
 static void __exit svm_exit(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c460b0b439d3..6e78530df6a8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -12106,8 +12106,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = 
{
 
 static int __init vmx_init(void)
 {
-   int r = kvm_init(_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+   int r = kvm_init_x86(_x86_ops, sizeof(struct vcpu_vmx),
+__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5669af09b732..415529a78c37 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8181,6 +8181,16 @@ void kvm_arch_sync_events(struct kvm *kvm)
kvm_free_pit(kvm);
 }
 
+int kvm_init_x86(struct kvm_x86_ops *kvm_x86_ops, unsigned vcpu_size,
+unsigned vcpu_align, struct module *module)
+{
+   return kvm_init_usercopy(kvm_x86_ops, vcpu_size, vcpu_align,
+offsetof(struct kvm_vcpu_arch, cpuid_entries),
+sizeof_field(struct kvm_vcpu_arch, 
cpuid_entries),
+module);
+}
+EXPORT_SYMBOL_GPL(kvm_init_x86);
+
 int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
 {
int i, r;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6882538eda32..21e19658b086 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -561,8 +561,17 @@ static inline void kvm_irqfd_exit(void)
 {
 }
 #endif
-int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
- struct module *module);
+
+int kvm_init_usercopy(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+ unsigned vcpu_usercopy_start, unsigned vcpu_usercopy_size,
+ struct module *module);
+
+static inline int kvm_init(void *opaque, unsigned vcpu_size, unsigned 
vcpu_align,
+  struct module *module)
+{
+   return kvm_init_usercopy(opaque, vcpu_size, vcpu_align, 0, 0, module);
+}
+
 void kvm_exit(void);
 
 void kvm_get_kvm(struct kvm *kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 261c782a688f..ac889b28bb54 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3959,8 +3959,9 @@ static void kvm_sched_out(struct preempt_notifier *pn,
kvm_arch_vcpu_put(vcpu);
 }
 
-int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
- struct module *module)
+int kvm_init_usercopy(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
+ unsigned vcpu_arch_usercopy_start, unsigned

Re: [PATCH 05/27] kexec: Disable at runtime if the kernel is locked down

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> From: Matthew Garrett 
> 
> kexec permits the loading and execution of arbitrary code in ring 0, which
> is something that lock-down is meant to prevent. It makes sense to disable
> kexec in this situation.
> 
> This does not affect kexec_file_load() which can check for a signature on the
> image to be booted.
> 
> Signed-off-by: Matthew Garrett 
> Signed-off-by: David Howells 
> Acked-by: Dave Young 
> cc: ke...@lists.infradead.org


Reviewed-by: James Morris 

-- 
James Morris

Re: [PATCH 05/27] kexec: Disable at runtime if the kernel is locked down

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> From: Matthew Garrett 
> 
> kexec permits the loading and execution of arbitrary code in ring 0, which
> is something that lock-down is meant to prevent. It makes sense to disable
> kexec in this situation.
> 
> This does not affect kexec_file_load() which can check for a signature on the
> image to be booted.
> 
> Signed-off-by: Matthew Garrett 
> Signed-off-by: David Howells 
> Acked-by: Dave Young 
> cc: ke...@lists.infradead.org


Reviewed-by: James Morris 

-- 
James Morris

Re: [PATCH 04/27] Restrict /dev/mem and /dev/kmem when the kernel is locked down

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> From: Matthew Garrett 
> 
> Allowing users to write to address space makes it possible for the kernel to
> be subverted, avoiding module loading restrictions.  Prevent this when the
> kernel has been locked down.
> 
> Signed-off-by: Matthew Garrett 
> Signed-off-by: David Howells 


Reviewed-by: James Morris 

-- 
James Morris

Re: [PATCH 04/27] Restrict /dev/mem and /dev/kmem when the kernel is locked down

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> From: Matthew Garrett 
> 
> Allowing users to write to address space makes it possible for the kernel to
> be subverted, avoiding module loading restrictions.  Prevent this when the
> kernel has been locked down.
> 
> Signed-off-by: Matthew Garrett 
> Signed-off-by: David Howells 


Reviewed-by: James Morris 

-- 
James Morris

Re: [PATCH 0/5] xfs refcount conversions

2017-10-20 Thread Dave Chinner

On Fri, Oct 20, 2017 at 02:07:53PM +0300, Elena Reshetova wrote:
> Note: our previous thread didn't finish in any conclusion, so
> I am resending this now (rebased at latest linux-next) to revive
> the discussion. refcount_t is slowly becoming a standard for
> refcounters and we would really like to make all conversions
> done where it is applicable.

In a separate "replace atomics with refcounts" discussion, the
ordering guarantees of refcounts was raised:

https://lkml.org/lkml/2017/9/4/206

i.e. refcounts use weak ordering whilst atomics imply a smp_mb()
operation was performed.

Given these counters in XFS directly define the life cycle states
rather than being just an object refcount, I'm pretty sure they
rely on the implied smp_mb() that the atomic operations provide to
work correctly.

Let me put it this way: Documentation/memory-barriers.txt breaks my
brain.

We know atomics provide the barriers we need for the code to work
correctly, but using anything else requires understanding the
ordering of the new code and what Documentation/memory-barriers.txt
says that means.

IMO, that makes it way too hard to review sanely for code that:

a) we already know works correctly
b) has guards against reference count problems; and
c) isn't performance critical, so doesn't need the
   complexity of tricky memory ordering and/or barriers.

So, really, it comes down to the fact that we know refcount_t is not
a straight drop in replacement for atomics, and that actually
verifying the change is correct requires an in depth understanding
of Documentation/memory-barriers.txt. IMO, that's way too much of a
long term maintenance and knowledge burden to add to what is a
simple set of reference counters...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com

Re: [PATCH 03/27] Enforce module signatures if the kernel is locked down

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> If the kernel is locked down, require that all modules have valid
> signatures that we can verify.
> 
> Signed-off-by: David Howells 


Reviewed-by: James Morris 

-- 
James Morris

Re: [PATCH 03/27] Enforce module signatures if the kernel is locked down

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> If the kernel is locked down, require that all modules have valid
> signatures that we can verify.
> 
> Signed-off-by: David Howells 


Reviewed-by: James Morris 

-- 
James Morris

Re: [PATCH 0/5] xfs refcount conversions

2017-10-20 Thread Dave Chinner

On Fri, Oct 20, 2017 at 02:07:53PM +0300, Elena Reshetova wrote:
> Note: our previous thread didn't finish in any conclusion, so
> I am resending this now (rebased at latest linux-next) to revive
> the discussion. refcount_t is slowly becoming a standard for
> refcounters and we would really like to make all conversions
> done where it is applicable.

In a separate "replace atomics with refcounts" discussion, the
ordering guarantees of refcounts was raised:

https://lkml.org/lkml/2017/9/4/206

i.e. refcounts use weak ordering whilst atomics imply a smp_mb()
operation was performed.

Given these counters in XFS directly define the life cycle states
rather than being just an object refcount, I'm pretty sure they
rely on the implied smp_mb() that the atomic operations provide to
work correctly.

Let me put it this way: Documentation/memory-barriers.txt breaks my
brain.

We know atomics provide the barriers we need for the code to work
correctly, but using anything else requires understanding the
ordering of the new code and what Documentation/memory-barriers.txt
says that means.

IMO, that makes it way too hard to review sanely for code that:

a) we already know works correctly
b) has guards against reference count problems; and
c) isn't performance critical, so doesn't need the
   complexity of tricky memory ordering and/or barriers.

So, really, it comes down to the fact that we know refcount_t is not
a straight drop in replacement for atomics, and that actually
verifying the change is correct requires an in depth understanding
of Documentation/memory-barriers.txt. IMO, that's way too much of a
long term maintenance and knowledge burden to add to what is a
simple set of reference counters...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com

Re: [PATCH 01/27] Add the ability to lock down access to the running kernel image

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> Provide a single call to allow kernel code to determine whether the system
> should be locked down, thereby disallowing various accesses that might
> allow the running kernel image to be changed including the loading of
> modules that aren't validly signed with a key we recognise, fiddling with
> MSR registers and disallowing hibernation,
> 
> Signed-off-by: David Howells 


Acked-by: James Morris 


-- 
James Morris

Re: [PATCH 01/27] Add the ability to lock down access to the running kernel image

2017-10-20 Thread James Morris

On Thu, 19 Oct 2017, David Howells wrote:

> Provide a single call to allow kernel code to determine whether the system
> should be locked down, thereby disallowing various accesses that might
> allow the running kernel image to be changed including the loading of
> modules that aren't validly signed with a key we recognise, fiddling with
> MSR registers and disallowing hibernation,
> 
> Signed-off-by: David Howells 


Acked-by: James Morris 


-- 
James Morris

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1790 matches

Mail list logo