Re: [PATCH] lkdtm: Convert from jprobe to kprobe
On Fri, 20 Oct 2017 06:31:27 -0700 Kees Cookwrote: > The jprobe subsystem is being removed, so convert to using kprobe instead. > Looks good to me:) Acked-by: Masami Hiramatsu Thanks, > Cc: Masami Hiramatsu > Signed-off-by: Kees Cook > --- > drivers/misc/lkdtm_core.c | 154 > ++ > 1 file changed, 45 insertions(+), 109 deletions(-) > > diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c > index 981b3ef71e47..ed7f0c61c59a 100644 > --- a/drivers/misc/lkdtm_core.c > +++ b/drivers/misc/lkdtm_core.c > @@ -56,122 +56,54 @@ static ssize_t direct_entry(struct file *f, const char > __user *user_buf, > size_t count, loff_t *off); > > #ifdef CONFIG_KPROBES > -static void lkdtm_handler(void); > +static int lkdtm_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); > static ssize_t lkdtm_debugfs_entry(struct file *f, > const char __user *user_buf, > size_t count, loff_t *off); > - > - > -/* jprobe entry point handlers. */ > -static unsigned int jp_do_irq(unsigned int irq) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > - > -static irqreturn_t jp_handle_irq_event(unsigned int irq, > -struct irqaction *action) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > - > -static void jp_tasklet_action(struct softirq_action *a) > -{ > - lkdtm_handler(); > - jprobe_return(); > -} > - > -static void jp_ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) > -{ > - lkdtm_handler(); > - jprobe_return(); > -} > - > -struct scan_control; > - > -static unsigned long jp_shrink_inactive_list(unsigned long max_scan, > - struct zone *zone, > - struct scan_control *sc) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > - > -static int jp_hrtimer_start(struct hrtimer *timer, ktime_t tim, > - const enum hrtimer_mode mode) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > - > -static int jp_scsi_dispatch_cmd(struct scsi_cmnd *cmd) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > - > -# ifdef CONFIG_IDE > -static int jp_generic_ide_ioctl(ide_drive_t *drive, struct file *file, > - struct block_device *bdev, unsigned int cmd, > - unsigned long arg) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > -# endif > +# define CRASHPOINT_KPROBE(_symbol) \ > + .kprobe = { \ > + .symbol_name = (_symbol), \ > + .pre_handler = lkdtm_kprobe_handler,\ > + }, > +# define CRASHPOINT_WRITE(_symbol) \ > + (_symbol) ? lkdtm_debugfs_entry : direct_entry > +#else > +# define CRASHPOINT_KPROBE(_symbol) > +# define CRASHPOINT_WRITE(_symbol) direct_entry > #endif > > /* Crash points */ > struct crashpoint { > const char *name; > const struct file_operations fops; > - struct jprobe jprobe; > + struct kprobe kprobe; > }; > > -#define CRASHPOINT(_name, _write, _symbol, _entry) \ > +#define CRASHPOINT(_name, _symbol) \ > { \ > .name = _name, \ > .fops = { \ > .read = lkdtm_debugfs_read, \ > .llseek = generic_file_llseek, \ > .open = lkdtm_debugfs_open, \ > - .write = _write, \ > - }, \ > - .jprobe = { \ > - .kp.symbol_name = _symbol, \ > - .entry = (kprobe_opcode_t *)_entry, \ > + .write = CRASHPOINT_WRITE(_symbol) \ > }, \ > + CRASHPOINT_KPROBE(_symbol) \ > } > > /* Define the possible places where we can trigger a crash point. */ > -struct crashpoint crashpoints[] = { > - CRASHPOINT("DIRECT",direct_entry, > -NULL,NULL), > +static struct crashpoint crashpoints[] = { > + CRASHPOINT("DIRECT", NULL), > #ifdef CONFIG_KPROBES > - CRASHPOINT("INT_HARDWARE_ENTRY",lkdtm_debugfs_entry, > -"do_IRQ",
Re: [PATCH] lkdtm: Convert from jprobe to kprobe
On Fri, 20 Oct 2017 06:31:27 -0700 Kees Cook wrote: > The jprobe subsystem is being removed, so convert to using kprobe instead. > Looks good to me:) Acked-by: Masami Hiramatsu Thanks, > Cc: Masami Hiramatsu > Signed-off-by: Kees Cook > --- > drivers/misc/lkdtm_core.c | 154 > ++ > 1 file changed, 45 insertions(+), 109 deletions(-) > > diff --git a/drivers/misc/lkdtm_core.c b/drivers/misc/lkdtm_core.c > index 981b3ef71e47..ed7f0c61c59a 100644 > --- a/drivers/misc/lkdtm_core.c > +++ b/drivers/misc/lkdtm_core.c > @@ -56,122 +56,54 @@ static ssize_t direct_entry(struct file *f, const char > __user *user_buf, > size_t count, loff_t *off); > > #ifdef CONFIG_KPROBES > -static void lkdtm_handler(void); > +static int lkdtm_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); > static ssize_t lkdtm_debugfs_entry(struct file *f, > const char __user *user_buf, > size_t count, loff_t *off); > - > - > -/* jprobe entry point handlers. */ > -static unsigned int jp_do_irq(unsigned int irq) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > - > -static irqreturn_t jp_handle_irq_event(unsigned int irq, > -struct irqaction *action) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > - > -static void jp_tasklet_action(struct softirq_action *a) > -{ > - lkdtm_handler(); > - jprobe_return(); > -} > - > -static void jp_ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) > -{ > - lkdtm_handler(); > - jprobe_return(); > -} > - > -struct scan_control; > - > -static unsigned long jp_shrink_inactive_list(unsigned long max_scan, > - struct zone *zone, > - struct scan_control *sc) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > - > -static int jp_hrtimer_start(struct hrtimer *timer, ktime_t tim, > - const enum hrtimer_mode mode) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > - > -static int jp_scsi_dispatch_cmd(struct scsi_cmnd *cmd) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > - > -# ifdef CONFIG_IDE > -static int jp_generic_ide_ioctl(ide_drive_t *drive, struct file *file, > - struct block_device *bdev, unsigned int cmd, > - unsigned long arg) > -{ > - lkdtm_handler(); > - jprobe_return(); > - return 0; > -} > -# endif > +# define CRASHPOINT_KPROBE(_symbol) \ > + .kprobe = { \ > + .symbol_name = (_symbol), \ > + .pre_handler = lkdtm_kprobe_handler,\ > + }, > +# define CRASHPOINT_WRITE(_symbol) \ > + (_symbol) ? lkdtm_debugfs_entry : direct_entry > +#else > +# define CRASHPOINT_KPROBE(_symbol) > +# define CRASHPOINT_WRITE(_symbol) direct_entry > #endif > > /* Crash points */ > struct crashpoint { > const char *name; > const struct file_operations fops; > - struct jprobe jprobe; > + struct kprobe kprobe; > }; > > -#define CRASHPOINT(_name, _write, _symbol, _entry) \ > +#define CRASHPOINT(_name, _symbol) \ > { \ > .name = _name, \ > .fops = { \ > .read = lkdtm_debugfs_read, \ > .llseek = generic_file_llseek, \ > .open = lkdtm_debugfs_open, \ > - .write = _write, \ > - }, \ > - .jprobe = { \ > - .kp.symbol_name = _symbol, \ > - .entry = (kprobe_opcode_t *)_entry, \ > + .write = CRASHPOINT_WRITE(_symbol) \ > }, \ > + CRASHPOINT_KPROBE(_symbol) \ > } > > /* Define the possible places where we can trigger a crash point. */ > -struct crashpoint crashpoints[] = { > - CRASHPOINT("DIRECT",direct_entry, > -NULL,NULL), > +static struct crashpoint crashpoints[] = { > + CRASHPOINT("DIRECT", NULL), > #ifdef CONFIG_KPROBES > - CRASHPOINT("INT_HARDWARE_ENTRY",lkdtm_debugfs_entry, > -"do_IRQ",jp_do_irq), > - CRASHPOINT("INT_HW_IRQ_EN", lkdtm_debugfs_entry, > -
Re: [PATCH 1/2] mm/mmu_notifier: avoid double notification when it is useless v2
On Thu, 2017-10-19 at 12:58 -0400, Jerome Glisse wrote: > On Thu, Oct 19, 2017 at 09:53:11PM +1100, Balbir Singh wrote: > > On Thu, Oct 19, 2017 at 2:28 PM, Jerome Glissewrote: > > > On Thu, Oct 19, 2017 at 02:04:26PM +1100, Balbir Singh wrote: > > > > On Mon, 16 Oct 2017 23:10:02 -0400 > > > > jgli...@redhat.com wrote: > > > > > > > > > From: Jérôme Glisse > > > > > > > > > > + /* > > > > > +* No need to call mmu_notifier_invalidate_range() as we > > > > > are > > > > > +* downgrading page table protection not changing it to > > > > > point > > > > > +* to a new page. > > > > > +* > > > > > +* See Documentation/vm/mmu_notifier.txt > > > > > +*/ > > > > > if (pmdp) { > > > > > #ifdef CONFIG_FS_DAX_PMD > > > > > pmd_t pmd; > > > > > @@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct > > > > > address_space *mapping, > > > > > pmd = pmd_wrprotect(pmd); > > > > > pmd = pmd_mkclean(pmd); > > > > > set_pmd_at(vma->vm_mm, address, pmdp, pmd); > > > > > - mmu_notifier_invalidate_range(vma->vm_mm, start, > > > > > end); > > > > > > > > Could the secondary TLB still see the mapping as dirty and propagate > > > > the dirty bit back? > > > > > > I am assuming hardware does sane thing of setting the dirty bit only > > > when walking the CPU page table when device does a write fault ie > > > once the device get a write TLB entry the dirty is set by the IOMMU > > > when walking the page table before returning the lookup result to the > > > device and that it won't be set again latter (ie propagated back > > > latter). > > > > > > > The other possibility is that the hardware things the page is writable > > and already > > marked dirty. It allows writes and does not set the dirty bit? > > I thought about this some more and the patch can not regress anything > that is not broken today. So if we assume that device can propagate > dirty bit because it can cache the write protection than all current > code is broken for two reasons: > > First one is current code clear pte entry, build a new pte value with > write protection and update pte entry with new pte value. So any PASID/ > ATS platform that allows device to cache the write bit and set dirty > bit anytime after that can race during that window and you would loose > the dirty bit of the device. That is not that bad as you are gonna > propagate the dirty bit to the struct page. But they stay consistent with the notifiers, so from the OS perspective it notifies of any PTE changes as they happen. When the ATS platform sees invalidation, it invalidates it's PTE's as well. I was speaking of the case where the ATS platform could assume it has write access and has not seen any invalidation, the OS could return back to user space or the caller with write bit clear, but the ATS platform could still do a write since it's not seen the invalidation. > > Second one is if the dirty bit is propagated back to the new write > protected pte. Quick look at code it seems that when we zap pte or > or mkclean we don't check that the pte has write permission but only > care about the dirty bit. So it should not have any bad consequence. > > After this patch only the second window is bigger and thus more likely > to happen. But nothing sinister should happen from that. > > > > > > > I should probably have spell that out and maybe some of the ATS/PASID > > > implementer did not do that. > > > > > > > > > > > > unlock_pmd: > > > > > spin_unlock(ptl); > > > > > #endif > > > > > @@ -643,7 +649,6 @@ static void dax_mapping_entry_mkclean(struct > > > > > address_space *mapping, > > > > > pte = pte_wrprotect(pte); > > > > > pte = pte_mkclean(pte); > > > > > set_pte_at(vma->vm_mm, address, ptep, pte); > > > > > - mmu_notifier_invalidate_range(vma->vm_mm, start, > > > > > end); > > > > > > > > Ditto > > > > > > > > > unlock_pte: > > > > > pte_unmap_unlock(ptep, ptl); > > > > > } > > > > > diff --git a/include/linux/mmu_notifier.h > > > > > b/include/linux/mmu_notifier.h > > > > > index 6866e8126982..49c925c96b8a 100644 > > > > > --- a/include/linux/mmu_notifier.h > > > > > +++ b/include/linux/mmu_notifier.h > > > > > @@ -155,7 +155,8 @@ struct mmu_notifier_ops { > > > > > * shared page-tables, it not necessary to implement the > > > > > * invalidate_range_start()/end() notifiers, as > > > > > * invalidate_range() alread catches the points in time when an > > > > > -* external TLB range needs to be flushed. > > > > > +* external TLB range needs to be flushed. For more in depth > > > > > +* discussion on this see Documentation/vm/mmu_notifier.txt > > > > > * > > > > >
Re: [PATCH 1/2] mm/mmu_notifier: avoid double notification when it is useless v2
On Thu, 2017-10-19 at 12:58 -0400, Jerome Glisse wrote: > On Thu, Oct 19, 2017 at 09:53:11PM +1100, Balbir Singh wrote: > > On Thu, Oct 19, 2017 at 2:28 PM, Jerome Glisse wrote: > > > On Thu, Oct 19, 2017 at 02:04:26PM +1100, Balbir Singh wrote: > > > > On Mon, 16 Oct 2017 23:10:02 -0400 > > > > jgli...@redhat.com wrote: > > > > > > > > > From: Jérôme Glisse > > > > > > > > > > + /* > > > > > +* No need to call mmu_notifier_invalidate_range() as we > > > > > are > > > > > +* downgrading page table protection not changing it to > > > > > point > > > > > +* to a new page. > > > > > +* > > > > > +* See Documentation/vm/mmu_notifier.txt > > > > > +*/ > > > > > if (pmdp) { > > > > > #ifdef CONFIG_FS_DAX_PMD > > > > > pmd_t pmd; > > > > > @@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct > > > > > address_space *mapping, > > > > > pmd = pmd_wrprotect(pmd); > > > > > pmd = pmd_mkclean(pmd); > > > > > set_pmd_at(vma->vm_mm, address, pmdp, pmd); > > > > > - mmu_notifier_invalidate_range(vma->vm_mm, start, > > > > > end); > > > > > > > > Could the secondary TLB still see the mapping as dirty and propagate > > > > the dirty bit back? > > > > > > I am assuming hardware does sane thing of setting the dirty bit only > > > when walking the CPU page table when device does a write fault ie > > > once the device get a write TLB entry the dirty is set by the IOMMU > > > when walking the page table before returning the lookup result to the > > > device and that it won't be set again latter (ie propagated back > > > latter). > > > > > > > The other possibility is that the hardware things the page is writable > > and already > > marked dirty. It allows writes and does not set the dirty bit? > > I thought about this some more and the patch can not regress anything > that is not broken today. So if we assume that device can propagate > dirty bit because it can cache the write protection than all current > code is broken for two reasons: > > First one is current code clear pte entry, build a new pte value with > write protection and update pte entry with new pte value. So any PASID/ > ATS platform that allows device to cache the write bit and set dirty > bit anytime after that can race during that window and you would loose > the dirty bit of the device. That is not that bad as you are gonna > propagate the dirty bit to the struct page. But they stay consistent with the notifiers, so from the OS perspective it notifies of any PTE changes as they happen. When the ATS platform sees invalidation, it invalidates it's PTE's as well. I was speaking of the case where the ATS platform could assume it has write access and has not seen any invalidation, the OS could return back to user space or the caller with write bit clear, but the ATS platform could still do a write since it's not seen the invalidation. > > Second one is if the dirty bit is propagated back to the new write > protected pte. Quick look at code it seems that when we zap pte or > or mkclean we don't check that the pte has write permission but only > care about the dirty bit. So it should not have any bad consequence. > > After this patch only the second window is bigger and thus more likely > to happen. But nothing sinister should happen from that. > > > > > > > I should probably have spell that out and maybe some of the ATS/PASID > > > implementer did not do that. > > > > > > > > > > > > unlock_pmd: > > > > > spin_unlock(ptl); > > > > > #endif > > > > > @@ -643,7 +649,6 @@ static void dax_mapping_entry_mkclean(struct > > > > > address_space *mapping, > > > > > pte = pte_wrprotect(pte); > > > > > pte = pte_mkclean(pte); > > > > > set_pte_at(vma->vm_mm, address, ptep, pte); > > > > > - mmu_notifier_invalidate_range(vma->vm_mm, start, > > > > > end); > > > > > > > > Ditto > > > > > > > > > unlock_pte: > > > > > pte_unmap_unlock(ptep, ptl); > > > > > } > > > > > diff --git a/include/linux/mmu_notifier.h > > > > > b/include/linux/mmu_notifier.h > > > > > index 6866e8126982..49c925c96b8a 100644 > > > > > --- a/include/linux/mmu_notifier.h > > > > > +++ b/include/linux/mmu_notifier.h > > > > > @@ -155,7 +155,8 @@ struct mmu_notifier_ops { > > > > > * shared page-tables, it not necessary to implement the > > > > > * invalidate_range_start()/end() notifiers, as > > > > > * invalidate_range() alread catches the points in time when an > > > > > -* external TLB range needs to be flushed. > > > > > +* external TLB range needs to be flushed. For more in depth > > > > > +* discussion on this see Documentation/vm/mmu_notifier.txt > > > > > * > > > > > * The invalidate_range() function is
Re: [PATCH 12/27] x86/msr: Restrict MSR access when the kernel is locked down
On Fri, Oct 20, 2017 at 09:48:16PM +0100, David Howells wrote: > Alan Coxwrote: > > > There are a load of standard tools that use this so I think you are going > > to need a whitelist. Can you at least log *which* MSR in the failing case > > so a whitelist can be built over time ? > [...snip] > > And do you know where wrmsr_safe_regs() might be found? I can see things > using it and exporting it, but no implementation, so I'm guessing it's > macroised somewhere. Looks the definition is in arch/x86/lib/msr-reg.S #ifdef CONFIG_X86_64 /* * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]); * * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] * */ .macro op_safe_regs op ENTRY(\op\()_safe_regs) pushq %rbx pushq %r12 ... Regards Joey Lee
Re: [PATCH 12/27] x86/msr: Restrict MSR access when the kernel is locked down
On Fri, Oct 20, 2017 at 09:48:16PM +0100, David Howells wrote: > Alan Cox wrote: > > > There are a load of standard tools that use this so I think you are going > > to need a whitelist. Can you at least log *which* MSR in the failing case > > so a whitelist can be built over time ? > [...snip] > > And do you know where wrmsr_safe_regs() might be found? I can see things > using it and exporting it, but no implementation, so I'm guessing it's > macroised somewhere. Looks the definition is in arch/x86/lib/msr-reg.S #ifdef CONFIG_X86_64 /* * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]); * * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] * */ .macro op_safe_regs op ENTRY(\op\()_safe_regs) pushq %rbx pushq %r12 ... Regards Joey Lee
Re: [PATCH v3 02/13] dax: require 'struct page' for filesystem dax
On Fri, Oct 20, 2017 at 8:20 PM, Matthew Wilcoxwrote: > On Fri, Oct 20, 2017 at 03:29:57PM -0700, Dan Williams wrote: >> Ok, I'd also like to kill DAX support in the brd driver. It's a source >> of complexity and maintenance burden for zero benefit. It's the only >> ->direct_access() implementation that sleeps and it's the only >> implementation where there is a non-linear relationship between >> sectors and pfns. Having a 1:1 sector to pfn relationship will help >> with the dma-extent-busy management since we don't need to keep >> calling into the driver to map pfns back to sectors once we know the >> pfn[0] sector[0] relationship. > > But these are important things that other block devices may / will want. > > For example, I think it's entirely sensible to support ->direct_access > for RAID-0. Dell are looking at various different options for having > one pmemX device per DIMM and using RAID to lash them together. > ->direct_access makes no sense for RAID-5 or RAID-1, but RAID-0 makes > sense to me. > > Last time we tried to take sleeping out, there were grumblings from people > with network block devices who thought they'd want to bring pages in > across the network. I'm a bit less sympathetic to this because I don't > know anyone actively working on it, but the RAID-0 case is something I > think we should care about. True, good point. In fact we already support device-mapper striping with ->direct_access(). I'd still like to go ahead with the sleeping removal. When those folks come back and add network direct_access they can do the hard work of figuring out cases where we need to call direct_access in atomic contexts.
Re: [PATCH v3 02/13] dax: require 'struct page' for filesystem dax
On Fri, Oct 20, 2017 at 8:20 PM, Matthew Wilcox wrote: > On Fri, Oct 20, 2017 at 03:29:57PM -0700, Dan Williams wrote: >> Ok, I'd also like to kill DAX support in the brd driver. It's a source >> of complexity and maintenance burden for zero benefit. It's the only >> ->direct_access() implementation that sleeps and it's the only >> implementation where there is a non-linear relationship between >> sectors and pfns. Having a 1:1 sector to pfn relationship will help >> with the dma-extent-busy management since we don't need to keep >> calling into the driver to map pfns back to sectors once we know the >> pfn[0] sector[0] relationship. > > But these are important things that other block devices may / will want. > > For example, I think it's entirely sensible to support ->direct_access > for RAID-0. Dell are looking at various different options for having > one pmemX device per DIMM and using RAID to lash them together. > ->direct_access makes no sense for RAID-5 or RAID-1, but RAID-0 makes > sense to me. > > Last time we tried to take sleeping out, there were grumblings from people > with network block devices who thought they'd want to bring pages in > across the network. I'm a bit less sympathetic to this because I don't > know anyone actively working on it, but the RAID-0 case is something I > think we should care about. True, good point. In fact we already support device-mapper striping with ->direct_access(). I'd still like to go ahead with the sleeping removal. When those folks come back and add network direct_access they can do the hard work of figuring out cases where we need to call direct_access in atomic contexts.
[PATCH] net: can: dev: remove unused code in can_restart
Value assigned to variable err is overwritten at line 562: err = priv->do_set_mode(dev, CAN_MODE_START); before it can be used. Also, notice that this code has been there since 2014. Addresses-Coverity-ID: 1227031 Signed-off-by: Gustavo A. R. Silva--- This code was tested by compilation only. drivers/net/can/dev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 365a8cc..064d0f6 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -543,10 +543,9 @@ static void can_restart(struct net_device *dev) /* send restart message upstream */ skb = alloc_can_err_skb(dev, ); - if (skb == NULL) { - err = -ENOMEM; + if (!skb) goto restart; - } + cf->can_id |= CAN_ERR_RESTARTED; netif_rx(skb); -- 2.7.4
[PATCH] net: can: dev: remove unused code in can_restart
Value assigned to variable err is overwritten at line 562: err = priv->do_set_mode(dev, CAN_MODE_START); before it can be used. Also, notice that this code has been there since 2014. Addresses-Coverity-ID: 1227031 Signed-off-by: Gustavo A. R. Silva --- This code was tested by compilation only. drivers/net/can/dev.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 365a8cc..064d0f6 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -543,10 +543,9 @@ static void can_restart(struct net_device *dev) /* send restart message upstream */ skb = alloc_can_err_skb(dev, ); - if (skb == NULL) { - err = -ENOMEM; + if (!skb) goto restart; - } + cf->can_id |= CAN_ERR_RESTARTED; netif_rx(skb); -- 2.7.4
Re: [PATCH v3 02/13] dax: require 'struct page' for filesystem dax
On Fri, Oct 20, 2017 at 03:29:57PM -0700, Dan Williams wrote: > Ok, I'd also like to kill DAX support in the brd driver. It's a source > of complexity and maintenance burden for zero benefit. It's the only > ->direct_access() implementation that sleeps and it's the only > implementation where there is a non-linear relationship between > sectors and pfns. Having a 1:1 sector to pfn relationship will help > with the dma-extent-busy management since we don't need to keep > calling into the driver to map pfns back to sectors once we know the > pfn[0] sector[0] relationship. But these are important things that other block devices may / will want. For example, I think it's entirely sensible to support ->direct_access for RAID-0. Dell are looking at various different options for having one pmemX device per DIMM and using RAID to lash them together. ->direct_access makes no sense for RAID-5 or RAID-1, but RAID-0 makes sense to me. Last time we tried to take sleeping out, there were grumblings from people with network block devices who thought they'd want to bring pages in across the network. I'm a bit less sympathetic to this because I don't know anyone actively working on it, but the RAID-0 case is something I think we should care about.
Re: [PATCH v3 02/13] dax: require 'struct page' for filesystem dax
On Fri, Oct 20, 2017 at 03:29:57PM -0700, Dan Williams wrote: > Ok, I'd also like to kill DAX support in the brd driver. It's a source > of complexity and maintenance burden for zero benefit. It's the only > ->direct_access() implementation that sleeps and it's the only > implementation where there is a non-linear relationship between > sectors and pfns. Having a 1:1 sector to pfn relationship will help > with the dma-extent-busy management since we don't need to keep > calling into the driver to map pfns back to sectors once we know the > pfn[0] sector[0] relationship. But these are important things that other block devices may / will want. For example, I think it's entirely sensible to support ->direct_access for RAID-0. Dell are looking at various different options for having one pmemX device per DIMM and using RAID to lash them together. ->direct_access makes no sense for RAID-5 or RAID-1, but RAID-0 makes sense to me. Last time we tried to take sleeping out, there were grumblings from people with network block devices who thought they'd want to bring pages in across the network. I'm a bit less sympathetic to this because I don't know anyone actively working on it, but the RAID-0 case is something I think we should care about.
Re: [PATCH mmotm] lib/bug.c: fix build when MODULES are not enabled
On Fri, Oct 20, 2017 at 04:53:50PM -0700, Randy Dunlap wrote: > From: Randy Dunlap> > Fix build when CONFIG_MODULES is not enabled. > Fixes multiple build errors. Thanks. I already sent a patch earlier. -Andi > > Signed-off-by: Randy Dunlap > Reported-by: kbuild test robot > Cc: Andi Kleen
Re: [PATCH mmotm] lib/bug.c: fix build when MODULES are not enabled
On Fri, Oct 20, 2017 at 04:53:50PM -0700, Randy Dunlap wrote: > From: Randy Dunlap > > Fix build when CONFIG_MODULES is not enabled. > Fixes multiple build errors. Thanks. I already sent a patch earlier. -Andi > > Signed-off-by: Randy Dunlap > Reported-by: kbuild test robot > Cc: Andi Kleen
Re: [PATCH 00/23] Hardened usercopy whitelisting
On Fri, Oct 20, 2017 at 4:25 PM, Paolo Bonziniwrote: > On 21/10/2017 00:40, Paolo Bonzini wrote: >> This breaks KVM completely on x86, due to two ioctls >> (KVM_GET/SET_CPUID2) accessing the cpuid_entries field of struct >> kvm_vcpu_arch. >> >> There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is >> obsolete and not a big deal at all. >> >> I can post some patches, but probably not until the beginning of >> November due to travelling. Please do not send this too close to the >> beginning of the merge window. > > Sleeping is overrated, sending patches now... Oh awesome, thank you very much for tracking this down and building fixes! I'll insert these into the usercopy whitelisting series, and see if I can find any similar cases. Thanks! -Kees -- Kees Cook Pixel Security
Re: [PATCH 00/23] Hardened usercopy whitelisting
On Fri, Oct 20, 2017 at 4:25 PM, Paolo Bonzini wrote: > On 21/10/2017 00:40, Paolo Bonzini wrote: >> This breaks KVM completely on x86, due to two ioctls >> (KVM_GET/SET_CPUID2) accessing the cpuid_entries field of struct >> kvm_vcpu_arch. >> >> There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is >> obsolete and not a big deal at all. >> >> I can post some patches, but probably not until the beginning of >> November due to travelling. Please do not send this too close to the >> beginning of the merge window. > > Sleeping is overrated, sending patches now... Oh awesome, thank you very much for tracking this down and building fixes! I'll insert these into the usercopy whitelisting series, and see if I can find any similar cases. Thanks! -Kees -- Kees Cook Pixel Security
[PATCH v7] acpi: apei: remove the unused dead-code for SEA/NMI notification type
For the SEA notification, the two functions ghes_sea_add() and ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA is defined. If not, it will return errors in the ghes_probe() and not continue. If the probe is failed, the ghes_sea_remove() also has no chance to be called. Hence, remove the unnecessary handling when CONFIG_ACPI_APEI_SEA is not defined. For the NMI notification, it has the same issue as SEA notification, so also remove the unused dead-code for it. Cc: Tyler BaicarCc: James Morse Signed-off-by: Dongjiu Geng Tested-by: Tyler Baicar Signed-off-by: Borislav Petkov --- drivers/acpi/apei/ghes.c | 33 + 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index d661d45..3eee30a 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -849,17 +849,8 @@ static void ghes_sea_remove(struct ghes *ghes) synchronize_rcu(); } #else /* CONFIG_ACPI_APEI_SEA */ -static inline void ghes_sea_add(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n", - ghes->generic->header.source_id); -} - -static inline void ghes_sea_remove(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n", - ghes->generic->header.source_id); -} +static inline void ghes_sea_add(struct ghes *ghes) { } +static inline void ghes_sea_remove(struct ghes *ghes) { } #endif /* CONFIG_ACPI_APEI_SEA */ #ifdef CONFIG_HAVE_ACPI_APEI_NMI @@ -1061,23 +1052,9 @@ static void ghes_nmi_init_cxt(void) init_irq_work(_proc_irq_work, ghes_proc_in_irq); } #else /* CONFIG_HAVE_ACPI_APEI_NMI */ -static inline void ghes_nmi_add(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n", - ghes->generic->header.source_id); - BUG(); -} - -static inline void ghes_nmi_remove(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n", - ghes->generic->header.source_id); - BUG(); -} - -static inline void ghes_nmi_init_cxt(void) -{ -} +static inline void ghes_nmi_add(struct ghes *ghes) { } +static inline void ghes_nmi_remove(struct ghes *ghes) { } +static inline void ghes_nmi_init_cxt(void) { } #endif /* CONFIG_HAVE_ACPI_APEI_NMI */ static int ghes_probe(struct platform_device *ghes_dev) -- 2.10.1
[PATCH v7] acpi: apei: remove the unused dead-code for SEA/NMI notification type
For the SEA notification, the two functions ghes_sea_add() and ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA is defined. If not, it will return errors in the ghes_probe() and not continue. If the probe is failed, the ghes_sea_remove() also has no chance to be called. Hence, remove the unnecessary handling when CONFIG_ACPI_APEI_SEA is not defined. For the NMI notification, it has the same issue as SEA notification, so also remove the unused dead-code for it. Cc: Tyler Baicar Cc: James Morse Signed-off-by: Dongjiu Geng Tested-by: Tyler Baicar Signed-off-by: Borislav Petkov --- drivers/acpi/apei/ghes.c | 33 + 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index d661d45..3eee30a 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -849,17 +849,8 @@ static void ghes_sea_remove(struct ghes *ghes) synchronize_rcu(); } #else /* CONFIG_ACPI_APEI_SEA */ -static inline void ghes_sea_add(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n", - ghes->generic->header.source_id); -} - -static inline void ghes_sea_remove(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n", - ghes->generic->header.source_id); -} +static inline void ghes_sea_add(struct ghes *ghes) { } +static inline void ghes_sea_remove(struct ghes *ghes) { } #endif /* CONFIG_ACPI_APEI_SEA */ #ifdef CONFIG_HAVE_ACPI_APEI_NMI @@ -1061,23 +1052,9 @@ static void ghes_nmi_init_cxt(void) init_irq_work(_proc_irq_work, ghes_proc_in_irq); } #else /* CONFIG_HAVE_ACPI_APEI_NMI */ -static inline void ghes_nmi_add(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n", - ghes->generic->header.source_id); - BUG(); -} - -static inline void ghes_nmi_remove(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n", - ghes->generic->header.source_id); - BUG(); -} - -static inline void ghes_nmi_init_cxt(void) -{ -} +static inline void ghes_nmi_add(struct ghes *ghes) { } +static inline void ghes_nmi_remove(struct ghes *ghes) { } +static inline void ghes_nmi_init_cxt(void) { } #endif /* CONFIG_HAVE_ACPI_APEI_NMI */ static int ghes_probe(struct platform_device *ghes_dev) -- 2.10.1
Re: [PATCH 2/3] watchdog: hpwdt: SMBIOS check
On 10/20/2017 03:54 PM, Jerry Hoemann wrote: Correct test on SMBIOS table 219 Misc Features bits for UEFI supported. Please explain in more detail. There is no table 219 in the SMBIOS specification. There is table 9, BIOS Characteristics Extension Byte 2, which specifies bit 3 as "UEFI Specification is supported.", but nothing that really maps to the other byte, and no "misc features". Maybe this is HP specific, but then we'll need to have much better explanation. Signed-off-by: Jerry Hoemann--- drivers/watchdog/hpwdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index ef54b03..4c011e8 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -707,7 +707,7 @@ static void dmi_find_icru(const struct dmi_header *dm, void *dummy) smbios_proliant_ptr = (struct smbios_proliant_info *) dm; if (smbios_proliant_ptr->misc_features & 0x01) is_icru = 1; - if (smbios_proliant_ptr->misc_features & 0x408) + if (smbios_proliant_ptr->misc_features & 0x1400) is_uefi = 1; } } Presumably patch 2/3 and 3/3 are bug fixs and should come first so they can be applied to stable releases. Thanks, Guenter
Re: [PATCH 2/3] watchdog: hpwdt: SMBIOS check
On 10/20/2017 03:54 PM, Jerry Hoemann wrote: Correct test on SMBIOS table 219 Misc Features bits for UEFI supported. Please explain in more detail. There is no table 219 in the SMBIOS specification. There is table 9, BIOS Characteristics Extension Byte 2, which specifies bit 3 as "UEFI Specification is supported.", but nothing that really maps to the other byte, and no "misc features". Maybe this is HP specific, but then we'll need to have much better explanation. Signed-off-by: Jerry Hoemann --- drivers/watchdog/hpwdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index ef54b03..4c011e8 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -707,7 +707,7 @@ static void dmi_find_icru(const struct dmi_header *dm, void *dummy) smbios_proliant_ptr = (struct smbios_proliant_info *) dm; if (smbios_proliant_ptr->misc_features & 0x01) is_icru = 1; - if (smbios_proliant_ptr->misc_features & 0x408) + if (smbios_proliant_ptr->misc_features & 0x1400) is_uefi = 1; } } Presumably patch 2/3 and 3/3 are bug fixs and should come first so they can be applied to stable releases. Thanks, Guenter
Re: [PATCH 1/3] watchdog: hpwdt: add ioctl WDIOC_GETPRETIMEOUT
On 10/20/2017 03:54 PM, Jerry Hoemann wrote: Add support for WDIOC_GETPRETIMEOUT ioctl so that user applications can determine when the NMI should arrive. Signed-off-by: Jerry Hoemann--- drivers/watchdog/hpwdt.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 67fbe35..ef54b03 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -50,6 +50,7 @@ static bool nowayout = WATCHDOG_NOWAYOUT; static char expect_release; static unsigned long hpwdt_is_open; +static const int pretimeout = 9; static void __iomem *pci_mem_addr; /* the PCI-memory address */ static unsigned long __iomem *hpwdt_timer_reg; @@ -622,6 +623,12 @@ static long hpwdt_ioctl(struct file *file, unsigned int cmd, } break; + case WDIOC_GETPRETIMEOUT: + ret = copy_to_user(argp, , sizeof(pretimeout)); + if (ret) + ret = -EFAULT; + break; + case WDIOC_SETTIMEOUT: ret = get_user(new_margin, p); if (ret) Can you please convert the driver to use the watchdog subsystem instead ? If there are still improvements needed afterwards, they can still be implemented, but we really should not make improvements which are already supported by the watchdog core. Thanks, Guenter
Re: [PATCH 1/3] watchdog: hpwdt: add ioctl WDIOC_GETPRETIMEOUT
On 10/20/2017 03:54 PM, Jerry Hoemann wrote: Add support for WDIOC_GETPRETIMEOUT ioctl so that user applications can determine when the NMI should arrive. Signed-off-by: Jerry Hoemann --- drivers/watchdog/hpwdt.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 67fbe35..ef54b03 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -50,6 +50,7 @@ static bool nowayout = WATCHDOG_NOWAYOUT; static char expect_release; static unsigned long hpwdt_is_open; +static const int pretimeout = 9; static void __iomem *pci_mem_addr; /* the PCI-memory address */ static unsigned long __iomem *hpwdt_timer_reg; @@ -622,6 +623,12 @@ static long hpwdt_ioctl(struct file *file, unsigned int cmd, } break; + case WDIOC_GETPRETIMEOUT: + ret = copy_to_user(argp, , sizeof(pretimeout)); + if (ret) + ret = -EFAULT; + break; + case WDIOC_SETTIMEOUT: ret = get_user(new_margin, p); if (ret) Can you please convert the driver to use the watchdog subsystem instead ? If there are still improvements needed afterwards, they can still be implemented, but we really should not make improvements which are already supported by the watchdog core. Thanks, Guenter
Re: [RESEND PATCH 1/3] completion: Add support for initializing completion with lockdep_map
On Sat, Oct 21, 2017 at 4:58 AM, Bart Van Asschewrote: > Sorry but I'm not sure that's the best possible answer. In my opinion > avoiding that completion objects have dependencies on other lock objects, > e.g. by avoiding to wait on a completion object while holding a mutex, is a > far superior strategy over adding cross-release checking to completion > objects. The former strategy namely makes it unnecessary to add > cross-release checking to completion objects because that strategy ensures > that these completion objects cannot get involved in a deadlock. The latter It's true if we force it. But do you think it's possible? > strategy can lead to false positive deadlock reports by the lockdep code, What do you think false positives come from? It comes from assigning lock classes falsely where we should more care, rather than lockdep code itself. The same is applicable to cross-release. > something none of us wants. > > A possible alternative strategy could be to enable cross-release checking > only for those completion objects for which waiting occurs inside a critical > section. Of course, it already did. Cross-release doesn't consider any waiting outside of critical sections at all, and it should do. > As explained in another e-mail thread, unlike the lock inversion checking > performed by the <= v4.13 lockdep code, cross-release checking is a heuristic > that does not have a sound theoretical basis. The lock validator is an It's not heuristic but based on the same theoretical basis as <=4.13 lockdep. I mean, the key basis is: 1) What causes deadlock 2) What is a dependency 3) Build a dependency when identified > important tool for kernel developers. It is important that it produces as few > false positives as possible. Since the cross-release checks are enabled > automatically when enabling lockdep, I think it is normal that I, as a kernel > developer, care that the cross-release checks produce as few false positives > as possible. No doubt. That's why I proposed these patches.
Re: [RESEND PATCH 1/3] completion: Add support for initializing completion with lockdep_map
On Sat, Oct 21, 2017 at 4:58 AM, Bart Van Assche wrote: > Sorry but I'm not sure that's the best possible answer. In my opinion > avoiding that completion objects have dependencies on other lock objects, > e.g. by avoiding to wait on a completion object while holding a mutex, is a > far superior strategy over adding cross-release checking to completion > objects. The former strategy namely makes it unnecessary to add > cross-release checking to completion objects because that strategy ensures > that these completion objects cannot get involved in a deadlock. The latter It's true if we force it. But do you think it's possible? > strategy can lead to false positive deadlock reports by the lockdep code, What do you think false positives come from? It comes from assigning lock classes falsely where we should more care, rather than lockdep code itself. The same is applicable to cross-release. > something none of us wants. > > A possible alternative strategy could be to enable cross-release checking > only for those completion objects for which waiting occurs inside a critical > section. Of course, it already did. Cross-release doesn't consider any waiting outside of critical sections at all, and it should do. > As explained in another e-mail thread, unlike the lock inversion checking > performed by the <= v4.13 lockdep code, cross-release checking is a heuristic > that does not have a sound theoretical basis. The lock validator is an It's not heuristic but based on the same theoretical basis as <=4.13 lockdep. I mean, the key basis is: 1) What causes deadlock 2) What is a dependency 3) Build a dependency when identified > important tool for kernel developers. It is important that it produces as few > false positives as possible. Since the cross-release checks are enabled > automatically when enabling lockdep, I think it is normal that I, as a kernel > developer, care that the cross-release checks produce as few false positives > as possible. No doubt. That's why I proposed these patches.
Re: [PATCH] [v2] tomoyo: fix timestamping for y2038
On Thu, 19 Oct 2017, Arnd Bergmann wrote: > Tomoyo uses an open-coded version of time_to_tm() to create a timestamp > from the current time as read by get_seconds(). This will overflow and > give wrong results on 32-bit systems in 2038. > > To correct this, this changes the code to use ktime_get_real_seconds() > and the generic time64_to_tm() function that are both y2038-safe. > Using the library function avoids adding an expensive 64-bit division > in this code and can benefit from any optimizations we do in common > code. > > Acked-by: Tetsuo Handa> Signed-off-by: Arnd Bergmann > --- > v2: fix year calculation Applied to: git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git next-general -- James Morris
Re: [PATCH] [v2] tomoyo: fix timestamping for y2038
On Thu, 19 Oct 2017, Arnd Bergmann wrote: > Tomoyo uses an open-coded version of time_to_tm() to create a timestamp > from the current time as read by get_seconds(). This will overflow and > give wrong results on 32-bit systems in 2038. > > To correct this, this changes the code to use ktime_get_real_seconds() > and the generic time64_to_tm() function that are both y2038-safe. > Using the library function avoids adding an expensive 64-bit division > in this code and can benefit from any optimizations we do in common > code. > > Acked-by: Tetsuo Handa > Signed-off-by: Arnd Bergmann > --- > v2: fix year calculation Applied to: git://git.kernel.org/pub/scm/linux/kernel/git/jmorris/linux-security.git next-general -- James Morris
Re: [PATCH 26/27] efi: Add an EFI_SECURE_BOOT flag to indicate secure boot mode
On Thu, 19 Oct 2017, David Howells wrote: > + if (efi_enabled(EFI_BOOT)) { > + switch (mode) { > + case efi_secureboot_mode_disabled: > + pr_info("Secure boot disabled\n"); > + break; > + case efi_secureboot_mode_enabled: > + set_bit(EFI_SECURE_BOOT, ); > + pr_info("Secure boot enabled\n"); > + break; > + default: > + pr_info("Secure boot could not be determined\n"); Perhaps make this pr_warning and include the unknown mode value? -- James Morris
Re: [PATCH 26/27] efi: Add an EFI_SECURE_BOOT flag to indicate secure boot mode
On Thu, 19 Oct 2017, David Howells wrote: > + if (efi_enabled(EFI_BOOT)) { > + switch (mode) { > + case efi_secureboot_mode_disabled: > + pr_info("Secure boot disabled\n"); > + break; > + case efi_secureboot_mode_enabled: > + set_bit(EFI_SECURE_BOOT, ); > + pr_info("Secure boot enabled\n"); > + break; > + default: > + pr_info("Secure boot could not be determined\n"); Perhaps make this pr_warning and include the unknown mode value? -- James Morris
Re: [PATCH 25/27] Lock down /proc/kcore
On Thu, 19 Oct 2017, David Howells wrote: > Disallow access to /proc/kcore when the kernel is locked down to prevent > access to cryptographic data. > > Signed-off-by: David HowellsReviewed-by: James Morris I have to wonder, though, after everything is locked down, how easy will it be for new things to slip in which need to be included in the lockdown, but are not. -- James Morris
Re: [PATCH 25/27] Lock down /proc/kcore
On Thu, 19 Oct 2017, David Howells wrote: > Disallow access to /proc/kcore when the kernel is locked down to prevent > access to cryptographic data. > > Signed-off-by: David Howells Reviewed-by: James Morris I have to wonder, though, after everything is locked down, how easy will it be for new things to slip in which need to be included in the lockdown, but are not. -- James Morris
[PATCH v6] acpi: apei: remove the unused dead-code for SEA/NMI notification type
For the SEA notification, the two functions ghes_sea_add() and ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA is defined. If not, it will return errors in the ghes_probe() and not continue. If the probe is failed, the ghes_sea_remove() also has no chance to be called. Hence, remove the unnecessary handling when CONFIG_ACPI_APEI_SEA is not defined. For the NMI notification, it has the same issue as SEA notification, so also remove the unused dead-code for it. Cc: Tyler BaicarCc: James Morse Signed-off-by: Dongjiu Geng Tested-by: Tyler Baicar Signed-off-by: Borislav Petkov --- drivers/acpi/apei/ghes.c | 33 + 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index d661d45..3eee30a 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -849,17 +849,8 @@ static void ghes_sea_remove(struct ghes *ghes) synchronize_rcu(); } #else /* CONFIG_ACPI_APEI_SEA */ -static inline void ghes_sea_add(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n", - ghes->generic->header.source_id); -} - -static inline void ghes_sea_remove(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n", - ghes->generic->header.source_id); -} +static inline void ghes_sea_add(struct ghes *ghes) { } +static inline void ghes_sea_remove(struct ghes *ghes) { } #endif /* CONFIG_ACPI_APEI_SEA */ #ifdef CONFIG_HAVE_ACPI_APEI_NMI @@ -1061,23 +1052,9 @@ static void ghes_nmi_init_cxt(void) init_irq_work(_proc_irq_work, ghes_proc_in_irq); } #else /* CONFIG_HAVE_ACPI_APEI_NMI */ -static inline void ghes_nmi_add(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n", - ghes->generic->header.source_id); - BUG(); -} - -static inline void ghes_nmi_remove(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n", - ghes->generic->header.source_id); - BUG(); -} - -static inline void ghes_nmi_init_cxt(void) -{ -} +static inline void ghes_nmi_add(struct ghes *ghes) { } +static inline void ghes_nmi_remove(struct ghes *ghes) { } +static inline void ghes_nmi_init_cxt(void) { } #endif /* CONFIG_HAVE_ACPI_APEI_NMI */ static int ghes_probe(struct platform_device *ghes_dev) -- 2.10.1
[PATCH v6] acpi: apei: remove the unused dead-code for SEA/NMI notification type
For the SEA notification, the two functions ghes_sea_add() and ghes_sea_remove() are only called when CONFIG_ACPI_APEI_SEA is defined. If not, it will return errors in the ghes_probe() and not continue. If the probe is failed, the ghes_sea_remove() also has no chance to be called. Hence, remove the unnecessary handling when CONFIG_ACPI_APEI_SEA is not defined. For the NMI notification, it has the same issue as SEA notification, so also remove the unused dead-code for it. Cc: Tyler Baicar Cc: James Morse Signed-off-by: Dongjiu Geng Tested-by: Tyler Baicar Signed-off-by: Borislav Petkov --- drivers/acpi/apei/ghes.c | 33 + 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index d661d45..3eee30a 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -849,17 +849,8 @@ static void ghes_sea_remove(struct ghes *ghes) synchronize_rcu(); } #else /* CONFIG_ACPI_APEI_SEA */ -static inline void ghes_sea_add(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n", - ghes->generic->header.source_id); -} - -static inline void ghes_sea_remove(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n", - ghes->generic->header.source_id); -} +static inline void ghes_sea_add(struct ghes *ghes) { } +static inline void ghes_sea_remove(struct ghes *ghes) { } #endif /* CONFIG_ACPI_APEI_SEA */ #ifdef CONFIG_HAVE_ACPI_APEI_NMI @@ -1061,23 +1052,9 @@ static void ghes_nmi_init_cxt(void) init_irq_work(_proc_irq_work, ghes_proc_in_irq); } #else /* CONFIG_HAVE_ACPI_APEI_NMI */ -static inline void ghes_nmi_add(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to add NMI notification which is not supported!\n", - ghes->generic->header.source_id); - BUG(); -} - -static inline void ghes_nmi_remove(struct ghes *ghes) -{ - pr_err(GHES_PFX "ID: %d, trying to remove NMI notification which is not supported!\n", - ghes->generic->header.source_id); - BUG(); -} - -static inline void ghes_nmi_init_cxt(void) -{ -} +static inline void ghes_nmi_add(struct ghes *ghes) { } +static inline void ghes_nmi_remove(struct ghes *ghes) { } +static inline void ghes_nmi_init_cxt(void) { } #endif /* CONFIG_HAVE_ACPI_APEI_NMI */ static int ghes_probe(struct platform_device *ghes_dev) -- 2.10.1
Re: [PATCH 1/4] mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS
On Fri, Oct 20, 2017 at 12:59 PM, Kirill A. Shutemovwrote: > With boot-time switching between paging mode we will have variable > MAX_PHYSMEM_BITS. > > Let's use the maximum variable possible for CONFIG_X86_5LEVEL=y > configuration to define zsmalloc data structures. > > The patch introduces MAX_POSSIBLE_PHYSMEM_BITS to cover such case. > It also suits well to handle PAE special case. > I see that with your upcoming patch, MAX_PHYSMEM_BITS is turned into a variable for x86_64 case as: (pgtable_l5_enabled ? 52 : 46). Even with this change, I don't see a need for this new MAX_POSSIBLE_PHYSMEM_BITS constant. > -#ifndef MAX_PHYSMEM_BITS > -#ifdef CONFIG_HIGHMEM64G > -#define MAX_PHYSMEM_BITS 36 > -#else /* !CONFIG_HIGHMEM64G */ > +#ifndef MAX_POSSIBLE_PHYSMEM_BITS > +#ifdef MAX_PHYSMEM_BITS > +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS > +#else This ifdef on HIGHMEM64G is redundant, as x86 already defines MAX_PHYSMEM_BITS = 36 in PAE case. So, all that zsmalloc should do is: #ifndef MAX_PHYSMEM_BITS #define MAX_PHYSMEM_BITS BITS_PER_LONG #endif .. and then no change is needed for rest of derived constants like _PFN_BITS. It is upto every arch to define correct MAX_PHYSMEM_BITS (variable or constant) based on whatever configurations the arch supports. If not defined, zsmalloc picks a reasonable default of BITS_PER_LONG. I will send a patch which makes the change to remove ifdef on CONFIG_HIGHMEM64G. Thanks, Nitin
Re: [PATCH 1/4] mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS
On Fri, Oct 20, 2017 at 12:59 PM, Kirill A. Shutemov wrote: > With boot-time switching between paging mode we will have variable > MAX_PHYSMEM_BITS. > > Let's use the maximum variable possible for CONFIG_X86_5LEVEL=y > configuration to define zsmalloc data structures. > > The patch introduces MAX_POSSIBLE_PHYSMEM_BITS to cover such case. > It also suits well to handle PAE special case. > I see that with your upcoming patch, MAX_PHYSMEM_BITS is turned into a variable for x86_64 case as: (pgtable_l5_enabled ? 52 : 46). Even with this change, I don't see a need for this new MAX_POSSIBLE_PHYSMEM_BITS constant. > -#ifndef MAX_PHYSMEM_BITS > -#ifdef CONFIG_HIGHMEM64G > -#define MAX_PHYSMEM_BITS 36 > -#else /* !CONFIG_HIGHMEM64G */ > +#ifndef MAX_POSSIBLE_PHYSMEM_BITS > +#ifdef MAX_PHYSMEM_BITS > +#define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS > +#else This ifdef on HIGHMEM64G is redundant, as x86 already defines MAX_PHYSMEM_BITS = 36 in PAE case. So, all that zsmalloc should do is: #ifndef MAX_PHYSMEM_BITS #define MAX_PHYSMEM_BITS BITS_PER_LONG #endif .. and then no change is needed for rest of derived constants like _PFN_BITS. It is upto every arch to define correct MAX_PHYSMEM_BITS (variable or constant) based on whatever configurations the arch supports. If not defined, zsmalloc picks a reasonable default of BITS_PER_LONG. I will send a patch which makes the change to remove ifdef on CONFIG_HIGHMEM64G. Thanks, Nitin
Re: [PATCH net] hv_sock: add locking in the open/close/release code paths
From: Dexuan CuiDate: Thu, 19 Oct 2017 03:33:14 + > > Without the patch, when hvs_open_connection() hasn't completely established > a connection (e.g. it has changed sk->sk_state to SS_CONNECTED, but hasn't > inserted the sock into the connected queue), vsock_stream_connect() may see > the sk_state change and return the connection to the userspace, and next > when the userspace closes the connection quickly, hvs_release() may not see > the connection in the connected queue; finally hvs_open_connection() > inserts the connection into the queue, but we won't be able to purge the > connection for ever. > > Signed-off-by: Dexuan Cui Applied.
Re: [PATCH net] hv_sock: add locking in the open/close/release code paths
From: Dexuan Cui Date: Thu, 19 Oct 2017 03:33:14 + > > Without the patch, when hvs_open_connection() hasn't completely established > a connection (e.g. it has changed sk->sk_state to SS_CONNECTED, but hasn't > inserted the sock into the connected queue), vsock_stream_connect() may see > the sk_state change and return the connection to the userspace, and next > when the userspace closes the connection quickly, hvs_release() may not see > the connection in the connected queue; finally hvs_open_connection() > inserts the connection into the queue, but we won't be able to purge the > connection for ever. > > Signed-off-by: Dexuan Cui Applied.
[PATCH v1] mm: broken deferred calculation
In reset_deferred_meminit we determine number of pages that must not be deferred. We initialize pages for at least 2G of memory, but also pages for reserved memory in this node. The reserved memory is determined in this function: memblock_reserved_memory_within(), which operates over physical addresses, and returns size in bytes. However, reset_deferred_meminit() assumes that that this function operates with pfns, and returns page count. The result is that in the best case machine boots slower than expected due to initializing more pages than needed in single thread, and in the worst case panics because fewer than needed pages are initialized early. Fixes: 864b9a393dcb ("mm: consider memblock reservations for deferred memory initialization sizing") Signed-off-by: Pavel Tatashin--- include/linux/mmzone.h | 3 ++- mm/page_alloc.c| 27 ++- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a6f361931d52..d45ba78c7e42 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -699,7 +699,8 @@ typedef struct pglist_data { * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; - unsigned long static_init_size; + /* Number of non-deferred pages */ + unsigned long static_init_pgcnt; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97687b38da05..16419cdbbb7a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -289,28 +289,37 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + +/* + * Determine how many pages need to be initialized durig early boot + * (non-deferred initialization). + * The value of first_deferred_pfn will be set later, once non-deferred pages + * are initialized, but for now set it ULONG_MAX. + */ static inline void reset_deferred_meminit(pg_data_t *pgdat) { - unsigned long max_initialise; - unsigned long reserved_lowmem; + phys_addr_t start_addr, end_addr; + unsigned long max_pgcnt; + unsigned long reserved; /* * Initialise at least 2G of a node but also take into account that * two large system hashes that can take up 1GB for 0.25TB/node. */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); + max_pgcnt = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); /* * Compensate the all the memblock reservations (e.g. crash kernel) * from the initial estimation to make sure we will initialize enough * memory to boot. */ - reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, - pgdat->node_start_pfn + max_initialise); - max_initialise += reserved_lowmem; + start_addr = PFN_PHYS(pgdat->node_start_pfn); + end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); + reserved = memblock_reserved_memory_within(start_addr, end_addr); + max_pgcnt += PHYS_PFN(reserved); - pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); + pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -337,7 +346,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, if (zone_end < pgdat_end_pfn(pgdat)) return true; (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_size) && + if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; -- 2.14.2
[PATCH v1] mm: broken deferred calculation
In reset_deferred_meminit we determine number of pages that must not be deferred. We initialize pages for at least 2G of memory, but also pages for reserved memory in this node. The reserved memory is determined in this function: memblock_reserved_memory_within(), which operates over physical addresses, and returns size in bytes. However, reset_deferred_meminit() assumes that that this function operates with pfns, and returns page count. The result is that in the best case machine boots slower than expected due to initializing more pages than needed in single thread, and in the worst case panics because fewer than needed pages are initialized early. Fixes: 864b9a393dcb ("mm: consider memblock reservations for deferred memory initialization sizing") Signed-off-by: Pavel Tatashin --- include/linux/mmzone.h | 3 ++- mm/page_alloc.c| 27 ++- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a6f361931d52..d45ba78c7e42 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -699,7 +699,8 @@ typedef struct pglist_data { * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; - unsigned long static_init_size; + /* Number of non-deferred pages */ + unsigned long static_init_pgcnt; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97687b38da05..16419cdbbb7a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -289,28 +289,37 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + +/* + * Determine how many pages need to be initialized durig early boot + * (non-deferred initialization). + * The value of first_deferred_pfn will be set later, once non-deferred pages + * are initialized, but for now set it ULONG_MAX. + */ static inline void reset_deferred_meminit(pg_data_t *pgdat) { - unsigned long max_initialise; - unsigned long reserved_lowmem; + phys_addr_t start_addr, end_addr; + unsigned long max_pgcnt; + unsigned long reserved; /* * Initialise at least 2G of a node but also take into account that * two large system hashes that can take up 1GB for 0.25TB/node. */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); + max_pgcnt = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); /* * Compensate the all the memblock reservations (e.g. crash kernel) * from the initial estimation to make sure we will initialize enough * memory to boot. */ - reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, - pgdat->node_start_pfn + max_initialise); - max_initialise += reserved_lowmem; + start_addr = PFN_PHYS(pgdat->node_start_pfn); + end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); + reserved = memblock_reserved_memory_within(start_addr, end_addr); + max_pgcnt += PHYS_PFN(reserved); - pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); + pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -337,7 +346,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, if (zone_end < pgdat_end_pfn(pgdat)) return true; (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_size) && + if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; -- 2.14.2
Re: [PATCH 0/12] PM / sleep: Driver flags for system suspend/resume
On Friday, October 20, 2017 10:46:07 PM CEST Bjorn Helgaas wrote: > On Mon, Oct 16, 2017 at 03:12:35AM +0200, Rafael J. Wysocki wrote: > > Hi All, > > > > Well, this took more time than expected, as I tried to cover everything I > > had > > in mind regarding PM flags for drivers. > > For the parts that touch PCI, > > Acked-by: Bjorn HelgaasThank you! > I doubt there'll be conflicts with changes in my tree, but let me know if > you trip over any so I can watch for them when merging. Well, if there are any conflicts, we'll see them in linux-next I guess. :-) Thanks, Rafael
Re: [PATCH 0/12] PM / sleep: Driver flags for system suspend/resume
On Friday, October 20, 2017 10:46:07 PM CEST Bjorn Helgaas wrote: > On Mon, Oct 16, 2017 at 03:12:35AM +0200, Rafael J. Wysocki wrote: > > Hi All, > > > > Well, this took more time than expected, as I tried to cover everything I > > had > > in mind regarding PM flags for drivers. > > For the parts that touch PCI, > > Acked-by: Bjorn Helgaas Thank you! > I doubt there'll be conflicts with changes in my tree, but let me know if > you trip over any so I can watch for them when merging. Well, if there are any conflicts, we'll see them in linux-next I guess. :-) Thanks, Rafael
Re: swap_info_get: Bad swap offset entry 0200f8a7
On Fri, 20 Oct 2017, huang ying wrote: > > 4 May < Linux version 4.11.2-1-ARCH > > 4 Jun < Linux version 4.11.3-1-ARCH > > 7 Jul < Linux version 4.11.9-1-ARCH > > 4 Aug < Linux version 4.12.8-2-ARCH > > 24 Sep < Linux version 4.12.13-1-ARCH > > 158 Oct < Linux version 4.13.5-1-ARCH > > So you have never seen this before 4.11 like 4.10? Unfortunately the kernel logs for that machine only go back until May 2017 and I cannot tell if that hasn't happened before. I've seen these messages appear since then but didn't bother much. But as it now happens more frequently, I thought I should mention this to the list. > Which operations will trigger this error messages? I'm not able to reproduce it at will, but I suspect that memory pressure triggers these messages. The machine in question is an Lenovo Ideapad S10 notebook running 24x7 and is equipped with 1 GB of RAM. Two Java processes are basically using up all the memory, so usually it tooks like this: $ free -m total used free shared buff/cache available Mem: 99486667 1 6020 Swap:760437 322 $ zramctl NAME ALGORITHM DISKSIZE DATA COMPR TOTAL STREAMS MOUNTPOINT /dev/zram0 lz4 248.7M 247M 92.3M 97.4M 2 [SWAP] I just assumed the message is triggered when the system is really low on memory and maybe zram is too slow to provide the memory requested. But that's just my layman's assumption :-) For example, today's message was emitted during the night: Oct 20 01:26:18 len kernel: [638973.207849] \ swap_info_get: Bad swap offset entry 0200f8a7 And here are the sysstat numbers for that time frame: $ sar -r -s 00:00 -e 02:00 Linux 4.13.5-1-ARCH (len) 10/20/2017 _i686_ (2 CPU) 12:00:01 AM kbmemfree kbmemused %memused kbbuffers kbcached kbcommit %commit kbactive kbinact kbdirty 12:10:06 AM 70076948404 93.12 4 19004 1556176 86.58376608379408 220 12:20:02 AM 80488937992 92.10 4180404 1563952 87.01380184327736 5568 12:30:03 AM 83296935184 91.82 4137260 1569776 87.3432951233 280 12:40:03 AM 65188953292 93.60 4 21156 1571048 87.41386644389820 1144 12:50:03 AM 67512950968 93.37 4 33452 1570628 87.38378936381580 1304 01:00:07 AM 65520952960 93.57 4 24996 1573180 87.53385396386152 904 01:10:03 AM 66956951524 93.43 4 35520 1572696 87.50379548379364 172 01:20:02 AM 67440951040 93.38 4 88736 1569864 87.34381764370472 7080 01:30:03 AM 70048948432 93.12 4 29212 1572504 87.49383516381900 1832 01:40:04 AM 71532946948 92.98 4 29220 1570096 87.35380120380284 1000 01:50:03 AM 65828952652 93.54 4 34408 1570604 87.38381040381028 1604 Average:70353948127 93.09 4 57579 1569139 87.30376661371613 1919 == If that is unreadable, here it is again: https://paste.debian.net/991927/ > Is it possible for you to check > whether the error exists for normal swap device (not ZRAM)? I have "normal" (but encrpted) swap configured but with a lower priority: cat /proc/swaps FilenameTypeSizeUsedPriority /dev/dm-0 partition 524284 194348 0 /dev/zram0 partition 254616 253536 32767 I shall disable the zram device and disable encryption too and will report back if the message appears again. > 32bit or 64bit kernel do you use? I'm using an i686 kernel for this Atom N270 processor (with HT enabled). Thanks for your response, Christian. -- BOFH excuse #403: Sysadmin didn't hear pager go off due to loud music from bar-room speakers.
Re: swap_info_get: Bad swap offset entry 0200f8a7
On Fri, 20 Oct 2017, huang ying wrote: > > 4 May < Linux version 4.11.2-1-ARCH > > 4 Jun < Linux version 4.11.3-1-ARCH > > 7 Jul < Linux version 4.11.9-1-ARCH > > 4 Aug < Linux version 4.12.8-2-ARCH > > 24 Sep < Linux version 4.12.13-1-ARCH > > 158 Oct < Linux version 4.13.5-1-ARCH > > So you have never seen this before 4.11 like 4.10? Unfortunately the kernel logs for that machine only go back until May 2017 and I cannot tell if that hasn't happened before. I've seen these messages appear since then but didn't bother much. But as it now happens more frequently, I thought I should mention this to the list. > Which operations will trigger this error messages? I'm not able to reproduce it at will, but I suspect that memory pressure triggers these messages. The machine in question is an Lenovo Ideapad S10 notebook running 24x7 and is equipped with 1 GB of RAM. Two Java processes are basically using up all the memory, so usually it tooks like this: $ free -m total used free shared buff/cache available Mem: 99486667 1 6020 Swap:760437 322 $ zramctl NAME ALGORITHM DISKSIZE DATA COMPR TOTAL STREAMS MOUNTPOINT /dev/zram0 lz4 248.7M 247M 92.3M 97.4M 2 [SWAP] I just assumed the message is triggered when the system is really low on memory and maybe zram is too slow to provide the memory requested. But that's just my layman's assumption :-) For example, today's message was emitted during the night: Oct 20 01:26:18 len kernel: [638973.207849] \ swap_info_get: Bad swap offset entry 0200f8a7 And here are the sysstat numbers for that time frame: $ sar -r -s 00:00 -e 02:00 Linux 4.13.5-1-ARCH (len) 10/20/2017 _i686_ (2 CPU) 12:00:01 AM kbmemfree kbmemused %memused kbbuffers kbcached kbcommit %commit kbactive kbinact kbdirty 12:10:06 AM 70076948404 93.12 4 19004 1556176 86.58376608379408 220 12:20:02 AM 80488937992 92.10 4180404 1563952 87.01380184327736 5568 12:30:03 AM 83296935184 91.82 4137260 1569776 87.3432951233 280 12:40:03 AM 65188953292 93.60 4 21156 1571048 87.41386644389820 1144 12:50:03 AM 67512950968 93.37 4 33452 1570628 87.38378936381580 1304 01:00:07 AM 65520952960 93.57 4 24996 1573180 87.53385396386152 904 01:10:03 AM 66956951524 93.43 4 35520 1572696 87.50379548379364 172 01:20:02 AM 67440951040 93.38 4 88736 1569864 87.34381764370472 7080 01:30:03 AM 70048948432 93.12 4 29212 1572504 87.49383516381900 1832 01:40:04 AM 71532946948 92.98 4 29220 1570096 87.35380120380284 1000 01:50:03 AM 65828952652 93.54 4 34408 1570604 87.38381040381028 1604 Average:70353948127 93.09 4 57579 1569139 87.30376661371613 1919 == If that is unreadable, here it is again: https://paste.debian.net/991927/ > Is it possible for you to check > whether the error exists for normal swap device (not ZRAM)? I have "normal" (but encrpted) swap configured but with a lower priority: cat /proc/swaps FilenameTypeSizeUsedPriority /dev/dm-0 partition 524284 194348 0 /dev/zram0 partition 254616 253536 32767 I shall disable the zram device and disable encryption too and will report back if the message appears again. > 32bit or 64bit kernel do you use? I'm using an i686 kernel for this Atom N270 processor (with HT enabled). Thanks for your response, Christian. -- BOFH excuse #403: Sysadmin didn't hear pager go off due to loud music from bar-room speakers.
[PATCH v2 2/4] android: binder: Fix null ptr dereference in debug msg
Don't access next->data in kernel debug message when the next buffer is null. Acked-by: Arve HjønnevågSigned-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index e12072b1d507..c2819a3d58a6 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -560,7 +560,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: merge free, buffer %pK do not share page with %pK or %pK\n", alloc->pid, buffer->data, - prev->data, next->data); + prev->data, next ? next->data : NULL); binder_update_page_range(alloc, 0, buffer_start_page(buffer), buffer_start_page(buffer) + PAGE_SIZE, NULL); -- 2.11.0 (Apple Git-81)
[PATCH v2 2/4] android: binder: Fix null ptr dereference in debug msg
Don't access next->data in kernel debug message when the next buffer is null. Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index e12072b1d507..c2819a3d58a6 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -560,7 +560,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: merge free, buffer %pK do not share page with %pK or %pK\n", alloc->pid, buffer->data, - prev->data, next->data); + prev->data, next ? next->data : NULL); binder_update_page_range(alloc, 0, buffer_start_page(buffer), buffer_start_page(buffer) + PAGE_SIZE, NULL); -- 2.11.0 (Apple Git-81)
[PATCH v2 3/4] android: binder: Remove unused vma argument
The vma argument in update_page_range is no longer used after 74310e06 ("android: binder: Move buffer out of area shared with user space"), since mmap_handler no longer calls update_page_range with a vma. Acked-by: Arve HjønnevågSigned-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 16 +++- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index c2819a3d58a6..36bcea1fc47c 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -186,12 +186,12 @@ struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc, } static int binder_update_page_range(struct binder_alloc *alloc, int allocate, - void *start, void *end, - struct vm_area_struct *vma) + void *start, void *end) { void *page_addr; unsigned long user_page_addr; struct binder_lru_page *page; + struct vm_area_struct *vma = NULL; struct mm_struct *mm = NULL; bool need_mm = false; @@ -215,7 +215,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (!vma && need_mm && mmget_not_zero(alloc->vma_vm_mm)) + if (need_mm && mmget_not_zero(alloc->vma_vm_mm)) mm = alloc->vma_vm_mm; if (mm) { @@ -437,7 +437,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, if (end_page_addr > has_page_addr) end_page_addr = has_page_addr; ret = binder_update_page_range(alloc, 1, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL); + (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr); if (ret) return ERR_PTR(ret); @@ -478,7 +478,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, err_alloc_buf_struct_failed: binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), -end_page_addr, NULL); +end_page_addr); return ERR_PTR(-ENOMEM); } @@ -562,8 +562,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, alloc->pid, buffer->data, prev->data, next ? next->data : NULL); binder_update_page_range(alloc, 0, buffer_start_page(buffer), -buffer_start_page(buffer) + PAGE_SIZE, -NULL); +buffer_start_page(buffer) + PAGE_SIZE); } list_del(>entry); kfree(buffer); @@ -600,8 +599,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), - (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK), - NULL); + (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK)); rb_erase(>rb_node, >allocated_buffers); buffer->free = 1; -- 2.11.0 (Apple Git-81)
[PATCH v2 3/4] android: binder: Remove unused vma argument
The vma argument in update_page_range is no longer used after 74310e06 ("android: binder: Move buffer out of area shared with user space"), since mmap_handler no longer calls update_page_range with a vma. Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 16 +++- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index c2819a3d58a6..36bcea1fc47c 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -186,12 +186,12 @@ struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc, } static int binder_update_page_range(struct binder_alloc *alloc, int allocate, - void *start, void *end, - struct vm_area_struct *vma) + void *start, void *end) { void *page_addr; unsigned long user_page_addr; struct binder_lru_page *page; + struct vm_area_struct *vma = NULL; struct mm_struct *mm = NULL; bool need_mm = false; @@ -215,7 +215,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (!vma && need_mm && mmget_not_zero(alloc->vma_vm_mm)) + if (need_mm && mmget_not_zero(alloc->vma_vm_mm)) mm = alloc->vma_vm_mm; if (mm) { @@ -437,7 +437,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, if (end_page_addr > has_page_addr) end_page_addr = has_page_addr; ret = binder_update_page_range(alloc, 1, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL); + (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr); if (ret) return ERR_PTR(ret); @@ -478,7 +478,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, err_alloc_buf_struct_failed: binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), -end_page_addr, NULL); +end_page_addr); return ERR_PTR(-ENOMEM); } @@ -562,8 +562,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, alloc->pid, buffer->data, prev->data, next ? next->data : NULL); binder_update_page_range(alloc, 0, buffer_start_page(buffer), -buffer_start_page(buffer) + PAGE_SIZE, -NULL); +buffer_start_page(buffer) + PAGE_SIZE); } list_del(>entry); kfree(buffer); @@ -600,8 +599,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), - (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK), - NULL); + (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK)); rb_erase(>rb_node, >allocated_buffers); buffer->free = 1; -- 2.11.0 (Apple Git-81)
[PATCH v2 1/4] android: binder: Don't get mm from task
Use binder_alloc struct's mm_struct rather than getting a reference to the mm struct through get_task_mm to avoid a potential deadlock between lru lock, task lock and dentry lock, since a thread can be holding the task lock and the dentry lock while trying to acquire the lru lock. Acked-by: Arve HjønnevågSigned-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 22 +- drivers/android/binder_alloc.h | 1 - 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 064f5e31ec55..e12072b1d507 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -215,17 +215,12 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (!vma && need_mm) - mm = get_task_mm(alloc->tsk); + if (!vma && need_mm && mmget_not_zero(alloc->vma_vm_mm)) + mm = alloc->vma_vm_mm; if (mm) { down_write(>mmap_sem); vma = alloc->vma; - if (vma && mm != alloc->vma_vm_mm) { - pr_err("%d: vma mm and task mm mismatch\n", - alloc->pid); - vma = NULL; - } } if (!vma && need_mm) { @@ -720,6 +715,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, barrier(); alloc->vma = vma; alloc->vma_vm_mm = vma->vm_mm; + mmgrab(alloc->vma_vm_mm); return 0; @@ -795,6 +791,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) vfree(alloc->buffer); } mutex_unlock(>mutex); + if (alloc->vma_vm_mm) + mmdrop(alloc->vma_vm_mm); binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d buffers %d, pages %d\n", @@ -889,7 +887,6 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc) void binder_alloc_vma_close(struct binder_alloc *alloc) { WRITE_ONCE(alloc->vma, NULL); - WRITE_ONCE(alloc->vma_vm_mm, NULL); } /** @@ -926,9 +923,9 @@ enum lru_status binder_alloc_free_page(struct list_head *item, page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; vma = alloc->vma; if (vma) { - mm = get_task_mm(alloc->tsk); - if (!mm) - goto err_get_task_mm_failed; + if (!mmget_not_zero(alloc->vma_vm_mm)) + goto err_mmget; + mm = alloc->vma_vm_mm; if (!down_write_trylock(>mmap_sem)) goto err_down_write_mmap_sem_failed; } @@ -963,7 +960,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, err_down_write_mmap_sem_failed: mmput_async(mm); -err_get_task_mm_failed: +err_mmget: err_page_already_freed: mutex_unlock(>mutex); err_get_alloc_mutex_failed: @@ -1002,7 +999,6 @@ struct shrinker binder_shrinker = { */ void binder_alloc_init(struct binder_alloc *alloc) { - alloc->tsk = current->group_leader; alloc->pid = current->group_leader->pid; mutex_init(>mutex); INIT_LIST_HEAD(>buffers); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index a3a3602c689c..2dd33b6df104 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -100,7 +100,6 @@ struct binder_lru_page { */ struct binder_alloc { struct mutex mutex; - struct task_struct *tsk; struct vm_area_struct *vma; struct mm_struct *vma_vm_mm; void *buffer; -- 2.11.0 (Apple Git-81)
[PATCH v2 4/4] android: binder: Change binder_shrinker to static
binder_shrinker struct is not used anywhere outside of binder_alloc.c and should be static. Acked-by: Arve HjønnevågSigned-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 36bcea1fc47c..6f6f745605af 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -982,7 +982,7 @@ binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return ret; } -struct shrinker binder_shrinker = { +static struct shrinker binder_shrinker = { .count_objects = binder_shrink_count, .scan_objects = binder_shrink_scan, .seeks = DEFAULT_SEEKS, -- 2.11.0 (Apple Git-81)
[PATCH v2 1/4] android: binder: Don't get mm from task
Use binder_alloc struct's mm_struct rather than getting a reference to the mm struct through get_task_mm to avoid a potential deadlock between lru lock, task lock and dentry lock, since a thread can be holding the task lock and the dentry lock while trying to acquire the lru lock. Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 22 +- drivers/android/binder_alloc.h | 1 - 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 064f5e31ec55..e12072b1d507 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -215,17 +215,12 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (!vma && need_mm) - mm = get_task_mm(alloc->tsk); + if (!vma && need_mm && mmget_not_zero(alloc->vma_vm_mm)) + mm = alloc->vma_vm_mm; if (mm) { down_write(>mmap_sem); vma = alloc->vma; - if (vma && mm != alloc->vma_vm_mm) { - pr_err("%d: vma mm and task mm mismatch\n", - alloc->pid); - vma = NULL; - } } if (!vma && need_mm) { @@ -720,6 +715,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, barrier(); alloc->vma = vma; alloc->vma_vm_mm = vma->vm_mm; + mmgrab(alloc->vma_vm_mm); return 0; @@ -795,6 +791,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) vfree(alloc->buffer); } mutex_unlock(>mutex); + if (alloc->vma_vm_mm) + mmdrop(alloc->vma_vm_mm); binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d buffers %d, pages %d\n", @@ -889,7 +887,6 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc) void binder_alloc_vma_close(struct binder_alloc *alloc) { WRITE_ONCE(alloc->vma, NULL); - WRITE_ONCE(alloc->vma_vm_mm, NULL); } /** @@ -926,9 +923,9 @@ enum lru_status binder_alloc_free_page(struct list_head *item, page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; vma = alloc->vma; if (vma) { - mm = get_task_mm(alloc->tsk); - if (!mm) - goto err_get_task_mm_failed; + if (!mmget_not_zero(alloc->vma_vm_mm)) + goto err_mmget; + mm = alloc->vma_vm_mm; if (!down_write_trylock(>mmap_sem)) goto err_down_write_mmap_sem_failed; } @@ -963,7 +960,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, err_down_write_mmap_sem_failed: mmput_async(mm); -err_get_task_mm_failed: +err_mmget: err_page_already_freed: mutex_unlock(>mutex); err_get_alloc_mutex_failed: @@ -1002,7 +999,6 @@ struct shrinker binder_shrinker = { */ void binder_alloc_init(struct binder_alloc *alloc) { - alloc->tsk = current->group_leader; alloc->pid = current->group_leader->pid; mutex_init(>mutex); INIT_LIST_HEAD(>buffers); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index a3a3602c689c..2dd33b6df104 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -100,7 +100,6 @@ struct binder_lru_page { */ struct binder_alloc { struct mutex mutex; - struct task_struct *tsk; struct vm_area_struct *vma; struct mm_struct *vma_vm_mm; void *buffer; -- 2.11.0 (Apple Git-81)
[PATCH v2 4/4] android: binder: Change binder_shrinker to static
binder_shrinker struct is not used anywhere outside of binder_alloc.c and should be static. Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 36bcea1fc47c..6f6f745605af 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -982,7 +982,7 @@ binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return ret; } -struct shrinker binder_shrinker = { +static struct shrinker binder_shrinker = { .count_objects = binder_shrink_count, .scan_objects = binder_shrink_scan, .seeks = DEFAULT_SEEKS, -- 2.11.0 (Apple Git-81)
[PATCH v2 0/4] android: binder: fixes for memory allocator change
This patchset resolves a potential deadlock between lru lock, task lock and dentry lock reported by lockdep. It also fixes the null pointer dereference in kernel debug message in binder_alloc.c. Unused vma argument is removed, and binder shrinker is made static. android: binder: Don't get mm from task android: binder: Fix null ptr dereference in debug msg android: binder: Remove unused vma argument android: binder: Change binder_shrinker to static v2: combines the version 1 of two separate patches and reorder the commits so that the first two can be applied to 4.14. drivers/android/binder_alloc.c | 40 +--- drivers/android/binder_alloc.h | 1 - 2 files changed, 17 insertions(+), 24 deletions(-)
[PATCH v2 0/4] android: binder: fixes for memory allocator change
This patchset resolves a potential deadlock between lru lock, task lock and dentry lock reported by lockdep. It also fixes the null pointer dereference in kernel debug message in binder_alloc.c. Unused vma argument is removed, and binder shrinker is made static. android: binder: Don't get mm from task android: binder: Fix null ptr dereference in debug msg android: binder: Remove unused vma argument android: binder: Change binder_shrinker to static v2: combines the version 1 of two separate patches and reorder the commits so that the first two can be applied to 4.14. drivers/android/binder_alloc.c | 40 +--- drivers/android/binder_alloc.h | 1 - 2 files changed, 17 insertions(+), 24 deletions(-)
Re: [PATCH net 5/5] net/ncsi: Fix length of GVI response packet
From: Samuel Mendoza-JonasDate: Thu, 19 Oct 2017 13:43:09 +1100 > From: Gavin Shan > > The length of GVI (GetVersionInfo) response packet should be 40 instead > of 36. This issue was found from /sys/kernel/debug/ncsi/eth0/stats. > > # ethtool --ncsi eth0 swstats > : > RESPONSE OK TIMEOUT ERROR > === > GVI 002 > > With this applied, no error reported on GVI response packets: > > # ethtool --ncsi eth0 swstats > : > RESPONSE OK TIMEOUT ERROR > === > GVI 200 > > Signed-off-by: Gavin Shan > Signed-off-by: Samuel Mendoza-Jonas Applied.
Re: [PATCH net 2/5] net/ncsi: Stop monitor if channel times out or is inactive
From: Samuel Mendoza-JonasDate: Thu, 19 Oct 2017 13:43:06 +1100 > ncsi_channel_monitor() misses stopping the channel monitor in several > places that it should, causing a WARN_ON_ONCE() to trigger when the > monitor is re-started later, eg: > > [ 459.04] WARNING: CPU: 0 PID: 1093 at net/ncsi/ncsi-manage.c:269 > ncsi_start_channel_monitor+0x7c/0x90 > [ 459.04] CPU: 0 PID: 1093 Comm: kworker/0:3 Not tainted > 4.10.17-gaca2fdd #140 > [ 459.04] Hardware name: ASpeed SoC > [ 459.04] Workqueue: events ncsi_dev_work > [ 459.04] [<80010094>] (unwind_backtrace) from [<8000d950>] > (show_stack+0x20/0x24) > [ 459.04] [<8000d950>] (show_stack) from [<801dbf70>] > (dump_stack+0x20/0x28) > [ 459.04] [<801dbf70>] (dump_stack) from [<80018d7c>] (__warn+0xe0/0x108) > [ 459.04] [<80018d7c>] (__warn) from [<80018e70>] > (warn_slowpath_null+0x30/0x38) > [ 459.04] [<80018e70>] (warn_slowpath_null) from [<803f6a08>] > (ncsi_start_channel_monitor+0x7c/0x90) > [ 459.04] [<803f6a08>] (ncsi_start_channel_monitor) from [<803f7664>] > (ncsi_configure_channel+0xdc/0x5fc) > [ 459.04] [<803f7664>] (ncsi_configure_channel) from [<803f8160>] > (ncsi_dev_work+0xac/0x474) > [ 459.04] [<803f8160>] (ncsi_dev_work) from [<8002d244>] > (process_one_work+0x1e0/0x450) > [ 459.04] [<8002d244>] (process_one_work) from [<8002d510>] > (worker_thread+0x5c/0x570) > [ 459.04] [<8002d510>] (worker_thread) from [<80033614>] > (kthread+0x124/0x164) > [ 459.04] [<80033614>] (kthread) from [<8000a5e8>] > (ret_from_fork+0x14/0x2c) > > This also updates the monitor instead of just returning if > ncsi_xmit_cmd() fails to send the get-link-status command so that the > monitor properly times out. > > Fixes: e6f44ed6d04d3 "net/ncsi: Package and channel management" > > Signed-off-by: Samuel Mendoza-Jonas Applied.
Re: [PATCH net 3/5] net/ncsi: Disable HWA mode when no channels are found
From: Samuel Mendoza-JonasDate: Thu, 19 Oct 2017 13:43:07 +1100 > From: Gavin Shan > > When there are no NCSI channels probed, HWA (Hardware Arbitration) > mode is enabled. It's not correct because HWA depends on the fact: > NCSI channels exist and all of them support HWA mode. This disables > HWA when no channels are probed. > > Signed-off-by: Gavin Shan > Signed-off-by: Samuel Mendoza-Jonas Applied.
Re: [PATCH net 5/5] net/ncsi: Fix length of GVI response packet
From: Samuel Mendoza-Jonas Date: Thu, 19 Oct 2017 13:43:09 +1100 > From: Gavin Shan > > The length of GVI (GetVersionInfo) response packet should be 40 instead > of 36. This issue was found from /sys/kernel/debug/ncsi/eth0/stats. > > # ethtool --ncsi eth0 swstats > : > RESPONSE OK TIMEOUT ERROR > === > GVI 002 > > With this applied, no error reported on GVI response packets: > > # ethtool --ncsi eth0 swstats > : > RESPONSE OK TIMEOUT ERROR > === > GVI 200 > > Signed-off-by: Gavin Shan > Signed-off-by: Samuel Mendoza-Jonas Applied.
Re: [PATCH net 2/5] net/ncsi: Stop monitor if channel times out or is inactive
From: Samuel Mendoza-Jonas Date: Thu, 19 Oct 2017 13:43:06 +1100 > ncsi_channel_monitor() misses stopping the channel monitor in several > places that it should, causing a WARN_ON_ONCE() to trigger when the > monitor is re-started later, eg: > > [ 459.04] WARNING: CPU: 0 PID: 1093 at net/ncsi/ncsi-manage.c:269 > ncsi_start_channel_monitor+0x7c/0x90 > [ 459.04] CPU: 0 PID: 1093 Comm: kworker/0:3 Not tainted > 4.10.17-gaca2fdd #140 > [ 459.04] Hardware name: ASpeed SoC > [ 459.04] Workqueue: events ncsi_dev_work > [ 459.04] [<80010094>] (unwind_backtrace) from [<8000d950>] > (show_stack+0x20/0x24) > [ 459.04] [<8000d950>] (show_stack) from [<801dbf70>] > (dump_stack+0x20/0x28) > [ 459.04] [<801dbf70>] (dump_stack) from [<80018d7c>] (__warn+0xe0/0x108) > [ 459.04] [<80018d7c>] (__warn) from [<80018e70>] > (warn_slowpath_null+0x30/0x38) > [ 459.04] [<80018e70>] (warn_slowpath_null) from [<803f6a08>] > (ncsi_start_channel_monitor+0x7c/0x90) > [ 459.04] [<803f6a08>] (ncsi_start_channel_monitor) from [<803f7664>] > (ncsi_configure_channel+0xdc/0x5fc) > [ 459.04] [<803f7664>] (ncsi_configure_channel) from [<803f8160>] > (ncsi_dev_work+0xac/0x474) > [ 459.04] [<803f8160>] (ncsi_dev_work) from [<8002d244>] > (process_one_work+0x1e0/0x450) > [ 459.04] [<8002d244>] (process_one_work) from [<8002d510>] > (worker_thread+0x5c/0x570) > [ 459.04] [<8002d510>] (worker_thread) from [<80033614>] > (kthread+0x124/0x164) > [ 459.04] [<80033614>] (kthread) from [<8000a5e8>] > (ret_from_fork+0x14/0x2c) > > This also updates the monitor instead of just returning if > ncsi_xmit_cmd() fails to send the get-link-status command so that the > monitor properly times out. > > Fixes: e6f44ed6d04d3 "net/ncsi: Package and channel management" > > Signed-off-by: Samuel Mendoza-Jonas Applied.
Re: [PATCH net 3/5] net/ncsi: Disable HWA mode when no channels are found
From: Samuel Mendoza-Jonas Date: Thu, 19 Oct 2017 13:43:07 +1100 > From: Gavin Shan > > When there are no NCSI channels probed, HWA (Hardware Arbitration) > mode is enabled. It's not correct because HWA depends on the fact: > NCSI channels exist and all of them support HWA mode. This disables > HWA when no channels are probed. > > Signed-off-by: Gavin Shan > Signed-off-by: Samuel Mendoza-Jonas Applied.
Re: [PATCH net 4/5] net/ncsi: Enforce failover on link monitor timeout
From: Samuel Mendoza-JonasDate: Thu, 19 Oct 2017 13:43:08 +1100 > From: Gavin Shan > > The NCSI channel has been configured to provide service if its link > monitor timer is enabled, regardless of its state (inactive or active). > So the timeout event on the link monitor indicates the out-of-service > on that channel, for which a failover is needed. > > This sets NCSI_DEV_RESHUFFLE flag to enforce failover on link monitor > timeout, regardless the channel's original state (inactive or active). > Also, the link is put into "down" state to give the failing channel > lowest priority when selecting for the active channel. The state of > failing channel should be set to active in order for deinitialization > and failover to be done. > > Signed-off-by: Gavin Shan > Signed-off-by: Samuel Mendoza-Jonas Applied.
Re: [PATCH net 4/5] net/ncsi: Enforce failover on link monitor timeout
From: Samuel Mendoza-Jonas Date: Thu, 19 Oct 2017 13:43:08 +1100 > From: Gavin Shan > > The NCSI channel has been configured to provide service if its link > monitor timer is enabled, regardless of its state (inactive or active). > So the timeout event on the link monitor indicates the out-of-service > on that channel, for which a failover is needed. > > This sets NCSI_DEV_RESHUFFLE flag to enforce failover on link monitor > timeout, regardless the channel's original state (inactive or active). > Also, the link is put into "down" state to give the failing channel > lowest priority when selecting for the active channel. The state of > failing channel should be set to active in order for deinitialization > and failover to be done. > > Signed-off-by: Gavin Shan > Signed-off-by: Samuel Mendoza-Jonas Applied.
Re: [PATCH net 1/5] net/ncsi: Fix AEN HNCDSC packet length
From: Samuel Mendoza-JonasDate: Thu, 19 Oct 2017 13:43:05 +1100 > Correct the value of the HNCDSC AEN packet. > Fixes: 7a82ecf4cfb85 "net/ncsi: NCSI AEN packet handler" > > Signed-off-by: Samuel Mendoza-Jonas Applied.
Re: [PATCH net 1/5] net/ncsi: Fix AEN HNCDSC packet length
From: Samuel Mendoza-Jonas Date: Thu, 19 Oct 2017 13:43:05 +1100 > Correct the value of the HNCDSC AEN packet. > Fixes: 7a82ecf4cfb85 "net/ncsi: NCSI AEN packet handler" > > Signed-off-by: Samuel Mendoza-Jonas Applied.
Re: [PATCH 7/7] gpio: brcmstb: implement suspend/resume/shutdown
On Thu, Oct 19, 2017 at 11:39 AM, Doug Bergerwrote: >>> +static int brcmstb_gpio_resume(struct device *dev) >>> +{ >>> +struct brcmstb_gpio_priv *priv = dev_get_drvdata(dev); >>> +struct brcmstb_gpio_bank *bank; >>> +u32 wake_mask = 0; >> >> This isn't really being used as a mask, contrary to appearances. It's >> just tracking whether any active IRQs were seen. Please change to use a >> bool instead and adjust the name accordingly. >> > > I see your point, but I believe it is cleaner to use this to consolidate > the bit masks returned by each __brcmstb_gpio_get_active_irqs() call. > This allows a single test rather than a test per bank. What about something like this? bool need_wakeup_event = false; list_for_each_entry(bank, >bank_list, node) { need_wakeup_event |= !!__brcmstb_gpio_get_active_irqs(bank); brcmstb_gpio_bank_restore(priv, bank); } if (priv->parent_wake_irq && need_wakeup_event) pm_wakeup_event(dev, 0);
Re: [PATCH 7/7] gpio: brcmstb: implement suspend/resume/shutdown
On Thu, Oct 19, 2017 at 11:39 AM, Doug Berger wrote: >>> +static int brcmstb_gpio_resume(struct device *dev) >>> +{ >>> +struct brcmstb_gpio_priv *priv = dev_get_drvdata(dev); >>> +struct brcmstb_gpio_bank *bank; >>> +u32 wake_mask = 0; >> >> This isn't really being used as a mask, contrary to appearances. It's >> just tracking whether any active IRQs were seen. Please change to use a >> bool instead and adjust the name accordingly. >> > > I see your point, but I believe it is cleaner to use this to consolidate > the bit masks returned by each __brcmstb_gpio_get_active_irqs() call. > This allows a single test rather than a test per bank. What about something like this? bool need_wakeup_event = false; list_for_each_entry(bank, >bank_list, node) { need_wakeup_event |= !!__brcmstb_gpio_get_active_irqs(bank); brcmstb_gpio_bank_restore(priv, bank); } if (priv->parent_wake_irq && need_wakeup_event) pm_wakeup_event(dev, 0);
Re: [PATCH 0/3] arm64: cpuinfo: make /proc/cpuinfo more human-readable
On 10/20/2017 01:24 PM, Jon Masters wrote: > 1). The first thing people do when they get an Arm server is to cat > /proc/cpuinfo. They then come complaining that it's not like x86. They > can't get the output their looking for and this results in bug filing, > and countless hours on phone calls discussing over and over again. > Worse, there are some parts of the stack that really need this. Within 6 hours of sending this, I get a ping about this week's "Works On Arm" newsletter and...people reporting bugs with not getting CPU capabilities in /proc/cpuinfo. This madness is going to end. Soon. Jon. -- Computer Architect | Sent from my Fedora powered laptop
Re: [PATCH 0/3] arm64: cpuinfo: make /proc/cpuinfo more human-readable
On 10/20/2017 01:24 PM, Jon Masters wrote: > 1). The first thing people do when they get an Arm server is to cat > /proc/cpuinfo. They then come complaining that it's not like x86. They > can't get the output their looking for and this results in bug filing, > and countless hours on phone calls discussing over and over again. > Worse, there are some parts of the stack that really need this. Within 6 hours of sending this, I get a ping about this week's "Works On Arm" newsletter and...people reporting bugs with not getting CPU capabilities in /proc/cpuinfo. This madness is going to end. Soon. Jon. -- Computer Architect | Sent from my Fedora powered laptop
[PATCH] net: core: rtnetlink: use BUG_ON instead of if condition followed by BUG
Use BUG_ON instead of if condition followed by BUG in do_setlink. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva--- This code was tested by compilation only. net/core/rtnetlink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 04680a5..df8dba9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2274,8 +2274,7 @@ static int do_setlink(const struct sk_buff *skb, rcu_read_lock(); - if (!(af_ops = rtnl_af_lookup(nla_type(af - BUG(); + BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af; err = af_ops->set_link_af(dev, af); if (err < 0) { -- 2.7.4
[PATCH] net: core: rtnetlink: use BUG_ON instead of if condition followed by BUG
Use BUG_ON instead of if condition followed by BUG in do_setlink. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva --- This code was tested by compilation only. net/core/rtnetlink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 04680a5..df8dba9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2274,8 +2274,7 @@ static int do_setlink(const struct sk_buff *skb, rcu_read_lock(); - if (!(af_ops = rtnl_af_lookup(nla_type(af - BUG(); + BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af; err = af_ops->set_link_af(dev, af); if (err < 0) { -- 2.7.4
Re: [PATCH] MIPS: kernel: proc: Remove spurious white space in cpuinfo
On 10/20/2017 01:47 PM, Maciej W. Rozycki wrote: On Fri, 20 Oct 2017, Aleksandar Markovic wrote: Remove unnecessary space from FPU info segment of /proc/cpuinfo. NAK. As I recall back in Nov 2001 I placed the extra space there to visually separate the CPU part from the FPU part, e.g.: cpu model : R3000A V3.0 FPU V4.0 cpu model : SiByte SB1 V0.2 FPU V0.2 etc. And the motivation behind it still stands. Please remember that /proc/cpuinfo is there for live humans to parse and grouping all these pieces together would make it harder. Which means your change adds no value I'm afraid. I think it is even riskier than that. This is part of the kernel-userspace ABI, many programs parse this file, any gratuitous changes risk breaking something. I don't really have an opinion about the various *printf functions being used, but think the resultant change in what is visible to userspace should not be done. NB regrettably back in those days much of the patch traffic happened off any mailing list, however I have quickly tracked down my archival copy of the original submission of the change introducing this piece of code and I'll be happy to share it with anyone upon request. Maciej
Re: [PATCH] MIPS: kernel: proc: Remove spurious white space in cpuinfo
On 10/20/2017 01:47 PM, Maciej W. Rozycki wrote: On Fri, 20 Oct 2017, Aleksandar Markovic wrote: Remove unnecessary space from FPU info segment of /proc/cpuinfo. NAK. As I recall back in Nov 2001 I placed the extra space there to visually separate the CPU part from the FPU part, e.g.: cpu model : R3000A V3.0 FPU V4.0 cpu model : SiByte SB1 V0.2 FPU V0.2 etc. And the motivation behind it still stands. Please remember that /proc/cpuinfo is there for live humans to parse and grouping all these pieces together would make it harder. Which means your change adds no value I'm afraid. I think it is even riskier than that. This is part of the kernel-userspace ABI, many programs parse this file, any gratuitous changes risk breaking something. I don't really have an opinion about the various *printf functions being used, but think the resultant change in what is visible to userspace should not be done. NB regrettably back in those days much of the patch traffic happened off any mailing list, however I have quickly tracked down my archival copy of the original submission of the change introducing this piece of code and I'll be happy to share it with anyone upon request. Maciej
Re: [PATCH] PCI/AER: update AER status string print to match other AER logs
On Tue, Oct 17, 2017 at 09:42:02AM -0600, Tyler Baicar wrote: > Currently the AER driver uses cper_print_bits() to print the AER status > string. This causes the status string to not include the proper PCI device > name prefix that the other AER prints include. Also, it has a different > print level than all the other AER prints. > > Update the AER driver to print the AER status string with the proper string > prefix and proper print level. > > Previous log example: > > e1000e 0003:01:00.1: aer_status: 0x0041, aer_mask: 0x > Receiver Error, Bad TLP > e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID > pcieport 0003:00:00.0: aer_status: 0x1000, aer_mask: 0xe000 > Replay Timer Timeout > pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID > > New log: > > e1000e 0003:01:00.1: aer_status: 0x0041, aer_mask: 0x > e1000e 0003:01:00.1: Receiver Error > e1000e 0003:01:00.1: Bad TLP > e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID > pcieport 0003:00:00.0: aer_status: 0x1000, aer_mask: 0xe000 > pcieport 0003:00:00.0: Replay Timer Timeout > pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID I definitely think it's MUCH better to use dev_err() as you do. I don't like the cper_print_bits() strategy of inserting line breaks to fit in 80 columns. That leads to atomicity issues, e.g., other printk output getting inserted in the middle of a single AER log, and suggests an ordering ("Receiver Error" occurred before "Bad TLP") that isn't real. It'd be ideal if everything fit on one line per event, but that might not be practical. I'm not necessarily attached to the actual strings. These messages are for sophisticated users and maybe could be abbreviated as in lspci output. It might actually be kind of neat if the output here matched up with the output of "lspci -vv" (lspci prints all the bits; here you probably want only the set bits). Or maybe not. But even what you have here is a huge improvement. I *hate* unattached things in dmesg like we currently get. There's no reliable way to connect that "Receiver Error, Bad TLP" with the device. > Signed-off-by: Tyler Baicar> --- > drivers/pci/pcie/aer/aerdrv_errprint.c | 15 ++- > 1 file changed, 14 insertions(+), 1 deletion(-) > > diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c > b/drivers/pci/pcie/aer/aerdrv_errprint.c > index 54c4b69..b718daa 100644 > --- a/drivers/pci/pcie/aer/aerdrv_errprint.c > +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c > @@ -206,6 +206,19 @@ void aer_print_port_info(struct pci_dev *dev, struct > aer_err_info *info) > } > > #ifdef CONFIG_ACPI_APEI_PCIEAER > +void dev_print_bits(struct pci_dev *dev, unsigned int bits, > + const char * const strs[], unsigned int strs_size) > +{ > + unsigned int i; > + > + for (i = 0; i < strs_size; i++) { > + if (!(bits & (1U << i))) > + continue; > + if (strs[i]) > + dev_err(>dev, "%s\n", strs[i]); > + } > +} > + > int cper_severity_to_aer(int cper_severity) > { > switch (cper_severity) { > @@ -243,7 +256,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity, > agent = AER_GET_AGENT(aer_severity, status); > > dev_err(>dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, > mask); > - cper_print_bits("", status, status_strs, status_strs_size); > + dev_print_bits(dev, status, status_strs, status_strs_size); > dev_err(>dev, "aer_layer=%s, aer_agent=%s\n", > aer_error_layer[layer], aer_agent_string[agent]); > > -- > Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm > Technologies, Inc. > Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, > a Linux Foundation Collaborative Project. >
Re: [PATCH] PCI/AER: update AER status string print to match other AER logs
On Tue, Oct 17, 2017 at 09:42:02AM -0600, Tyler Baicar wrote: > Currently the AER driver uses cper_print_bits() to print the AER status > string. This causes the status string to not include the proper PCI device > name prefix that the other AER prints include. Also, it has a different > print level than all the other AER prints. > > Update the AER driver to print the AER status string with the proper string > prefix and proper print level. > > Previous log example: > > e1000e 0003:01:00.1: aer_status: 0x0041, aer_mask: 0x > Receiver Error, Bad TLP > e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID > pcieport 0003:00:00.0: aer_status: 0x1000, aer_mask: 0xe000 > Replay Timer Timeout > pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID > > New log: > > e1000e 0003:01:00.1: aer_status: 0x0041, aer_mask: 0x > e1000e 0003:01:00.1: Receiver Error > e1000e 0003:01:00.1: Bad TLP > e1000e 0003:01:00.1: aer_layer=Physical Layer, aer_agent=Receiver ID > pcieport 0003:00:00.0: aer_status: 0x1000, aer_mask: 0xe000 > pcieport 0003:00:00.0: Replay Timer Timeout > pcieport 0003:00:00.0: aer_layer=Data Link Layer, aer_agent=Transmitter ID I definitely think it's MUCH better to use dev_err() as you do. I don't like the cper_print_bits() strategy of inserting line breaks to fit in 80 columns. That leads to atomicity issues, e.g., other printk output getting inserted in the middle of a single AER log, and suggests an ordering ("Receiver Error" occurred before "Bad TLP") that isn't real. It'd be ideal if everything fit on one line per event, but that might not be practical. I'm not necessarily attached to the actual strings. These messages are for sophisticated users and maybe could be abbreviated as in lspci output. It might actually be kind of neat if the output here matched up with the output of "lspci -vv" (lspci prints all the bits; here you probably want only the set bits). Or maybe not. But even what you have here is a huge improvement. I *hate* unattached things in dmesg like we currently get. There's no reliable way to connect that "Receiver Error, Bad TLP" with the device. > Signed-off-by: Tyler Baicar > --- > drivers/pci/pcie/aer/aerdrv_errprint.c | 15 ++- > 1 file changed, 14 insertions(+), 1 deletion(-) > > diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c > b/drivers/pci/pcie/aer/aerdrv_errprint.c > index 54c4b69..b718daa 100644 > --- a/drivers/pci/pcie/aer/aerdrv_errprint.c > +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c > @@ -206,6 +206,19 @@ void aer_print_port_info(struct pci_dev *dev, struct > aer_err_info *info) > } > > #ifdef CONFIG_ACPI_APEI_PCIEAER > +void dev_print_bits(struct pci_dev *dev, unsigned int bits, > + const char * const strs[], unsigned int strs_size) > +{ > + unsigned int i; > + > + for (i = 0; i < strs_size; i++) { > + if (!(bits & (1U << i))) > + continue; > + if (strs[i]) > + dev_err(>dev, "%s\n", strs[i]); > + } > +} > + > int cper_severity_to_aer(int cper_severity) > { > switch (cper_severity) { > @@ -243,7 +256,7 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity, > agent = AER_GET_AGENT(aer_severity, status); > > dev_err(>dev, "aer_status: 0x%08x, aer_mask: 0x%08x\n", status, > mask); > - cper_print_bits("", status, status_strs, status_strs_size); > + dev_print_bits(dev, status, status_strs, status_strs_size); > dev_err(>dev, "aer_layer=%s, aer_agent=%s\n", > aer_error_layer[layer], aer_agent_string[agent]); > > -- > Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm > Technologies, Inc. > Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, > a Linux Foundation Collaborative Project. >
[PATCH mmotm] lib/bug.c: fix build when MODULES are not enabled
From: Randy DunlapFix build when CONFIG_MODULES is not enabled. Fixes multiple build errors. Signed-off-by: Randy Dunlap Reported-by: kbuild test robot Cc: Andi Kleen --- lib/bug.c |2 ++ 1 file changed, 2 insertions(+) --- mmotm-2017-1019-1700.orig/lib/bug.c +++ mmotm-2017-1019-1700/lib/bug.c @@ -206,6 +206,7 @@ static void clear_once_table(struct bug_ void generic_bug_clear_once(void) { +#ifdef CONFIG_MODULES struct module *mod; rcu_read_lock_sched(); @@ -213,6 +214,7 @@ void generic_bug_clear_once(void) clear_once_table(mod->bug_table, mod->bug_table + mod->num_bugs); rcu_read_unlock_sched(); +#endif clear_once_table(__start___bug_table, __stop___bug_table); }
[PATCH mmotm] lib/bug.c: fix build when MODULES are not enabled
From: Randy Dunlap Fix build when CONFIG_MODULES is not enabled. Fixes multiple build errors. Signed-off-by: Randy Dunlap Reported-by: kbuild test robot Cc: Andi Kleen --- lib/bug.c |2 ++ 1 file changed, 2 insertions(+) --- mmotm-2017-1019-1700.orig/lib/bug.c +++ mmotm-2017-1019-1700/lib/bug.c @@ -206,6 +206,7 @@ static void clear_once_table(struct bug_ void generic_bug_clear_once(void) { +#ifdef CONFIG_MODULES struct module *mod; rcu_read_lock_sched(); @@ -213,6 +214,7 @@ void generic_bug_clear_once(void) clear_once_table(mod->bug_table, mod->bug_table + mod->num_bugs); rcu_read_unlock_sched(); +#endif clear_once_table(__start___bug_table, __stop___bug_table); }
Re: [bug report] regression bisected to "block: Make most scsi_req_init() calls implicit"
On Fri, 2017-10-20 at 16:54 -0600, dann frazier wrote: > hey, > I'm seeing a regression when executing 'dmraid -r -c' in an arm64 > QEMU guest, which I've bisected to the following commit: > > ca18d6f7 "block: Make most scsi_req_init() calls implicit" > > I haven't yet had time to try and debug it yet, but wanted to get > the report out there before the weekend. Here's the crash: > > [ 138.519885] usercopy: kernel memory overwrite attempt detected to > (null) () (6 bytes) > [ 138.521562] kernel BUG at mm/usercopy.c:72! > [ 138.522294] Internal error: Oops - BUG: 0 [#1] SMP > [ 138.523105] Modules linked in: nls_utf8 isofs nls_iso8859_1 qemu_fw_cfg > ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi > scsi_transport_iscsi ip_tables > x_tables autofs4 btrfs zstd_decompress zstd_compress xxhash raid10 raid456 > async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq > libcrc32c raid1 raid0 > multipath linear aes_ce_blk aes_ce_cipher crc32_ce crct10dif_ce ghash_ce > sha2_ce sha256_arm64 sha1_ce virtio_net virtio_blk aes_neon_bs aes_neon_blk > crypto_simd cryptd > aes_arm64 > [ 138.531307] CPU: 62 PID: 2271 Comm: dmraid Not tainted 4.14.0-rc5+ #20 > [ 138.532512] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 > [ 138.533796] task: 8003cba2e900 task.stack: 110e8000 > [ 138.534887] PC is at __check_object_size+0x114/0x200 > [ 138.535800] LR is at __check_object_size+0x114/0x200 > [ 138.536711] pc : [] lr : [] pstate: > 00400145 > [ 138.538073] sp : 110ebb00 > [ 138.538682] x29: 110ebb00 x28: > [ 138.539658] x27: d88e1110 x26: 8003e8d3d800 > [ 138.540633] x25: 0802001d x24: 8003e1131920 > [ 138.541621] x23: 0006 x22: 0006 > [ 138.542596] x21: x20: 0006 > [ 138.543571] x19: x18: > [ 138.544548] x17: 83380ce0 x16: 082dd3b0 > [ 138.545525] x15: 093c8c08 x14: 6c756e2820202020 > [ 138.546511] x13: 202020202020206f x12: 7420646574636574 > [ 138.547489] x11: 093c9658 x10: 086ae800 > [ 138.548466] x9 : 7265766f2079726f x8 : 0017 > [ 138.549445] x7 : 6c756e3c2820296c x6 : 8003eeb51c28 > [ 138.550434] x5 : 8003eeb51c28 x4 : > [ 138.551411] x3 : 8003eeb59ec8 x2 : d4a0cd0f45236000 > [ 138.552388] x1 : x0 : 0059 > [ 138.553364] Process dmraid (pid: 2271, stack limit = 0x110e8000) > [ 138.554593] Call trace: > [ 138.555043] Exception stack(0x110eb9c0 to 0x110ebb00) > [ 138.556214] b9c0: 0059 d4a0cd0f45236000 > 8003eeb59ec8 > [ 138.557653] b9e0: 8003eeb51c28 8003eeb51c28 > 6c756e3c2820296c > [ 138.559082] ba00: 0017 7265766f2079726f 086ae800 > 093c9658 > [ 138.560510] ba20: 7420646574636574 202020202020206f 6c756e2820202020 > 093c8c08 > [ 138.561950] ba40: 082dd3b0 83380ce0 > > [ 138.563379] ba60: 0006 0006 > 0006 > [ 138.564805] ba80: 8003e1131920 0802001d 8003e8d3d800 > d88e1110 > [ 138.566238] baa0: 110ebb00 082c0e5c > 110ebb00 > [ 138.567666] bac0: 082c0e5c 00400145 08e25a80 > > [ 138.569090] bae0: 0001 0006 110ebb00 > 082c0e5c > [ 138.570523] [] __check_object_size+0x114/0x200 > [ 138.571628] [] sg_io+0x120/0x438 > [ 138.572507] [] scsi_cmd_ioctl+0x594/0x728 > [ 138.573531] [] scsi_cmd_blk_ioctl+0x50/0x60 > [ 138.574594] [] virtblk_ioctl+0x60/0x80 [virtio_blk] > [ 138.575769] [] blkdev_ioctl+0x5e4/0xb50 > [ 138.576756] [] block_ioctl+0x50/0x68 > [ 138.577698] [] do_vfs_ioctl+0xc4/0x940 > [ 138.578671] [] SyS_ioctl+0x8c/0xa8 > [ 138.579581] Exception stack(0x110ebec0 to 0x110ec000) > [ 138.580752] bec0: 0005 2285 d88e10b8 > 0006 > [ 138.582199] bee0: 0004 83416648 > 0050 > [ 138.583623] bf00: 001d 0003 0012 > 0011 > [ 138.585050] bf20: 83409000 00ff 8309dc70 > 0531 > [ 138.586490] bf40: 8344a360 83380ce0 00dc > 83478948 > [ 138.587918] bf60: 0004 17ee7f90 0005 > 17ede920 > [ 138.589346] bf80: 17ee7f60 0003 83416648 > 17ee7f60 > [ 138.590785] bfa0: d88e1218 d88e1090 834166dc > d88e1090 > [ 138.592215] bfc0: 83380cec 8000 0005 > 001d > [ 138.593649] bfe0:
Re: [bug report] regression bisected to "block: Make most scsi_req_init() calls implicit"
On Fri, 2017-10-20 at 16:54 -0600, dann frazier wrote: > hey, > I'm seeing a regression when executing 'dmraid -r -c' in an arm64 > QEMU guest, which I've bisected to the following commit: > > ca18d6f7 "block: Make most scsi_req_init() calls implicit" > > I haven't yet had time to try and debug it yet, but wanted to get > the report out there before the weekend. Here's the crash: > > [ 138.519885] usercopy: kernel memory overwrite attempt detected to > (null) () (6 bytes) > [ 138.521562] kernel BUG at mm/usercopy.c:72! > [ 138.522294] Internal error: Oops - BUG: 0 [#1] SMP > [ 138.523105] Modules linked in: nls_utf8 isofs nls_iso8859_1 qemu_fw_cfg > ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi > scsi_transport_iscsi ip_tables > x_tables autofs4 btrfs zstd_decompress zstd_compress xxhash raid10 raid456 > async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq > libcrc32c raid1 raid0 > multipath linear aes_ce_blk aes_ce_cipher crc32_ce crct10dif_ce ghash_ce > sha2_ce sha256_arm64 sha1_ce virtio_net virtio_blk aes_neon_bs aes_neon_blk > crypto_simd cryptd > aes_arm64 > [ 138.531307] CPU: 62 PID: 2271 Comm: dmraid Not tainted 4.14.0-rc5+ #20 > [ 138.532512] Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 > [ 138.533796] task: 8003cba2e900 task.stack: 110e8000 > [ 138.534887] PC is at __check_object_size+0x114/0x200 > [ 138.535800] LR is at __check_object_size+0x114/0x200 > [ 138.536711] pc : [] lr : [] pstate: > 00400145 > [ 138.538073] sp : 110ebb00 > [ 138.538682] x29: 110ebb00 x28: > [ 138.539658] x27: d88e1110 x26: 8003e8d3d800 > [ 138.540633] x25: 0802001d x24: 8003e1131920 > [ 138.541621] x23: 0006 x22: 0006 > [ 138.542596] x21: x20: 0006 > [ 138.543571] x19: x18: > [ 138.544548] x17: 83380ce0 x16: 082dd3b0 > [ 138.545525] x15: 093c8c08 x14: 6c756e2820202020 > [ 138.546511] x13: 202020202020206f x12: 7420646574636574 > [ 138.547489] x11: 093c9658 x10: 086ae800 > [ 138.548466] x9 : 7265766f2079726f x8 : 0017 > [ 138.549445] x7 : 6c756e3c2820296c x6 : 8003eeb51c28 > [ 138.550434] x5 : 8003eeb51c28 x4 : > [ 138.551411] x3 : 8003eeb59ec8 x2 : d4a0cd0f45236000 > [ 138.552388] x1 : x0 : 0059 > [ 138.553364] Process dmraid (pid: 2271, stack limit = 0x110e8000) > [ 138.554593] Call trace: > [ 138.555043] Exception stack(0x110eb9c0 to 0x110ebb00) > [ 138.556214] b9c0: 0059 d4a0cd0f45236000 > 8003eeb59ec8 > [ 138.557653] b9e0: 8003eeb51c28 8003eeb51c28 > 6c756e3c2820296c > [ 138.559082] ba00: 0017 7265766f2079726f 086ae800 > 093c9658 > [ 138.560510] ba20: 7420646574636574 202020202020206f 6c756e2820202020 > 093c8c08 > [ 138.561950] ba40: 082dd3b0 83380ce0 > > [ 138.563379] ba60: 0006 0006 > 0006 > [ 138.564805] ba80: 8003e1131920 0802001d 8003e8d3d800 > d88e1110 > [ 138.566238] baa0: 110ebb00 082c0e5c > 110ebb00 > [ 138.567666] bac0: 082c0e5c 00400145 08e25a80 > > [ 138.569090] bae0: 0001 0006 110ebb00 > 082c0e5c > [ 138.570523] [] __check_object_size+0x114/0x200 > [ 138.571628] [] sg_io+0x120/0x438 > [ 138.572507] [] scsi_cmd_ioctl+0x594/0x728 > [ 138.573531] [] scsi_cmd_blk_ioctl+0x50/0x60 > [ 138.574594] [] virtblk_ioctl+0x60/0x80 [virtio_blk] > [ 138.575769] [] blkdev_ioctl+0x5e4/0xb50 > [ 138.576756] [] block_ioctl+0x50/0x68 > [ 138.577698] [] do_vfs_ioctl+0xc4/0x940 > [ 138.578671] [] SyS_ioctl+0x8c/0xa8 > [ 138.579581] Exception stack(0x110ebec0 to 0x110ec000) > [ 138.580752] bec0: 0005 2285 d88e10b8 > 0006 > [ 138.582199] bee0: 0004 83416648 > 0050 > [ 138.583623] bf00: 001d 0003 0012 > 0011 > [ 138.585050] bf20: 83409000 00ff 8309dc70 > 0531 > [ 138.586490] bf40: 8344a360 83380ce0 00dc > 83478948 > [ 138.587918] bf60: 0004 17ee7f90 0005 > 17ede920 > [ 138.589346] bf80: 17ee7f60 0003 83416648 > 17ee7f60 > [ 138.590785] bfa0: d88e1218 d88e1090 834166dc > d88e1090 > [ 138.592215] bfc0: 83380cec 8000 0005 > 001d > [ 138.593649] bfe0:
Re: [PATCH 09/27] uswsusp: Disable when the kernel is locked down
On Thu, 19 Oct 2017, David Howells wrote: > From: Matthew Garrett> > uswsusp allows a user process to dump and then restore kernel state, which > makes it possible to modify the running kernel. Disable this if the kernel > is locked down. > > Signed-off-by: Matthew Garrett > Signed-off-by: David Howells Reviewed-by: James Morris -- James Morris
Re: [PATCH 09/27] uswsusp: Disable when the kernel is locked down
On Thu, 19 Oct 2017, David Howells wrote: > From: Matthew Garrett > > uswsusp allows a user process to dump and then restore kernel state, which > makes it possible to modify the running kernel. Disable this if the kernel > is locked down. > > Signed-off-by: Matthew Garrett > Signed-off-by: David Howells Reviewed-by: James Morris -- James Morris
Re: [PATCH 07/27] kexec_file: Disable at runtime if securelevel has been set
On Thu, 19 Oct 2017, David Howells wrote: > From: Chun-Yi Lee> > When KEXEC_VERIFY_SIG is not enabled, kernel should not loads image > through kexec_file systemcall if securelevel has been set. > > This code was showed in Matthew's patch but not in git: > https://lkml.org/lkml/2015/3/13/778 > Reviewed-by: James Morris -- James Morris
Re: [PATCH 07/27] kexec_file: Disable at runtime if securelevel has been set
On Thu, 19 Oct 2017, David Howells wrote: > From: Chun-Yi Lee > > When KEXEC_VERIFY_SIG is not enabled, kernel should not loads image > through kexec_file systemcall if securelevel has been set. > > This code was showed in Matthew's patch but not in git: > https://lkml.org/lkml/2015/3/13/778 > Reviewed-by: James Morris -- James Morris
Re: [PATCH 0/3] arm64: cpuinfo: make /proc/cpuinfo more human-readable
On 10/20/2017 10:10 AM, Mark Rutland wrote: > Hi Al, > > On Mon, Oct 16, 2017 at 05:43:19PM -0600, Al Stone wrote: >> On 10/13/2017 08:27 AM, Mark Rutland wrote: >>> I certainly agree that exposing the information that we have is useful, >>> as I have stated several times. I'm not NAKing exposing this information >>> elsewhere. >>> >>> If you want a consistent cross-architecture interface for this >>> information, then you need to propose a new one. That was we can >>> actually solve the underlying issues, for all architectures, without >>> breaking ABI. >>> >>> I would be *very* interested in such an interface, and would be more >>> than happy to help. >> >> I'm playing with some patches that do very similar things in sysfs, vs >> proc. Is that better :)? > > Exposing data under sysfs is certainly better, yes. :) > >> Obviously, you'll have to see the patches to >> properly answer that, but what I'm playing with at present is placing >> this info in new entries in /sys/devices/cpu and/or /sys/devices/system, >> and generating some of the content based on what's already in header files >> (e.g., in cputype.h). > > My opposition to MIDR -> string mapping applies regardless of > location... Harumph. This is the one thing I get asked for most often, however (second most is frequency). It turns out humans are not nearly as good at indexed lookups as computers, hence the requests. Whatever the root of the opposition is, it needs to get fixed. My fear is that if it doesn't get fixed in the firmware or the kernel, it will get fixed in some far messier, less controllable way somewhere else (more likely several somewhere elses) and just exacerbate the problem. >> The idea of course is to keep this new info from touching any existing >> info so we don't break compatibility -- does that feel like a better >> direction, at least? > > ... but otherwise this sounds good to me! > > Thanks, > Mark. > -- ciao, al --- Al Stone Software Engineer Red Hat, Inc. a...@redhat.com ---
Re: [PATCH 0/3] arm64: cpuinfo: make /proc/cpuinfo more human-readable
On 10/20/2017 10:10 AM, Mark Rutland wrote: > Hi Al, > > On Mon, Oct 16, 2017 at 05:43:19PM -0600, Al Stone wrote: >> On 10/13/2017 08:27 AM, Mark Rutland wrote: >>> I certainly agree that exposing the information that we have is useful, >>> as I have stated several times. I'm not NAKing exposing this information >>> elsewhere. >>> >>> If you want a consistent cross-architecture interface for this >>> information, then you need to propose a new one. That was we can >>> actually solve the underlying issues, for all architectures, without >>> breaking ABI. >>> >>> I would be *very* interested in such an interface, and would be more >>> than happy to help. >> >> I'm playing with some patches that do very similar things in sysfs, vs >> proc. Is that better :)? > > Exposing data under sysfs is certainly better, yes. :) > >> Obviously, you'll have to see the patches to >> properly answer that, but what I'm playing with at present is placing >> this info in new entries in /sys/devices/cpu and/or /sys/devices/system, >> and generating some of the content based on what's already in header files >> (e.g., in cputype.h). > > My opposition to MIDR -> string mapping applies regardless of > location... Harumph. This is the one thing I get asked for most often, however (second most is frequency). It turns out humans are not nearly as good at indexed lookups as computers, hence the requests. Whatever the root of the opposition is, it needs to get fixed. My fear is that if it doesn't get fixed in the firmware or the kernel, it will get fixed in some far messier, less controllable way somewhere else (more likely several somewhere elses) and just exacerbate the problem. >> The idea of course is to keep this new info from touching any existing >> info so we don't break compatibility -- does that feel like a better >> direction, at least? > > ... but otherwise this sounds good to me! > > Thanks, > Mark. > -- ciao, al --- Al Stone Software Engineer Red Hat, Inc. a...@redhat.com ---
Re: [PATCH 00/23] Hardened usercopy whitelisting
On 21/10/2017 00:40, Paolo Bonzini wrote: > This breaks KVM completely on x86, due to two ioctls > (KVM_GET/SET_CPUID2) accessing the cpuid_entries field of struct > kvm_vcpu_arch. > > There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is > obsolete and not a big deal at all. > > I can post some patches, but probably not until the beginning of > November due to travelling. Please do not send this too close to the > beginning of the merge window. Sleeping is overrated, sending patches now... Paolo
Re: [PATCH 00/23] Hardened usercopy whitelisting
On 21/10/2017 00:40, Paolo Bonzini wrote: > This breaks KVM completely on x86, due to two ioctls > (KVM_GET/SET_CPUID2) accessing the cpuid_entries field of struct > kvm_vcpu_arch. > > There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is > obsolete and not a big deal at all. > > I can post some patches, but probably not until the beginning of > November due to travelling. Please do not send this too close to the > beginning of the merge window. Sleeping is overrated, sending patches now... Paolo
[PATCH 2/2] KVM: fix KVM_XEN_HVM_CONFIG ioctl
This ioctl is obsolete (it was used by Xenner as far as I know) but still let's not break it gratuitously... Its handler is copying directly into struct kvm. Go through a bounce buffer instead, with the added benefit that we can actually do something useful with the flags argument---the previous code was exiting with -EINVAL but still doing the copy. This technically is a userspace ABI breakage, but since no one should be using the ioctl, it's a good occasion to see if someone actually complains. Cc: kernel-harden...@lists.openwall.com Cc: Kees CookCc: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 415529a78c37..c76d7afa30be 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4218,13 +4218,14 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(>lock); break; case KVM_XEN_HVM_CONFIG: { + struct kvm_xen_hvm_config xhc; r = -EFAULT; - if (copy_from_user(>arch.xen_hvm_config, argp, - sizeof(struct kvm_xen_hvm_config))) + if (copy_from_user(, argp, sizeof(xhc))) goto out; r = -EINVAL; - if (kvm->arch.xen_hvm_config.flags) + if (xhc.flags) goto out; + memcpy(>arch.xen_hvm_config, , sizeof(xhc)); r = 0; break; } -- 2.14.2
[PATCH 2/2] KVM: fix KVM_XEN_HVM_CONFIG ioctl
This ioctl is obsolete (it was used by Xenner as far as I know) but still let's not break it gratuitously... Its handler is copying directly into struct kvm. Go through a bounce buffer instead, with the added benefit that we can actually do something useful with the flags argument---the previous code was exiting with -EINVAL but still doing the copy. This technically is a userspace ABI breakage, but since no one should be using the ioctl, it's a good occasion to see if someone actually complains. Cc: kernel-harden...@lists.openwall.com Cc: Kees Cook Cc: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 415529a78c37..c76d7afa30be 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4218,13 +4218,14 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(>lock); break; case KVM_XEN_HVM_CONFIG: { + struct kvm_xen_hvm_config xhc; r = -EFAULT; - if (copy_from_user(>arch.xen_hvm_config, argp, - sizeof(struct kvm_xen_hvm_config))) + if (copy_from_user(, argp, sizeof(xhc))) goto out; r = -EINVAL; - if (kvm->arch.xen_hvm_config.flags) + if (xhc.flags) goto out; + memcpy(>arch.xen_hvm_config, , sizeof(xhc)); r = 0; break; } -- 2.14.2
[PATCH 0/2] KVM: fixes for the kernel-hardening tree
Two KVM ioctls (KVM_GET/SET_CPUID2) directly access the cpuid_entries field of struct kvm_vcpu_arch. Therefore, the new usercopy hardening work in linux-next, which forbids copies from and to slab objects unless they are from kmalloc or explicitly whitelisted, breaks KVM completely. This series fixes it by adding the two new usercopy arguments to kvm_init (more precisely to a new function kvm_init_usercopy, while kvm_init passes zeroes as a default). There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is obsolete and not a big deal at all. I'm Ccing all submaintainers in case they have something similar going on in their kvm_arch and kvm_vcpu_arch structs. KVM has a pretty complex userspace API, so thorough with linux-next is highly recommended. Many thanks to Thomas Gleixner for reporting this to me. Paolo Paolo Bonzini (2): KVM: allow setting a usercopy region in struct kvm_vcpu KVM: fix KVM_XEN_HVM_CONFIG ioctl arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 17 ++--- include/linux/kvm_host.h| 13 +++-- virt/kvm/kvm_main.c | 13 - 6 files changed, 40 insertions(+), 14 deletions(-) -- 2.14.2
[PATCH 1/2] KVM: allow setting a usercopy region in struct kvm_vcpu
On x86, struct kvm_vcpu has a usercopy region corresponding to the CPUID entries. The area is read and written by the KVM_GET/SET_CPUID2 ioctls. Without this patch, KVM is completely broken on x86 with usercopy hardening enabled. Define kvm_init in terms of a more generic function that allows setting a usercopy region. Because x86 has separate kvm_init callers for Intel and AMD, another variant called kvm_init_x86 passes the region corresponding to the cpuid_entries array. Reported-by: Thomas GleixnerCc: kernel-harden...@lists.openwall.com Cc: Kees Cook Cc: Radim Krčmář Cc: Christoffer Dall Cc: Marc Zyngier Cc: Christian Borntraeger Cc: Cornelia Huck Cc: James Hogan Cc: Paul Mackerras Signed-off-by: Paolo Bonzini --- The patch is on top of linux-next. arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 10 ++ include/linux/kvm_host.h| 13 +++-- virt/kvm/kvm_main.c | 13 - 6 files changed, 36 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6b8f937ca398..bb8243d413d0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1420,6 +1420,9 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} +int kvm_init_x86(struct kvm_x86_ops *kvm_x86_ops, unsigned vcpu_size, +unsigned vcpu_align, struct module *module); + static inline int kvm_cpu_get_apicid(int mps_cpu) { #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ff94552f85d0..457433c3a703 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5594,8 +5594,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { static int __init svm_init(void) { - return kvm_init(_x86_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + return kvm_init_x86(_x86_ops, sizeof(struct vcpu_svm), + __alignof__(struct vcpu_svm), THIS_MODULE); } static void __exit svm_exit(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c460b0b439d3..6e78530df6a8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -12106,8 +12106,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { static int __init vmx_init(void) { - int r = kvm_init(_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + int r = kvm_init_x86(_x86_ops, sizeof(struct vcpu_vmx), +__alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5669af09b732..415529a78c37 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8181,6 +8181,16 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } +int kvm_init_x86(struct kvm_x86_ops *kvm_x86_ops, unsigned vcpu_size, +unsigned vcpu_align, struct module *module) +{ + return kvm_init_usercopy(kvm_x86_ops, vcpu_size, vcpu_align, +offsetof(struct kvm_vcpu_arch, cpuid_entries), +sizeof_field(struct kvm_vcpu_arch, cpuid_entries), +module); +} +EXPORT_SYMBOL_GPL(kvm_init_x86); + int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6882538eda32..21e19658b086 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -561,8 +561,17 @@ static inline void kvm_irqfd_exit(void) { } #endif -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module); + +int kvm_init_usercopy(void *opaque, unsigned vcpu_size, unsigned vcpu_align, + unsigned vcpu_usercopy_start, unsigned vcpu_usercopy_size, + struct module *module); + +static inline int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, + struct module *module) +{ + return kvm_init_usercopy(opaque, vcpu_size, vcpu_align, 0, 0, module); +} + void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 261c782a688f..ac889b28bb54 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3959,8 +3959,9 @@ static void kvm_sched_out(struct preempt_notifier *pn, kvm_arch_vcpu_put(vcpu); } -int kvm_init(void *opaque, unsigned vcpu_size,
[PATCH 0/2] KVM: fixes for the kernel-hardening tree
Two KVM ioctls (KVM_GET/SET_CPUID2) directly access the cpuid_entries field of struct kvm_vcpu_arch. Therefore, the new usercopy hardening work in linux-next, which forbids copies from and to slab objects unless they are from kmalloc or explicitly whitelisted, breaks KVM completely. This series fixes it by adding the two new usercopy arguments to kvm_init (more precisely to a new function kvm_init_usercopy, while kvm_init passes zeroes as a default). There's also another broken ioctl, KVM_XEN_HVM_CONFIG, but it is obsolete and not a big deal at all. I'm Ccing all submaintainers in case they have something similar going on in their kvm_arch and kvm_vcpu_arch structs. KVM has a pretty complex userspace API, so thorough with linux-next is highly recommended. Many thanks to Thomas Gleixner for reporting this to me. Paolo Paolo Bonzini (2): KVM: allow setting a usercopy region in struct kvm_vcpu KVM: fix KVM_XEN_HVM_CONFIG ioctl arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 17 ++--- include/linux/kvm_host.h| 13 +++-- virt/kvm/kvm_main.c | 13 - 6 files changed, 40 insertions(+), 14 deletions(-) -- 2.14.2
[PATCH 1/2] KVM: allow setting a usercopy region in struct kvm_vcpu
On x86, struct kvm_vcpu has a usercopy region corresponding to the CPUID entries. The area is read and written by the KVM_GET/SET_CPUID2 ioctls. Without this patch, KVM is completely broken on x86 with usercopy hardening enabled. Define kvm_init in terms of a more generic function that allows setting a usercopy region. Because x86 has separate kvm_init callers for Intel and AMD, another variant called kvm_init_x86 passes the region corresponding to the cpuid_entries array. Reported-by: Thomas Gleixner Cc: kernel-harden...@lists.openwall.com Cc: Kees Cook Cc: Radim Krčmář Cc: Christoffer Dall Cc: Marc Zyngier Cc: Christian Borntraeger Cc: Cornelia Huck Cc: James Hogan Cc: Paul Mackerras Signed-off-by: Paolo Bonzini --- The patch is on top of linux-next. arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm.c | 4 ++-- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 10 ++ include/linux/kvm_host.h| 13 +++-- virt/kvm/kvm_main.c | 13 - 6 files changed, 36 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6b8f937ca398..bb8243d413d0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1420,6 +1420,9 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} +int kvm_init_x86(struct kvm_x86_ops *kvm_x86_ops, unsigned vcpu_size, +unsigned vcpu_align, struct module *module); + static inline int kvm_cpu_get_apicid(int mps_cpu) { #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ff94552f85d0..457433c3a703 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5594,8 +5594,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { static int __init svm_init(void) { - return kvm_init(_x86_ops, sizeof(struct vcpu_svm), - __alignof__(struct vcpu_svm), THIS_MODULE); + return kvm_init_x86(_x86_ops, sizeof(struct vcpu_svm), + __alignof__(struct vcpu_svm), THIS_MODULE); } static void __exit svm_exit(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c460b0b439d3..6e78530df6a8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -12106,8 +12106,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { static int __init vmx_init(void) { - int r = kvm_init(_x86_ops, sizeof(struct vcpu_vmx), - __alignof__(struct vcpu_vmx), THIS_MODULE); + int r = kvm_init_x86(_x86_ops, sizeof(struct vcpu_vmx), +__alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5669af09b732..415529a78c37 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8181,6 +8181,16 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } +int kvm_init_x86(struct kvm_x86_ops *kvm_x86_ops, unsigned vcpu_size, +unsigned vcpu_align, struct module *module) +{ + return kvm_init_usercopy(kvm_x86_ops, vcpu_size, vcpu_align, +offsetof(struct kvm_vcpu_arch, cpuid_entries), +sizeof_field(struct kvm_vcpu_arch, cpuid_entries), +module); +} +EXPORT_SYMBOL_GPL(kvm_init_x86); + int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6882538eda32..21e19658b086 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -561,8 +561,17 @@ static inline void kvm_irqfd_exit(void) { } #endif -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module); + +int kvm_init_usercopy(void *opaque, unsigned vcpu_size, unsigned vcpu_align, + unsigned vcpu_usercopy_start, unsigned vcpu_usercopy_size, + struct module *module); + +static inline int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, + struct module *module) +{ + return kvm_init_usercopy(opaque, vcpu_size, vcpu_align, 0, 0, module); +} + void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 261c782a688f..ac889b28bb54 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3959,8 +3959,9 @@ static void kvm_sched_out(struct preempt_notifier *pn, kvm_arch_vcpu_put(vcpu); } -int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, - struct module *module) +int kvm_init_usercopy(void *opaque, unsigned vcpu_size, unsigned vcpu_align, + unsigned vcpu_arch_usercopy_start, unsigned
Re: [PATCH 05/27] kexec: Disable at runtime if the kernel is locked down
On Thu, 19 Oct 2017, David Howells wrote: > From: Matthew Garrett> > kexec permits the loading and execution of arbitrary code in ring 0, which > is something that lock-down is meant to prevent. It makes sense to disable > kexec in this situation. > > This does not affect kexec_file_load() which can check for a signature on the > image to be booted. > > Signed-off-by: Matthew Garrett > Signed-off-by: David Howells > Acked-by: Dave Young > cc: ke...@lists.infradead.org Reviewed-by: James Morris -- James Morris
Re: [PATCH 05/27] kexec: Disable at runtime if the kernel is locked down
On Thu, 19 Oct 2017, David Howells wrote: > From: Matthew Garrett > > kexec permits the loading and execution of arbitrary code in ring 0, which > is something that lock-down is meant to prevent. It makes sense to disable > kexec in this situation. > > This does not affect kexec_file_load() which can check for a signature on the > image to be booted. > > Signed-off-by: Matthew Garrett > Signed-off-by: David Howells > Acked-by: Dave Young > cc: ke...@lists.infradead.org Reviewed-by: James Morris -- James Morris
Re: [PATCH 04/27] Restrict /dev/mem and /dev/kmem when the kernel is locked down
On Thu, 19 Oct 2017, David Howells wrote: > From: Matthew Garrett> > Allowing users to write to address space makes it possible for the kernel to > be subverted, avoiding module loading restrictions. Prevent this when the > kernel has been locked down. > > Signed-off-by: Matthew Garrett > Signed-off-by: David Howells Reviewed-by: James Morris -- James Morris
Re: [PATCH 04/27] Restrict /dev/mem and /dev/kmem when the kernel is locked down
On Thu, 19 Oct 2017, David Howells wrote: > From: Matthew Garrett > > Allowing users to write to address space makes it possible for the kernel to > be subverted, avoiding module loading restrictions. Prevent this when the > kernel has been locked down. > > Signed-off-by: Matthew Garrett > Signed-off-by: David Howells Reviewed-by: James Morris -- James Morris
Re: [PATCH 0/5] xfs refcount conversions
On Fri, Oct 20, 2017 at 02:07:53PM +0300, Elena Reshetova wrote: > Note: our previous thread didn't finish in any conclusion, so > I am resending this now (rebased at latest linux-next) to revive > the discussion. refcount_t is slowly becoming a standard for > refcounters and we would really like to make all conversions > done where it is applicable. In a separate "replace atomics with refcounts" discussion, the ordering guarantees of refcounts was raised: https://lkml.org/lkml/2017/9/4/206 i.e. refcounts use weak ordering whilst atomics imply a smp_mb() operation was performed. Given these counters in XFS directly define the life cycle states rather than being just an object refcount, I'm pretty sure they rely on the implied smp_mb() that the atomic operations provide to work correctly. Let me put it this way: Documentation/memory-barriers.txt breaks my brain. We know atomics provide the barriers we need for the code to work correctly, but using anything else requires understanding the ordering of the new code and what Documentation/memory-barriers.txt says that means. IMO, that makes it way too hard to review sanely for code that: a) we already know works correctly b) has guards against reference count problems; and c) isn't performance critical, so doesn't need the complexity of tricky memory ordering and/or barriers. So, really, it comes down to the fact that we know refcount_t is not a straight drop in replacement for atomics, and that actually verifying the change is correct requires an in depth understanding of Documentation/memory-barriers.txt. IMO, that's way too much of a long term maintenance and knowledge burden to add to what is a simple set of reference counters... Cheers, Dave. -- Dave Chinner da...@fromorbit.com
Re: [PATCH 03/27] Enforce module signatures if the kernel is locked down
On Thu, 19 Oct 2017, David Howells wrote: > If the kernel is locked down, require that all modules have valid > signatures that we can verify. > > Signed-off-by: David HowellsReviewed-by: James Morris -- James Morris
Re: [PATCH 03/27] Enforce module signatures if the kernel is locked down
On Thu, 19 Oct 2017, David Howells wrote: > If the kernel is locked down, require that all modules have valid > signatures that we can verify. > > Signed-off-by: David Howells Reviewed-by: James Morris -- James Morris
Re: [PATCH 0/5] xfs refcount conversions
On Fri, Oct 20, 2017 at 02:07:53PM +0300, Elena Reshetova wrote: > Note: our previous thread didn't finish in any conclusion, so > I am resending this now (rebased at latest linux-next) to revive > the discussion. refcount_t is slowly becoming a standard for > refcounters and we would really like to make all conversions > done where it is applicable. In a separate "replace atomics with refcounts" discussion, the ordering guarantees of refcounts was raised: https://lkml.org/lkml/2017/9/4/206 i.e. refcounts use weak ordering whilst atomics imply a smp_mb() operation was performed. Given these counters in XFS directly define the life cycle states rather than being just an object refcount, I'm pretty sure they rely on the implied smp_mb() that the atomic operations provide to work correctly. Let me put it this way: Documentation/memory-barriers.txt breaks my brain. We know atomics provide the barriers we need for the code to work correctly, but using anything else requires understanding the ordering of the new code and what Documentation/memory-barriers.txt says that means. IMO, that makes it way too hard to review sanely for code that: a) we already know works correctly b) has guards against reference count problems; and c) isn't performance critical, so doesn't need the complexity of tricky memory ordering and/or barriers. So, really, it comes down to the fact that we know refcount_t is not a straight drop in replacement for atomics, and that actually verifying the change is correct requires an in depth understanding of Documentation/memory-barriers.txt. IMO, that's way too much of a long term maintenance and knowledge burden to add to what is a simple set of reference counters... Cheers, Dave. -- Dave Chinner da...@fromorbit.com
Re: [PATCH 01/27] Add the ability to lock down access to the running kernel image
On Thu, 19 Oct 2017, David Howells wrote: > Provide a single call to allow kernel code to determine whether the system > should be locked down, thereby disallowing various accesses that might > allow the running kernel image to be changed including the loading of > modules that aren't validly signed with a key we recognise, fiddling with > MSR registers and disallowing hibernation, > > Signed-off-by: David HowellsAcked-by: James Morris -- James Morris
Re: [PATCH 01/27] Add the ability to lock down access to the running kernel image
On Thu, 19 Oct 2017, David Howells wrote: > Provide a single call to allow kernel code to determine whether the system > should be locked down, thereby disallowing various accesses that might > allow the running kernel image to be changed including the loading of > modules that aren't validly signed with a key we recognise, fiddling with > MSR registers and disallowing hibernation, > > Signed-off-by: David Howells Acked-by: James Morris -- James Morris