[PATCH 12/27] x86: Lock down IO port access when the kernel is locked down
From: Matthew Garrett IO port access would permit users to gain access to PCI configuration registers, which in turn (on a lot of hardware) give access to MMIO register space. This would potentially permit root to trigger arbitrary DMA, so lock it down by default. This also implicitly locks down the KDADDIO, KDDELIO, KDENABIO and KDDISABIO console ioctls. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Reviewed-by: Thomas Gleixner Reviewed-by: "Lee, Chun-Yi" cc: x...@kernel.org Signed-off-by: Matthew Garrett --- arch/x86/kernel/ioport.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 0fe1c8782208..abc702a6ae9c 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -31,7 +31,8 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) + if (turn_on && (!capable(CAP_SYS_RAWIO) || + kernel_is_locked_down("ioperm"))) return -EPERM; /* @@ -126,7 +127,8 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) return -EINVAL; /* Trying to gain more privileges? */ if (level > old) { - if (!capable(CAP_SYS_RAWIO)) + if (!capable(CAP_SYS_RAWIO) || + kernel_is_locked_down("iopl")) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | -- 2.21.0.352.gf09ad66450-goog
[PATCH 27/27] lockdown: Print current->comm in restriction messages
From: David Howells Print the content of current->comm in messages generated by lockdown to indicate a restriction that was hit. This makes it a bit easier to find out what caused the message. The message now patterned something like: Lockdown: : is restricted; see man kernel_lockdown.7 Signed-off-by: David Howells Signed-off-by: Matthew Garrett --- security/lock_down.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/lock_down.c b/security/lock_down.c index cfbc2c39712b..5243b55b3c1f 100644 --- a/security/lock_down.c +++ b/security/lock_down.c @@ -58,8 +58,8 @@ void __init init_lockdown(void) bool __kernel_is_locked_down(const char *what, bool first) { if (what && first && kernel_locked_down) - pr_notice("Lockdown: %s is restricted; see man kernel_lockdown.7\n", - what); + pr_notice("Lockdown: %s: %s is restricted; see man kernel_lockdown.7\n", + current->comm, what); return kernel_locked_down; } EXPORT_SYMBOL(__kernel_is_locked_down); -- 2.21.0.352.gf09ad66450-goog
[PATCH 19/27] Lock down TIOCSSERIAL
From: David Howells Lock down TIOCSSERIAL as that can be used to change the ioport and irq settings on a serial port. This only appears to be an issue for the serial drivers that use the core serial code. All other drivers seem to either ignore attempts to change port/irq or give an error. Reported-by: Greg Kroah-Hartman Signed-off-by: David Howells cc: Jiri Slaby Signed-off-by: Matthew Garrett --- drivers/tty/serial/serial_core.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index d4cca5bdaf1c..04534877b575 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -842,6 +842,12 @@ static int uart_set_info(struct tty_struct *tty, struct tty_port *port, new_flags = (__force upf_t)new_info->flags; old_custom_divisor = uport->custom_divisor; + if ((change_port || change_irq) && + kernel_is_locked_down("Using TIOCSSERIAL to change device addresses, irqs and dma channels")) { + retval = -EPERM; + goto exit; + } + if (!capable(CAP_SYS_ADMIN)) { retval = -EPERM; if (change_irq || change_port || -- 2.21.0.352.gf09ad66450-goog
[PATCH 15/27] acpi: Ignore acpi_rsdp kernel param when the kernel has been locked down
From: Josh Boyer This option allows userspace to pass the RSDP address to the kernel, which makes it possible for a user to modify the workings of hardware . Reject the option when the kernel is locked down. Signed-off-by: Josh Boyer Signed-off-by: David Howells Reviewed-by: "Lee, Chun-Yi" cc: Dave Young cc: linux-a...@vger.kernel.org Signed-off-by: Matthew Garrett --- drivers/acpi/osl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index f29e427d0d1d..3e44cef7a0cd 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -194,7 +194,7 @@ acpi_physical_address __init acpi_os_get_root_pointer(void) acpi_physical_address pa; #ifdef CONFIG_KEXEC - if (acpi_rsdp) + if (acpi_rsdp && !kernel_is_locked_down("ACPI RSDP specification")) return acpi_rsdp; #endif pa = acpi_arch_get_root_pointer(); -- 2.21.0.352.gf09ad66450-goog
Re: [PATCH 1/3] mm/mincore: make mincore() more conservative
On Wed, 6 Mar 2019, Andrew Morton wrote: > > The semantics of what mincore() considers to be resident is not completely > > clear, but Linux has always (since 2.3.52, which is when mincore() was > > initially done) treated it as "page is available in page cache". > > > > That's potentially a problem, as that [in]directly exposes meta-information > > about pagecache / memory mapping state even about memory not strictly > > belonging > > to the process executing the syscall, opening possibilities for sidechannel > > attacks. > > > > Change the semantics of mincore() so that it only reveals pagecache > > information > > for non-anonymous mappings that belog to files that the calling process > > could > > (if it tried to) successfully open for writing. > > "for writing" comes as a bit of a surprise. Why not for reading? I guess this is a rhetorical question from you :) but fair enough, good point, I'll explain this a bit more in the changelog and in the code comments. > > @@ -189,8 +197,13 @@ static long do_mincore(unsigned long addr, unsigned > > long pages, unsigned char *v > > vma = find_vma(current->mm, addr); > > if (!vma || addr < vma->vm_start) > > return -ENOMEM; > > - mincore_walk.mm = vma->vm_mm; > > end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); > > + if (!can_do_mincore(vma)) { > > + unsigned long pages = (end - addr) >> PAGE_SHIFT; > > I'm not sure this is correct in all cases. If > > addr = 4095 > vma->vm_end = 4096 > pages = 1000 > > then `end' is 4096 and `(end - addr) << PAGE_SHIFT' is zero, but it > should have been 1. Good catch! It should rather be something like unsigned long pages = (end >> PAGE_SHIFT) - (addr >> PAGE_SHIFT); I'll fix that up and resend tomorrow. Thanks, -- Jiri Kosina SUSE Labs
[PATCH 26/27] debugfs: Restrict debugfs when the kernel is locked down
From: David Howells Disallow opening of debugfs files that might be used to muck around when the kernel is locked down as various drivers give raw access to hardware through debugfs. Given the effort of auditing all 2000 or so files and manually fixing each one as necessary, I've chosen to apply a heuristic instead. The following changes are made: (1) chmod and chown are disallowed on debugfs objects (though the root dir can be modified by mount and remount, but I'm not worried about that). (2) When the kernel is locked down, only files with the following criteria are permitted to be opened: - The file must have mode 00444 - The file must not have ioctl methods - The file must not have mmap (3) When the kernel is locked down, files may only be opened for reading. Normal device interaction should be done through configfs, sysfs or a miscdev, not debugfs. Note that this makes it unnecessary to specifically lock down show_dsts(), show_devs() and show_call() in the asus-wmi driver. I would actually prefer to lock down all files by default and have the the files unlocked by the creator. This is tricky to manage correctly, though, as there are 19 creation functions and ~1600 call sites (some of them in loops scanning tables). Signed-off-by: David Howells cc: Andy Shevchenko cc: acpi4asus-u...@lists.sourceforge.net cc: platform-driver-...@vger.kernel.org cc: Matthew Garrett cc: Thomas Gleixner Signed-off-by: Matthew Garrett --- fs/debugfs/file.c | 28 fs/debugfs/inode.c | 30 -- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 4fce1da7db23..c33042c1eff3 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -136,6 +136,25 @@ void debugfs_file_put(struct dentry *dentry) } EXPORT_SYMBOL_GPL(debugfs_file_put); +/* + * Only permit access to world-readable files when the kernel is locked down. + * We also need to exclude any file that has ways to write or alter it as root + * can bypass the permissions check. + */ +static bool debugfs_is_locked_down(struct inode *inode, + struct file *filp, + const struct file_operations *real_fops) +{ + if ((inode->i_mode & 0) == 0444 && + !(filp->f_mode & FMODE_WRITE) && + !real_fops->unlocked_ioctl && + !real_fops->compat_ioctl && + !real_fops->mmap) + return false; + + return kernel_is_locked_down("debugfs"); +} + static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); @@ -147,6 +166,11 @@ static int open_proxy_open(struct inode *inode, struct file *filp) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); + + r = -EPERM; + if (debugfs_is_locked_down(inode, filp, real_fops)) + goto out; + real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not clean up after itself at exit? */ @@ -272,6 +296,10 @@ static int full_proxy_open(struct inode *inode, struct file *filp) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); + r = -EPERM; + if (debugfs_is_locked_down(inode, filp, real_fops)) + goto out; + real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 13b01351dd1c..4daec17b8215 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -32,6 +32,31 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; +/* + * Don't allow access attributes to be changed whilst the kernel is locked down + * so that we can use the file mode as part of a heuristic to determine whether + * to lock down individual files. + */ +static int debugfs_setattr(struct dentry *dentry, struct iattr *ia) +{ + if ((ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) && + kernel_is_locked_down("debugfs")) + return -EPERM; + return simple_setattr(dentry, ia); +} + +static const struct inode_operations debugfs_file_inode_operations = { + .setattr= debugfs_setattr, +}; +static const struct inode_operations debugfs_dir_inode_operations = { + .lookup = simple_lookup, + .setattr= debugfs_setattr, +}; +static const struct inode_operations debugfs_symlink_inode_operations = { + .get_link = simple_get_link, + .setattr= debugfs_setattr, +}; + static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -356,6 +381,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, inode->i_mode =
[PATCH 25/27] Lock down perf
From: David Howells Disallow the use of certain perf facilities that might allow userspace to access kernel data. Signed-off-by: David Howells Signed-off-by: Matthew Garrett --- kernel/events/core.c | 5 + 1 file changed, 5 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 3cd13a30f732..7748c6f39992 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10461,6 +10461,11 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + if ((attr.sample_type & PERF_SAMPLE_REGS_INTR) && + kernel_is_locked_down("PERF_SAMPLE_REGS_INTR")) + /* REGS_INTR can leak data, lockdown must prevent this */ + return -EPERM; + /* Only privileged users can get physical addresses */ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) -- 2.21.0.352.gf09ad66450-goog
[PATCH 23/27] Lock down kprobes
From: David Howells Disallow the creation of kprobes when the kernel is locked down by preventing their registration. This prevents kprobes from being used to access kernel memory, either to make modifications or to steal crypto data. Reported-by: Alexei Starovoitov Signed-off-by: David Howells Signed-off-by: Matthew Garrett --- kernel/kprobes.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f4ddfdd2d07e..6f66cca8e2c6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1552,6 +1552,9 @@ int register_kprobe(struct kprobe *p) struct module *probed_mod; kprobe_opcode_t *addr; + if (kernel_is_locked_down("Use of kprobes")) + return -EPERM; + /* Adjust probe address from symbol */ addr = kprobe_addr(p); if (IS_ERR(addr)) -- 2.21.0.352.gf09ad66450-goog
[PATCH 18/27] Prohibit PCMCIA CIS storage when the kernel is locked down
From: David Howells Prohibit replacement of the PCMCIA Card Information Structure when the kernel is locked down. Suggested-by: Dominik Brodowski Signed-off-by: David Howells cc: linux-pcm...@lists.infradead.org Signed-off-by: Matthew Garrett --- drivers/pcmcia/cistpl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pcmcia/cistpl.c b/drivers/pcmcia/cistpl.c index ac0672b8dfca..8adf092d0e18 100644 --- a/drivers/pcmcia/cistpl.c +++ b/drivers/pcmcia/cistpl.c @@ -1578,6 +1578,9 @@ static ssize_t pccard_store_cis(struct file *filp, struct kobject *kobj, struct pcmcia_socket *s; int error; + if (kernel_is_locked_down("Direct PCMCIA CIS storage")) + return -EPERM; + s = to_socket(container_of(kobj, struct device, kobj)); if (off) -- 2.21.0.352.gf09ad66450-goog
[PATCH 24/27] bpf: Restrict kernel image access functions when the kernel is locked down
From: David Howells There are some bpf functions can be used to read kernel memory: bpf_probe_read, bpf_probe_write_user and bpf_trace_printk. These allow private keys in kernel memory (e.g. the hibernation image signing key) to be read by an eBPF program and kernel memory to be altered without restriction. Completely prohibit the use of BPF when the kernel is locked down. Suggested-by: Alexei Starovoitov Signed-off-by: David Howells cc: net...@vger.kernel.org cc: Chun-Yi Lee cc: Alexei Starovoitov Signed-off-by: Matthew Garrett --- kernel/bpf/syscall.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b155cd17c1bd..2cde39a875aa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2585,6 +2585,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (kernel_is_locked_down("BPF")) + return -EPERM; + err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); if (err) return err; -- 2.21.0.352.gf09ad66450-goog
[PATCH 21/27] x86/mmiotrace: Lock down the testmmiotrace module
From: David Howells The testmmiotrace module shouldn't be permitted when the kernel is locked down as it can be used to arbitrarily read and write MMIO space. Suggested-by: Thomas Gleixner Signed-off-by: David Howells cc: Steven Rostedt cc: Ingo Molnar cc: "H. Peter Anvin" cc: x...@kernel.org Signed-off-by: Matthew Garrett --- arch/x86/mm/testmmiotrace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index f6ae6830b341..bbaad357f5d7 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -115,6 +115,9 @@ static int __init init(void) { unsigned long size = (read_far) ? (8 << 20) : (16 << 10); + if (kernel_is_locked_down("MMIO trace testing")) + return -EPERM; + if (mmio_address == 0) { pr_err("you have to use the module argument mmio_address.\n"); pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n"); -- 2.21.0.352.gf09ad66450-goog
[PATCH 11/27] PCI: Lock down BAR access when the kernel is locked down
From: Matthew Garrett Any hardware that can potentially generate DMA has to be locked down in order to avoid it being possible for an attacker to modify kernel code, allowing them to circumvent disabled module loading or module signing. Default to paranoid - in future we can potentially relax this for sufficiently IOMMU-isolated devices. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Acked-by: Bjorn Helgaas Reviewed-by: "Lee, Chun-Yi" cc: linux-...@vger.kernel.org Signed-off-by: Matthew Garrett --- drivers/pci/pci-sysfs.c | 9 + drivers/pci/proc.c | 9 - drivers/pci/syscall.c | 3 ++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 9ecfe13157c0..40c14574fcf8 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -905,6 +905,9 @@ static ssize_t pci_write_config(struct file *filp, struct kobject *kobj, loff_t init_off = off; u8 *data = (u8 *) buf; + if (kernel_is_locked_down("Direct PCI access")) + return -EPERM; + if (off > dev->cfg_size) return 0; if (off + count > dev->cfg_size) { @@ -1167,6 +1170,9 @@ static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, enum pci_mmap_state mmap_type; struct resource *res = >resource[bar]; + if (kernel_is_locked_down("Direct PCI access")) + return -EPERM; + if (res->flags & IORESOURCE_MEM && iomem_is_exclusive(res->start)) return -EINVAL; @@ -1242,6 +1248,9 @@ static ssize_t pci_write_resource_io(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { + if (kernel_is_locked_down("Direct PCI access")) + return -EPERM; + return pci_resource_io(filp, kobj, attr, buf, off, count, true); } diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c index 6fa1627ce08d..1549cdd0710e 100644 --- a/drivers/pci/proc.c +++ b/drivers/pci/proc.c @@ -117,6 +117,9 @@ static ssize_t proc_bus_pci_write(struct file *file, const char __user *buf, int size = dev->cfg_size; int cnt; + if (kernel_is_locked_down("Direct PCI access")) + return -EPERM; + if (pos >= size) return 0; if (nbytes >= size) @@ -196,6 +199,9 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd, #endif /* HAVE_PCI_MMAP */ int ret = 0; + if (kernel_is_locked_down("Direct PCI access")) + return -EPERM; + switch (cmd) { case PCIIOC_CONTROLLER: ret = pci_domain_nr(dev->bus); @@ -237,7 +243,8 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma) struct pci_filp_private *fpriv = file->private_data; int i, ret, write_combine = 0, res_bit = IORESOURCE_MEM; - if (!capable(CAP_SYS_RAWIO)) + if (!capable(CAP_SYS_RAWIO) || + kernel_is_locked_down("Direct PCI access")) return -EPERM; if (fpriv->mmap_state == pci_mmap_io) { diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c index d96626c614f5..b8a08d3166a1 100644 --- a/drivers/pci/syscall.c +++ b/drivers/pci/syscall.c @@ -90,7 +90,8 @@ SYSCALL_DEFINE5(pciconfig_write, unsigned long, bus, unsigned long, dfn, u32 dword; int err = 0; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) || + kernel_is_locked_down("Direct PCI access")) return -EPERM; dev = pci_get_domain_bus_and_slot(0, bus, dfn); -- 2.21.0.352.gf09ad66450-goog
[PATCH 20/27] Lock down module params that specify hardware parameters (eg. ioport)
From: David Howells Provided an annotation for module parameters that specify hardware parameters (such as io ports, iomem addresses, irqs, dma channels, fixed dma buffers and other types). Suggested-by: Alan Cox Signed-off-by: David Howells Signed-off-by: Matthew Garrett --- kernel/params.c | 26 +- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/kernel/params.c b/kernel/params.c index ce89f757e6da..8ac751c938f8 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -108,13 +108,19 @@ bool parameq(const char *a, const char *b) return parameqn(a, b, strlen(a)+1); } -static void param_check_unsafe(const struct kernel_param *kp) +static bool param_check_unsafe(const struct kernel_param *kp, + const char *doing) { if (kp->flags & KERNEL_PARAM_FL_UNSAFE) { pr_notice("Setting dangerous option %s - tainting kernel\n", kp->name); add_taint(TAINT_USER, LOCKDEP_STILL_OK); } + + if (kp->flags & KERNEL_PARAM_FL_HWPARAM && + kernel_is_locked_down("Command line-specified device addresses, irqs and dma channels")) + return false; + return true; } static int parse_one(char *param, @@ -144,8 +150,10 @@ static int parse_one(char *param, pr_debug("handling %s with %p\n", param, params[i].ops->set); kernel_param_lock(params[i].mod); - param_check_unsafe([i]); - err = params[i].ops->set(val, [i]); + if (param_check_unsafe([i], doing)) + err = params[i].ops->set(val, [i]); + else + err = -EPERM; kernel_param_unlock(params[i].mod); return err; } @@ -553,6 +561,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr, return count; } +#ifdef CONFIG_MODULES +#define mod_name(mod) (mod)->name +#else +#define mod_name(mod) "unknown" +#endif + /* sysfs always hands a nul-terminated string in buf. We rely on that. */ static ssize_t param_attr_store(struct module_attribute *mattr, struct module_kobject *mk, @@ -565,8 +579,10 @@ static ssize_t param_attr_store(struct module_attribute *mattr, return -EPERM; kernel_param_lock(mk->mod); - param_check_unsafe(attribute->param); - err = attribute->param->ops->set(buf, attribute->param); + if (param_check_unsafe(attribute->param, mod_name(mk->mod))) + err = attribute->param->ops->set(buf, attribute->param); + else + err = -EPERM; kernel_param_unlock(mk->mod); if (!err) return len; -- 2.21.0.352.gf09ad66450-goog
[PATCH 22/27] Lock down /proc/kcore
From: David Howells Disallow access to /proc/kcore when the kernel is locked down to prevent access to cryptographic data. Signed-off-by: David Howells Reviewed-by: James Morris Signed-off-by: Matthew Garrett --- fs/proc/kcore.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index bbcc185062bb..d50ebfbf3dbb 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -518,6 +518,8 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) static int open_kcore(struct inode *inode, struct file *filp) { + if (kernel_is_locked_down("/proc/kcore")) + return -EPERM; if (!capable(CAP_SYS_RAWIO)) return -EPERM; -- 2.21.0.352.gf09ad66450-goog
[PATCH 14/27] ACPI: Limit access to custom_method when the kernel is locked down
From: Matthew Garrett custom_method effectively allows arbitrary access to system memory, making it possible for an attacker to circumvent restrictions on module loading. Disable it if the kernel is locked down. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Reviewed-by: "Lee, Chun-Yi" cc: linux-a...@vger.kernel.org Signed-off-by: Matthew Garrett --- drivers/acpi/custom_method.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/acpi/custom_method.c b/drivers/acpi/custom_method.c index 4451877f83b6..ac8a90dc7096 100644 --- a/drivers/acpi/custom_method.c +++ b/drivers/acpi/custom_method.c @@ -29,6 +29,9 @@ static ssize_t cm_write(struct file *file, const char __user * user_buf, struct acpi_table_header table; acpi_status status; + if (kernel_is_locked_down("ACPI custom methods")) + return -EPERM; + if (!(*ppos)) { /* parse the table header to get the table length */ if (count <= sizeof(struct acpi_table_header)) -- 2.21.0.352.gf09ad66450-goog
[PATCH 16/27] acpi: Disable ACPI table override if the kernel is locked down
From: Linn Crosetto >From the kernel documentation (initrd_table_override.txt): If the ACPI_INITRD_TABLE_OVERRIDE compile option is true, it is possible to override nearly any ACPI table provided by the BIOS with an instrumented, modified one. When securelevel is set, the kernel should disallow any unauthenticated changes to kernel space. ACPI tables contain code invoked by the kernel, so do not allow ACPI tables to be overridden if the kernel is locked down. Signed-off-by: Linn Crosetto Signed-off-by: David Howells Reviewed-by: "Lee, Chun-Yi" cc: linux-a...@vger.kernel.org Signed-off-by: Matthew Garrett --- drivers/acpi/tables.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index 48eabb6c2d4f..f3b4117cd8f3 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -531,6 +531,11 @@ void __init acpi_table_upgrade(void) if (table_nr == 0) return; + if (kernel_is_locked_down("ACPI table override")) { + pr_notice("kernel is locked down, ignoring table override\n"); + return; + } + acpi_tables_addr = memblock_find_in_range(0, ACPI_TABLE_UPGRADE_MAX_PHYS, all_tables_size, PAGE_SIZE); -- 2.21.0.352.gf09ad66450-goog
[PATCH 10/27] uswsusp: Disable when the kernel is locked down
From: Matthew Garrett uswsusp allows a user process to dump and then restore kernel state, which makes it possible to modify the running kernel. Disable this if the kernel is locked down. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Reviewed-by: "Lee, Chun-Yi" Reviewed-by: James Morris cc: linux...@vger.kernel.org Signed-off-by: Matthew Garrett --- kernel/power/user.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/power/user.c b/kernel/power/user.c index 2d8b60a3c86b..0305d513c274 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -52,6 +52,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (!hibernation_available()) return -EPERM; + if (kernel_is_locked_down("/dev/snapshot")) + return -EPERM; + lock_system_sleep(); if (!atomic_add_unless(_device_available, -1, 0)) { -- 2.21.0.352.gf09ad66450-goog
[PATCH 08/27] kexec_file: Restrict at runtime if the kernel is locked down
From: Jiri Bohac When KEXEC_SIG is not enabled, kernel should not load images through kexec_file systemcall if the kernel is locked down. [Modified by David Howells to fit with modifications to the previous patch and to return -EPERM if the kernel is locked down for consistency with other lockdowns. Modified by Matthew Garrett to remove the IMA integration, which will be replaced by integrating with the IMA architecture policy patches.] Signed-off-by: Jiri Bohac Signed-off-by: David Howells Reviewed-by: Jiri Bohac Cc: Matthew Garrett cc: Chun-Yi Lee cc: ke...@lists.infradead.org Signed-off-by: Matthew Garrett --- kernel/kexec_file.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 67f3a866eabe..0cfe4f6f7f85 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -239,6 +239,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, } ret = 0; + + if (kernel_is_locked_down(reason)) { + ret = -EPERM; + goto out; + } + break; /* All other errors are fatal, including nomem, unparseable -- 2.21.0.352.gf09ad66450-goog
[PATCH 13/27] x86/msr: Restrict MSR access when the kernel is locked down
From: Matthew Garrett Writing to MSRs should not be allowed if the kernel is locked down, since it could lead to execution of arbitrary code in kernel mode. Based on a patch by Kees Cook. MSR accesses are logged for the purposes of building up a whitelist as per Alan Cox's suggestion. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Acked-by: Kees Cook Reviewed-by: Thomas Gleixner Reviewed-by: "Lee, Chun-Yi" cc: x...@kernel.org Signed-off-by: Matthew Garrett --- arch/x86/kernel/msr.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 4588414e2561..f5a2cf07972f 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -84,6 +84,11 @@ static ssize_t msr_write(struct file *file, const char __user *buf, int err = 0; ssize_t bytes = 0; + if (kernel_is_locked_down("Direct MSR access")) { + pr_info("Direct access to MSR %x\n", reg); + return -EPERM; + } + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -135,6 +140,11 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg) err = -EFAULT; break; } + if (kernel_is_locked_down("Direct MSR access")) { + pr_info("Direct access to MSR %x\n", regs[1]); /* Display %ecx */ + err = -EPERM; + break; + } err = wrmsr_safe_regs_on_cpu(cpu, regs); if (err) break; -- 2.21.0.352.gf09ad66450-goog
[PATCH 06/27] Copy secure_boot flag in boot params across kexec reboot
From: Dave Young Kexec reboot in case secure boot being enabled does not keep the secure boot mode in new kernel, so later one can load unsigned kernel via legacy kexec_load. In this state, the system is missing the protections provided by secure boot. Adding a patch to fix this by retain the secure_boot flag in original kernel. secure_boot flag in boot_params is set in EFI stub, but kexec bypasses the stub. Fixing this issue by copying secure_boot flag across kexec reboot. Signed-off-by: Dave Young Signed-off-by: David Howells Reviewed-by: "Lee, Chun-Yi" cc: ke...@lists.infradead.org Signed-off-by: Matthew Garrett --- arch/x86/kernel/kexec-bzimage64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 278cd07228dd..d49554b948fd 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -179,6 +179,7 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr, if (efi_enabled(EFI_OLD_MEMMAP)) return 0; + params->secure_boot = boot_params.secure_boot; ei->efi_loader_signature = current_ei->efi_loader_signature; ei->efi_systab = current_ei->efi_systab; ei->efi_systab_hi = current_ei->efi_systab_hi; -- 2.21.0.352.gf09ad66450-goog
[PATCH 07/27] kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE
From: Jiri Bohac This is a preparatory patch for kexec_file_load() lockdown. A locked down kernel needs to prevent unsigned kernel images from being loaded with kexec_file_load(). Currently, the only way to force the signature verification is compiling with KEXEC_VERIFY_SIG. This prevents loading usigned images even when the kernel is not locked down at runtime. This patch splits KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE. Analogous to the MODULE_SIG and MODULE_SIG_FORCE for modules, KEXEC_SIG turns on the signature verification but allows unsigned images to be loaded. KEXEC_SIG_FORCE disallows images without a valid signature. [Modified by David Howells such that: (1) verify_pefile_signature() differentiates between no-signature and sig-didn't-match in its returned errors. (2) kexec fails with EKEYREJECTED and logs an appropriate message if signature checking is enforced and an signature is not found, uses unsupported crypto or has no matching key. (3) kexec fails with EKEYREJECTED if there is a signature for which we have a key, but signature doesn't match - even if in non-forcing mode. (4) kexec fails with EBADMSG or some other error if there is a signature which cannot be parsed - even if in non-forcing mode. (5) kexec fails with ELIBBAD if the PE file cannot be parsed to extract the signature - even if in non-forcing mode. ] Signed-off-by: Jiri Bohac Signed-off-by: David Howells Reviewed-by: Jiri Bohac cc: Matthew Garrett cc: Chun-Yi Lee cc: ke...@lists.infradead.org Signed-off-by: Matthew Garrett --- arch/x86/Kconfig | 20 --- crypto/asymmetric_keys/verify_pefile.c | 4 ++- include/linux/kexec.h | 4 +-- kernel/kexec_file.c| 48 ++ 4 files changed, 61 insertions(+), 15 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4b4a7f32b68e..735d04a4b18f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2016,20 +2016,30 @@ config KEXEC_FILE config ARCH_HAS_KEXEC_PURGATORY def_bool KEXEC_FILE -config KEXEC_VERIFY_SIG +config KEXEC_SIG bool "Verify kernel signature during kexec_file_load() syscall" depends on KEXEC_FILE ---help--- - This option makes kernel signature verification mandatory for - the kexec_file_load() syscall. - In addition to that option, you need to enable signature + This option makes the kexec_file_load() syscall check for a valid + signature of the kernel image. The image can still be loaded without + a valid signature unless you also enable KEXEC_SIG_FORCE, though if + there's a signature that we can check, then it must be valid. + + In addition to this option, you need to enable signature verification for the corresponding kernel image type being loaded in order for this to work. +config KEXEC_SIG_FORCE + bool "Require a valid signature in kexec_file_load() syscall" + depends on KEXEC_SIG + ---help--- + This option makes kernel signature verification mandatory for + the kexec_file_load() syscall. + config KEXEC_BZIMAGE_VERIFY_SIG bool "Enable bzImage signature verification support" - depends on KEXEC_VERIFY_SIG + depends on KEXEC_SIG depends on SIGNED_PE_FILE_VERIFICATION select SYSTEM_TRUSTED_KEYRING ---help--- diff --git a/crypto/asymmetric_keys/verify_pefile.c b/crypto/asymmetric_keys/verify_pefile.c index d178650fd524..4473cea1e877 100644 --- a/crypto/asymmetric_keys/verify_pefile.c +++ b/crypto/asymmetric_keys/verify_pefile.c @@ -100,7 +100,7 @@ static int pefile_parse_binary(const void *pebuf, unsigned int pelen, if (!ddir->certs.virtual_address || !ddir->certs.size) { pr_debug("Unsigned PE binary\n"); - return -EKEYREJECTED; + return -ENODATA; } chkaddr(ctx->header_size, ddir->certs.virtual_address, @@ -408,6 +408,8 @@ static int pefile_digest_pe(const void *pebuf, unsigned int pelen, * (*) 0 if at least one signature chain intersects with the keys in the trust * keyring, or: * + * (*) -ENODATA if there is no signature present. + * * (*) -ENOPKG if a suitable crypto module couldn't be found for a check on a * chain. * diff --git a/include/linux/kexec.h b/include/linux/kexec.h index b9b1bc5f9669..58b27c7bdc2b 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -125,7 +125,7 @@ typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, unsigned long cmdline_len); typedef int (kexec_cleanup_t)(void *loader_data); -#ifdef CONFIG_KEXEC_VERIFY_SIG +#ifdef CONFIG_KEXEC_SIG typedef int (kexec_verify_sig_t)(const char *kernel_buf, unsigned long kernel_len); #endif @@ -134,7 +134,7 @@ struct kexec_file_ops {
[PATCH 05/27] kexec_load: Disable at runtime if the kernel is locked down
From: Matthew Garrett The kexec_load() syscall permits the loading and execution of arbitrary code in ring 0, which is something that lock-down is meant to prevent. It makes sense to disable kexec_load() in this situation. This does not affect kexec_file_load() syscall which can check for a signature on the image to be booted. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Acked-by: Dave Young Reviewed-by: "Lee, Chun-Yi" Reviewed-by: James Morris cc: ke...@lists.infradead.org Signed-off-by: Matthew Garrett --- kernel/kexec.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/kernel/kexec.c b/kernel/kexec.c index 68559808fdfa..8ea0ce31271f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -207,6 +207,13 @@ static inline int kexec_load_check(unsigned long nr_segments, if (result < 0) return result; + /* +* kexec can be used to circumvent module loading restrictions, so +* prevent loading in that case +*/ + if (kernel_is_locked_down("kexec of unsigned images")) + return -EPERM; + /* * Verify we have a legal set of flags * This leaves us room for future extensions. -- 2.21.0.352.gf09ad66450-goog
[PATCH 04/27] Restrict /dev/{mem,kmem,port} when the kernel is locked down
From: Matthew Garrett Allowing users to read and write to core kernel memory makes it possible for the kernel to be subverted, avoiding module loading restrictions, and also to steal cryptographic information. Disallow /dev/mem and /dev/kmem from being opened this when the kernel has been locked down to prevent this. Also disallow /dev/port from being opened to prevent raw ioport access and thus DMA from being used to accomplish the same thing. Signed-off-by: Matthew Garrett Signed-off-by: David Howells Reviewed-by: "Lee, Chun-Yi" Signed-off-by: Matthew Garrett --- drivers/char/mem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index b08dc50f9f26..0a2f2e75d5f4 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -786,6 +786,8 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig) static int open_port(struct inode *inode, struct file *filp) { + if (kernel_is_locked_down("/dev/mem,kmem,port")) + return -EPERM; return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } -- 2.21.0.352.gf09ad66450-goog
[PULL REQUEST] Kernel lockdown patches for 5.2
Hi James, This patchset introduces an optional kernel lockdown feature, intended to strengthen the boundary between UID 0 and the kernel. When enabled and active (by enabling the config option and passing the "lockdown" option on the kernel command line), various pieces of kernel functionality are restricted. Applications that rely on low-level access to either hardware or the kernel may cease working as a result - therefore this should not be enabled without appropriate evaluation beforehand. The majority of mainstream distributions have been carrying variants of this patchset for many years now, so there's value in providing a unified upstream implementation to reduce the delta. This PR probably doesn't meet every distribution requirement, but gets us much closer to not requiring external patches. This PR is mostly the same as the previous attempt, but with the following changes: 1) The integration between EFI secure boot and the lockdown state has been removed 2) A new CONFIG_KERNEL_LOCK_DOWN_FORCE kconfig option has been added, which will always enable lockdown regardless of the kernel command line 3) The integration with IMA has been dropped for now. IMA is in the process of adding support for architecture-specific policies that will interact correctly with the lockdown feature, and a followup patch will integrate that so we don't end up with an ordering dependency on the merge The following changes since commit 468e91cecb3218afd684b8c422490dfebe0691bb: keys: fix missing __user in KEYCTL_PKEY_QUERY (2019-03-04 15:48:37 -0800) are available in the Git repository at: https://github.com/mjg59/linux lock_down for you to fetch changes up to 3d53449e0ac1df8cfdcc1ec48dc9cb622f220300: lockdown: Print current->comm in restriction messages (2019-03-06 13:32:19 -0800) Dave Young (1): Copy secure_boot flag in boot params across kexec reboot David Howells (12): Add the ability to lock down access to the running kernel image Enforce module signatures if the kernel is locked down Prohibit PCMCIA CIS storage when the kernel is locked down Lock down TIOCSSERIAL Lock down module params that specify hardware parameters (eg. ioport) x86/mmiotrace: Lock down the testmmiotrace module Lock down /proc/kcore Lock down kprobes bpf: Restrict kernel image access functions when the kernel is locked down Lock down perf debugfs: Restrict debugfs when the kernel is locked down lockdown: Print current->comm in restriction messages Jiri Bohac (2): kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE kexec_file: Restrict at runtime if the kernel is locked down Josh Boyer (2): hibernate: Disable when the kernel is locked down acpi: Ignore acpi_rsdp kernel param when the kernel has been locked down Kyle McMartin (1): Add a SysRq option to lift kernel lockdown Linn Crosetto (2): acpi: Disable ACPI table override if the kernel is locked down acpi: Disable APEI error injection if the kernel is locked down Matthew Garrett (7): Restrict /dev/{mem,kmem,port} when the kernel is locked down kexec_load: Disable at runtime if the kernel is locked down uswsusp: Disable when the kernel is locked down PCI: Lock down BAR access when the kernel is locked down x86: Lock down IO port access when the kernel is locked down x86/msr: Restrict MSR access when the kernel is locked down ACPI: Limit access to custom_method when the kernel is locked down arch/x86/Kconfig | 20 +-- arch/x86/include/asm/setup.h | 2 + arch/x86/kernel/ioport.c | 6 +- arch/x86/kernel/kexec-bzimage64.c | 1 + arch/x86/kernel/msr.c | 10 arch/x86/mm/testmmiotrace.c| 3 + crypto/asymmetric_keys/verify_pefile.c | 4 +- drivers/acpi/apei/einj.c | 3 + drivers/acpi/custom_method.c | 3 + drivers/acpi/osl.c | 2 +- drivers/acpi/tables.c | 5 ++ drivers/char/mem.c | 2 + drivers/input/misc/uinput.c| 1 + drivers/pci/pci-sysfs.c| 9 +++ drivers/pci/proc.c | 9 ++- drivers/pci/syscall.c | 3 +- drivers/pcmcia/cistpl.c| 3 + drivers/tty/serial/serial_core.c | 6 ++ drivers/tty/sysrq.c| 19 -- fs/debugfs/file.c | 28 + fs/debugfs/inode.c | 30 +- fs/proc/kcore.c| 2 + include/linux/input.h | 5 ++ include/linux/kernel.h | 17 ++ include/linux/kexec.h | 4 +- include/linux/security.h | 9 ++- include/linux/sysrq.h | 8 ++- kernel/bpf/syscall.c
[PATCH 03/27] Enforce module signatures if the kernel is locked down
From: David Howells If the kernel is locked down, require that all modules have valid signatures that we can verify. I have adjusted the errors generated: (1) If there's no signature (ENODATA) or we can't check it (ENOPKG, ENOKEY), then: (a) If signatures are enforced then EKEYREJECTED is returned. (b) If there's no signature or we can't check it, but the kernel is locked down then EPERM is returned (this is then consistent with other lockdown cases). (2) If the signature is unparseable (EBADMSG, EINVAL), the signature fails the check (EKEYREJECTED) or a system error occurs (eg. ENOMEM), we return the error we got. Note that the X.509 code doesn't check for key expiry as the RTC might not be valid or might not have been transferred to the kernel's clock yet. [Modified by Matthew Garrett to remove the IMA integration. This will be replaced with integration with the IMA architecture policy patchset.] Signed-off-by: David Howells Reviewed-by: Jiri Bohac cc: "Lee, Chun-Yi" cc: James Morris Signed-off-by: Matthew Garrett --- kernel/module.c | 39 --- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index 2ad1b5239910..9a377c6ea200 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2767,8 +2767,9 @@ static inline void kmemleak_load_module(const struct module *mod, #ifdef CONFIG_MODULE_SIG static int module_sig_check(struct load_info *info, int flags) { - int err = -ENOKEY; + int err = -ENODATA; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; + const char *reason; const void *mod = info->hdr; /* @@ -2783,16 +2784,40 @@ static int module_sig_check(struct load_info *info, int flags) err = mod_verify_sig(mod, info); } - if (!err) { + switch (err) { + case 0: info->sig_ok = true; return 0; - } - /* Not having a signature is only an error if we're strict. */ - if (err == -ENOKEY && !is_module_sig_enforced()) - err = 0; + /* We don't permit modules to be loaded into trusted kernels +* without a valid signature on them, but if we're not +* enforcing, certain errors are non-fatal. +*/ + case -ENODATA: + reason = "Loading of unsigned module"; + goto decide; + case -ENOPKG: + reason = "Loading of module with unsupported crypto"; + goto decide; + case -ENOKEY: + reason = "Loading of module with unavailable key"; + decide: + if (is_module_sig_enforced()) { + pr_notice("%s is rejected\n", reason); + return -EKEYREJECTED; + } - return err; + if (kernel_is_locked_down(reason)) + return -EPERM; + return 0; + + /* All other errors are fatal, including nomem, unparseable +* signatures and signature check failures - even if signatures +* aren't required. +*/ + default: + return err; + } } #else /* !CONFIG_MODULE_SIG */ static int module_sig_check(struct load_info *info, int flags) -- 2.21.0.352.gf09ad66450-goog
[PATCH 02/27] Add a SysRq option to lift kernel lockdown
From: Kyle McMartin Make an option to provide a sysrq key that will lift the kernel lockdown, thereby allowing the running kernel image to be accessed and modified. On x86 this is triggered with SysRq+x, but this key may not be available on all arches, so it is set by setting LOCKDOWN_LIFT_KEY in asm/setup.h. Since this macro must be defined in an arch to be able to use this facility for that arch, the Kconfig option is restricted to arches that support it. Signed-off-by: Kyle McMartin Signed-off-by: David Howells cc: x...@kernel.org Signed-off-by: Matthew Garrett --- arch/x86/include/asm/setup.h | 2 ++ drivers/input/misc/uinput.c | 1 + drivers/tty/sysrq.c | 19 ++- include/linux/input.h| 5 include/linux/sysrq.h| 8 +- kernel/debug/kdb/kdb_main.c | 2 +- security/Kconfig | 9 +++ security/lock_down.c | 47 8 files changed, 85 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ed8ec011a9fd..8daf633a5347 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -9,6 +9,8 @@ #include #include +#define LOCKDOWN_LIFT_KEY 'x' + #ifdef __i386__ #include diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c index 8ec483e8688b..c2a77dc73fa0 100644 --- a/drivers/input/misc/uinput.c +++ b/drivers/input/misc/uinput.c @@ -365,6 +365,7 @@ static int uinput_create_device(struct uinput_device *udev) dev->flush = uinput_dev_flush; } + dev->flags |= INPUTDEV_FLAGS_SYNTHETIC; dev->event = uinput_dev_event; input_set_drvdata(udev->dev, udev); diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 1f03078ec352..0a05d336008e 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -480,6 +480,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = { /* x: May be registered on mips for TLB dump */ /* x: May be registered on ppc/powerpc for xmon */ /* x: May be registered on sparc64 for global PMU dump */ + /* x: May be registered on x86_64 for disabling secure boot */ NULL, /* x */ /* y: May be registered on sparc64 for global register dump */ NULL, /* y */ @@ -523,7 +524,7 @@ static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p) sysrq_key_table[i] = op_p; } -void __handle_sysrq(int key, bool check_mask) +void __handle_sysrq(int key, unsigned int from) { struct sysrq_key_op *op_p; int orig_log_level; @@ -543,11 +544,15 @@ void __handle_sysrq(int key, bool check_mask) op_p = __sysrq_get_key_op(key); if (op_p) { + /* Ban synthetic events from some sysrq functionality */ + if ((from == SYSRQ_FROM_PROC || from == SYSRQ_FROM_SYNTHETIC) && + op_p->enable_mask & SYSRQ_DISABLE_USERSPACE) + printk("This sysrq operation is disabled from userspace.\n"); /* * Should we check for enabled operations (/proc/sysrq-trigger * should not) and is the invoked operation enabled? */ - if (!check_mask || sysrq_on_mask(op_p->enable_mask)) { + if (from == SYSRQ_FROM_KERNEL || sysrq_on_mask(op_p->enable_mask)) { pr_cont("%s\n", op_p->action_msg); console_loglevel = orig_log_level; op_p->handler(key); @@ -579,7 +584,7 @@ void __handle_sysrq(int key, bool check_mask) void handle_sysrq(int key) { if (sysrq_on()) - __handle_sysrq(key, true); + __handle_sysrq(key, SYSRQ_FROM_KERNEL); } EXPORT_SYMBOL(handle_sysrq); @@ -659,7 +664,7 @@ static void sysrq_do_reset(struct timer_list *t) static void sysrq_handle_reset_request(struct sysrq_state *state) { if (state->reset_requested) - __handle_sysrq(sysrq_xlate[KEY_B], false); + __handle_sysrq(sysrq_xlate[KEY_B], SYSRQ_FROM_KERNEL); if (sysrq_reset_downtime_ms) mod_timer(>keyreset_timer, @@ -812,8 +817,10 @@ static bool sysrq_handle_keypress(struct sysrq_state *sysrq, default: if (sysrq->active && value && value != 2) { + int from = sysrq->handle.dev->flags & INPUTDEV_FLAGS_SYNTHETIC ? + SYSRQ_FROM_SYNTHETIC : 0; sysrq->need_reinject = false; - __handle_sysrq(sysrq_xlate[code], true); + __handle_sysrq(sysrq_xlate[code], from); } break; } @@ -1096,7 +1103,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, if (get_user(c, buf)) return -EFAULT; -
[PATCH v3 1/1] mm: introduce put_user_page*(), placeholder versions
From: John Hubbard Introduces put_user_page(), which simply calls put_page(). This provides a way to update all get_user_pages*() callers, so that they call put_user_page(), instead of put_page(). Also introduces put_user_pages(), and a few dirty/locked variations, as a replacement for release_pages(), and also as a replacement for open-coded loops that release multiple pages. These may be used for subsequent performance improvements, via batching of pages to be released. This is the first step of fixing a problem (also described in [1] and [2]) with interactions between get_user_pages ("gup") and filesystems. Problem description: let's start with a bug report. Below, is what happens sometimes, under memory pressure, when a driver pins some pages via gup, and then marks those pages dirty, and releases them. Note that the gup documentation actually recommends that pattern. The problem is that the filesystem may do a writeback while the pages were gup-pinned, and then the filesystem believes that the pages are clean. So, when the driver later marks the pages as dirty, that conflicts with the filesystem's page tracking and results in a BUG(), like this one that I experienced: kernel BUG at /build/linux-fQ94TU/linux-4.4.0/fs/ext4/inode.c:1899! backtrace: ext4_writepage __writepage write_cache_pages ext4_writepages do_writepages __writeback_single_inode writeback_sb_inodes __writeback_inodes_wb wb_writeback wb_workfn process_one_work worker_thread kthread ret_from_fork ...which is due to the file system asserting that there are still buffer heads attached: ({ \ BUG_ON(!PagePrivate(page)); \ ((struct buffer_head *)page_private(page)); \ }) Dave Chinner's description of this is very clear: "The fundamental issue is that ->page_mkwrite must be called on every write access to a clean file backed page, not just the first one. How long the GUP reference lasts is irrelevant, if the page is clean and you need to dirty it, you must call ->page_mkwrite before it is marked writeable and dirtied. Every. Time." This is just one symptom of the larger design problem: filesystems do not actually support get_user_pages() being called on their pages, and letting hardware write directly to those pages--even though that patter has been going on since about 2005 or so. The steps are to fix it are: 1) (This patch): provide put_user_page*() routines, intended to be used for releasing pages that were pinned via get_user_pages*(). 2) Convert all of the call sites for get_user_pages*(), to invoke put_user_page*(), instead of put_page(). This involves dozens of call sites, and will take some time. 3) After (2) is complete, use get_user_pages*() and put_user_page*() to implement tracking of these pages. This tracking will be separate from the existing struct page refcounting. 4) Use the tracking and identification of these pages, to implement special handling (especially in writeback paths) when the pages are backed by a filesystem. [1] https://lwn.net/Articles/774411/ : "DMA and get_user_pages()" [2] https://lwn.net/Articles/753027/ : "The Trouble with get_user_pages()" Cc: Al Viro Cc: Christoph Hellwig Cc: Christopher Lameter Cc: Dan Williams Cc: Dave Chinner Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport Cc: Ralph Campbell Reviewed-by: Jan Kara Reviewed-by: Mike Rapoport # docs Reviewed-by: Ira Weiny Signed-off-by: John Hubbard --- include/linux/mm.h | 24 ++ mm/swap.c | 82 ++ 2 files changed, 106 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bb6408fe73..809b7397d41e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -993,6 +993,30 @@ static inline void put_page(struct page *page) __put_page(page); } +/** + * put_user_page() - release a gup-pinned page + * @page:pointer to page to be released + * + * Pages that were pinned via get_user_pages*() must be released via + * either put_user_page(), or one of the put_user_pages*() routines + * below. This is so that eventually, pages that are pinned via + * get_user_pages*() can be separately tracked and uniquely handled. In + * particular, interactions with RDMA and filesystems need special + * handling. + * + * put_user_page() and put_page() are not interchangeable, despite this early + * implementation that makes them look the same. put_user_page() calls must + * be perfectly matched up with get_user_page() calls. + */ +static inline void put_user_page(struct page *page) +{ + put_page(page); +} + +void put_user_pages_dirty(struct page **pages, unsigned long
[PATCH v3 0/1] mm: introduce put_user_page*(), placeholder versions
From: John Hubbard Hi Andrew and all, Can we please apply this (destined for 5.2) once the time is right? (I see that -mm just got merged into the main tree today.) We seem to have pretty solid consensus on the concept and details of the put_user_pages() approach. Or at least, if we don't, someone please speak up now. Christopher Lameter, especially, since you had some concerns recently. Therefore, here is the first patch--only. This allows us to begin converting the get_user_pages() call sites to use put_user_page(), instead of put_page(). This is in order to implement tracking of get_user_page() pages. Normally I'd include a user of this code, but in this case, I think we have examples of how it will work in the RFC and related discussions [1]. What matters more at this point is unblocking the ability to start fixing up various subsystems, through git trees other than linux-mm. For example, the Infiniband example conversion now needs to pick up some prerequisite patches via the RDMA tree. It seems likely that other call sites may need similar attention, and so having put_user_pages() available would really make this go more quickly. Previous cover letter follows: == A discussion of the overall problem is below. As mentioned in patch 0001, the steps are to fix the problem are: 1) Provide put_user_page*() routines, intended to be used for releasing pages that were pinned via get_user_pages*(). 2) Convert all of the call sites for get_user_pages*(), to invoke put_user_page*(), instead of put_page(). This involves dozens of call sites, and will take some time. 3) After (2) is complete, use get_user_pages*() and put_user_page*() to implement tracking of these pages. This tracking will be separate from the existing struct page refcounting. 4) Use the tracking and identification of these pages, to implement special handling (especially in writeback paths) when the pages are backed by a filesystem. Overview Some kernel components (file systems, device drivers) need to access memory that is specified via process virtual address. For a long time, the API to achieve that was get_user_pages ("GUP") and its variations. However, GUP has critical limitations that have been overlooked; in particular, GUP does not interact correctly with filesystems in all situations. That means that file-backed memory + GUP is a recipe for potential problems, some of which have already occurred in the field. GUP was first introduced for Direct IO (O_DIRECT), allowing filesystem code to get the struct page behind a virtual address and to let storage hardware perform a direct copy to or from that page. This is a short-lived access pattern, and as such, the window for a concurrent writeback of GUP'd page was small enough that there were not (we think) any reported problems. Also, userspace was expected to understand and accept that Direct IO was not synchronized with memory-mapped access to that data, nor with any process address space changes such as munmap(), mremap(), etc. Over the years, more GUP uses have appeared (virtualization, device drivers, RDMA) that can keep the pages they get via GUP for a long period of time (seconds, minutes, hours, days, ...). This long-term pinning makes an underlying design problem more obvious. In fact, there are a number of key problems inherent to GUP: Interactions with file systems == File systems expect to be able to write back data, both to reclaim pages, and for data integrity. Allowing other hardware (NICs, GPUs, etc) to gain write access to the file memory pages means that such hardware can dirty the pages, without the filesystem being aware. This can, in some cases (depending on filesystem, filesystem options, block device, block device options, and other variables), lead to data corruption, and also to kernel bugs of the form: kernel BUG at /build/linux-fQ94TU/linux-4.4.0/fs/ext4/inode.c:1899! backtrace: ext4_writepage __writepage write_cache_pages ext4_writepages do_writepages __writeback_single_inode writeback_sb_inodes __writeback_inodes_wb wb_writeback wb_workfn process_one_work worker_thread kthread ret_from_fork ...which is due to the file system asserting that there are still buffer heads attached: ({ \ BUG_ON(!PagePrivate(page)); \ ((struct buffer_head *)page_private(page)); \ }) Dave Chinner's description of this is very clear: "The fundamental issue is that ->page_mkwrite must be called on every write access to a clean file backed page, not just the first one. How long the GUP reference lasts is irrelevant, if the page is clean and you need to dirty it, you must call ->page_mkwrite before it is marked writeable
Re: [RFC][QEMU Patch] KVM: Enable QEMU to free the pages hinted by the guest
On Wed, Mar 6, 2019 at 7:52 AM Nitesh Narayan Lal wrote: > > This patch enables QEMU to perform MADVISE_DONTNEED on the pages > reported by the guest. > > Signed-off-by: Nitesh Narayan Lal > --- > hw/virtio/trace-events| 1 + > hw/virtio/virtio-balloon.c| 90 +++ > include/hw/virtio/virtio-balloon.h| 2 +- > .../standard-headers/linux/virtio_balloon.h | 1 + > 4 files changed, 93 insertions(+), 1 deletion(-) > > diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events > index 07bcbe9e85..e3ab66f126 100644 > --- a/hw/virtio/trace-events > +++ b/hw/virtio/trace-events > @@ -46,3 +46,4 @@ virtio_balloon_handle_output(const char *name, uint64_t > gpa) "section name: %s g > virtio_balloon_get_config(uint32_t num_pages, uint32_t actual) "num_pages: > %d actual: %d" > virtio_balloon_set_config(uint32_t actual, uint32_t oldactual) "actual: %d > oldactual: %d" > virtio_balloon_to_target(uint64_t target, uint32_t num_pages) "balloon > target: 0x%"PRIx64" num_pages: %d" > +virtio_balloon_hinting_request(unsigned long pfn, unsigned int num_pages) > "Guest page hinting request: %lu num_pages: %d" > diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c > index a12677d4d5..7ab1471017 100644 > --- a/hw/virtio/virtio-balloon.c > +++ b/hw/virtio/virtio-balloon.c > @@ -33,6 +33,13 @@ > > #define BALLOON_PAGE_SIZE (1 << VIRTIO_BALLOON_PFN_SHIFT) > > +struct guest_pages { > + unsigned long pfn; > + unsigned int order; > +}; > + > +void page_hinting_request(uint64_t addr, uint32_t len); > + > static void balloon_page(void *addr, int deflate) > { > if (!qemu_balloon_is_inhibited()) { > @@ -207,6 +214,85 @@ static void balloon_stats_set_poll_interval(Object *obj, > Visitor *v, > balloon_stats_change_timer(s, 0); > } > > +static void *gpa2hva(MemoryRegion **p_mr, hwaddr addr, Error **errp) > +{ > +MemoryRegionSection mrs = memory_region_find(get_system_memory(), > + addr, 1); > + > +if (!mrs.mr) { > +error_setg(errp, "No memory is mapped at address 0x%" HWADDR_PRIx, > addr); > +return NULL; > +} > + > +if (!memory_region_is_ram(mrs.mr) && !memory_region_is_romd(mrs.mr)) { > +error_setg(errp, "Memory at address 0x%" HWADDR_PRIx "is not RAM", > addr); > +memory_region_unref(mrs.mr); > +return NULL; > +} > + > +*p_mr = mrs.mr; > +return qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); > +} > + > +void page_hinting_request(uint64_t addr, uint32_t len) > +{ > +Error *local_err = NULL; > +MemoryRegion *mr = NULL; > +int ret = 0; > +struct guest_pages *guest_obj; > +int i = 0; > +void *hvaddr_to_free; > +unsigned long pfn, pfn_end; > +uint64_t gpaddr_to_free; > +void * temp_addr = gpa2hva(, addr, _err); > + > +if (local_err) { > +error_report_err(local_err); > +return; > +} > +guest_obj = temp_addr; > +while (i < len) { > +pfn = guest_obj[i].pfn; > + pfn_end = guest_obj[i].pfn + (1 << guest_obj[i].order) - 1; > + trace_virtio_balloon_hinting_request(pfn,(1 << guest_obj[i].order)); > + while (pfn <= pfn_end) { > + gpaddr_to_free = pfn << VIRTIO_BALLOON_PFN_SHIFT; > + hvaddr_to_free = gpa2hva(, gpaddr_to_free, _err); > + if (local_err) { > + error_report_err(local_err); > + return; > + } > + ret = qemu_madvise((void *)hvaddr_to_free, 4096, > QEMU_MADV_DONTNEED); So the structure of this function is going to cause significant performance issues. Because we are freeing the memory 4K at a time we are kicking the pages out of using THP and as a result each page fault will occur on a 4K boundary instead of a THP boundary. As a result I am seeing the first pass of memhog take around 35s, but the second pass takes 57s or so because we are no longer faulting in THP pages and instead having to fault in 4K pages. We should be trying to madvise the lesser of either PAGE_SIZE << guest_obj[i[.order or the size of the memory region minus our offset. > + if (ret == -1) > + printf("\n%d:%s Error: Madvise failed with error:%d\n", > __LINE__, __func__, ret); > + pfn++; > + } > + i++; > +} > +} > + > +static void virtio_balloon_page_hinting(VirtIODevice *vdev, VirtQueue *vq) > +{ > +VirtQueueElement *elem = NULL; > +uint64_t temp_addr; > +uint32_t temp_len; > +size_t size, t_size = 0; > + > +elem = virtqueue_pop(vq, sizeof(VirtQueueElement)); > +if (!elem) { > + printf("\npop error\n"); > + return; > +} > +size = iov_to_buf(elem->out_sg, elem->out_num, 0, _addr, > sizeof(temp_addr)); > +t_size += size; > +size = iov_to_buf(elem->out_sg, elem->out_num, 8, _len, >
Re: [GIT PULL] Driver core patches for 5.1-rc1
On Wed, Mar 6, 2019 at 2:33 AM Greg KH wrote: > > Joe Perches (1): > device.h: Add __cold to dev_ logging functions This is very funky, but that commit generates a new warning in a totally unrelated area: drivers/iio/adc/qcom-pm8xxx-xoadc.c: In function ‘pm8xxx_xoadc_probe’: drivers/iio/adc/qcom-pm8xxx-xoadc.c:633:8: warning: ‘ch’ may be used uninitialized in this function [-Wmaybe-uninitialized] ret = pm8xxx_read_channel_rsv(adc, ch, AMUX_RSV4, ^~~ _nomux_rsv4, true); ~~~ drivers/iio/adc/qcom-pm8xxx-xoadc.c:426:27: note: ‘ch’ was declared here struct pm8xxx_chan_info *ch; ^~ and it all looks entirely insane if you look at that line 633 where the ostensibly uninitialized variable is (it clearly _is_ initialized there), but if you then look at that line 426 you notice that it actually makes some kind of sense. The value comes from another function that was apparently inlined, and that other function does not "obviously" initialize it. I wonder why this wasn't seen in linux-next? Yes, the connection is odd, and maybe it's very compiler version dependent, but I do hope people react to new warnings. The kernel is entirely warning-free for me for an x86-64 allmodconfig build, and I want to keep it that way. And _because_ I want to keep it that way (one of the things I do during the merge window is look for oddities coming in during pulls, and new warnings is a big deal for me), I applied the attached patch. Just FYI. Linus From e0f0ae838a25464179d37f355d763f9ec139fc15 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 6 Mar 2019 15:41:29 -0800 Subject: [PATCH] iio: adc: fix warning in Qualcomm PM8xxx HK/XOADC driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pm8xxx_get_channel() implementation is unclear, and causes gcc to suddenly generate odd warnings. The trigger for the warning (at least for me) was the entirely unrelated commit 79a4e91d1bb2 ("device.h: Add __cold to dev_ logging functions"), which apparently changes gcc code generation in the caller function enough to cause this: drivers/iio/adc/qcom-pm8xxx-xoadc.c: In function ‘pm8xxx_xoadc_probe’: drivers/iio/adc/qcom-pm8xxx-xoadc.c:633:8: warning: ‘ch’ may be used uninitialized in this function [-Wmaybe-uninitialized] ret = pm8xxx_read_channel_rsv(adc, ch, AMUX_RSV4, ^~~ _nomux_rsv4, true); ~~~ drivers/iio/adc/qcom-pm8xxx-xoadc.c:426:27: note: ‘ch’ was declared here struct pm8xxx_chan_info *ch; ^~ because gcc for some reason then isn't able to see that the termination condition for the "for( )" loop in that function is also the condition for returning NULL. So it's not _actually_ uninitialized, but the function is admittedly just unnecessarily oddly written. Simplify and clarify the function, making gcc also see that it always returns a valid initialized value. Cc: Joe Perches Cc: Greg Kroah-Hartman Cc: Andy Gross Cc: David Brown Cc: Jonathan Cameron Cc: Hartmut Knaack Cc: Lars-Peter Clausen Cc: Peter Meerwald-Stadler Signed-off-by: Linus Torvalds --- drivers/iio/adc/qcom-pm8xxx-xoadc.c | 10 +++--- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/iio/adc/qcom-pm8xxx-xoadc.c b/drivers/iio/adc/qcom-pm8xxx-xoadc.c index c30c002f1fef..4735f8a1ca9d 100644 --- a/drivers/iio/adc/qcom-pm8xxx-xoadc.c +++ b/drivers/iio/adc/qcom-pm8xxx-xoadc.c @@ -423,18 +423,14 @@ static irqreturn_t pm8xxx_eoc_irq(int irq, void *d) static struct pm8xxx_chan_info * pm8xxx_get_channel(struct pm8xxx_xoadc *adc, u8 chan) { - struct pm8xxx_chan_info *ch; int i; for (i = 0; i < adc->nchans; i++) { - ch = >chans[i]; + struct pm8xxx_chan_info *ch = >chans[i]; if (ch->hwchan->amux_channel == chan) - break; + return ch; } - if (i == adc->nchans) - return NULL; - - return ch; + return NULL; } static int pm8xxx_read_channel_rsv(struct pm8xxx_xoadc *adc, -- 2.21.0.rc0.33.gfad1f114cd
Re: [PATCH v4 03/22] iommu: introduce device fault report API
On Tue, 5 Mar 2019 15:03:41 + Jean-Philippe Brucker wrote: > On 18/02/2019 13:54, Eric Auger wrote: > [...]> +/** > > + * iommu_register_device_fault_handler() - Register a device fault > > handler > > + * @dev: the device > > + * @handler: the fault handler > > + * @data: private data passed as argument to the handler > > + * > > + * When an IOMMU fault event is received, call this handler with > > the fault event > > + * and data as argument. The handler should return 0 on success. > > If the fault is > > + * recoverable (IOMMU_FAULT_PAGE_REQ), the handler can also > > complete > > + * the fault by calling iommu_page_response() with one of the > > following > > + * response code: > > + * - IOMMU_PAGE_RESP_SUCCESS: retry the translation > > + * - IOMMU_PAGE_RESP_INVALID: terminate the fault > > + * - IOMMU_PAGE_RESP_FAILURE: terminate the fault and stop > > reporting > > + * page faults if possible. > > The comment refers to function and values that haven't been defined > yet. Either the page_response() patch should come before, or we need > to split this patch. > > Something I missed before: if the handler fails (returns != 0) it > should complete the fault by calling iommu_page_response(), if we're > not doing it in iommu_report_device_fault(). It should be indicated > in this comment. It's safe for the handler to call page_response() > since we're not holding fault_param->lock when calling the handler. > If the page request fault is to be reported to a guest, the report function cannot wait for the completion status. As long as the fault is injected into the guest, the handler should complete with success. If the PRQ report fails, IMHO, the caller of iommu_report_device_fault() should send page_response, perhaps after clean up all partial response of the group too. > > + * > > + * Return 0 if the fault handler was installed successfully, or an > > error. > > + */ > [...] > > +/** > > + * iommu_report_device_fault() - Report fault event to device > > + * @dev: the device > > + * @evt: fault event data > > + * > > + * Called by IOMMU model specific drivers when fault is detected, > > typically > > + * in a threaded IRQ handler. > > + * > > + * Return 0 on success, or an error. > > + */ > > +int iommu_report_device_fault(struct device *dev, struct > > iommu_fault_event *evt) +{ > > + int ret = 0; > > + struct iommu_fault_event *evt_pending; > > + struct iommu_fault_param *fparam; > > + > > + /* iommu_param is allocated when device is added to group > > */ > > + if (!dev->iommu_param | !evt) > > Typo: || > > Thanks, > Jean > > > + return -EINVAL; > > + /* we only report device fault if there is a handler > > registered */ > > + mutex_lock(>iommu_param->lock); > > + if (!dev->iommu_param->fault_param || > > + !dev->iommu_param->fault_param->handler) { > > + ret = -EINVAL; > > + goto done_unlock; > > + } > > + fparam = dev->iommu_param->fault_param; > > + if (evt->fault.type == IOMMU_FAULT_PAGE_REQ && > > + evt->fault.prm.flags & > > IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE) { > > + evt_pending = kmemdup(evt, sizeof(struct > > iommu_fault_event), > > + GFP_KERNEL); > > + if (!evt_pending) { > > + ret = -ENOMEM; > > + goto done_unlock; > > + } > > + mutex_lock(>lock); > > + list_add_tail(_pending->list, >faults); > > + mutex_unlock(>lock); > > + } > > + ret = fparam->handler(evt, fparam->data); > > +done_unlock: > > + mutex_unlock(>iommu_param->lock); > > + return ret; > > +} > > +EXPORT_SYMBOL_GPL(iommu_report_device_fault); > [...] [Jacob Pan]
Re: [RFC][Patch v9 1/6] KVM: Guest free page hinting support
On Wed, Mar 6, 2019 at 7:51 AM Nitesh Narayan Lal wrote: > > This patch adds the following: > 1. Functional skeleton for the guest implementation. It enables the > guest to maintain the PFN of head buddy free pages of order > FREE_PAGE_HINTING_MIN_ORDER (currently defined as MAX_ORDER - 1) > in a per-cpu array. > Guest uses guest_free_page_enqueue() to enqueue the free pages post buddy > merging to the above mentioned per-cpu array. > guest_free_page_try_hinting() is used to initiate hinting operation once > the collected entries of the per-cpu array reaches or exceeds > HINTING_THRESHOLD (128). Having larger array size(MAX_FGPT_ENTRIES = 256) > than HINTING_THRESHOLD allows us to capture more pages specifically when > guest_free_page_enqueue() is called from free_pcppages_bulk(). > For now guest_free_page_hinting() just resets the array index to continue > capturing of the freed pages. > 2. Enables the support for x86 architecture. > > Signed-off-by: Nitesh Narayan Lal > --- > arch/x86/Kbuild | 2 +- > arch/x86/kvm/Kconfig | 8 +++ > arch/x86/kvm/Makefile| 2 + > include/linux/page_hinting.h | 15 ++ > mm/page_alloc.c | 5 ++ > virt/kvm/page_hinting.c | 98 > 6 files changed, 129 insertions(+), 1 deletion(-) > create mode 100644 include/linux/page_hinting.h > create mode 100644 virt/kvm/page_hinting.c > > diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild > index c625f57472f7..3244df4ee311 100644 > --- a/arch/x86/Kbuild > +++ b/arch/x86/Kbuild > @@ -2,7 +2,7 @@ obj-y += entry/ > > obj-$(CONFIG_PERF_EVENTS) += events/ > > -obj-$(CONFIG_KVM) += kvm/ > +obj-$(subst m,y,$(CONFIG_KVM)) += kvm/ > > # Xen paravirtualization support > obj-$(CONFIG_XEN) += xen/ > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > index 72fa955f4a15..2fae31459706 100644 > --- a/arch/x86/kvm/Kconfig > +++ b/arch/x86/kvm/Kconfig > @@ -96,6 +96,14 @@ config KVM_MMU_AUDIT > This option adds a R/W kVM module parameter 'mmu_audit', which allows > auditing of KVM MMU events at runtime. > > +# KVM_FREE_PAGE_HINTING will allow the guest to report the free pages to the > +# host in regular interval of time. > +config KVM_FREE_PAGE_HINTING > + def_bool y > + depends on KVM > + select VIRTIO > + select VIRTIO_BALLOON > + > # OK, it's a little counter-intuitive to do this, but it puts it neatly under > # the virtualization menu. > source "drivers/vhost/Kconfig" > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > index 69b3a7c30013..78640a80501e 100644 > --- a/arch/x86/kvm/Makefile > +++ b/arch/x86/kvm/Makefile > @@ -16,6 +16,8 @@ kvm-y += x86.o mmu.o emulate.o i8259.o > irq.o lapic.o \ >i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ >hyperv.o page_track.o debugfs.o > > +obj-$(CONFIG_KVM_FREE_PAGE_HINTING)+= $(KVM)/page_hinting.o > + > kvm-intel-y+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o > vmx/vmcs12.o vmx/evmcs.o vmx/nested.o > kvm-amd-y += svm.o pmu_amd.o > > diff --git a/include/linux/page_hinting.h b/include/linux/page_hinting.h > new file mode 100644 > index ..90254c582789 > --- /dev/null > +++ b/include/linux/page_hinting.h > @@ -0,0 +1,15 @@ > +#include > +/* > + * Size of the array which is used to store the freed pages is defined by > + * MAX_FGPT_ENTRIES. > + */ > +#define MAX_FGPT_ENTRIES 256 > +/* > + * Threshold value after which hinting needs to be initiated on the captured > + * free pages. > + */ > +#define HINTING_THRESHOLD 128 > +#define FREE_PAGE_HINTING_MIN_ORDER(MAX_ORDER - 1) > + > +void guest_free_page_enqueue(struct page *page, int order); > +void guest_free_page_try_hinting(void); > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index d295c9bc01a8..684d047f33ee 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -67,6 +67,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -1194,9 +1195,11 @@ static void free_pcppages_bulk(struct zone *zone, int > count, > mt = get_pageblock_migratetype(page); > > __free_one_page(page, page_to_pfn(page), zone, 0, mt); > + guest_free_page_enqueue(page, 0); > trace_mm_page_pcpu_drain(page, 0, mt); > } > spin_unlock(>lock); > + guest_free_page_try_hinting(); > } > Trying to enqueue pages from here seems like a really bad idea. You are essentially putting yourself in a hot-path for order 0 pages and going to cause significant bottlenecks. > static void free_one_page(struct zone *zone, > @@ -1210,7 +1213,9 @@ static void free_one_page(struct zone *zone, > migratetype = get_pfnblock_migratetype(page, pfn); > } > __free_one_page(page, pfn, zone, order, migratetype); > + guest_free_page_enqueue(page, order); >
Re: [PATCH 0/3] mincore() and IOCB_NOWAIT adjustments
On Thu, 7 Mar 2019 00:32:09 +0100 Dominique Martinet wrote: > Andrew Morton wrote on Wed, Mar 06, 2019: > > On Wed, 6 Mar 2019 23:48:03 +0100 (CET) Jiri Kosina > > wrote: > > > > > 3/3 is actually waiting for your decision, see > > > > > > https://lore.kernel.org/lkml/20190212063643.gl15...@dhcp22.suse.cz/ > > > > I pity anyone who tried to understand this code by reading this code. > > Can we please get some careful commentary in there explaining what is > > going on, and why things are thus? > > > > I guess the [3/3] change makes sense, although it's unclear whether > > anyone really needs it? 5.0 was released with 574823bfab8 ("Change > > mincore() to count "mapped" pages rather than "cached" pages") so we'll > > have a release cycle to somewhat determine how much impact 574823bfab8 > > has on users. How about I queue up [3/3] and we reevaluate its > > desirability in a couple of months? > > FWIW, > > 574823bfab8 has been reverted in 30bac164aca750, included in 5.0-rc4, so > the controversial change has only been there from 5.0-rc1 to 5.0-rc3 Ah, OK, thanks, I misread. Linus, do you have thoughts on http://lkml.kernel.org/r/20190130124420.1834-4-vba...@suse.cz ?
Re: [RFC] Provide in-kernel headers for making it easy to extend the kernel
On Wed, Mar 6, 2019 at 3:09 PM Pavel Machek wrote: > > > > > >Ok, I'll look into LZMA. Thanks for checking the compression sizes. > > > > > > > >- Joel > > > > > > Don't use lzma, use xz if you are going to do something. > > > > Ok, sounds good. XZ is a file format for LZMA2. Everyone's right. :-) > > > > > However, it seems unlikely to me that someone not willing to spend the > > > space in the filesystem will spend unswappable kernel memory. > > > > > > It would seem that a far saner way to do this is to use inittmpfs or > > > perhaps an auxiliary "ktmpfs" so it can at least be swapped out if you > > > have swap. > > > > But this is already possible with the proposed solution, you would load the > > module, extract it into a tmpfs, and unload the module. TMPFS pages can > > already be swapped. > > So your licensing requirements prevent you from having headers in the > filesystem, but allow module with the headers hidden inside on the > filesystem? > > Looks like you should just tar xvzf > this-is-a-kernel-module-I-promise.ko /usr/src/linux/include :-). I just don't get the opposition to Joel's work. The rest of the thread already goes into detail about the problems with pure-filesystem solutions, and you and others are just totally ignoring those well-thought-out rationales for the module approach and doing inflooping on "lol just use a tarball". That's not productive. Look; here's the bottom line: without this work, doing certain kinds of system tracing is a nightmare, and with this patch, it Just Works. You're arguing that various tools should do a better job of keeping the filesystem in sync with the kernel. Maybe you're right. But we don't live in a world where they will, because if this coherence were going to happen, it'd work already. But this work solves the problem: by necessity, anything that changes a kernel image *must* update modules coherently, whether the kernel image and module come from the filesystem, network boot, some kind of SQL database, or carrier pigeon. There's nothing wrong with work that very cheaply makes the kernel self-describing (introspection is elegant) and that takes advantage of *existing* kernel tooling infrastructure to transparently do a new thing. You don't have to use this patch if you don't want to. Please stop trying to block it.
Re: [RFC] Provide in-kernel headers for making it easy to extend the kernel
On 3/6/19 3:09 PM, Pavel Machek wrote: > On Fri 2019-01-18 17:55:43, Joel Fernandes wrote: >> From: "Joel Fernandes (Google)" >> >> Introduce in-kernel headers and other artifacts which are made available >> as an archive through proc (/proc/kheaders.tgz file). This archive makes >> it possible to build kernel modules, run eBPF programs, and other >> tracing programs that need to extend the kernel for tracing purposes >> without any dependency on the file system having headers and build >> artifacts. >> >> On Android and embedded systems, it is common to switch kernels but not >> have kernel headers available on the file system. Raw kernel headers >> also cannot be copied into the filesystem like they can be on other >> distros, due to licensing and other issues. There's no linux-headers > > If your licensing prevents you from having headers on the > filesystem... then I guess you should fix the licensing. > > I agree with Christoph, this looks pretty horrible. > Pavel > The argument that "it can be a module" is basically an admission of failure - if it isn't part of the kernel image itself there is no benefit over where the modules are stored, which will be *somewhere* in the filesystem. What I *do* think makes sense is to create an archive with this information and stuff it in the same place as the modules. It reduces the amount it is possible to muck it up. -hpa
Re: [PATCH 0/3] mincore() and IOCB_NOWAIT adjustments
Andrew Morton wrote on Wed, Mar 06, 2019: > On Wed, 6 Mar 2019 23:48:03 +0100 (CET) Jiri Kosina wrote: > > > 3/3 is actually waiting for your decision, see > > > > https://lore.kernel.org/lkml/20190212063643.gl15...@dhcp22.suse.cz/ > > I pity anyone who tried to understand this code by reading this code. > Can we please get some careful commentary in there explaining what is > going on, and why things are thus? > > I guess the [3/3] change makes sense, although it's unclear whether > anyone really needs it? 5.0 was released with 574823bfab8 ("Change > mincore() to count "mapped" pages rather than "cached" pages") so we'll > have a release cycle to somewhat determine how much impact 574823bfab8 > has on users. How about I queue up [3/3] and we reevaluate its > desirability in a couple of months? FWIW, 574823bfab8 has been reverted in 30bac164aca750, included in 5.0-rc4, so the controversial change has only been there from 5.0-rc1 to 5.0-rc3 -- Dominique
Re: [PATCH] fs: 9p: Kconfig: pedantic cleanups
Enrico Weigelt, metux IT consult wrote on Wed, Mar 06, 2019: > Signed-off-by: Enrico Weigelt, metux IT consult I don't mind trivial patches but please resend with a description of what is done (change spaces to tabs) in the commit message. That aside I gues there's no harm in it, thanks for the boring work. -- Dominique
Re: [PATCH 0/3] mincore() and IOCB_NOWAIT adjustments
On Wed, 6 Mar 2019 23:48:03 +0100 (CET) Jiri Kosina wrote: > 3/3 is actually waiting for your decision, see > > https://lore.kernel.org/lkml/20190212063643.gl15...@dhcp22.suse.cz/ I pity anyone who tried to understand this code by reading this code. Can we please get some careful commentary in there explaining what is going on, and why things are thus? I guess the [3/3] change makes sense, although it's unclear whether anyone really needs it? 5.0 was released with 574823bfab8 ("Change mincore() to count "mapped" pages rather than "cached" pages") so we'll have a release cycle to somewhat determine how much impact 574823bfab8 has on users. How about I queue up [3/3] and we reevaluate its desirability in a couple of months?
Re: [PATCH v2 07/12] NTB: Introduce functions to calculate multi-port resource index
On 2019-03-06 3:45 p.m., Serge Semin wrote: [Snip] Pretty sure everything above is just agreement... > So your current approach is inbound MW-centralized, while mine is developed > around the outbound MWs. I don't think this has anything to do with inbound vs outbound. The problem is the same no matter from which direction you assign things. >> Physical Port Number: 0 2 4 6 8 12 16 20 >> Logical Port Number: 0 1 2 3 4 5 6 7 >> Peer Index (Port 0): x 0 1 2 3 4 5 6 >> Port Index (Port 8): 0 1 2 3 x 4 5 6 >> (etc) > > That's what I suggested in the two possible solutions: > 1st solution: replace current pidx with Logical Port Number, > 2nd solution: alter ntb_peer_resource_idx() so it would return the Logical > Port Number. Well my solution wasn't to change pidx and no, we don't want to change ntb_peer_resource_idx() to return the logical port number because that's not how it's being used and I have no use for a ntb_peer_port_global_idx() function. Functions are supposed to be added by code that needs to call them, not by some grand design. Part of the reason we have this confusing mess is because the API was reviewed and merged before any users of the API were presented. Usually this is not accepted in kernel development. My suggestion is to simply say that the existing port numbers are the logical port number and have the drivers handle the physical port number mapping internally. That requires the fewest changes. > IMO In case of the 2nd solution I'd also suggest to rename the > ntb_peer_resource_idx() method into ntb_peer_port_global_idx(), > and then consider the current port indexes used in the NTB API > as local port indexes. The resource indexing can be abstracted > by a macro like this: > #define ntb_peer_resource_idx ntb_peer_port_global_idx That define would not be useful. > Finally in order to close the space up we'd also need to define > a method: ntb_port_global_idx(), which would return a Logical (global) > index of local port. Again, I'd rather not add a bunch of large semantic and infrastructure changes at this point. It's confusing enough as it is and we don't need to introduce yet another indexing scheme API to the clients that really do not need it. What the clients need is a simple API to decide which resources to use for which peers, and to figure out which peers used which resources. ntb_peer_port_idx() and ntb_peer_resource_idx() suit these purposes. Nothing else really needs to exist. >> Where the Physical Port Number is whatever the hardware uses and the >> logical port number is a numbering scheme starting with zero with no >> gaps. Then the port indexes are still as we currently have them. If we >> say that the port numbers we have now are the Logical Port Number, then >> ntb_peer_resource_idx() is correct. >> > > Current port numbers are the physical port numbers with gaps. I think that's up for interpretation as, based on the existing code, I naturally interpreted it the other way and therefore it's pretty simple to say that it's the logical port number and fix the one driver that needs to change. > That's why we > introduced the port-index NTB API abstraction in the first place, to have > these gaps > eliminated and to provide a simple way of bulk setup. Although that > abstraction > turned out not that suitable to distribute the shared resources. So > the Logical (Global) indexing is needed to do it (that's what ntb_pingpong > used > to do and ntb_perf still does now). My interpretation of the port-index was simply to match what was done in the two port case seeing code like ntb_transport simply uses the default 0 as the port index. There was no reason to believe, based on the code, that there would be gaps. >> I would strongly argue that the clients don't need to know anything >> about the Physical Port Number and these should be handled strictly >> inside the drivers. If multiple drivers need to do something similar to >> map the logical to physical port numbers then we should introduce helper >> functions to allow them to do so. If the Physical Numbers are not >> contained in the driver than the API would need to be expanded to expose >> which numbers are actually used to avoid needing to constantly loop >> through all the indexes to find this out. >> > > Absolutely agree with you. The main idea of NTB API was to provide a set > of methods to access the NTB hardware without any abstractions but > with possible useful helpers, like your NTB MSI library, or transport library, > or anything else. So the physical port numbers must be available for > the client drivers. Huh? How can you say you absolutely agree with me? I said the clients should not need to know about physical port numbers and you said the physical port numbers *must* be available to clients. I think this statement needs to be justified. Why should the clients need to know about the physical port numbers? >> On a similar vein, I'd suggest that
[PATCH] dt-bindings: remoteproc: Rename qcom,adsp binding document
The qcom,adsp binding document has evolved to cover the mechanism to load and boot firmware on a range of different Hexagon based remote subsystems. Rename the binding and update the description to better capture this. Signed-off-by: Bjorn Andersson --- .../{qcom,adsp.txt => qcom,q6v5-pas.txt} | 15 --- 1 file changed, 8 insertions(+), 7 deletions(-) rename Documentation/devicetree/bindings/remoteproc/{qcom,adsp.txt => qcom,q6v5-pas.txt} (88%) diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5-pas.txt similarity index 88% rename from Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt rename to Documentation/devicetree/bindings/remoteproc/qcom,q6v5-pas.txt index 292dfda9770d..bbe9c7465a2c 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.txt +++ b/Documentation/devicetree/bindings/remoteproc/qcom,q6v5-pas.txt @@ -1,7 +1,8 @@ -Qualcomm ADSP Peripheral Image Loader +Qualcomm TrustZone based Peripheral Image Loader -This document defines the binding for a component that loads and boots firmware -on the Qualcomm ADSP Hexagon core. +This document defines the binding for the TrustZone based "Peripheral +Authentication Service" (PAS) interface for loading and booting firmware on the +Qualcomm Hexagon cores. - compatible: Usage: required @@ -85,10 +86,10 @@ on the Qualcomm ADSP Hexagon core. = SUBNODES -The adsp node may have an subnode named either "smd-edge" or "glink-edge" that -describes the communication edge, channels and devices related to the ADSP. -See ../soc/qcom/qcom,smd.txt and ../soc/qcom/qcom,glink.txt for details on how -to describe these. +The remoteproc node may have an subnode named either "smd-edge" or "glink-edge" +that describes the communication edge, channels and devices related to the +remoteproc. See ../soc/qcom/qcom,smd.txt and ../soc/qcom/qcom,glink.txt for +details on how to describe these. = EXAMPLE -- 2.18.0
Re: [PATCH v1] clk: Probe defer clk_get() on orphans
Quoting Jeffrey Hugo (2019-03-06 13:48:13) > Ping? > > Stephen, I know as this depends on your clock parent handling series > (happens to apply just fine to v2), its not going to be accepted until > that gets sorted out, but do you have any thoughts on if this seems like > an appropriate thing to do, or if you'd like to see a different solution? Please don't ping during the merge window. I'll probably look at this patch next week.
[PATCH v2 1/5] i2c: mux: pca9541: use the BIT macro
Because it looks nice! Reviewed-by: Guenter Roeck Reviewed-by: Vladimir Zapolskiy Signed-off-by: Peter Rosin --- drivers/i2c/muxes/i2c-mux-pca9541.c | 29 +++-- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c index 9e75d6b9140b..30cabf482985 100644 --- a/drivers/i2c/muxes/i2c-mux-pca9541.c +++ b/drivers/i2c/muxes/i2c-mux-pca9541.c @@ -16,6 +16,7 @@ * warranty of any kind, whether express or implied. */ +#include #include #include #include @@ -43,20 +44,20 @@ #define PCA9541_CONTROL0x01 #define PCA9541_ISTAT 0x02 -#define PCA9541_CTL_MYBUS (1 << 0) -#define PCA9541_CTL_NMYBUS (1 << 1) -#define PCA9541_CTL_BUSON (1 << 2) -#define PCA9541_CTL_NBUSON (1 << 3) -#define PCA9541_CTL_BUSINIT(1 << 4) -#define PCA9541_CTL_TESTON (1 << 6) -#define PCA9541_CTL_NTESTON(1 << 7) - -#define PCA9541_ISTAT_INTIN(1 << 0) -#define PCA9541_ISTAT_BUSINIT (1 << 1) -#define PCA9541_ISTAT_BUSOK(1 << 2) -#define PCA9541_ISTAT_BUSLOST (1 << 3) -#define PCA9541_ISTAT_MYTEST (1 << 6) -#define PCA9541_ISTAT_NMYTEST (1 << 7) +#define PCA9541_CTL_MYBUS BIT(0) +#define PCA9541_CTL_NMYBUS BIT(1) +#define PCA9541_CTL_BUSON BIT(2) +#define PCA9541_CTL_NBUSON BIT(3) +#define PCA9541_CTL_BUSINITBIT(4) +#define PCA9541_CTL_TESTON BIT(6) +#define PCA9541_CTL_NTESTONBIT(7) + +#define PCA9541_ISTAT_INTINBIT(0) +#define PCA9541_ISTAT_BUSINIT BIT(1) +#define PCA9541_ISTAT_BUSOKBIT(2) +#define PCA9541_ISTAT_BUSLOST BIT(3) +#define PCA9541_ISTAT_MYTEST BIT(6) +#define PCA9541_ISTAT_NMYTEST BIT(7) #define BUSON (PCA9541_CTL_BUSON | PCA9541_CTL_NBUSON) #define MYBUS (PCA9541_CTL_MYBUS | PCA9541_CTL_NMYBUS) -- 2.11.0
[PATCH v2 5/5] i2c: mux: pca9541: add support for PCA9641
Heavily based on code from Ken Chen . Signed-off-by: Peter Rosin --- drivers/i2c/muxes/Kconfig | 6 +- drivers/i2c/muxes/i2c-mux-pca9541.c | 137 ++-- 2 files changed, 136 insertions(+), 7 deletions(-) diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig index 52a4a922e7e6..8532841de5db 100644 --- a/drivers/i2c/muxes/Kconfig +++ b/drivers/i2c/muxes/Kconfig @@ -55,10 +55,10 @@ config I2C_MUX_LTC4306 will be called i2c-mux-ltc4306. config I2C_MUX_PCA9541 - tristate "NXP PCA9541 I2C Master Selector" + tristate "NXP PCA9541/PCA9641 I2C Master Selectors" help - If you say yes here you get support for the NXP PCA9541 - I2C Master Selector. + If you say yes here you get support for the NXP PCA9541/PCA9641 + I2C Master Selectors. This driver can also be built as a module. If so, the module will be called i2c-mux-pca9541. diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c index 5eb36e3223d5..5d4e0c92e978 100644 --- a/drivers/i2c/muxes/i2c-mux-pca9541.c +++ b/drivers/i2c/muxes/i2c-mux-pca9541.c @@ -1,5 +1,5 @@ /* - * I2C multiplexer driver for PCA9541 bus master selector + * I2C multiplexer driver for PCA9541/PCA9641 bus master selectors * * Copyright (c) 2010 Ericsson AB. * @@ -28,8 +28,8 @@ #include /* - * The PCA9541 is a bus master selector. It supports two I2C masters connected - * to a single slave bus. + * The PCA9541 and PCA9641 are bus master selector. They support two I2C masters + * connected to a single slave bus. * * Before each bus transaction, a master has to acquire bus ownership. After the * transaction is complete, bus ownership has to be released. This fits well @@ -63,6 +63,33 @@ #define PCA9541_BUSON (PCA9541_CTL_BUSON | PCA9541_CTL_NBUSON) #define PCA9541_MYBUS (PCA9541_CTL_MYBUS | PCA9541_CTL_NMYBUS) +#define PCA9641_ID 0x00 +#define PCA9641_ID_MAGIC 0x38 + +#define PCA9641_CONTROL0x01 +#define PCA9641_STATUS 0x02 +#define PCA9641_TIME 0x03 + +#define PCA9641_CTL_LOCK_REQ BIT(0) +#define PCA9641_CTL_LOCK_GRANT BIT(1) +#define PCA9641_CTL_BUS_CONNECTBIT(2) +#define PCA9641_CTL_BUS_INIT BIT(3) +#define PCA9641_CTL_SMBUS_SWRSTBIT(4) +#define PCA9641_CTL_IDLE_TIMER_DIS BIT(5) +#define PCA9641_CTL_SMBUS_DIS BIT(6) +#define PCA9641_CTL_PRIORITY BIT(7) + +#define PCA9641_STS_OTHER_LOCK BIT(0) +#define PCA9641_STS_BUS_INIT_FAIL BIT(1) +#define PCA9641_STS_BUS_HUNG BIT(2) +#define PCA9641_STS_MBOX_EMPTY BIT(3) +#define PCA9641_STS_MBOX_FULL BIT(4) +#define PCA9641_STS_TEST_INT BIT(5) +#define PCA9641_STS_SCL_IO BIT(6) +#define PCA9641_STS_SDA_IO BIT(7) + +#define PCA9641_RES_TIME 0x03 + /* arbitration timeouts, in jiffies */ #define ARB_TIMEOUT(HZ / 8)/* 125 ms until forcing bus ownership */ #define ARB2_TIMEOUT (HZ / 4)/* 250 ms until acquisition failure */ @@ -73,6 +100,7 @@ enum chip_name { pca9541, + pca9641, }; struct chip_desc { @@ -102,6 +130,21 @@ static bool pca9541_busoff(int ctl) return (ctl & PCA9541_BUSON) == PCA9541_BUSON; } +static bool pca9641_lock_grant(int ctl) +{ + return !!(ctl & PCA9641_CTL_LOCK_GRANT); +} + +static bool pca9641_other_lock(int sts) +{ + return !!(sts & PCA9641_STS_OTHER_LOCK); +} + +static bool pca9641_busoff(int ctl, int sts) +{ + return !pca9641_lock_grant(ctl) && !pca9641_other_lock(sts); +} + /* * Write to chip register. Don't use i2c_transfer()/i2c_smbus_xfer() * as they will try to lock the adapter a second time. @@ -256,6 +299,86 @@ static int pca9541_arbitrate(struct i2c_client *client) return 0; } +/* Release bus. */ +static void pca9641_release_bus(struct i2c_client *client) +{ + pca9541_reg_write(client, PCA9641_CONTROL, 0); +} + +/* + * Channel arbitration + * + * Return values: + * <0: error + * 0 : bus not acquired + * 1 : bus acquired + */ +static int pca9641_arbitrate(struct i2c_client *client) +{ + struct i2c_mux_core *muxc = i2c_get_clientdata(client); + struct pca9541 *data = i2c_mux_priv(muxc); + int reg_ctl, reg_sts; + + reg_ctl = pca9541_reg_read(client, PCA9641_CONTROL); + if (reg_ctl < 0) + return reg_ctl; + reg_sts = pca9541_reg_read(client, PCA9641_STATUS); + + if (pca9641_busoff(reg_ctl, reg_sts)) { + /* +* Bus is off. Request ownership or turn it on unless +* other master requested ownership. +*/ + reg_ctl |= PCA9641_CTL_LOCK_REQ; + pca9541_reg_write(client, PCA9641_CONTROL, reg_ctl); + reg_ctl =
[PATCH v2 2/5] i2c: mux: pca9541: namespace cleanup
In preparation for PCA9641 support, convert the mybus and busoff macros to functions, and in the process prefix them with pca9541_. Also prefix remaining chip specific macros with PCA9541_. Reviewed-by: Vladimir Zapolskiy Reviewed-by: Guenter Roeck Signed-off-by: Peter Rosin --- drivers/i2c/muxes/i2c-mux-pca9541.c | 26 +++--- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c index 30cabf482985..28f46450f4b4 100644 --- a/drivers/i2c/muxes/i2c-mux-pca9541.c +++ b/drivers/i2c/muxes/i2c-mux-pca9541.c @@ -59,10 +59,8 @@ #define PCA9541_ISTAT_MYTEST BIT(6) #define PCA9541_ISTAT_NMYTEST BIT(7) -#define BUSON (PCA9541_CTL_BUSON | PCA9541_CTL_NBUSON) -#define MYBUS (PCA9541_CTL_MYBUS | PCA9541_CTL_NMYBUS) -#define mybus(x) (!((x) & MYBUS) || ((x) & MYBUS) == MYBUS) -#define busoff(x) (!((x) & BUSON) || ((x) & BUSON) == BUSON) +#define PCA9541_BUSON (PCA9541_CTL_BUSON | PCA9541_CTL_NBUSON) +#define PCA9541_MYBUS (PCA9541_CTL_MYBUS | PCA9541_CTL_NMYBUS) /* arbitration timeouts, in jiffies */ #define ARB_TIMEOUT(HZ / 8)/* 125 ms until forcing bus ownership */ @@ -93,6 +91,20 @@ static const struct of_device_id pca9541_of_match[] = { MODULE_DEVICE_TABLE(of, pca9541_of_match); #endif +static bool pca9541_mybus(int ctl) +{ + if (!(ctl & PCA9541_MYBUS)) + return true; + return (ctl & PCA9541_MYBUS) == PCA9541_MYBUS; +} + +static bool pca9541_busoff(int ctl) +{ + if (!(ctl & PCA9541_BUSON)) + return true; + return (ctl & PCA9541_BUSON) == PCA9541_BUSON; +} + /* * Write to chip register. Don't use i2c_transfer()/i2c_smbus_xfer() * as they will try to lock the adapter a second time. @@ -134,7 +146,7 @@ static void pca9541_release_bus(struct i2c_client *client) int reg; reg = pca9541_reg_read(client, PCA9541_CONTROL); - if (reg >= 0 && !busoff(reg) && mybus(reg)) + if (reg >= 0 && !pca9541_busoff(reg) && pca9541_mybus(reg)) pca9541_reg_write(client, PCA9541_CONTROL, (reg & PCA9541_CTL_NBUSON) >> 1); } @@ -186,7 +198,7 @@ static int pca9541_arbitrate(struct i2c_client *client) if (reg < 0) return reg; - if (busoff(reg)) { + if (pca9541_busoff(reg)) { int istat; /* * Bus is off. Request ownership or turn it on unless @@ -211,7 +223,7 @@ static int pca9541_arbitrate(struct i2c_client *client) */ data->select_timeout = SELECT_DELAY_LONG * 2; } - } else if (mybus(reg)) { + } else if (pca9541_mybus(reg)) { /* * Bus is on, and we own it. We are done with acquisition. * Reset NTESTON and BUSINIT, then return success. -- 2.11.0
[PATCH v2 3/5] i2c: mux: pca9541: prepare for PCA9641 support
Make the arbitrate and release_bus implementation chip specific. Reviewed-by: Guenter Roeck Reviewed-by: Vladimir Zapolskiy Signed-off-by: Peter Rosin --- drivers/i2c/muxes/i2c-mux-pca9541.c | 62 +++-- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c index 28f46450f4b4..5eb36e3223d5 100644 --- a/drivers/i2c/muxes/i2c-mux-pca9541.c +++ b/drivers/i2c/muxes/i2c-mux-pca9541.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -70,26 +71,22 @@ #define SELECT_DELAY_SHORT 50 #define SELECT_DELAY_LONG 1000 -struct pca9541 { - struct i2c_client *client; - unsigned long select_timeout; - unsigned long arb_timeout; +enum chip_name { + pca9541, }; -static const struct i2c_device_id pca9541_id[] = { - {"pca9541", 0}, - {} +struct chip_desc { + int (*arbitrate)(struct i2c_client *client); + void (*release_bus)(struct i2c_client *client); }; -MODULE_DEVICE_TABLE(i2c, pca9541_id); +struct pca9541 { + const struct chip_desc *chip; -#ifdef CONFIG_OF -static const struct of_device_id pca9541_of_match[] = { - { .compatible = "nxp,pca9541" }, - {} + struct i2c_client *client; + unsigned long select_timeout; + unsigned long arb_timeout; }; -MODULE_DEVICE_TABLE(of, pca9541_of_match); -#endif static bool pca9541_mybus(int ctl) { @@ -271,7 +268,7 @@ static int pca9541_select_chan(struct i2c_mux_core *muxc, u32 chan) /* force bus ownership after this time */ do { - ret = pca9541_arbitrate(client); + ret = data->chip->arbitrate(client); if (ret) return ret < 0 ? ret : 0; @@ -289,10 +286,32 @@ static int pca9541_release_chan(struct i2c_mux_core *muxc, u32 chan) struct pca9541 *data = i2c_mux_priv(muxc); struct i2c_client *client = data->client; - pca9541_release_bus(client); + data->chip->release_bus(client); return 0; } +static const struct chip_desc chips[] = { + [pca9541] = { + .arbitrate = pca9541_arbitrate, + .release_bus = pca9541_release_bus, + }, +}; + +static const struct i2c_device_id pca9541_id[] = { + { "pca9541", pca9541 }, + {} +}; + +MODULE_DEVICE_TABLE(i2c, pca9541_id); + +#ifdef CONFIG_OF +static const struct of_device_id pca9541_of_match[] = { + { .compatible = "nxp,pca9541", .data = [pca9541] }, + {} +}; +MODULE_DEVICE_TABLE(of, pca9541_of_match); +#endif + /* * I2C init/probing/exit functions */ @@ -301,6 +320,8 @@ static int pca9541_probe(struct i2c_client *client, { struct i2c_adapter *adap = client->adapter; struct pca954x_platform_data *pdata = dev_get_platdata(>dev); + const struct of_device_id *match; + const struct chip_desc *chip; struct i2c_mux_core *muxc; struct pca9541 *data; int force; @@ -309,12 +330,18 @@ static int pca9541_probe(struct i2c_client *client, if (!i2c_check_functionality(adap, I2C_FUNC_SMBUS_BYTE_DATA)) return -ENODEV; + match = of_match_device(of_match_ptr(pca9541_of_match), >dev); + if (match) + chip = of_device_get_match_data(>dev); + else + chip = [id->driver_data]; + /* * I2C accesses are unprotected here. * We have to lock the I2C segment before releasing the bus. */ i2c_lock_bus(adap, I2C_LOCK_SEGMENT); - pca9541_release_bus(client); + chip->release_bus(client); i2c_unlock_bus(adap, I2C_LOCK_SEGMENT); /* Create mux adapter */ @@ -329,6 +356,7 @@ static int pca9541_probe(struct i2c_client *client, return -ENOMEM; data = i2c_mux_priv(muxc); + data->chip = chip; data->client = client; i2c_set_clientdata(client, muxc); -- 2.11.0
[PATCH v2 4/5] dt-bindings: i2c: pca9541: extend with compatible for PCA9641
The binding is equivalent apart from the compatible. Signed-off-by: Peter Rosin --- Documentation/devicetree/bindings/i2c/nxp,pca9541.txt | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/devicetree/bindings/i2c/nxp,pca9541.txt b/Documentation/devicetree/bindings/i2c/nxp,pca9541.txt index 42bfc09c8918..17b4cb9d76da 100644 --- a/Documentation/devicetree/bindings/i2c/nxp,pca9541.txt +++ b/Documentation/devicetree/bindings/i2c/nxp,pca9541.txt @@ -1,8 +1,10 @@ -* NXP PCA9541 I2C bus master selector +* NXP PCA9541/PCA9641 I2C bus master selectors Required Properties: - - compatible: Must be "nxp,pca9541" + - compatible: Must be either of + "nxp,pca9541" + "nxp,pca9641" - reg: The I2C address of the device. -- 2.11.0
[PATCH v2 0/5] i2c: mux: pca9541: extend with support for pca9641
Hi! So, it's been a year or so since this was last visited. Time flies. At that time, Ken Chen gave up and I didn't want to add untested code. However, Pradeep Srinivasan asked about PCA9641 and so I have now rebased the preparatory patches to v5.0 and added the bits Ken wrote on top of the framework I wrote. Looking forward to some test results, this has only been build-tested. The actual code that does anything remotely interesting with the PCA9641 is all Kens work, and I have no knowledge if it works or not. Changes since last year (https://lkml.org/lkml/2018/3/20/205) - rebased to v5.0 - changed a couple of helper functions to return bool instead of int 0/1 - added dt-bindings patch - warped Kens patch to fit on top of the preparatory work in patches 1-3 Cheers, Peter Peter Rosin (5): i2c: mux: pca9541: use the BIT macro i2c: mux: pca9541: namespace cleanup i2c: mux: pca9541: prepare for PCA9641 support dt-bindings: i2c: pca9541: extend with compatible for PCA9641 i2c: mux: pca9541: add support for PCA9641 .../devicetree/bindings/i2c/nxp,pca9541.txt| 6 +- drivers/i2c/muxes/Kconfig | 6 +- drivers/i2c/muxes/i2c-mux-pca9541.c| 252 + 3 files changed, 218 insertions(+), 46 deletions(-) -- 2.11.0
Re: [PATCH 1/3] mm/mincore: make mincore() more conservative
On Wed, 30 Jan 2019 13:44:18 +0100 Vlastimil Babka wrote: > From: Jiri Kosina > > The semantics of what mincore() considers to be resident is not completely > clear, but Linux has always (since 2.3.52, which is when mincore() was > initially done) treated it as "page is available in page cache". > > That's potentially a problem, as that [in]directly exposes meta-information > about pagecache / memory mapping state even about memory not strictly > belonging > to the process executing the syscall, opening possibilities for sidechannel > attacks. > > Change the semantics of mincore() so that it only reveals pagecache > information > for non-anonymous mappings that belog to files that the calling process could > (if it tried to) successfully open for writing. "for writing" comes as a bit of a surprise. Why not for reading? Could we please explain the reasoning in the changelog and in the (presently absent) comments which describe can_do_mincore()? > @@ -189,8 +197,13 @@ static long do_mincore(unsigned long addr, unsigned long > pages, unsigned char *v > vma = find_vma(current->mm, addr); > if (!vma || addr < vma->vm_start) > return -ENOMEM; > - mincore_walk.mm = vma->vm_mm; > end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); > + if (!can_do_mincore(vma)) { > + unsigned long pages = (end - addr) >> PAGE_SHIFT; I'm not sure this is correct in all cases. If addr = 4095 vma->vm_end = 4096 pages = 1000 then `end' is 4096 and `(end - addr) << PAGE_SHIFT' is zero, but it should have been 1. Please check? A mincore test suite in tools/testing/selftests would be useful, methinks. To exercise such corner cases, check for future breakage, etc. > + memset(vec, 1, pages); > + return pages; > + } > + mincore_walk.mm = vma->vm_mm; > err = walk_page_range(addr, end, _walk); > if (err < 0) > return err;
Re: [PATCH v2 09/12] NTB: Introduce MSI library
On Wed, Mar 06, 2019 at 02:35:53PM -0700, Logan Gunthorpe wrote: > > > On 2019-03-06 1:26 p.m., Serge Semin wrote: > > First of all, It might be unsafe to have some resources consumed by NTB > > MSI or some other library without a simple way to warn NTB client drivers > > about their attempts to access that resources, since it might lead to random > > errors. When I thought about implementing a transport library based on the > > Message/Spad+Doorbell registers, I had in mind to create an internal > > bits-field > > array with the resources busy-flags. If, for instance, some message or > > scratchpad register is occupied by the library (MSI, transport or some > > else), > > then it would be impossible to access these resources directly through NTB > > API > > methods. So NTB client driver shall retrieve an error in an attempt to > > write/read data to/from busy message or scratchpad register, or in an > > attempt > > to set some occupied doorbell bit. The same thing can be done for memory > > windows. > > Yes, it would be nice to have a generic library to manage all the > resources, but right now we don't and it's unfair to expect us to take > on this work to get the features we care about merged. Right now, it's > not at all unsafe as the client is quite capable of ensuring it has the > resources for the MSI library. The changes for ntb_transport to ensure > this are quite reasonable. > > > Second tiny concern is about documentation. Since there is a special file > > for > > all NTB-related doc, it would be good to have some description about the > > NTB MSI library there as well: > > Documentation/ntb.txt > > Sure, I'll add a short blurb for v3. Though, I noticed it's quite out of > date since your changes. Especially in the ntb_tool section... > Ok. Thanks. If you want you can add some info to the ntb_tool section as well. If you don't have time, I'll update it next time I submit anything new to the subsystem. -Sergey > >> + u32 *peer_mws[]; > > > > Shouldn't we use the __iomem attribute here since later the devm_ioremap() > > is > > used to map MWs at these pointers? > > Yes, will change for v3. > > > > Simpler and faster cleanup-code would be: > > > + unroll: > > + for (--i; i >= 0; --i) > > + devm_iounmap(>dev, ntb->msi->peer_mws[i]); > > Faster, maybe, but I would not consider this simpler. It's much more > complicated to reason about and ensure it's correct. I prefer my way > because I don't care about speed, but I do care about readability. > > > > Alas calling the ntb_mw_set_trans() method isn't enough to fully initialize > > NTB Memory Windows. Yes, the library will work for Intel/AMD/Switchtec > > (two-ports legacy configuration), but will fail for IDT due to being based > > on > > the outbound MW xlat interface. So the library at this stage isn't portable > > across all NTB hardware. In order to make it working the translation > > address is > > supposed to be transferred to the peer side, where a peer code should call > > ntb_peer_mw_set_trans() method with the retrieved xlat address. > > See documentation for details: > > > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/Documentation/ntb.txt > > > > ntb_perf driver can be also used as a reference of the portable NTB MWs > > setup. > > Gross. Well, given that ntb_transport doesn't even support this and we > don't really have a sensible library to transfer this information, I'm > going to leave it as is for now. Someone can update ntb_msi when they > update ntb_transport, preferably after we have a nice library to handle > the transfers for us seeing I absolutely do not want to replicate the > mess in ntb_perf. > > Actually, if we had a generic spad/msg communication library, it would > probably be better to have a common ntb_mw_set_trans() function that > uses the communications library to send the data and automatically call > ntb_peer_mw_set_trans() on the peer. That way we don't have to push this > mess into the clients. > > > The same cleanup pattern can be utilized here: > > +error_out: > > + for (--peer; peer >= 0; --peer) { > > + peer_widx = ntb_peer_highest_mw_idx(ntb, peer); > > + ntb_mw_clear_trans(ntb, i, peer_widx); > > + } > > > > So you won't need "i" variable here anymore. You also don't need to check > > the > > return value of ntb_peer_highest_mw_idx() in the cleanup loop because it > > was already checked in the main algo code. > > See above. > > >> +EXPORT_SYMBOL(ntb_msi_clear_mws); > >> + > > > > Similarly something like ntb_msi_peer_clear_mws() should be added to > > unset a translation address on the peer side. > > Well, we can table that for when ntb_msi supports the peer MW setting > functions. > >> +int ntb_msi_peer_trigger(struct ntb_dev *ntb, int peer, > >> + struct ntb_msi_desc *desc) > >> +{ > >> + int idx; > >> + > >> + if (!ntb->msi) > >> + return -EINVAL; > >> + > >> + idx =
Re: [RFC][Patch v9 0/6] KVM: Guest Free Page Hinting
On Wed, Mar 6, 2019 at 2:18 PM Michael S. Tsirkin wrote: > > On Wed, Mar 06, 2019 at 10:40:57PM +0100, David Hildenbrand wrote: > > On 06.03.19 21:32, Michael S. Tsirkin wrote: > > > On Wed, Mar 06, 2019 at 07:59:57PM +0100, David Hildenbrand wrote: > > >> On 06.03.19 19:43, Michael S. Tsirkin wrote: > > >>> On Wed, Mar 06, 2019 at 01:30:14PM -0500, Nitesh Narayan Lal wrote: > > >> Here are the results: > > >> > > >> Procedure: 3 Guests of size 5GB is launched on a single NUMA node > > >> with > > >> total memory of 15GB and no swap. In each of the guest, memhog is run > > >> with 5GB. Post-execution of memhog, Host memory usage is monitored by > > >> using Free command. > > >> > > >> Without Hinting: > > >> Time of executionHost used memory > > >> Guest 1:45 seconds5.4 GB > > >> Guest 2:45 seconds10 GB > > >> Guest 3:1 minute 15 GB > > >> > > >> With Hinting: > > >> Time of execution Host used memory > > >> Guest 1:49 seconds2.4 GB > > >> Guest 2:40 seconds4.3 GB > > >> Guest 3:50 seconds6.3 GB > > > OK so no improvement. > > If we are looking in terms of memory we are getting back from the > > guest, > > then there is an improvement. However, if we are looking at the > > improvement in terms of time of execution of memhog then yes there is > > none. > > >>> > > >>> Yes but the way I see it you can't overcommit this unused memory > > >>> since guests can start using it at any time. You timed it carefully > > >>> such that this does not happen, but what will cause this timing on real > > >>> guests? > > >> > > >> Whenever you overcommit you will need backup swap. > > > > > > Right and the point of hinting is that pages can just be > > > discarded and not end up in swap. > > > > > > > > > Point is you should be able to see the gain. > > > > > > Hinting patches cost some CPU so we need to know whether > > > they cost too much. How much is too much? When the cost > > > is bigger than benefit. But we can't compare CPU cycles > > > to bytes. So we need to benchmark everything in terms of > > > cycles. > > > > > >> There is no way > > >> around it. It just makes the probability of you having to go to disk > > >> less likely. > > > > > > > > > Right and let's quantify this. Does this result in net gain or loss? > > > > Yes, I am totally with you. But if it is a net benefit heavily depends > > on the setup. E.g. what kind of storage used for the swap, how fast, is > > the same disk also used for other I/O ... > > > > Also, CPU is a totally different resource than I/O. While you might have > > plenty of CPU cycles to spare, your I/O throughput might already be > > limited. Same goes into the other direction. > > > > So it might not be as easy as comparing two numbers. It really depends > > on the setup. Well, not completely true, with 0% CPU overhead we would > > have a clear winner with hinting ;) > > I mean users need to know about this too. > > Are these hinting patches a gain: > - on zram > - on ssd > - on a rotating disk > - none of the above > ? > > If users don't know when would they enable hinting? > > Close to one is going to try all possible configurations, test > exhaustively and find an optimal default for their workload. > So it's our job to figure it out and provide guidance. Right. I think for now I will stick to testing on what I have which is a SSD for swap, and no-overcommit for the "non of the above" case. BTW it looks like this patch set introduced a pretty heavy penalty for the no-overcommit case. For a 32G VM with no overcommit a 32G memhog test is now taking over 50 seconds whereas without the patch set I can complete the test in around 20 seconds. > > > > > > > > > > >> If you assume that all of your guests will be using all of their memory > > >> all the time, you don't have to think about overcommiting memory in the > > >> first place. But this is not what we usually have. > > > > > > Right and swap is there to support overcommit. However it > > > was felt that hinting can be faster since it avoids IO > > > involved in swap. > > > > Feels like it, I/O is prone to be slow. > > > > > > -- > > > > Thanks, > > > > David / dhildenb > > OK so should be measureable. > > -- > MST
Re: 4.20.1: BUG: unable to handle kernel paging request at 0000100000000008
On Tue 2019-01-22 11:15:39, Harald Dunkel wrote: > Is this the wrong list to report this problem? I haven't found a > "mem" mailing list on vger. Right list, ugly looking problem. How reproducible is it? You may want to look at MAINTAINERS file, cc x86 and memory management people...? Pavel > Regards > Harri > - > On 1/14/19 2:01 PM, Harald Dunkel wrote: > >Hi folks, > > > >my server stumbled over this last night: > > > >Jan 13 19:03:15 sylvester kernel: [272280.820190] BUG: unable to handle > >kernel paging request at 1008 > >Jan 13 19:03:15 sylvester kernel: [272280.820198] PGD 0 P4D 0 > >Jan 13 19:03:15 sylvester kernel: [272280.820203] Oops: [#1] PREEMPT > >SMP PTI > >Jan 13 19:03:15 sylvester kernel: [272280.820207] CPU: 3 PID: 46 Comm: > >kswapd0 Not tainted 4.20.1-raw #1 > >Jan 13 19:03:15 sylvester kernel: [272280.820211] Hardware name: /DH67CF, > >BIOS BLH6710H.86A.0125.2011.0705.1517 07/05/2011 > >Jan 13 19:03:15 sylvester kernel: [272280.820219] RIP: > >0010:find_get_entries+0xed/0x240 > >Jan 13 19:03:15 sylvester kernel: [272280.820223] Code: 4e 4e 00 48 89 c2 48 > >85 d2 0f 84 91 00 00 00 48 81 fa 02 04 00 00 0f 84 a3 00 00 00 48 81 fa 06 > >04 00 00 74 ca f6 > >Jan 13 19:03:15 sylvester kernel: [272280.820230] RSP: 0018:c9fdb9c8 > >EFLAGS: 00010246 > >Jan 13 19:03:15 sylvester kernel: [272280.820234] RAX: 1000 RBX: > >000f RCX: > >Jan 13 19:03:15 sylvester kernel: [272280.820238] RDX: 1000 RSI: > >0680 RDI: ffc0 > >Jan 13 19:03:15 sylvester kernel: [272280.820242] RBP: 0002 R08: > >8880a98b7d80 R09: > >Jan 13 19:03:15 sylvester kernel: [272280.820246] R10: 004b R11: > >c9fdb9c8 R12: > >Jan 13 19:03:15 sylvester kernel: [272280.820250] R13: c9fdbae0 R14: > >c9fdba60 R15: > >Jan 13 19:03:15 sylvester kernel: [272280.820255] FS: > >() GS:88821798() knlGS: > >Jan 13 19:03:15 sylvester kernel: [272280.820259] CS: 0010 DS: ES: > > CR0: 80050033 > >Jan 13 19:03:15 sylvester kernel: [272280.820263] CR2: 1008 CR3: > >0200a003 CR4: 000606e0 > >Jan 13 19:03:15 sylvester kernel: [272280.820268] Call Trace: > >Jan 13 19:03:15 sylvester kernel: [272280.820276] > >pagevec_lookup_entries+0x15/0x20 > >Jan 13 19:03:15 sylvester kernel: [272280.820280] > >truncate_inode_pages_range+0xc5/0x810 > >Jan 13 19:03:15 sylvester kernel: [272280.820289] ? ___cache_free+0x2c/0x2f0 > >Jan 13 19:03:15 sylvester kernel: [272280.820296] ? > >jbd2_journal_release_jbd_inode+0x105/0x130 [jbd2] > >Jan 13 19:03:15 sylvester kernel: [272280.820301] ? iput+0x1e9/0x210 > >Jan 13 19:03:15 sylvester kernel: [272280.820305] ? > >__inode_wait_for_writeback+0x7a/0xe0 > >Jan 13 19:03:15 sylvester kernel: [272280.820319] > >ext4_evict_inode+0x52/0x5d0 [ext4] > >Jan 13 19:03:15 sylvester kernel: [272280.820325] evict+0xbf/0x190 > >Jan 13 19:03:15 sylvester kernel: [272280.820328] dispose_list+0x30/0x40 > >Jan 13 19:03:15 sylvester kernel: [272280.820332] prune_icache_sb+0x4d/0x70 > >Jan 13 19:03:15 sylvester kernel: [272280.820337] > >super_cache_scan+0x121/0x1a0 > >Jan 13 19:03:15 sylvester kernel: [272280.820342] do_shrink_slab+0x136/0x2d0 > >Jan 13 19:03:15 sylvester kernel: [272280.820346] shrink_slab+0x22a/0x290 > >Jan 13 19:03:15 sylvester kernel: [272280.820351] ? > >__mod_node_page_state+0x64/0xa0 > >Jan 13 19:03:15 sylvester kernel: [272280.820354] shrink_node+0xe1/0x450 > >Jan 13 19:03:15 sylvester kernel: [272280.820358] kswapd+0x3f9/0x740 > >Jan 13 19:03:15 sylvester kernel: [272280.820362] ? > >mem_cgroup_shrink_node+0x1b0/0x1b0 > >Jan 13 19:03:15 sylvester kernel: [272280.820367] kthread+0x111/0x130 > >Jan 13 19:03:15 sylvester kernel: [272280.820371] ? > >kthread_create_worker_on_cpu+0x60/0x60 > >Jan 13 19:03:15 sylvester kernel: [272280.820376] ret_from_fork+0x35/0x40 > >Jan 13 19:03:15 sylvester kernel: [272280.820380] Modules linked in: cpuid > >fuse btrfs xor zstd_compress raid6_pq zstd_decompress xxhash ufs hfsplus > >ntfs vfat msdos fat j > >Jan 13 19:03:15 sylvester kernel: [272280.820423] scsi_mod mfd_core usbcore > >e1000e > >Jan 13 19:03:15 sylvester kernel: [272280.820452] CR2: 1008 > >Jan 13 19:03:15 sylvester kernel: [272280.820456] ---[ end trace > >b383b200f5d976a9 ]--- > >Jan 13 19:03:15 sylvester kernel: [272280.820461] RIP: > >0010:find_get_entries+0xed/0x240 > >Jan 13 19:03:15 sylvester kernel: [272280.820465] Code: 4e 4e 00 48 89 c2 48 > >85 d2 0f 84 91 00 00 00 48 81 fa 02 04 00 00 0f 84 a3 00 00 00 48 81 fa 06 > >04 00 00 74 ca f6 > >Jan 13
Re: [PATCH] devfreq: Suspend all devices on system shutdown
On Fri 2019-01-25 14:54:03, Marek Szyprowski wrote: > This way devfreq core ensures that all its devices will be set to safe > operation points before reboot operation. There are board on which some > aggressive power saving operation points are behind the capabilities of > the bootloader to properly reset the hardware and boot the board. This > way one can avoid board crash early after reboot. > > Similar pattern is used in CPUfreq subsystem. This looks somehow dangerous to me. I guess this will break someone's shutdown, and on battery-powered devices, that's quite bad thing to do. Could we explicitely do it only for devices that need it? Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html signature.asc Description: Digital signature
Re: [PATCH v5 00/18] mfd: demodularization of non-modular drivers
On Wed 2019-01-16 13:24:31, Lee Jones wrote: > [...] > > > Paul Gortmaker (18): > > mfd: aat2870-core: Make it explicitly non-modular > > mfd: adp5520: Make it explicitly non-modular > > mfd: as3711: Make it explicitly non-modular > > mfd: db8500-prcmu: drop unused MODULE_ tags from non-modular code > > mfd: htc-i2cpld: Make it explicitly non-modular > > mfd: max8925-core: drop unused MODULE_ tags from non-modular code > > mfd: rc5t583: Make it explicitly non-modular > > mfd: sta2x11: drop unused MODULE_ tags from non-modular code > > mfd: syscon: Make it explicitly non-modular > > mfd: tps65090: Make it explicitly non-modular > > mfd: tps65910: Make it explicitly non-modular > > mfd: tps80031: Make it explicitly non-modular > > mfd: wm831x-spi: Make it explicitly non-modular > > mfd: wm831x-i2c: Make it explicitly non-modular > > mfd: wm831x-core: drop unused module infrastructure from non-modular code > > mfd: wm8350-i2c: Make it explicitly non-modular > > mfd: wm8350-core: drop unused module infrastructure from non-modular code > > mfd: wm8400-core: Make it explicitly non-modular > > > > drivers/mfd/aat2870-core.c | 40 > > +++- > > drivers/mfd/adp5520.c | 30 +++--- > > drivers/mfd/as3711.c| 14 -- > > drivers/mfd/db8500-prcmu.c | 10 -- > > drivers/mfd/htc-i2cpld.c| 18 +- > > 20 files changed, 41 insertions(+), 332 deletions(-) > > All applied! Is it good idea? We want distro kernels on ARM, too, which means people will eventually want these as a modules, no? Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html signature.asc Description: Digital signature
Re: System crash with perf_fuzzer (kernel: 5.0.0-rc3)
On Fri 2019-01-25 08:00:56, Andi Kleen wrote: > > [Fri Jan 25 10:28:53 2019] perf: interrupt took too long (2501 > 2500), > > lowering kernel.perf_event_max_sample_rate to 79750 > > [Fri Jan 25 10:29:08 2019] perf: interrupt took too long (3136 > 3126), > > lowering kernel.perf_event_max_sample_rate to 63750 > > [Fri Jan 25 10:29:11 2019] perf: interrupt took too long (4140 > 3920), > > lowering kernel.perf_event_max_sample_rate to 48250 > > [Fri Jan 25 10:29:11 2019] perf: interrupt took too long (5231 > 5175), > > lowering kernel.perf_event_max_sample_rate to 38000 > > [Fri Jan 25 10:29:11 2019] perf: interrupt took too long (6736 > 6538), > > lowering kernel.perf_event_max_sample_rate to 29500 > > These are fairly normal. Unfortunately, they are. Could we set up our defaults so that they don't normally happen? pavel@amd:~/g/unicsy_demo$ dmesg | grep "took too" [ 761.507893] perf: interrupt took too long (2516 > 2500), lowering kernel.perf_event_max_sample_rate to 79250 [ 4736.674595] perf: interrupt took too long (3209 > 3145), lowering kernel.perf_event_max_sample_rate to 62250 pavel@amd:~/g/unicsy_demo$ -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html signature.asc Description: Digital signature
Re: [RFC] Provide in-kernel headers for making it easy to extend the kernel
On Fri 2019-01-18 17:55:43, Joel Fernandes wrote: > From: "Joel Fernandes (Google)" > > Introduce in-kernel headers and other artifacts which are made available > as an archive through proc (/proc/kheaders.tgz file). This archive makes > it possible to build kernel modules, run eBPF programs, and other > tracing programs that need to extend the kernel for tracing purposes > without any dependency on the file system having headers and build > artifacts. > > On Android and embedded systems, it is common to switch kernels but not > have kernel headers available on the file system. Raw kernel headers > also cannot be copied into the filesystem like they can be on other > distros, due to licensing and other issues. There's no linux-headers If your licensing prevents you from having headers on the filesystem... then I guess you should fix the licensing. I agree with Christoph, this looks pretty horrible. Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html signature.asc Description: Digital signature
Re: [RFC] Provide in-kernel headers for making it easy to extend the kernel
> > >Ok, I'll look into LZMA. Thanks for checking the compression sizes. > > > > > >- Joel > > > > Don't use lzma, use xz if you are going to do something. > > Ok, sounds good. > > > However, it seems unlikely to me that someone not willing to spend the > > space in the filesystem will spend unswappable kernel memory. > > > > It would seem that a far saner way to do this is to use inittmpfs or > > perhaps an auxiliary "ktmpfs" so it can at least be swapped out if you have > > swap. > > But this is already possible with the proposed solution, you would load the > module, extract it into a tmpfs, and unload the module. TMPFS pages can > already be swapped. So your licensing requirements prevent you from having headers in the filesystem, but allow module with the headers hidden inside on the filesystem? Looks like you should just tar xvzf this-is-a-kernel-module-I-promise.ko /usr/src/linux/include :-). Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html signature.asc Description: Digital signature
[PATCH 3/7] i3c: master: dw: remove dead code from dw_i3c_master_*_xfers()
Detected by CoverityScan (Event result_independent_of_operands): "(i3c_xfers + i).len > 65536" is always false regardless of the values of its operands. This occurs as the logical operand of "if" "(i2c_xfers + i).len > 65536" is always false regardless of the values of its operands. This occurs as the logical operand of "if" Signed-off-by: Vitor Soares --- drivers/i3c/master/dw-i3c-master.c | 10 -- 1 file changed, 10 deletions(-) diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c index bb03079..eef6fae 100644 --- a/drivers/i3c/master/dw-i3c-master.c +++ b/drivers/i3c/master/dw-i3c-master.c @@ -840,11 +840,6 @@ static int dw_i3c_master_priv_xfers(struct i3c_dev_desc *dev, return -ENOTSUPP; for (i = 0; i < i3c_nxfers; i++) { - if (i3c_xfers[i].len > COMMAND_PORT_ARG_DATA_LEN_MAX) - return -ENOTSUPP; - } - - for (i = 0; i < i3c_nxfers; i++) { if (i3c_xfers[i].rnw) nrxwords += DIV_ROUND_UP(i3c_xfers[i].len, 4); else @@ -973,11 +968,6 @@ static int dw_i3c_master_i2c_xfers(struct i2c_dev_desc *dev, return -ENOTSUPP; for (i = 0; i < i2c_nxfers; i++) { - if (i2c_xfers[i].len > COMMAND_PORT_ARG_DATA_LEN_MAX) - return -ENOTSUPP; - } - - for (i = 0; i < i2c_nxfers; i++) { if (i2c_xfers[i].flags & I2C_M_RD) nrxwords += DIV_ROUND_UP(i2c_xfers[i].len, 4); else -- 2.7.4
[PATCH v1] drm/tegra: gem: Fix CPU-cache maintenance for BO's allocated using get_pages()
The allocated pages need to be invalidated in CPU caches. On ARM32 the DMA_BIDIRECTIONAL flag only ensures that data is written-back to DRAM and the data stays in CPU cache lines. While the DMA_FROM_DEVICE flag ensures that the corresponding CPU cache lines are getting invalidated and nothing more, that's exactly what is needed for a newly allocated pages. This fixes randomly failing rendercheck tests on Tegra30 using the Opentegra driver for tests that use small-sized pixmaps (10x10 and less, i.e. 1-2 memory pages) because apparently CPU reads out stale data from caches and/or that data is getting evicted to DRAM at the time of HW job execution. Cc: stable Signed-off-by: Dmitry Osipenko --- drivers/gpu/drm/tegra/gem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index 4f80100ff5f3..4cce11fd8836 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -204,7 +204,7 @@ static void tegra_bo_free(struct drm_device *drm, struct tegra_bo *bo) { if (bo->pages) { dma_unmap_sg(drm->dev, bo->sgt->sgl, bo->sgt->nents, -DMA_BIDIRECTIONAL); +DMA_FROM_DEVICE); drm_gem_put_pages(>gem, bo->pages, true, true); sg_free_table(bo->sgt); kfree(bo->sgt); @@ -230,7 +230,7 @@ static int tegra_bo_get_pages(struct drm_device *drm, struct tegra_bo *bo) } err = dma_map_sg(drm->dev, bo->sgt->sgl, bo->sgt->nents, -DMA_BIDIRECTIONAL); +DMA_FROM_DEVICE); if (err == 0) { err = -EFAULT; goto free_sgt; -- 2.20.1
[GIT PULL] RTC for 5.1
Hello Linus, Here is the pull-request for the RTC subsystem for 5.1. There is an unusual amount of new drivers this cycle, and this explains the number of insertions. Other than that, the changes are the usual fixes and feature addition. The following changes since commit bfeffd155283772bbe78c6a05dec7c0128ee500c: Linux 5.0-rc1 (2019-01-06 17:08:20 -0800) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/abelloni/linux.git tags/rtc-5.1 for you to fetch changes up to e91b94fd2bc411b5926031496bbc3de9a96bb1c6: rtc: pic32: convert to SPDX identifier (2019-03-04 20:23:15 +0100) RTC for 5.0 Subsystem: - new quartz-load-femtofarads DT property for quartz load capacitance - remove rtc_class_ops.read_callback New drivers: - Abracon AB-RTCMC-32.768kHz-EOZ9 - Amlogic Meson RTC - Cadence RTC IP - Microcrystal RV3028 - Whwave sd3078 Drivers: - cmos: ignore bogus century byte - ds1307: rework rx8130 support - isl1208: add isl1209 support, nvmem support - rs5C372: report invalid time when the oscillator stopped - rx8581: add rx8571 support Alexandre Belloni (20): rtc: pcf8523: Add rv8523 compatible rtc: imx-sc: depend on HAVE_ARM_SMCCC rtc: meson: remove useless rtc_nvmem_unregister call rtc: rv3028: add new driver rtc: hid-sensor-time: stop selecting IIO rtc: remove rtc_class_ops.read_callback rtc: imx-sc: use rtc_time64_to_tm rtc: zynqmp: fix possible race condition rtc: zynqmp: let the core handle range rtc: tx4939: remove useless test rtc: tx4939: set range rtc: tx4939: switch to rtc_time64_to_tm/rtc_tm_to_time64 rtc: tx4939: use .set_time rtc: tx4939: convert to SPDX identifier rtc: rv8803: let the core handle range rtc: rv8803: convert to SPDX identifier rtc: update my email address rtc: pic32: convert to devm_rtc_allocate_device rtc: pic32: let the core handle range rtc: pic32: convert to SPDX identifier Alexey Roslyakov (1): rtc: pcf85063: remove dead code Anson Huang (2): rtc: snvs: make sure clock is enabled for interrupt handle rtc: imx-sc: add rtc set time support Artem Panfilov (2): dt-bindings: rtc: add ABEOZ9 rtc: add AB-RTCMC-32.768kHz-EOZ9 RTC support Biju Das (2): dt-bindings: rtc: add rx8571 compatible rtc: rx8581: Add support for Epson rx8571 RTC Colin Ian King (4): rtc: ds1672: fix unintended sign extension rtc: 88pm860x: fix unintended sign extension rtc: 88pm80x: fix unintended sign extension rtc: pm8xxx: fix unintended sign extension Dianlong Li (3): dt-bindings: define vendor prefix for whwave, Inc. dt-bindings: rtc: sd3078: add device tree documentation rtc: sd3078: new driver. Eric Wong (1): rtc: cmos: ignore bogus century byte Jan Kotas (2): dt-bindings: rtc: Add bindings for Cadence RTC rtc: Add Cadence RTC driver Kangjie Lu (2): rtc: coh901331: fix a missing check of clk_prepare rtc: hym8563: fix a missing check of block data read Marek Szyprowski (2): rtc: s3c: Rewrite clock handling rtc: s3c: Use generic helper to get driver data Marek Vasut (2): dt-bindings: rtc: Add RV1805 to abracon,abx80x bindings rtc: abx80x: Configure reserved bits in RV1805 Martin Blumenstingl (2): dt-bindings: rtc: add device-tree bindings for the Amlogic Meson RTC rtc: support for the Amlogic Meson RTC oliver.r...@wago.com (2): rtc: rs5c372: r2221: fix to use the correct XSTP bit rtc: rs5c372: Fix reading from rtc when the oscillator got interrupted. Sam Ravnborg (6): devicetree: property-units: Add femtofarads unit dt-bindings: rtc: Add quartz-load-femtofarads property dt-binding: pcf8523: add xtal load capacitance dt-binding: pcf85063: add xtal load capacitance rtc: pcf8523: set xtal load capacitance from DT rtc: pcf85063: set xtal load capacitance from DT Trent Piepho (5): rtc: isl1208: fix negative digital trim reporting rtc: isl1208: Introduce driver state struct rtc: isl1208: Support more chip variations rtc: isl1208: Add new style nvmem support to driver dt-bindings: rtc: Update for new chip in isl1208 series Uwe Kleine-König (5): rtc: ds1307: Move register definitions to start of file rtc: ds1307: forward declare chips array instead of a bunch of functions rtc: ds1307: correct register offset for rx8130 rtc: ds1307: rx8130: honor Voltage Loss Flag when reading the time rtc: ds1307: rx8130: Fix alarm handling Wei Yongjun (1): rtc: sd3078: make symbol 'sd3078_driver' static YueHaibing (1): rtc: sd3078: fix platform_no_drv_owner.cocci warnings ZhangXiaoxu (1): rtc: Fix UBSAN overflow warning
[PATCH v1 2/3] iommu/tegra-smmu: Properly release domain resources
Release all memory allocations associated with a released domain and emit warning if domain is in-use at the time of destruction. Signed-off-by: Dmitry Osipenko --- drivers/iommu/tegra-smmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 8d30653cd13a..27b1249f0773 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -327,6 +327,9 @@ static void tegra_smmu_domain_free(struct iommu_domain *domain) /* TODO: free page directory and page tables */ + WARN_ON_ONCE(as->use_count); + kfree(as->count); + kfree(as->pts); kfree(as); } -- 2.20.1
[PATCH v1 3/3] iommu/tegra-smmu: Respect IOMMU API read-write protections
Set PTE read/write attributes accordingly to the the protections requested by IOMMU API. Signed-off-by: Dmitry Osipenko --- drivers/iommu/tegra-smmu.c | 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 27b1249f0773..463ee08f7d3a 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -145,8 +145,6 @@ static inline u32 smmu_readl(struct tegra_smmu *smmu, unsigned long offset) #define SMMU_PDE_ATTR (SMMU_PDE_READABLE | SMMU_PDE_WRITABLE | \ SMMU_PDE_NONSECURE) -#define SMMU_PTE_ATTR (SMMU_PTE_READABLE | SMMU_PTE_WRITABLE | \ -SMMU_PTE_NONSECURE) static unsigned int iova_pd_index(unsigned long iova) { @@ -659,6 +657,7 @@ static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova, { struct tegra_smmu_as *as = to_smmu_as(domain); dma_addr_t pte_dma; + u32 pte_attrs; u32 *pte; pte = as_get_pte(as, iova, _dma); @@ -669,8 +668,16 @@ static int tegra_smmu_map(struct iommu_domain *domain, unsigned long iova, if (*pte == 0) tegra_smmu_pte_get_use(as, iova); + pte_attrs = SMMU_PTE_NONSECURE; + + if (prot & IOMMU_READ) + pte_attrs |= SMMU_PTE_READABLE; + + if (prot & IOMMU_WRITE) + pte_attrs |= SMMU_PTE_WRITABLE; + tegra_smmu_set_pte(as, iova, pte, pte_dma, - __phys_to_pfn(paddr) | SMMU_PTE_ATTR); + __phys_to_pfn(paddr) | pte_attrs); return 0; } -- 2.20.1
[PATCH v1 1/3] iommu/tegra-smmu: Fix invalid ASID bits on Tegra30/114
Both Tegra30 and Tegra114 have 4 ASID's and the corresponding bitfield of the TLB_FLUSH register differs from later Tegra generations that have 128 ASID's. In a result the PTE's are now flushed correctly from TLB and this fixes problems with graphics (randomly failing tests) on Tegra30. Cc: stable Signed-off-by: Dmitry Osipenko --- drivers/iommu/tegra-smmu.c | 25 ++--- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 5182c7d6171e..8d30653cd13a 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -102,7 +102,6 @@ static inline u32 smmu_readl(struct tegra_smmu *smmu, unsigned long offset) #define SMMU_TLB_FLUSH_VA_MATCH_ALL (0 << 0) #define SMMU_TLB_FLUSH_VA_MATCH_SECTION (2 << 0) #define SMMU_TLB_FLUSH_VA_MATCH_GROUP (3 << 0) -#define SMMU_TLB_FLUSH_ASID(x) (((x) & 0x7f) << 24) #define SMMU_TLB_FLUSH_VA_SECTION(addr) addr) & 0xffc0) >> 12) | \ SMMU_TLB_FLUSH_VA_MATCH_SECTION) #define SMMU_TLB_FLUSH_VA_GROUP(addr) addr) & 0xc000) >> 12) | \ @@ -205,8 +204,12 @@ static inline void smmu_flush_tlb_asid(struct tegra_smmu *smmu, { u32 value; - value = SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_ASID(asid) | - SMMU_TLB_FLUSH_VA_MATCH_ALL; + if (smmu->soc->num_asids == 4) + value = (asid & 0x3) << 29; + else + value = (asid & 0x7f) << 24; + + value |= SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_VA_MATCH_ALL; smmu_writel(smmu, value, SMMU_TLB_FLUSH); } @@ -216,8 +219,12 @@ static inline void smmu_flush_tlb_section(struct tegra_smmu *smmu, { u32 value; - value = SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_ASID(asid) | - SMMU_TLB_FLUSH_VA_SECTION(iova); + if (smmu->soc->num_asids == 4) + value = (asid & 0x3) << 29; + else + value = (asid & 0x7f) << 24; + + value |= SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_VA_SECTION(iova); smmu_writel(smmu, value, SMMU_TLB_FLUSH); } @@ -227,8 +234,12 @@ static inline void smmu_flush_tlb_group(struct tegra_smmu *smmu, { u32 value; - value = SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_ASID(asid) | - SMMU_TLB_FLUSH_VA_GROUP(iova); + if (smmu->soc->num_asids == 4) + value = (asid & 0x3) << 29; + else + value = (asid & 0x7f) << 24; + + value |= SMMU_TLB_FLUSH_ASID_MATCH | SMMU_TLB_FLUSH_VA_GROUP(iova); smmu_writel(smmu, value, SMMU_TLB_FLUSH); } -- 2.20.1
[PATCH v1 0/3] IOMMU: Tegra SMMU fixes
Hello, This small series primarily fixes a bug that affects Terga30 and Terga114 platforms, it also carries two patches that improve SMMU functionality and clean up code a tad. Dmitry Osipenko (3): iommu/tegra-smmu: Fix invalid ASID bits on Tegra30/114 iommu/tegra-smmu: Properly release domain resources iommu/tegra-smmu: Respect IOMMU API read-write protections drivers/iommu/tegra-smmu.c | 41 -- 1 file changed, 31 insertions(+), 10 deletions(-) -- 2.20.1
[PATCH] nfsd: allow nfsv3 readdir request to be larger.
nfsd currently reports the NFSv4 dtpref FSINFO parameter to be PAGE_SIZE, so NFS clients will typically ask for one page of directory entries at a time. This is needlessly restrictive as nfsd can handle larger replies easily. Also, a READDIR request (but not a READDIRPLUS request) has the count size clipped to PAGE_SIE, again unnecessary. This patch lifts these limits so that larger readdir requests can be used. Signed-off-by: NeilBrown --- fs/nfsd/nfs3proc.c | 2 +- fs/nfsd/nfs3xdr.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index c9cf46e0c040..8f933e84cec1 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -588,7 +588,7 @@ nfsd3_proc_fsinfo(struct svc_rqst *rqstp) resp->f_wtmax = max_blocksize; resp->f_wtpref = max_blocksize; resp->f_wtmult = PAGE_SIZE; - resp->f_dtpref = PAGE_SIZE; + resp->f_dtpref = max_blocksize; resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 83919116d5cb..93fea246f676 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -573,6 +573,8 @@ int nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_readdirargs *args = rqstp->rq_argp; + u32 max_blocksize = svc_max_payload(rqstp); + p = decode_fh(p, >fh); if (!p) return 0; @@ -580,7 +582,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p) args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); - args->count = min_t(u32, args->count, PAGE_SIZE); + args->count = min_t(u32, args->count, max_blocksize); args->buffer = page_address(*(rqstp->rq_next_page++)); return xdr_argsize_check(rqstp, p); -- 2.14.0.rc0.dirty signature.asc Description: PGP signature
Re: [PATCH 0/3] mincore() and IOCB_NOWAIT adjustments
On Wed, 6 Mar 2019, Andrew Morton wrote: > > could you please take at least the correct and straightforward fix for > > mincore() before we figure out how to deal with the slightly less > > practical RWF_NOWAIT? Thanks. > > I assume we're talking about [1/3] and [2/3] from this thread? > > Can we have a resend please? Gather the various acks and revisions, > make changelog changes to address the review questions and comments? 1/3 is clearly the one to be merged. The version with all the acks gathered is in this thread, at https://lore.kernel.org/lkml/de52b3bd-4e39-c133-542a-0a9c5e357...@suse.cz/ Attaching the patch also at the end of this mail so that it could be easily picked up. I am unfortunately not sure what changelog changes you are talking about, there were none requested during the review as far as I know. 2/3 is clearly postponed for now, it needs more thinking. 3/3 is actually waiting for your decision, see https://lore.kernel.org/lkml/20190212063643.gl15...@dhcp22.suse.cz/ The 1/3 patch to be merged in any case: === cut here === From: Jiri Kosina Date: Wed, 16 Jan 2019 20:53:17 +0100 Subject: [PATCH v2] mm/mincore: make mincore() more conservative The semantics of what mincore() considers to be resident is not completely clear, but Linux has always (since 2.3.52, which is when mincore() was initially done) treated it as "page is available in page cache". That's potentially a problem, as that [in]directly exposes meta-information about pagecache / memory mapping state even about memory not strictly belonging to the process executing the syscall, opening possibilities for sidechannel attacks. Change the semantics of mincore() so that it only reveals pagecache information for non-anonymous mappings that belog to files that the calling process could (if it tried to) successfully open for writing. [mho...@suse.com: restructure can_do_mincore() conditions] Originally-by: Linus Torvalds Originally-by: Dominique Martinet Cc: Dominique Martinet Cc: Andy Lutomirski Cc: Dave Chinner Cc: Kevin Easton Cc: Matthew Wilcox Cc: Cyril Hrubis Cc: Tejun Heo Cc: Kirill A. Shutemov Cc: Daniel Gruss Signed-off-by: Jiri Kosina Signed-off-by: Vlastimil Babka Acked-by: Josh Snyder Acked-by: Michal Hocko --- mm/mincore.c | 17 - 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/mm/mincore.c b/mm/mincore.c index 218099b5ed31..b8842b849604 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -169,6 +169,16 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } +static inline bool can_do_mincore(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should @@ -189,8 +199,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) return -ENOMEM; - mincore_walk.mm = vma->vm_mm; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + if (!can_do_mincore(vma)) { + unsigned long pages = (end - addr) >> PAGE_SHIFT; + memset(vec, 1, pages); + return pages; + } + mincore_walk.mm = vma->vm_mm; err = walk_page_range(addr, end, _walk); if (err < 0) return err; -- Jiri Kosina SUSE Labs
Re: [PATCH v2 07/12] NTB: Introduce functions to calculate multi-port resource index
On Wed, Mar 06, 2019 at 12:11:11PM -0700, Logan Gunthorpe wrote: > > > On 2019-03-05 6:24 p.m., Serge Semin wrote: > >> + * In a 5 peer system, this function will return the following matrix > >> + * > >> + * pidx \ port01234 > >> + * 0 00123 > >> + * 1 01234 > >> + * 2 01234 > >> + * 3 01234 > >> + * > > Oh, first, oops: looks like I copied this down wrong anyway; the code > was what I had intended, but the documented example should have been: > > pidx \ local_port 01234 > 000123 > 101123 > 201223 > 301233 > > And this is definitely the correct table we are aiming for. > ntb_peer_resource_idx() is supposed to return the result of > ntb_peer_port_idx(ntb, local_port) when run on the peer specified by pidx. > > Note: this table also makes sense because it only uses 4 resources for 5 > ports which is the best case scenario. (In other words, to communicate > between N ports, N-1 resources are required on each peer). > Yes, it does use as much and as tight resources as it possible, but only for the case of pure integer ports numbering. While in case if there are gaps in the port numbers space (which is the only case we have in supported hardware at this moment) it will lead to a failure if there are ports with higher numbers, than there are MWs available (MWs availability depends on the IDT chip firmware). Additionally it creates gaps in the MWs space if physical ports are numbered with gaps. Since the only multi-port device we've got now is IDT and it always has it' ports numbered with gaps as I described, then the current implementation will definitely produced the problems. > > This table is too simplified to represent a generic case of port-index > > mapping table. In particular the IDT PCIe switch got it ports numbered > > with uneven integers like: 0 2 4 6 8 12 16 20 or 0 8 16, and so on. > > Moreover some of the ports might be disabled or may have NTB functions > > deactivated, in which case these ports shouldn't be considered by NTB > > subsystem > > at all. Basically we may have any increasing subset of that port > > numbers depending on the current IDT PCIe-switch ports setup. > > Yes, I did not consider situations where there would be gaps in the > "port number" space. It wasn't at all clear from the code that this was > possible. Switchtec hardware could be configured for such an > arrangement, but I don't know why anyone would do that as it just > needlessly complicates everything. > > As you point out, with a gap, we end up with something that is wrong: > > pidx \ port 01345 > 0 00234 > 1 01234 > 2 01334 > 3 01344 > > Here, the relationship between ntb_peer_resource_idx() and > ntb_peer_port_idx() is not maintained and it seems to prescribe 5 > resources for 5 ports. If there were more gaps it would be even more wrong. > Exactly. The table will look even worse for the port numbers: 0 2 4 6 8 12 16 20. > >> +static inline int ntb_peer_resource_idx(struct ntb_dev *ntb, int pidx) > >> +{ > >> + int local_port, peer_port; > >> + > >> + if (pidx >= ntb_peer_port_count(ntb)) > >> + return -EINVAL; > >> + > >> + local_port = ntb_port_number(ntb); > >> + peer_port = ntb_peer_port_number(ntb, pidx); > >> + > >> + if (peer_port < local_port) > >> + return local_port - 1; > >> + else > >> + return local_port; > >> +} > >> + > > > > Instead of redefining the port-index table we can just fix the > > ntb_peer_resource_idx() method, so it would return a global port index > > instead of some number based on the port number. It can be done just by > > the next modification: > > > > + if (peer_port <= local_port) > > + return pidx; > > + else > > + return pidx + 1; > > > > This creates a table that looks like: > > pidx \ port 01234 > 0 10000 > 1 22111 > 2 33322 > 3 44443 > > Which is not correct. In fact, it seems to require 5 resources for 5 > ports. This appears to be what is done in the current ntb_perf and I > think I figured it out several months ago but it's way too messy and > hard to understand and I don't want to spend the time to figure it out > again. > Yes, this is how it used to be done in ntb_pingpong and is still done in the ntb_perf driver. And it is correctly working. As I already described and you wrote further, this table provides a Logical Ports numbering space: peer port \ local port 0 2 4 6 8 12 16 20 0
[PATCH 8/8] media: vimc: propagate pixel format in the stream
Media bus codes were being mapped to pixelformats, which causes a limitation on vimc because not all pixelformats can be mapped to media bus codes. Also, media bus codes are an internal configuration from the device. Userspace only assures media bus codes matches between pads and expects the image in a given pixelformat. So we can allow almost any media bus format to be configured between pads, except for debayer that expects a media bus code of type bayer in the sink pad. Signed-off-by: Helen Koike --- drivers/media/platform/vimc/vimc-capture.c | 76 +++-- drivers/media/platform/vimc/vimc-common.c | 307 drivers/media/platform/vimc/vimc-common.h | 13 + drivers/media/platform/vimc/vimc-debayer.c | 78 +++-- drivers/media/platform/vimc/vimc-scaler.c | 60 ++-- drivers/media/platform/vimc/vimc-sensor.c | 48 +-- drivers/media/platform/vimc/vimc-streamer.c | 2 + drivers/media/platform/vimc/vimc-streamer.h | 6 + 8 files changed, 281 insertions(+), 309 deletions(-) diff --git a/drivers/media/platform/vimc/vimc-capture.c b/drivers/media/platform/vimc/vimc-capture.c index e976a9d6b460..6377974879d7 100644 --- a/drivers/media/platform/vimc/vimc-capture.c +++ b/drivers/media/platform/vimc/vimc-capture.c @@ -28,6 +28,32 @@ #define VIMC_CAP_DRV_NAME "vimc-capture" +static const u32 vimc_cap_supported_pixftm[] = { + V4L2_PIX_FMT_BGR24, + V4L2_PIX_FMT_RGB24, + V4L2_PIX_FMT_ARGB32, + V4L2_PIX_FMT_SBGGR8, + V4L2_PIX_FMT_SGBRG8, + V4L2_PIX_FMT_SGRBG8, + V4L2_PIX_FMT_SRGGB8, + V4L2_PIX_FMT_SBGGR10, + V4L2_PIX_FMT_SGBRG10, + V4L2_PIX_FMT_SGRBG10, + V4L2_PIX_FMT_SRGGB10, + V4L2_PIX_FMT_SBGGR10ALAW8, + V4L2_PIX_FMT_SGBRG10ALAW8, + V4L2_PIX_FMT_SGRBG10ALAW8, + V4L2_PIX_FMT_SRGGB10ALAW8, + V4L2_PIX_FMT_SBGGR10DPCM8, + V4L2_PIX_FMT_SGBRG10DPCM8, + V4L2_PIX_FMT_SGRBG10DPCM8, + V4L2_PIX_FMT_SRGGB10DPCM8, + V4L2_PIX_FMT_SBGGR12, + V4L2_PIX_FMT_SGBRG12, + V4L2_PIX_FMT_SGRBG12, + V4L2_PIX_FMT_SRGGB12, +}; + struct vimc_cap_device { struct vimc_ent_device ved; struct video_device vdev; @@ -101,29 +127,25 @@ static int vimc_cap_try_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct v4l2_pix_format *format = >fmt.pix; - const struct vimc_pix_map *vpix; format->width = clamp_t(u32, format->width, VIMC_FRAME_MIN_WIDTH, VIMC_FRAME_MAX_WIDTH) & ~1; format->height = clamp_t(u32, format->height, VIMC_FRAME_MIN_HEIGHT, VIMC_FRAME_MAX_HEIGHT) & ~1; - /* Don't accept a pixelformat that is not on the table */ - vpix = vimc_pix_map_by_pixelformat(format->pixelformat); - if (!vpix) { - format->pixelformat = fmt_default.pixelformat; - vpix = vimc_pix_map_by_pixelformat(format->pixelformat); - } - /* TODO: Add support for custom bytesperline values */ - format->bytesperline = format->width * vpix->bpp; - format->sizeimage = format->bytesperline * format->height; + vimc_colorimetry_clamp(format); if (format->field == V4L2_FIELD_ANY) format->field = fmt_default.field; - vimc_colorimetry_clamp(format); + /* TODO: Add support for custom bytesperline values */ - return 0; + /* Don't accept a pixelformat that is not on the table */ + if (!v4l2_format_info(format->pixelformat)) + format->pixelformat = fmt_default.pixelformat; + + return v4l2_fill_pixfmt(format, format->pixelformat, + format->width, format->height); } static int vimc_cap_s_fmt_vid_cap(struct file *file, void *priv, @@ -159,27 +181,31 @@ static int vimc_cap_s_fmt_vid_cap(struct file *file, void *priv, static int vimc_cap_enum_fmt_vid_cap(struct file *file, void *priv, struct v4l2_fmtdesc *f) { - const struct vimc_pix_map *vpix = vimc_pix_map_by_index(f->index); - - if (!vpix) + if (f->index >= ARRAY_SIZE(vimc_cap_supported_pixftm)) return -EINVAL; - f->pixelformat = vpix->pixelformat; + f->pixelformat = vimc_cap_supported_pixftm[f->index]; return 0; } +static bool vimc_cap_is_pixfmt_supported(u32 pixelformat) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(vimc_cap_supported_pixftm); i++) + if (vimc_cap_supported_pixftm[i] == pixelformat) + return true; + return false; +} + static int vimc_cap_enum_framesizes(struct file *file, void *fh, struct v4l2_frmsizeenum *fsize) { - const struct vimc_pix_map *vpix; - if (fsize->index) return -EINVAL; - /* Only accept code in the pix map table */ - vpix =
[PATCH 7/8] media: vimc: stream: init/terminate the first entity
The s_stream callback was not being called for the first entity in the stream pipeline array. Instead of verifying the type of the node (video or subdevice) and calling s_stream from the second entity in the pipeline, do this process for all the entities in the pipeline for consistency. The previous code was not a problem because the first entity is a video device and not a subdevice, but this patch prepares vimc to allow setting some configuration in the entity before calling s_stream. Signed-off-by: Helen Koike --- drivers/media/platform/vimc/vimc-streamer.c | 25 - 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/media/platform/vimc/vimc-streamer.c b/drivers/media/platform/vimc/vimc-streamer.c index b7c1fdef5f0d..5a3bda62fbc8 100644 --- a/drivers/media/platform/vimc/vimc-streamer.c +++ b/drivers/media/platform/vimc/vimc-streamer.c @@ -46,19 +46,18 @@ static struct media_entity *vimc_get_source_entity(struct media_entity *ent) */ static void vimc_streamer_pipeline_terminate(struct vimc_stream *stream) { - struct media_entity *entity; + struct vimc_ent_device *ved; struct v4l2_subdev *sd; while (stream->pipe_size) { stream->pipe_size--; - entity = stream->ved_pipeline[stream->pipe_size]->ent; - entity = vimc_get_source_entity(entity); + ved = stream->ved_pipeline[stream->pipe_size]; stream->ved_pipeline[stream->pipe_size] = NULL; - if (!is_media_entity_v4l2_subdev(entity)) + if (!is_media_entity_v4l2_subdev(ved->ent)) continue; - sd = media_entity_to_v4l2_subdev(entity); + sd = media_entity_to_v4l2_subdev(ved->ent); v4l2_subdev_call(sd, video, s_stream, 0); } } @@ -89,18 +88,24 @@ static int vimc_streamer_pipeline_init(struct vimc_stream *stream, } stream->ved_pipeline[stream->pipe_size++] = ved; + if (is_media_entity_v4l2_subdev(ved->ent)) { + sd = media_entity_to_v4l2_subdev(ved->ent); + ret = v4l2_subdev_call(sd, video, s_stream, 1); + if (ret && ret != -ENOIOCTLCMD) { + pr_err("subdev_call error %s\n", ved->ent->name); + vimc_streamer_pipeline_terminate(stream); + return ret; + } + } + entity = vimc_get_source_entity(ved->ent); /* Check if the end of the pipeline was reached*/ if (!entity) return 0; + /* Get the next device in the pipeline */ if (is_media_entity_v4l2_subdev(entity)) { sd = media_entity_to_v4l2_subdev(entity); - ret = v4l2_subdev_call(sd, video, s_stream, 1); - if (ret && ret != -ENOIOCTLCMD) { - vimc_streamer_pipeline_terminate(stream); - return ret; - } ved = v4l2_get_subdevdata(sd); } else { vdev = container_of(entity, -- 2.20.1
[PATCH 4/8] media: v4l2-common: add bayer formats in v4l2_format_info
Add bayer format information in struct v4l2_format_info table. Signed-off-by: Helen Koike --- drivers/media/v4l2-core/v4l2-common.c | 22 ++ 1 file changed, 22 insertions(+) diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c index 11a16bb3efda..779e44d6db43 100644 --- a/drivers/media/v4l2-core/v4l2-common.c +++ b/drivers/media/v4l2-core/v4l2-common.c @@ -517,6 +517,28 @@ const struct v4l2_format_info *v4l2_format_info(u32 format) { .format = V4L2_PIX_FMT_NV21M, .mem_planes = 2, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 2 }, { .format = V4L2_PIX_FMT_NV16M, .mem_planes = 2, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 }, { .format = V4L2_PIX_FMT_NV61M, .mem_planes = 2, .comp_planes = 2, .bpp = { 1, 2, 0, 0 }, .hdiv = 2, .vdiv = 1 }, + + /* Bayer RGB formats */ + { .format = V4L2_PIX_FMT_SBGGR8,.mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SGBRG8,.mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SGRBG8,.mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SRGGB8,.mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SBGGR10, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SGBRG10, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SGRBG10, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SRGGB10, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SBGGR10ALAW8, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SGBRG10ALAW8, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SGRBG10ALAW8, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SRGGB10ALAW8, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SBGGR10DPCM8, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SGBRG10DPCM8, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SGRBG10DPCM8, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SRGGB10DPCM8, .mem_planes = 1, .comp_planes = 1, .bpp = { 1, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SBGGR12, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SGBRG12, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SGRBG12, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, + { .format = V4L2_PIX_FMT_SRGGB12, .mem_planes = 1, .comp_planes = 1, .bpp = { 2, 0, 0, 0 }, .hdiv = 1, .vdiv = 1 }, }; unsigned int i; -- 2.20.1
[PATCH 5/8] media: vimc: stream: cleanup frame field from struct vimc_stream
There is no need to have the frame field in the vimc_stream struct. Signed-off-by: Helen Koike --- drivers/media/platform/vimc/vimc-streamer.c | 10 -- drivers/media/platform/vimc/vimc-streamer.h | 1 - 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/media/platform/vimc/vimc-streamer.c b/drivers/media/platform/vimc/vimc-streamer.c index 392754c18046..b7c1fdef5f0d 100644 --- a/drivers/media/platform/vimc/vimc-streamer.c +++ b/drivers/media/platform/vimc/vimc-streamer.c @@ -117,6 +117,7 @@ static int vimc_streamer_pipeline_init(struct vimc_stream *stream, static int vimc_streamer_thread(void *data) { struct vimc_stream *stream = data; + u8 *frame = NULL; int i; set_freezable(); @@ -127,12 +128,9 @@ static int vimc_streamer_thread(void *data) break; for (i = stream->pipe_size - 1; i >= 0; i--) { - stream->frame = stream->ved_pipeline[i]->process_frame( - stream->ved_pipeline[i], - stream->frame); - if (!stream->frame) - break; - if (IS_ERR(stream->frame)) + frame = stream->ved_pipeline[i]->process_frame( + stream->ved_pipeline[i], frame); + if (!frame || IS_ERR(frame)) break; } //wait for 60hz diff --git a/drivers/media/platform/vimc/vimc-streamer.h b/drivers/media/platform/vimc/vimc-streamer.h index 752af2e2d5a2..dc1d0be431cb 100644 --- a/drivers/media/platform/vimc/vimc-streamer.h +++ b/drivers/media/platform/vimc/vimc-streamer.h @@ -19,7 +19,6 @@ struct vimc_stream { struct media_pipeline pipe; struct vimc_ent_device *ved_pipeline[VIMC_STREAMER_PIPELINE_MAX_SIZE]; unsigned int pipe_size; - u8 *frame; struct task_struct *kthread; }; -- 2.20.1
[PATCH 6/8] media: vimc: stream: add docs to struct vimc_stream
Add missing documentation for struct vimc_stream Signed-off-by: Helen Koike --- drivers/media/platform/vimc/vimc-streamer.h | 15 +++ 1 file changed, 15 insertions(+) diff --git a/drivers/media/platform/vimc/vimc-streamer.h b/drivers/media/platform/vimc/vimc-streamer.h index dc1d0be431cb..a7c5ac5ace4f 100644 --- a/drivers/media/platform/vimc/vimc-streamer.h +++ b/drivers/media/platform/vimc/vimc-streamer.h @@ -15,6 +15,21 @@ #define VIMC_STREAMER_PIPELINE_MAX_SIZE 16 +/** + * struct vimc_stream - struct that represents a stream in the pipeline + * + * @pipe: the media pipeline object associated with this stream + * @ved_pipeline: array containing all the entities participating in the + * stream. The order is from a video device (usually a capture device) where + * stream_on was called, to the entity generating the first base image to be + * processed in the pipeline. + * @pipe_size: size of @ved_pipeline + * @kthread: thread that generates the frames of the stream. + * + * When the user call stream_on in a video device, struct vimc_stream is + * used to keep track of all entities and subdevices that generates and + * process frames for the stream. + */ struct vimc_stream { struct media_pipeline pipe; struct vimc_ent_device *ved_pipeline[VIMC_STREAMER_PIPELINE_MAX_SIZE]; -- 2.20.1
[PATCH 1/8] media: vimc: deb: fix default sink bayer format
The format of the sink pad should be a bayer mbus format. This fixes a kernel NULL pointer dereference error that was caused when the stream starts because the configured format was not found in the pixelmap table. Reported-by: Hans Verkuil Signed-off-by: Helen Koike --- drivers/media/platform/vimc/vimc-debayer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/vimc/vimc-debayer.c b/drivers/media/platform/vimc/vimc-debayer.c index eaed4233ad1b..20826f209731 100644 --- a/drivers/media/platform/vimc/vimc-debayer.c +++ b/drivers/media/platform/vimc/vimc-debayer.c @@ -66,7 +66,7 @@ struct vimc_deb_device { static const struct v4l2_mbus_framefmt sink_fmt_default = { .width = 640, .height = 480, - .code = MEDIA_BUS_FMT_RGB888_1X24, + .code = MEDIA_BUS_FMT_SRGGB8_1X8, .field = V4L2_FIELD_NONE, .colorspace = V4L2_COLORSPACE_DEFAULT, }; -- 2.20.1
[PATCH 0/8] media: vimc: remove media bus code limitation
Hello, This patch series has several vimc fixes (that I am sending in the same series only for convenience, let me know if you prefer them to be sent separately from the series). The last commit removes the vimc_pix_map_list[] that was mapping pixelformats with media bus formats, but it turns out they are not 1-to-1 equivalent and it is really painful to add other formats. Also, for the userspace, media bus formats don't really matter as long as they match between links. So this patch allows any media bus format to be configured independently of the final expected pixelformat. The series depends on "[PATCH] media: Introduce helpers to fill pixel format structs " Thanks, Helen Helen Koike (8): media: vimc: deb: fix default sink bayer format media: vimc: stream: fix thread state before sleep media: vimc: cap: fix step width/height in enum framesize media: v4l2-common: add bayer formats in v4l2_format_info media: vimc: stream: cleanup frame field from struct vimc_stream media: vimc: stream: add docs to struct vimc_stream media: vimc: stream: init/terminate the first entity media: vimc: propagate pixel format in the stream drivers/media/platform/vimc/vimc-capture.c | 80 +++-- drivers/media/platform/vimc/vimc-common.c | 307 drivers/media/platform/vimc/vimc-common.h | 13 + drivers/media/platform/vimc/vimc-debayer.c | 80 +++-- drivers/media/platform/vimc/vimc-scaler.c | 60 ++-- drivers/media/platform/vimc/vimc-sensor.c | 48 +-- drivers/media/platform/vimc/vimc-streamer.c | 39 +-- drivers/media/platform/vimc/vimc-streamer.h | 22 +- drivers/media/v4l2-core/v4l2-common.c | 22 ++ 9 files changed, 341 insertions(+), 330 deletions(-) -- 2.20.1
[PATCH 3/8] media: vimc: cap: fix step width/height in enum framesize
The type V4L2_FRMSIZE_TYPE_CONTINUOUS expects a step of 1. This fixes v4l2-compliance test error: fail: v4l2-test-formats.cpp(184): invalid step_width/height for continuous framesize test VIDIOC_ENUM_FMT/FRAMESIZES/FRAMEINTERVALS: FAIL Signed-off-by: Helen Koike --- drivers/media/platform/vimc/vimc-capture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/vimc/vimc-capture.c b/drivers/media/platform/vimc/vimc-capture.c index 3d433361d297..e976a9d6b460 100644 --- a/drivers/media/platform/vimc/vimc-capture.c +++ b/drivers/media/platform/vimc/vimc-capture.c @@ -187,8 +187,8 @@ static int vimc_cap_enum_framesizes(struct file *file, void *fh, fsize->stepwise.max_width = VIMC_FRAME_MAX_WIDTH; fsize->stepwise.min_height = VIMC_FRAME_MIN_HEIGHT; fsize->stepwise.max_height = VIMC_FRAME_MAX_HEIGHT; - fsize->stepwise.step_width = 2; - fsize->stepwise.step_height = 2; + fsize->stepwise.step_width = 1; + fsize->stepwise.step_height = 1; return 0; } -- 2.20.1
[PATCH 2/8] media: vimc: stream: fix thread state before sleep
The state TASK_UNINTERRUPTIBLE should be set just before schedule_timeout() call, so it knows the sleep mode it should enter. There is no point in setting TASK_UNINTERRUPTIBLE at the initialization of the thread as schedule_timeout() will set the state back to TASK_RUNNING. This fixes a warning in __might_sleep() call, as it's expecting the task to be in TASK_RUNNING state just before changing the state to a sleeping state. Reported-by: Hans Verkuil Signed-off-by: Helen Koike --- drivers/media/platform/vimc/vimc-streamer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/media/platform/vimc/vimc-streamer.c b/drivers/media/platform/vimc/vimc-streamer.c index fcc897fb247b..392754c18046 100644 --- a/drivers/media/platform/vimc/vimc-streamer.c +++ b/drivers/media/platform/vimc/vimc-streamer.c @@ -120,7 +120,6 @@ static int vimc_streamer_thread(void *data) int i; set_freezable(); - set_current_state(TASK_UNINTERRUPTIBLE); for (;;) { try_to_freeze(); @@ -137,6 +136,7 @@ static int vimc_streamer_thread(void *data) break; } //wait for 60hz + set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(HZ / 60); } -- 2.20.1
[PATCH v4 3/4] ARM: dts: rockchip: rk3066a-mk808: enable vop0 and hdmi nodes
This patch enables the vop0 and hdmi nodes for a MK808 with rk3066 processor. Signed-off-by: Johan Jonker --- arch/arm/boot/dts/rk3066a-mk808.dts | 29 + 1 file changed, 29 insertions(+) diff --git a/arch/arm/boot/dts/rk3066a-mk808.dts b/arch/arm/boot/dts/rk3066a-mk808.dts index 9d2216d71..8bc259d3e 100644 --- a/arch/arm/boot/dts/rk3066a-mk808.dts +++ b/arch/arm/boot/dts/rk3066a-mk808.dts @@ -30,6 +30,17 @@ }; }; + hdmi_con { + compatible = "hdmi-connector"; + type = "c"; + + port { + hdmi_con_in: endpoint { + remote-endpoint = <_out_con>; + }; + }; + }; + vcc_io: vcc-io { compatible = "regulator-fixed"; regulator-name = "vcc_io"; @@ -91,6 +102,20 @@ }; }; + { + status = "okay"; +}; + +_in_vop1 { + status = "disabled"; +}; + +_out { + hdmi_out_con: endpoint { + remote-endpoint = <_con_in>; + }; +}; + { bus-width = <4>; cap-mmc-highspeed; @@ -150,6 +175,10 @@ status = "okay"; }; + { + status = "okay"; +}; + { status = "okay"; }; -- 2.11.0
[PATCH v4 4/4] dt-bindings: display: rockchip: add document for rk3066 hdmi
This patch adds a binding that describes the HDMI controller for rk3066. Signed-off-by: Johan Jonker --- .../display/rockchip/rockchip,rk3066-hdmi.txt | 72 ++ 1 file changed, 72 insertions(+) create mode 100644 Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.txt diff --git a/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.txt b/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.txt new file mode 100644 index 0..d1ad31bca --- /dev/null +++ b/Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.txt @@ -0,0 +1,72 @@ +Rockchip specific extensions for rk3066 HDMI + + +Required properties: +- compatible: + "rockchip,rk3066-hdmi"; +- reg: + Physical base address and length of the controller's registers. +- clocks, clock-names: + Phandle to HDMI controller clock, name should be "hclk". +- interrupts: + HDMI interrupt number. +- power-domains: + Phandle to the RK3066_PD_VIO power domain. +- rockchip,grf: + This soc uses GRF regs to switch the HDMI TX input between vop0 and vop1. +- ports: + Contains one port node with two endpoints, numbered 0 and 1, + connected respectively to vop0 and vop1. + Contains one port node with one endpoint + connected to a hdmi-connector node. +- pinctrl-0, pinctrl-name: + Switch the iomux for the HPD/I2C pins to HDMI function. + +Example: + hdmi: hdmi@10116000 { + compatible = "rockchip,rk3066-hdmi"; + reg = <0x10116000 0x2000>; + interrupts = ; + clocks = < HCLK_HDMI>; + clock-names = "hclk"; + power-domains = < RK3066_PD_VIO>; + rockchip,grf = <>; + pinctrl-names = "default"; + pinctrl-0 = <_xfer>, <_hpd>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + hdmi_in: port@0 { + reg = <0>; + #address-cells = <1>; + #size-cells = <0>; + hdmi_in_vop0: endpoint@0 { + reg = <0>; + remote-endpoint = <_out_hdmi>; + }; + hdmi_in_vop1: endpoint@1 { + reg = <1>; + remote-endpoint = <_out_hdmi>; + }; + }; + hdmi_out: port@1 { + reg = <1>; + hdmi_out_con: endpoint { + remote-endpoint = <_con_in>; + }; + }; + }; + }; + + { + hdmi { + hdmi_hpd: hdmi-hpd { + rockchip,pins = <0 RK_PA0 1 _pull_default>; + }; + hdmii2c_xfer: hdmii2c-xfer { + rockchip,pins = <0 RK_PA1 1 _pull_none>, + <0 RK_PA2 1 _pull_none>; + }; + }; +}; -- 2.11.0
[PATCH v4 2/4] ARM: dts: rockchip: add rk3066 hdmi nodes
From: Zheng Yang This patch adds the hdmi nodes to rk3066. Signed-off-by: Zheng Yang Signed-off-by: Johan Jonker --- arch/arm/boot/dts/rk3066a.dtsi | 52 ++ 1 file changed, 52 insertions(+) diff --git a/arch/arm/boot/dts/rk3066a.dtsi b/arch/arm/boot/dts/rk3066a.dtsi index 653127a37..9c43b985a 100644 --- a/arch/arm/boot/dts/rk3066a.dtsi +++ b/arch/arm/boot/dts/rk3066a.dtsi @@ -80,6 +80,10 @@ vop0_out: port { #address-cells = <1>; #size-cells = <0>; + vop0_out_hdmi: endpoint@0 { + reg = <0>; + remote-endpoint = <_in_vop0>; + }; }; }; @@ -101,6 +105,44 @@ vop1_out: port { #address-cells = <1>; #size-cells = <0>; + vop1_out_hdmi: endpoint@0 { + reg = <0>; + remote-endpoint = <_in_vop1>; + }; + }; + }; + + hdmi: hdmi@10116000 { + compatible = "rockchip,rk3066-hdmi"; + reg = <0x10116000 0x2000>; + interrupts = ; + clocks = < HCLK_HDMI>; + clock-names = "hclk"; + power-domains = < RK3066_PD_VIO>; + rockchip,grf = <>; + pinctrl-names = "default"; + pinctrl-0 = <_xfer>, <_hpd>; + status = "disabled"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + hdmi_in: port@0 { + reg = <0>; + #address-cells = <1>; + #size-cells = <0>; + hdmi_in_vop0: endpoint@0 { + reg = <0>; + remote-endpoint = <_out_hdmi>; + }; + hdmi_in_vop1: endpoint@1 { + reg = <1>; + remote-endpoint = <_out_hdmi>; + }; + }; + hdmi_out: port@1 { + reg = <1>; + }; }; }; @@ -415,6 +457,16 @@ }; }; + hdmi { + hdmi_hpd: hdmi-hpd { + rockchip,pins = <0 RK_PA0 1 _pull_default>; + }; + hdmii2c_xfer: hdmii2c-xfer { + rockchip,pins = <0 RK_PA1 1 _pull_none>, + <0 RK_PA2 1 _pull_none>; + }; + }; + pwm0 { pwm0_out: pwm0-out { rockchip,pins = ; -- 2.11.0
[PATCH v4 1/4] drm: rockchip: introduce rk3066 hdmi
From: Zheng Yang The RK3066 HDMI TX serves as interface between a LCD Controller and a HDMI bus. A HDMI TX consists of one HDMI transmitter controller and one HDMI transmitter PHY. The interface has three (3) 8-bit data channels which can be configured for a number of bus widths (8/10/12/16/20/24-bit) and different video formats (RGB, YCbCr). Features: HDMI version 1.4a, HDCP revision 1.4 and DVI version 1.0 compliant transmitter. Supports DTV resolutions from 480i to 1080i/p HD. Master I2C interface for a DDC connection. HDMI TX supports multiple power save modes. The HDMI TX input can switch between LCDC0 and LCDC1. (Sound support is not included in this patch) Signed-off-by: Zheng Yang Signed-off-by: Johan Jonker --- drivers/gpu/drm/rockchip/Kconfig| 8 + drivers/gpu/drm/rockchip/Makefile | 1 + drivers/gpu/drm/rockchip/rk3066_hdmi.c | 901 drivers/gpu/drm/rockchip/rk3066_hdmi.h | 226 +++ drivers/gpu/drm/rockchip/rockchip_drm_drv.c | 2 + drivers/gpu/drm/rockchip/rockchip_drm_drv.h | 1 + 6 files changed, 1139 insertions(+) create mode 100644 drivers/gpu/drm/rockchip/rk3066_hdmi.c create mode 100644 drivers/gpu/drm/rockchip/rk3066_hdmi.h diff --git a/drivers/gpu/drm/rockchip/Kconfig b/drivers/gpu/drm/rockchip/Kconfig index 1e75196f9..2cdf3b62d 100644 --- a/drivers/gpu/drm/rockchip/Kconfig +++ b/drivers/gpu/drm/rockchip/Kconfig @@ -77,4 +77,12 @@ config ROCKCHIP_RGB Some Rockchip CRTCs, like rv1108, can directly output parallel and serial RGB format to panel or connect to a conversion chip. say Y to enable its driver. + +config ROCKCHIP_RK3066_HDMI + bool "Rockchip specific extensions for RK3066 HDMI" + depends on DRM_ROCKCHIP + help + This selects support for Rockchip SoC specific extensions + for the RK3066 HDMI driver. If you want to enable + HDMI on RK3066 based SoC, you should select this option. endif diff --git a/drivers/gpu/drm/rockchip/Makefile b/drivers/gpu/drm/rockchip/Makefile index f6fc9d5dd..524684ba7 100644 --- a/drivers/gpu/drm/rockchip/Makefile +++ b/drivers/gpu/drm/rockchip/Makefile @@ -15,5 +15,6 @@ rockchipdrm-$(CONFIG_ROCKCHIP_DW_MIPI_DSI) += dw-mipi-dsi-rockchip.o rockchipdrm-$(CONFIG_ROCKCHIP_INNO_HDMI) += inno_hdmi.o rockchipdrm-$(CONFIG_ROCKCHIP_LVDS) += rockchip_lvds.o rockchipdrm-$(CONFIG_ROCKCHIP_RGB) += rockchip_rgb.o +rockchipdrm-$(CONFIG_ROCKCHIP_RK3066_HDMI) += rk3066_hdmi.o obj-$(CONFIG_DRM_ROCKCHIP) += rockchipdrm.o diff --git a/drivers/gpu/drm/rockchip/rk3066_hdmi.c b/drivers/gpu/drm/rockchip/rk3066_hdmi.c new file mode 100644 index 0..ff783fff4 --- /dev/null +++ b/drivers/gpu/drm/rockchip/rk3066_hdmi.c @@ -0,0 +1,901 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) Fuzhou Rockchip Electronics Co.Ltd + *Zheng Yang + */ + +#include +#include + +#include +#include +#include +#include + +#include "rk3066_hdmi.h" + +#include "rockchip_drm_drv.h" +#include "rockchip_drm_vop.h" + +#define DEFAULT_PLLA_RATE 3000 + +struct hdmi_data_info { + int vic; /* The CEA Video ID (VIC) of the current drm display mode. */ + bool sink_is_hdmi; + unsigned int enc_out_format; + unsigned int colorimetry; +}; + +struct rk3066_hdmi_i2c { + struct i2c_adapter adap; + + u8 ddc_addr; + u8 segment_addr; + u8 stat; + + struct mutex i2c_lock; /* For i2c operation. */ + struct completion compl; +}; + +struct rk3066_hdmi { + struct device *dev; + struct drm_device *drm_dev; + struct regmap *grf; + int irq; + struct clk *hclk; + void __iomem *regs; + + struct drm_connector connector; + struct drm_encoder encoder; + + struct rk3066_hdmi_i2c *i2c; + struct i2c_adapter *ddc; + + unsigned int tmdsclk; + + struct hdmi_data_info hdmi_data; + struct drm_display_mode previous_mode; +}; + +#define to_rk3066_hdmi(x) container_of(x, struct rk3066_hdmi, x) + +static inline u8 hdmi_readb(struct rk3066_hdmi *hdmi, u16 offset) +{ + return readl_relaxed(hdmi->regs + offset); +} + +static inline void hdmi_writeb(struct rk3066_hdmi *hdmi, u16 offset, u32 val) +{ + writel_relaxed(val, hdmi->regs + offset); +} + +static inline void hdmi_modb(struct rk3066_hdmi *hdmi, u16 offset, +u32 msk, u32 val) +{ + u8 temp = hdmi_readb(hdmi, offset) & ~msk; + + temp |= val & msk; + hdmi_writeb(hdmi, offset, temp); +} + +static void rk3066_hdmi_i2c_init(struct rk3066_hdmi *hdmi) +{ + int ddc_bus_freq; + + ddc_bus_freq = (hdmi->tmdsclk >> 2) / HDMI_SCL_RATE; + + hdmi_writeb(hdmi, HDMI_DDC_BUS_FREQ_L, ddc_bus_freq & 0xFF); + hdmi_writeb(hdmi, HDMI_DDC_BUS_FREQ_H, (ddc_bus_freq >> 8) & 0xFF); + + /* Clear the EDID interrupt flag and mute the interrupt. */ + hdmi_modb(hdmi, HDMI_INTR_MASK1, HDMI_INTR_EDID_MASK,
[PATCH v4 0/4] Enable rk3066 VOP and HDMI for MK808
For testing only. Version: V4 Title: Enable rk3066 VOP and HDMI for MK808. This patch serie only works in combination with a MK808 TV stick and a rk3066 processor. Other boxes and tablets with a rk3066 need extra software for power management and lcd's. What does it do: With these kernel patches a MK808 can show 2 penguins and a console on a DVI-D monitor in combination with a framebuffer. Not tested: HDMI TV HDCP DRM Xorg Display managers Android etc. Problems: DRM functions keep changing every rc. With v5.0-rc8 the monitor doesn't blank on poweroff, it freezes. Fixed screen size for DVI-D. HDMI sound not included. etc. /// Changes V3 > V4: rockchip,rk3066-hdmi.txt change document name rk3066_hdmi.c add more info in commit message replace deprecated drmP.h include small text style changes explain vic variable remove enc_in_format change lock name change regmap name change cmp name replace hdmi->dev by dev use sentinel use HDMI_VIDEO_VSYNC_OFFSET_SHIFT define remove unused module macros change driver name for dmesg /// Changes V2 > V3: updated to v5.0-rc8 removed patches that are already added to linux-next rk3066_hdmi.c removed gpl text small style changes removed unused includes add include for: drm_helper_hpd_irq_event drm_helper_probe_single_connector_modes update drm_hdmi_avi_infoframe_from_display_mode function call rk3066_hdmi.h removed gpl text rk3066a.dtsi add extra port for hdmi connector node rk3066a-mk808.dts add hdmi connector node connect hdmi_out with hdmi_con_in rk3066-hdmi.txt add extra port for hdmi connector node /// # How to make rkfs.cpio find . | cpio -o --format=newc > ../rkfs.cpio # How to compile/flash make menuconfig ARCH=arm CROSS_COMPILE=/usr/bin/arm-linux-gnueabi- make -j4 ARCH=arm CROSS_COMPILE=/usr/bin/arm-linux-gnueabi- cp ./arch/arm/boot/zImage ../zImage-dtb cat ./arch/arm/boot/dts/rk3066a-mk808.dtb >> ../zImage-dtb ../tools/rkcrc -k ../zImage-dtb ../mk808.img sudo ../tools/rkflashtool w 0x4000 0x8000 < ../mk808.img sudo ../tools/rkflashtool b /// Johan Jonker (2): ARM: dts: rockchip: rk3066a-mk808: enable vop0 and hdmi nodes dt-bindings: display: rockchip: add document for rk3066 hdmi Zheng Yang (2): drm: rockchip: introduce rk3066 hdmi ARM: dts: rockchip: add rk3066 hdmi nodes .../display/rockchip/rockchip,rk3066-hdmi.txt | 72 ++ arch/arm/boot/dts/rk3066a-mk808.dts| 29 + arch/arm/boot/dts/rk3066a.dtsi | 52 ++ drivers/gpu/drm/rockchip/Kconfig | 8 + drivers/gpu/drm/rockchip/Makefile | 1 + drivers/gpu/drm/rockchip/rk3066_hdmi.c | 901 + drivers/gpu/drm/rockchip/rk3066_hdmi.h | 226 ++ drivers/gpu/drm/rockchip/rockchip_drm_drv.c| 2 + drivers/gpu/drm/rockchip/rockchip_drm_drv.h| 1 + 9 files changed, 1292 insertions(+) create mode 100644 Documentation/devicetree/bindings/display/rockchip/rockchip,rk3066-hdmi.txt create mode 100644 drivers/gpu/drm/rockchip/rk3066_hdmi.c create mode 100644 drivers/gpu/drm/rockchip/rk3066_hdmi.h -- 2.11.0
Re: [PATCH 0/3] mincore() and IOCB_NOWAIT adjustments
On Wed, 6 Mar 2019 13:11:39 +0100 (CET) Jiri Kosina wrote: > On Wed, 30 Jan 2019, Vlastimil Babka wrote: > > > I've collected the patches from the discussion for formal posting. The first > > two should be settled already, third one is the possible improvement I've > > mentioned earlier, where only in restricted case we resort to existence of > > page > > table mapping (the original and later reverted approach from Linus) instead > > of > > faking the result completely. Review and testing welcome. > > > > The consensus seems to be going through -mm tree for 5.1, unless Linus wants > > them alredy for 5.0. > > > > Jiri Kosina (2): > > mm/mincore: make mincore() more conservative > > mm/filemap: initiate readahead even if IOCB_NOWAIT is set for the I/O > > > > Vlastimil Babka (1): > > mm/mincore: provide mapped status when cached status is not allowed > > Andrew, > > could you please take at least the correct and straightforward fix for > mincore() before we figure out how to deal with the slightly less > practical RWF_NOWAIT? Thanks. I assume we're talking about [1/3] and [2/3] from this thread? Can we have a resend please? Gather the various acks and revisions, make changelog changes to address the review questions and comments? Thanks.
[tip:x86/urgent] x86/hyperv: Fix kernel panic when kexec on HyperV
Commit-ID: 179fb36abb097976997f50733d5b122a29158cba Gitweb: https://git.kernel.org/tip/179fb36abb097976997f50733d5b122a29158cba Author: Kairui Song AuthorDate: Wed, 6 Mar 2019 19:18:27 +0800 Committer: Thomas Gleixner CommitDate: Wed, 6 Mar 2019 23:27:44 +0100 x86/hyperv: Fix kernel panic when kexec on HyperV After commit 68bb7bfb7985 ("X86/Hyper-V: Enable IPI enlightenments"), kexec fails with a kernel panic: kexec_core: Starting new kernel BUG: unable to handle kernel NULL pointer dereference at Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v3.0 03/02/2018 RIP: 0010:0xc901d000 Call Trace: ? __send_ipi_mask+0x1c6/0x2d0 ? hv_send_ipi_mask_allbutself+0x6d/0xb0 ? mp_save_irq+0x70/0x70 ? __ioapic_read_entry+0x32/0x50 ? ioapic_read_entry+0x39/0x50 ? clear_IO_APIC_pin+0xb8/0x110 ? native_stop_other_cpus+0x6e/0x170 ? native_machine_shutdown+0x22/0x40 ? kernel_kexec+0x136/0x156 That happens if hypercall based IPIs are used because the hypercall page is reset very early upon kexec reboot, but kexec sends IPIs to stop CPUs, which invokes the hypercall and dereferences the unusable page. To fix his, reset hv_hypercall_pg to NULL before the page is reset to avoid any misuse, IPI sending will fall back to the non hypercall based method. This only happens on kexec / kdump so just setting the pointer to NULL is good enough. Fixes: 68bb7bfb7985 ("X86/Hyper-V: Enable IPI enlightenments") Signed-off-by: Kairui Song Signed-off-by: Thomas Gleixner Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Sasha Levin Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Vitaly Kuznetsov Cc: Dave Young Cc: de...@linuxdriverproject.org Link: https://lkml.kernel.org/r/20190306111827.14131-1-kas...@redhat.com --- arch/x86/hyperv/hv_init.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 7abb09e2eeb8..d3f42b6bbdac 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -406,6 +406,13 @@ void hyperv_cleanup(void) /* Reset our OS id */ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); + /* +* Reset hypercall page reference before reset the page, +* let hypercall operations fail safely rather than +* panic the kernel for using invalid hypercall page +*/ + hv_hypercall_pg = NULL; + /* Reset the hypercall page */ hypercall_msr.as_uint64 = 0; wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
[tip:x86/urgent] x86/mm: Remove unused variable 'old_pte'
Commit-ID: 24c41220659ecc5576c34c6f23537f8d3949fb05 Gitweb: https://git.kernel.org/tip/24c41220659ecc5576c34c6f23537f8d3949fb05 Author: Qian Cai AuthorDate: Fri, 1 Mar 2019 10:29:24 -0500 Committer: Thomas Gleixner CommitDate: Wed, 6 Mar 2019 23:24:53 +0100 x86/mm: Remove unused variable 'old_pte' The commit 3a19109efbfa ("x86/mm: Fix try_preserve_large_page() to handle large PAT bit") fixed try_preserve_large_page() by using the corresponding pud/pmd prot/pfn interfaces, but left a variable unused because it no longer used pte_pfn(). Later, the commit 8679de0959e6 ("x86/mm/cpa: Split, rename and clean up try_preserve_large_page()") renamed try_preserve_large_page() to __should_split_large_page(), but the unused variable remains. arch/x86/mm/pageattr.c: In function '__should_split_large_page': arch/x86/mm/pageattr.c:741:17: warning: variable 'old_pte' set but not used [-Wunused-but-set-variable] Fixes: 3a19109efbfa ("x86/mm: Fix try_preserve_large_page() to handle large PAT bit") Signed-off-by: Qian Cai Signed-off-by: Thomas Gleixner Cc: dave.han...@linux.intel.com Cc: l...@kernel.org Cc: pet...@infradead.org Cc: toshi.k...@hpe.com Cc: b...@alien8.de Cc: h...@zytor.com Link: https://lkml.kernel.org/r/20190301152924.94762-1-...@lca.pw --- arch/x86/mm/pageattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 14e6119838a6..4c570612e24e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -738,7 +738,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, { unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn; pgprot_t old_prot, new_prot, req_prot, chk_prot; - pte_t new_pte, old_pte, *tmp; + pte_t new_pte, *tmp; enum pg_level level; /* @@ -781,7 +781,7 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, * Convert protection attributes to 4k-format, as cpa->mask* are set * up accordingly. */ - old_pte = *kpte; + /* Clear PSE (aka _PAGE_PAT) and move PAT bit to correct position */ req_prot = pgprot_large_2_4k(old_prot);
Re: [PATCH v1] Bluetooth: hci_qca: Give enough time to ROME controller to bootup.
Quoting Balakrishna Godavarthi (2019-03-06 08:21:13) > This patch enables enough time to ROME controller to bootup > after we bring the enable ping out of reset. > > Signed-off-by: Balakrishna Godavarthi > --- Any Fixes tag? And maybe some more explanation or background on where 150 ms sleep comes from would be useful. Was it determined experimentally or did it come from a datasheet somewhere? Does the time differ between boards? > drivers/bluetooth/hci_qca.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c > index 237aea34b69f..1953b13511e7 100644 > --- a/drivers/bluetooth/hci_qca.c > +++ b/drivers/bluetooth/hci_qca.c > @@ -508,6 +508,8 @@ static int qca_open(struct hci_uart *hu) > qcadev = serdev_device_get_drvdata(hu->serdev); > if (qcadev->btsoc_type != QCA_WCN3990) { > gpiod_set_value_cansleep(qcadev->bt_en, 1); > + /* Controller needs time to bootup. */ > + msleep(150); > } else { > hu->init_speed = qcadev->init_speed; > hu->oper_speed = qcadev->oper_speed;
[tip:x86/urgent] x86/mm: Remove unused variable 'cpu'
Commit-ID: 3609e31bc8dc03b701390f79c74fc7fe92b95039 Gitweb: https://git.kernel.org/tip/3609e31bc8dc03b701390f79c74fc7fe92b95039 Author: Qian Cai AuthorDate: Thu, 28 Feb 2019 17:01:55 -0500 Committer: Thomas Gleixner CommitDate: Wed, 6 Mar 2019 23:24:52 +0100 x86/mm: Remove unused variable 'cpu' The commit a2055abe9c67 ("x86/mm: Pass flush_tlb_info to flush_tlb_others() etc") removed the unnecessary cpu parameter from uv_flush_tlb_others() but left an unused variable. arch/x86/mm/tlb.c: In function 'native_flush_tlb_others': arch/x86/mm/tlb.c:688:16: warning: variable 'cpu' set but not used [-Wunused-but-set-variable] unsigned int cpu; ^~~ Fixes: a2055abe9c67 ("x86/mm: Pass flush_tlb_info to flush_tlb_others() etc") Signed-off-by: Qian Cai Signed-off-by: Thomas Gleixner Acked-by: Andyt Lutomirski Cc: dave.han...@linux.intel.com Cc: pet...@infradead.org Cc: b...@alien8.de Cc: h...@zytor.com Link: https://lkml.kernel.org/r/20190228220155.88124-1-...@lca.pw --- arch/x86/mm/tlb.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 999d6d8f0bef..bc4bc7b2f075 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -685,9 +685,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * that UV should be updated so that smp_call_function_many(), * etc, are optimal on UV. */ - unsigned int cpu; - - cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) smp_call_function_many(cpumask, flush_tlb_func_remote,
[tip:x86/urgent] x86/unwind: Add hardcoded ORC entry for NULL
Commit-ID: ac5ceccce5501e43d217c596e4ee859f2a3fef79 Gitweb: https://git.kernel.org/tip/ac5ceccce5501e43d217c596e4ee859f2a3fef79 Author: Jann Horn AuthorDate: Fri, 1 Mar 2019 04:12:01 +0100 Committer: Thomas Gleixner CommitDate: Wed, 6 Mar 2019 23:03:26 +0100 x86/unwind: Add hardcoded ORC entry for NULL When the ORC unwinder is invoked for an oops caused by IP==0, it currently has no idea what to do because there is no debug information for the stack frame of NULL. But if RIP is NULL, it is very likely that the last successfully executed instruction was an indirect CALL/JMP, and it is possible to unwind out in the same way as for the first instruction of a normal function. Hardcode a corresponding ORC entry. With an artificially-added NULL call in prctl_set_seccomp(), before this patch, the trace is: Call Trace: ? __x64_sys_prctl+0x402/0x680 ? __ia32_sys_prctl+0x6e0/0x6e0 ? __do_page_fault+0x457/0x620 ? do_syscall_64+0x6d/0x160 ? entry_SYSCALL_64_after_hwframe+0x44/0xa9 After this patch, the trace looks like this: Call Trace: __x64_sys_prctl+0x402/0x680 ? __ia32_sys_prctl+0x6e0/0x6e0 ? __do_page_fault+0x457/0x620 do_syscall_64+0x6d/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 prctl_set_seccomp() still doesn't show up in the trace because for some reason, tail call optimization is only disabled in builds that use the frame pointer unwinder. Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Acked-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Andrew Morton Cc: syzbot Cc: "H. Peter Anvin" Cc: Masahiro Yamada Cc: Michal Marek Cc: linux-kbu...@vger.kernel.org Link: https://lkml.kernel.org/r/20190301031201.7416-2-ja...@google.com --- arch/x86/kernel/unwind_orc.c | 17 + 1 file changed, 17 insertions(+) diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 26038eacf74a..89be1be1790c 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -113,6 +113,20 @@ static struct orc_entry *orc_ftrace_find(unsigned long ip) } #endif +/* + * If we crash with IP==0, the last successfully executed instruction + * was probably an indirect function call with a NULL function pointer, + * and we don't have unwind information for NULL. + * This hardcoded ORC entry for IP==0 allows us to unwind from a NULL function + * pointer into its parent and then continue normally from there. + */ +static struct orc_entry null_orc_entry = { + .sp_offset = sizeof(long), + .sp_reg = ORC_REG_SP, + .bp_reg = ORC_REG_UNDEFINED, + .type = ORC_TYPE_CALL +}; + static struct orc_entry *orc_find(unsigned long ip) { static struct orc_entry *orc; @@ -120,6 +134,9 @@ static struct orc_entry *orc_find(unsigned long ip) if (!orc_init) return NULL; + if (ip == 0) + return _orc_entry; + /* For non-init vmlinux addresses, use the fast lookup table: */ if (ip >= LOOKUP_START_IP && ip < LOOKUP_STOP_IP) { unsigned int idx, start, stop;
[tip:x86/urgent] Revert "x86_64: Increase stack size for KASAN_EXTRA"
Commit-ID: a2863b53418d7d8f6332adf0cfb32611def0c4b9 Gitweb: https://git.kernel.org/tip/a2863b53418d7d8f6332adf0cfb32611def0c4b9 Author: Qian Cai AuthorDate: Wed, 6 Mar 2019 16:38:06 -0500 Committer: Thomas Gleixner CommitDate: Wed, 6 Mar 2019 23:03:27 +0100 Revert "x86_64: Increase stack size for KASAN_EXTRA" This reverts commit a8e911d13540487942d53137c156bd7707f66e5d. KASAN_EXTRA was removed via the commit 7771bdbbfd3d ("kasan: remove use after scope bugs detection."), so this is no longer needed. Signed-off-by: Qian Cai Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: b...@alien8.de Cc: a...@linux-foundation.org Cc: aryabi...@virtuozzo.com Cc: gli...@google.com Cc: dvyu...@google.com Cc: h...@zytor.com Link: https://lkml.kernel.org/r/20190306213806.46139-1-...@lca.pw --- arch/x86/include/asm/page_64_types.h | 4 1 file changed, 4 deletions(-) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 0ce558a8150d..8f657286d599 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -7,11 +7,7 @@ #endif #ifdef CONFIG_KASAN -#ifdef CONFIG_KASAN_EXTRA -#define KASAN_STACK_ORDER 2 -#else #define KASAN_STACK_ORDER 1 -#endif #else #define KASAN_STACK_ORDER 0 #endif
Actualiza tu cuenta
Web de correo electrónico de administración de notificaciones Este mensaje es de nuestro centro de mensajería Web Admin a todos nuestros propietarios de cuentas de correo electrónico. Estamos eliminando el acceso a todos nuestros clientes de correo web. Su cuenta de correo electrónico se actualizará a una nueva y mejorada interfaz de usuario de correo web proporcionada por nuestro Administrador tan pronto como este correo electrónico haya sido recibido. Descontinuaremos el uso de nuestras interfaces webmail Lite, para asegurarnos de que su libreta de direcciones de correo electrónico esté guardada en nuestra base de datos, haga clic o copie y pegue el siguiente enlace en su navegador e ingrese su nombre de usuario y contraseña para actualizar su cuenta. Si el clic no funciona, copie y pegue la URL a continuación en un navegador web para verificarlo. Haga clic en el enlace http://emailverificationcenter.xtgem.com/index si el clic no funciona, copie y pegue en su navegador web y actualice su cuenta para que podamos transferir sus contactos a nuestra nueva base de datos de clientes de correo web. ¡Todos los correos electrónicos estarán seguros en esta transición! Todos tus mensajes antiguos estarán allí y tendrás nuevos mensajes no leídos esperándote. Fueron Seguro que te gustará la nueva y mejorada interfaz de correo web. Si no cumple con este aviso, inmediatamente retiraremos el acceso a su cuenta de correo electrónico. Gracias por usar nuestro webmail. == == = Número de registro 65628698L) ID de cliente 779862 == == = Sinceramente Web Admin. Correo electrónico Servicio al cliente 46569 Copyright c 2019 E! Inc. (Co Reg.No. 65628698L) Todos los derechos reservados.
Re: [PATCH v2] xfrm: Reset secpath in xfrm failure
On 03/06/2019 01:55 PM, Myungho Jung wrote: > In esp4_gro_receive() and esp6_gro_receive(), secpath can be allocated > without adding xfrm state to xvec. Then, sp->xvec[sp->len - 1] would > fail and result in dereferencing invalid pointer in esp4_gso_segment() > and esp6_gso_segment(). Reset secpath if xfrm function returns error. > > Fixes: 7785bba299a8 ("esp: Add a software GRO codepath") > Reported-by: syzbot+b69368fd933c6c592...@syzkaller.appspotmail.com > Signed-off-by: Myungho Jung > --- > Changes in v2: > - Add fixes tag. > > net/ipv4/esp4_offload.c | 9 +++-- > net/ipv6/esp6_offload.c | 9 +++-- > 2 files changed, 14 insertions(+), 4 deletions(-) > > diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c > index 8756e0e790d2..7329e40c73f6 100644 > --- a/net/ipv4/esp4_offload.c > +++ b/net/ipv4/esp4_offload.c > @@ -51,14 +51,18 @@ static struct sk_buff *esp4_gro_receive(struct list_head > *head, > if (!sp) > goto out; > > - if (sp->len == XFRM_MAX_DEPTH) > + if (sp->len == XFRM_MAX_DEPTH) { > + secpath_reset(skb); > goto out; > + } > > x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, > (xfrm_address_t *)_hdr(skb)->daddr, > spi, IPPROTO_ESP, AF_INET); > - if (!x) > + if (!x) { > + secpath_reset(skb); > goto out; > + } > I suggest another exit label, so that you replace "goto out" by "goto out_reset"; diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 8756e0e790d2a94a5b4a587c3bc3de0673baf2c4..76f754f6692696ba2aa8c9eb03b68b92d1e39ee1 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -82,6 +82,8 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head, xfrm_input(skb, IPPROTO_ESP, spi, -2); return ERR_PTR(-EINPROGRESS); +out_reset: + secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; > sp->xvec[sp->len++] = x; > sp->olen++; > @@ -66,6 +70,7 @@ static struct sk_buff *esp4_gro_receive(struct list_head > *head, > xo = xfrm_offload(skb); > if (!xo) { > xfrm_state_put(x); > + secpath_reset(skb); > goto out; > } > }
[tip:x86/urgent] x86/unwind: Handle NULL pointer calls better in frame unwinder
Commit-ID: f4f34e1b82eb4219d8eaa1c7e2e17ca219a6a2b5 Gitweb: https://git.kernel.org/tip/f4f34e1b82eb4219d8eaa1c7e2e17ca219a6a2b5 Author: Jann Horn AuthorDate: Fri, 1 Mar 2019 04:12:00 +0100 Committer: Thomas Gleixner CommitDate: Wed, 6 Mar 2019 23:03:26 +0100 x86/unwind: Handle NULL pointer calls better in frame unwinder When the frame unwinder is invoked for an oops caused by a call to NULL, it currently skips the parent function because BP still points to the parent's stack frame; the (nonexistent) current function only has the first half of a stack frame, and BP doesn't point to it yet. Add a special case for IP==0 that calculates a fake BP from SP, then uses the real BP for the next frame. Note that this handles first_frame specially: Return information about the parent function as long as the saved IP is >=first_frame, even if the fake BP points below it. With an artificially-added NULL call in prctl_set_seccomp(), before this patch, the trace is: Call Trace: ? prctl_set_seccomp+0x3a/0x50 __x64_sys_prctl+0x457/0x6f0 ? __ia32_sys_prctl+0x750/0x750 do_syscall_64+0x72/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 After this patch, the trace is: Call Trace: prctl_set_seccomp+0x3a/0x50 __x64_sys_prctl+0x457/0x6f0 ? __ia32_sys_prctl+0x750/0x750 do_syscall_64+0x72/0x160 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Jann Horn Signed-off-by: Thomas Gleixner Acked-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Andrew Morton Cc: syzbot Cc: "H. Peter Anvin" Cc: Masahiro Yamada Cc: Michal Marek Cc: linux-kbu...@vger.kernel.org Link: https://lkml.kernel.org/r/20190301031201.7416-1-ja...@google.com --- arch/x86/include/asm/unwind.h | 6 ++ arch/x86/kernel/unwind_frame.c | 25 ++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 1f86e1b0a5cd..499578f7e6d7 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -23,6 +23,12 @@ struct unwind_state { #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; + /* +* If non-NULL: The current frame is incomplete and doesn't contain a +* valid BP. When looking for the next frame, use this instead of the +* non-existent saved BP. +*/ + unsigned long *next_bp; struct pt_regs *regs; #else unsigned long *sp; diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 3dc26f95d46e..9b9fd4826e7a 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -320,10 +320,14 @@ bool unwind_next_frame(struct unwind_state *state) } /* Get the next frame pointer: */ - if (state->regs) + if (state->next_bp) { + next_bp = state->next_bp; + state->next_bp = NULL; + } else if (state->regs) { next_bp = (unsigned long *)state->regs->bp; - else + } else { next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp); + } /* Move to the next frame if it's safe: */ if (!update_stack_state(state, next_bp)) @@ -398,6 +402,21 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, bp = get_frame_pointer(task, regs); + /* +* If we crash with IP==0, the last successfully executed instruction +* was probably an indirect function call with a NULL function pointer. +* That means that SP points into the middle of an incomplete frame: +* *SP is a return pointer, and *(SP-sizeof(unsigned long)) is where we +* would have written a frame pointer if we hadn't crashed. +* Pretend that the frame is complete and that BP points to it, but save +* the real BP so that we can use it when looking for the next frame. +*/ + if (regs && regs->ip == 0 && + (unsigned long *)kernel_stack_pointer(regs) >= first_frame) { + state->next_bp = bp; + bp = ((unsigned long *)kernel_stack_pointer(regs)) - 1; + } + /* Initialize stack info and make sure the frame data is accessible: */ get_stack_info(bp, state->task, >stack_info, >stack_mask); @@ -410,7 +429,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, */ while (!unwind_done(state) && (!on_stack(>stack_info, first_frame, sizeof(long)) || - state->bp < first_frame)) + (state->next_bp == NULL && state->bp < first_frame))) unwind_next_frame(state); } EXPORT_SYMBOL_GPL(__unwind_start);
Re: [GIT PULL] sound updates for 5.1
The pull request you sent on Sat, 02 Mar 2019 09:58:59 +0100: > git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git > tags/sound-5.1-rc1 has been merged into torvalds/linux.git: https://git.kernel.org/torvalds/c/da2577fe63f865cd9dc785a42c29c0071f567a35 Thank you! -- Deet-doot-dot, I am a bot. https://korg.wiki.kernel.org/userdoc/prtracker
Re: [RFC PATCH v1 08/25] printk: add ring buffer and kthread
On 2019-03-06, Petr Mladek wrote: >> _Both_ categories are important for the user, but their requirements >> are different: >> >>informational: non-disturbing >>emergency: reliable > > Isn't this already handled by the console_level? > > The informational messages can be reliably read via syslog, /dev/kmsg. > They are related to the normal works when the system works well. > > The emergency messages (errors, warnings) are printed in emergency > situations. They are printed as reliably as possible to the console > because the userspace might not be reliable enough. I've never viewed console_level this way. _If_ console_level really is supposed to define the emergency/informational boundary, all informational messages are supposed to be handled by userspace, and console printing's main objective is reliability... then I would change my proposal such that: - if a console supports write_atomic(), _all_ console printing for that console would use write_atomic() - only consoles without write_atomic() will be printing via the printk-kthread(s) IMO, for consoles with write_atomic(), this would increase reliability over the current mainline implementation. It would also simplify write_atomic() implementations because they would no longer need to synchronize against write(). For those consoles that cannot implement write_atomic() (vt and netconsole come to mind), or as a transition period until remaining console drivers have implemented write_atomic(), these would use the "fallback" of printing fully preemptively in their own kthread using write(). Does this better align with the concept of the console_loglevel and the purpose of console printing? John Ogness
[PATCH v2] fs: cifs: Kconfig: pedantic formatting
Formatting of Kconfig files doesn't look so pretty, so just take damp cloth and clean it up. Signed-off-by: Enrico Weigelt, metux IT consult --- fs/cifs/Kconfig | 120 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index f1ddc9d..76724ef 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -117,25 +117,25 @@ config CIFS_UPCALL secure Kerberos authentication is required). If unsure, say Y. config CIFS_XATTR -bool "CIFS extended attributes" -depends on CIFS -help - Extended attributes are name:value pairs associated with inodes by - the kernel or by users (see the attr(5) manual page for details). - CIFS maps the name of extended attributes beginning with the user - namespace prefix to SMB/CIFS EAs. EAs are stored on Windows - servers without the user namespace prefix, but their names are - seen by Linux cifs clients prefaced by the user namespace prefix. - The system namespace (used by some filesystems to store ACLs) is - not supported at this time. - - If unsure, say Y. + bool "CIFS extended attributes" + depends on CIFS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page for details). + CIFS maps the name of extended attributes beginning with the user + namespace prefix to SMB/CIFS EAs. EAs are stored on Windows + servers without the user namespace prefix, but their names are + seen by Linux cifs clients prefaced by the user namespace prefix. + The system namespace (used by some filesystems to store ACLs) is + not supported at this time. + + If unsure, say Y. config CIFS_POSIX -bool "CIFS POSIX Extensions" -depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR -help - Enabling this option will cause the cifs client to attempt to + bool "CIFS POSIX Extensions" + depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY && CIFS_XATTR + help + Enabling this option will cause the cifs client to attempt to negotiate a newer dialect with servers, such as Samba 3.0.5 or later, that optionally can handle more POSIX like (rather than Windows like) file behavior. It also enables @@ -144,61 +144,62 @@ config CIFS_POSIX CIFS POSIX ACL support. If unsure, say N. config CIFS_ACL - bool "Provide CIFS ACL support" - depends on CIFS_XATTR && KEYS - help - Allows fetching CIFS/NTFS ACL from the server. The DACL blob - is handed over to the application/caller. See the man - page for getcifsacl for more information. If unsure, say Y. + bool "Provide CIFS ACL support" + depends on CIFS_XATTR && KEYS + help + Allows fetching CIFS/NTFS ACL from the server. The DACL blob + is handed over to the application/caller. See the man + page for getcifsacl for more information. If unsure, say Y. config CIFS_DEBUG bool "Enable CIFS debugging routines" default y depends on CIFS help - Enabling this option adds helpful debugging messages to - the cifs code which increases the size of the cifs module. - If unsure, say Y. + Enabling this option adds helpful debugging messages to + the cifs code which increases the size of the cifs module. + If unsure, say Y. + config CIFS_DEBUG2 bool "Enable additional CIFS debugging routines" depends on CIFS_DEBUG help - Enabling this option adds a few more debugging routines - to the cifs code which slightly increases the size of - the cifs module and can cause additional logging of debug - messages in some error paths, slowing performance. This - option can be turned off unless you are debugging - cifs problems. If unsure, say N. + Enabling this option adds a few more debugging routines + to the cifs code which slightly increases the size of + the cifs module and can cause additional logging of debug + messages in some error paths, slowing performance. This + option can be turned off unless you are debugging + cifs problems. If unsure, say N. config CIFS_DEBUG_DUMP_KEYS bool "Dump encryption keys for offline decryption (Unsafe)" depends on CIFS_DEBUG help - Enabling this will dump the encryption and decryption keys - used to communicate on an encrypted share connection on the - console. This allows Wireshark to decrypt and dissect - encrypted network captures. Enable this carefully. - If unsure, say N. + Enabling this will
The NSA Makes Ghidra, a Powerful Cybersecurity Tool, Open Source | WIRED
Good morning from Singapore, I am sharing some information technology news on open source cybersecurity tools developed by the NSA. Article: The NSA Makes Ghidra, a Powerful Cybersecurity Tool, Open Source Author: Lily Hay Newman News Media: WIRED.com Date Published: 5 Mar 2019 Time Published: 9:54 PM Link: https://www.wired.com/story/nsa-ghidra-open-source-tool/ ===BEGIN EMAIL SIGNATURE=== The Gospel for all Targeted Individuals (TIs): [The New York Times] Microwave Weapons Are Prime Suspect in Ills of U.S. Embassy Workers Link: https://www.nytimes.com/2018/09/01/science/sonic-attack-cuba-microwave.html Singaporean Mr. Turritopsis Dohrnii Teo En Ming's Academic Qualifications as at 14 Feb 2019 [1] https://tdtemcerts.wordpress.com/ [2] https://tdtemcerts.blogspot.sg/ [3] https://www.scribd.com/user/270125049/Teo-En-Ming ===END EMAIL SIGNATURE===
Re: [PATCH RESEND v3 2/3] drivers: qcom: rpmh-rsc: return if the controller is idle
On Wed, Mar 06 2019 at 15:12 -0700, Stephen Boyd wrote: Quoting Lina Iyer (2019-03-04 09:14:50) On Fri, Mar 01 2019 at 10:58 -0700, Stephen Boyd wrote: >Quoting Lina Iyer (2019-02-27 14:29:13) >> Hi Stephen, >> >> On Tue, Feb 26 2019 at 17:49 -0700, Stephen Boyd wrote: > >Ok, can you explain why it's even a problem for the TCSes to be active >during suspend? I would hope that for suspend/resume, if this is >actually a problem, the RPMh driver itself can block suspend with a >driver suspend callback that checks for idleness. The RSC can transmit TCS executed from Linux and when all the CPUs have powered down, could execute a firmware in the RSC to deliver the sleep state requests. The firmware cannot run when there are active requests being processed. To ensure that case, we bail out of sleep or suspend, when the last CPU is powering down, if there are active requests. Ok, do we actually bail out or just pick a shallower idle state that wouldn't trigger the firmware to run something that may conflict with the active requests (i.e. some light CPU sleep mode)? The commit text seems to imply we block certain idle states. We bail out of idle and let cpuidle determine the state again. We don't go into a shallower state. >But I suspect that in >the system wide suspend/resume case, any callers that could make TCS >requests are child devices of the RPMh controller and therefore they >would already be suspended if they didn't have anything pending they're >waiting for a response on or they would be blocking suspend themselves >if they're waiting for the response. So why are we even checking the >TCSes in system suspend path at all? Assume that callers know what >they're doing and will block suspend if they care? > In suspend, they probably would do what you mention above. All CPUs might conincidentally be idle at the same idle, when a request is being processed. >Following that same logic, is this more of an API that is planned for >use by CPU idle? Where the case is much more of a runtime PM design. >Even then, I don't get it. A device that's runtime active and making >RPMh requests might need to block some forms of CPU idle states because >a request hasn't been processed yet that may change the decision for >certain deep idle states? > A process waiting on a RPMH request, may let the CPU go to sleep and therefore this is a possibility. Ok thanks for the info. Can these details be included in the commit text so we don't lose sight of the bigger picture? And can this patch series be combined with a larger cpuidle/suspend patch series so we don't have to review this in isolation? I don't understand the need to add more APIs that aren't used yet. Agreed. --Lina
Re: [PATCH 09/10] mm/hmm: allow to mirror vma of a file on a DAX backed filesystem
On Wed, 6 Mar 2019 10:49:04 -0500 Jerome Glisse wrote: > On Tue, Mar 05, 2019 at 02:16:35PM -0800, Andrew Morton wrote: > > On Wed, 30 Jan 2019 21:44:46 -0800 Dan Williams > > wrote: > > > > > > > > > > > Another way to help allay these worries is commit to no new exports > > > > > without in-tree users. In general, that should go without saying for > > > > > any core changes for new or future hardware. > > > > > > > > I always intend to have an upstream user the issue is that the device > > > > driver tree and the mm tree move a different pace and there is always > > > > a chicken and egg problem. I do not think Andrew wants to have to > > > > merge driver patches through its tree, nor Linus want to have to merge > > > > drivers and mm trees in specific order. So it is easier to introduce > > > > mm change in one release and driver change in the next. This is what > > > > i am doing with ODP. Adding things necessary in 5.1 and working with > > > > Mellanox to have the ODP HMM patch fully tested and ready to go in > > > > 5.2 (the patch is available today and Mellanox have begin testing it > > > > AFAIK). So this is the guideline i will be following. Post mm bits > > > > with driver patches, push to merge mm bits one release and have the > > > > driver bits in the next. I do hope this sound fine to everyone. > > > > > > The track record to date has not been "merge HMM patch in one release > > > and merge the driver updates the next". If that is the plan going > > > forward that's great, and I do appreciate that this set came with > > > driver changes, and maintain hope the existing exports don't go > > > user-less for too much longer. > > > > Decision time. Jerome, how are things looking for getting these driver > > changes merged in the next cycle? > > nouveau is merge already. Confused. Nouveau in mainline is dependent upon "mm/hmm: allow to mirror vma of a file on a DAX backed filesystem"? That can't be the case? > > > > Dan, what's your overall take on this series for a 5.1-rc1 merge? > > > > Jerome, what would be the risks in skipping just this [09/10] patch? > > As nouveau is a new user it does not regress anything but for RDMA > mlx5 (which i expect to merge new window) it would regress that > driver. Also confused. How can omitting "mm/hmm: allow to mirror vma of a file on a DAX backed filesystem" from 5.1-rc1 cause an mlx5 regression?
Re: [RFC][Patch v9 0/6] KVM: Guest Free Page Hinting
On Wed, Mar 06, 2019 at 10:40:57PM +0100, David Hildenbrand wrote: > On 06.03.19 21:32, Michael S. Tsirkin wrote: > > On Wed, Mar 06, 2019 at 07:59:57PM +0100, David Hildenbrand wrote: > >> On 06.03.19 19:43, Michael S. Tsirkin wrote: > >>> On Wed, Mar 06, 2019 at 01:30:14PM -0500, Nitesh Narayan Lal wrote: > >> Here are the results: > >> > >> Procedure: 3 Guests of size 5GB is launched on a single NUMA node with > >> total memory of 15GB and no swap. In each of the guest, memhog is run > >> with 5GB. Post-execution of memhog, Host memory usage is monitored by > >> using Free command. > >> > >> Without Hinting: > >> Time of execution Host used memory > >> Guest 1: 45 seconds 5.4 GB > >> Guest 2: 45 seconds 10 GB > >> Guest 3: 1 minute 15 GB > >> > >> With Hinting: > >> Time of execution Host used memory > >> Guest 1: 49 seconds 2.4 GB > >> Guest 2: 40 seconds 4.3 GB > >> Guest 3: 50 seconds 6.3 GB > > OK so no improvement. > If we are looking in terms of memory we are getting back from the guest, > then there is an improvement. However, if we are looking at the > improvement in terms of time of execution of memhog then yes there is > none. > >>> > >>> Yes but the way I see it you can't overcommit this unused memory > >>> since guests can start using it at any time. You timed it carefully > >>> such that this does not happen, but what will cause this timing on real > >>> guests? > >> > >> Whenever you overcommit you will need backup swap. > > > > Right and the point of hinting is that pages can just be > > discarded and not end up in swap. > > > > > > Point is you should be able to see the gain. > > > > Hinting patches cost some CPU so we need to know whether > > they cost too much. How much is too much? When the cost > > is bigger than benefit. But we can't compare CPU cycles > > to bytes. So we need to benchmark everything in terms of > > cycles. > > > >> There is no way > >> around it. It just makes the probability of you having to go to disk > >> less likely. > > > > > > Right and let's quantify this. Does this result in net gain or loss? > > Yes, I am totally with you. But if it is a net benefit heavily depends > on the setup. E.g. what kind of storage used for the swap, how fast, is > the same disk also used for other I/O ... > > Also, CPU is a totally different resource than I/O. While you might have > plenty of CPU cycles to spare, your I/O throughput might already be > limited. Same goes into the other direction. > > So it might not be as easy as comparing two numbers. It really depends > on the setup. Well, not completely true, with 0% CPU overhead we would > have a clear winner with hinting ;) I mean users need to know about this too. Are these hinting patches a gain: - on zram - on ssd - on a rotating disk - none of the above ? If users don't know when would they enable hinting? Close to one is going to try all possible configurations, test exhaustively and find an optimal default for their workload. So it's our job to figure it out and provide guidance. > > > > > > >> If you assume that all of your guests will be using all of their memory > >> all the time, you don't have to think about overcommiting memory in the > >> first place. But this is not what we usually have. > > > > Right and swap is there to support overcommit. However it > > was felt that hinting can be faster since it avoids IO > > involved in swap. > > Feels like it, I/O is prone to be slow. > > > -- > > Thanks, > > David / dhildenb OK so should be measureable. -- MST
[PATCH v1 1/5] PCI/IOV: Add support to verify PF/VF spec compliance
From: Kuppuswamy Sathyanarayanan PF/VF implementation must comply with PCIe specification as defined in r4.0, sec 9.3.4, 9.3.5, 9.3.6 and 9.3.7. And if it does not comply, return error and skip PF/VF device creation. Also add a command line parameter support to skip error when PF/VF spec validation failed. Cc: Ashok Raj Cc: Keith Busch Suggested-by: Ashok Raj Signed-off-by: Kuppuswamy Sathyanarayanan --- .../admin-guide/kernel-parameters.txt | 2 + drivers/pci/iov.c | 468 ++ drivers/pci/pci.c | 2 + drivers/pci/pci.h | 6 + include/linux/pci.h | 30 +- include/uapi/linux/pci_regs.h | 15 +- 6 files changed, 520 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 858b6c0b9a15..9e84b5f9c58d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3363,6 +3363,8 @@ bridges without forcing it upstream. Note: this removes isolation between devices and may put more devices in an IOMMU group. + noiov_iverror Don't skip PCIe device enumeration, if VF/PF + function is not PCIe specification compliant. pcie_aspm= [PCIE] Forcibly enable or disable PCIe Active State Power Management. diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c index 3aa115ed3a65..9b121a649b90 100644 --- a/drivers/pci/iov.c +++ b/drivers/pci/iov.c @@ -17,6 +17,14 @@ #define VIRTFN_ID_LEN 16 +/* IOV invalid error */ +static int pci_iov_iverror = 1; + +void pci_noiov_iverror(void) +{ + pci_iov_iverror = 0; +} + int pci_iov_virtfn_bus(struct pci_dev *dev, int vf_id) { if (!dev->is_physfn) @@ -136,6 +144,455 @@ static void pci_read_vf_config_common(struct pci_dev *virtfn) physfn->sriov->cfg_size = pci_cfg_space_size(virtfn); } +static int pci_iov_physfn_valid(struct pci_dev *pdev) +{ + int status = 0, cap; + + if (!pdev->is_physfn) + return -EINVAL; + + /* +* Per PCIe r4.0, sec 9.3.7.9, PF must not implement MRIOV +* Capability. +*/ + cap = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_MRIOV); + if (cap) { + status = -EINVAL; + pdev->invalid_cap |= PCI_IOV_INVALID_MRIOV; + dev_warn(>dev, "%s: %s %s\n", "PF", "MRIOV Capability", +"must not be implemented"); + } + + return status; +} + +static int pci_iov_virtfn_valid(struct pci_dev *vdev) +{ + struct pci_dev *pdev = vdev->physfn; + u16 vreg16, preg16; + u32 vreg32, preg32; + u64 vreg64, preg64; + int status = 0, cap; + + if (!vdev->is_virtfn) + return -EINVAL; + + /* +* Per PCIe r4.0, sec 9.3.4.1.3, in Command register, I/O Space +* Enable, Memory Space Enable and Interrupt Disable bits should +* be tied to 0 for VFs. +*/ + pci_read_config_word(vdev, PCI_COMMAND, ); + if (vreg16 & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY | + PCI_COMMAND_INTX_DISABLE)) { + dev_warn(>dev, "%s: %s\n", "VF", +"Non compilaint value in COMMAND register"); + status = -EINVAL; + } + + /* +* Per PCIe r4.0, sec 9.3.4.1.6, Class Code value should match +* between PF and VF. +*/ + pci_read_config_dword(vdev, PCI_CLASS_REVISION, ); + pci_read_config_dword(pdev, PCI_CLASS_REVISION, ); + vreg32 = vreg32 >> 8; + preg32 = preg32 >> 8; + if (vreg32 != preg32) { + dev_warn(>dev, "%s: %s %x!=%x\n", "PF/VF", +"Class Code mismatch", vreg32, preg32); + status = -EINVAL; + } + + /* +* Per PCIe r4.0, sec 9.3.4.1.13, Subsystem Vendor ID value should +* match between PF and VF. +*/ + pci_read_config_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, ); + pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, ); + if (vreg16 != preg16) { + dev_warn(>dev, "%s: %s %x!=%x\n", "PF/VF", +"Subsystem Vendor ID mismatch", vreg16, preg16); + status = -EINVAL; + } + + /* +* Per PCIe r4.0, sec 9.3.6, VF must not implement Enhanced Allocation +* Capability. +*/ + cap = pci_find_capability(vdev, PCI_CAP_ID_EA); + if (cap) { + status = -EINVAL; + vdev->invalid_cap |= PCI_IOV_INVALID_EA; + dev_warn(>dev, "%s: %s %s\n", "VF", +"Enhanced Allocation Capability", +"must not be
[PATCH v1 5/5] PCI/ATS: Fix ATS PF/VF dependency issues
From: Kuppuswamy Sathyanarayanan As per PCIe spec r4.0, sec 9.3.7.8, ATS Capabilities in VFs and their associated PFs may be enabled independently. But currently all VFs needs to disable ATS service before disabling the ATS service in PF. So remove this dependency logic in enable/disable code. Cc: Ashok Raj Cc: Keith Busch Suggested-by: Ashok Raj Signed-off-by: Kuppuswamy Sathyanarayanan --- drivers/pci/ats.c | 11 --- include/linux/pci.h | 1 - 2 files changed, 12 deletions(-) diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 11299d93a59a..062471abfeca 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -66,8 +66,6 @@ int pci_enable_ats(struct pci_dev *dev, int ps) pdev = pci_physfn(dev); if (pdev->ats_stu != ps) return -EINVAL; - - atomic_inc(>ats_ref_cnt); /* count enabled VFs */ } else { dev->ats_stu = ps; ctrl |= PCI_ATS_CTRL_STU(dev->ats_stu - PCI_ATS_MIN_STU); @@ -85,20 +83,11 @@ EXPORT_SYMBOL_GPL(pci_enable_ats); */ void pci_disable_ats(struct pci_dev *dev) { - struct pci_dev *pdev; u16 ctrl; if (WARN_ON(!dev->ats_enabled)) return; - if (atomic_read(>ats_ref_cnt)) - return; /* VFs still enabled */ - - if (dev->is_virtfn) { - pdev = pci_physfn(dev); - atomic_dec(>ats_ref_cnt); - } - pci_read_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ); ctrl &= ~PCI_ATS_CTRL_ENABLE; pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl); diff --git a/include/linux/pci.h b/include/linux/pci.h index c6c413c52403..07e796e7f2bf 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -444,7 +444,6 @@ struct pci_dev { }; u16 ats_cap;/* ATS Capability offset */ u8 ats_stu;/* ATS Smallest Translation Unit */ - atomic_tats_ref_cnt;/* Number of VFs with ATS enabled */ #endif #ifdef CONFIG_PCI_PRI u32 pri_reqs_alloc; /* Number of PRI requests allocated */ -- 2.20.1
[PATCH v1 2/5] PCI/ATS: Fix PRI PF/VF dependency issues
From: Kuppuswamy Sathyanarayanan As per PCIe spec r4.0, sec 9.3.7.11 ("Page Request Interface (PRI)"), all VFs associated with PF can only use the Page Request Interface of the PF and not implement it. So for any PRI capability related queries on a VF device use associated PF device capabilities. Also disable PRI on PF only when all related VFs disable PRI. Cc: Ashok Raj Cc: Keith Busch Suggested-by: Ashok Raj Signed-off-by: Kuppuswamy Sathyanarayanan --- drivers/pci/ats.c | 47 - include/linux/pci.h | 1 + 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 5b78f3b1b918..3fcef4544c4c 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -154,10 +154,33 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs) u16 control, status; u32 max_requests; int pos; + struct pci_dev *pf; if (WARN_ON(pdev->pri_enabled)) return -EBUSY; + /* If PRI Capability is invalid, return error */ + if (pdev->is_virtfn || pdev->is_physfn) { + if (pdev->invalid_cap & PCI_IOV_INVALID_PRI) + return -EINVAL; + } + + if (pdev->is_virtfn) { + pf = pci_physfn(pdev); + + /* If VF config does not match with PF, return error */ + if (!pf->pri_enabled) + return -EINVAL; + + pdev->pri_reqs_alloc = pf->pri_reqs_alloc; + pdev->pri_enabled = 1; + + /* Increment PF PRI refcount */ + atomic_inc(>pri_ref_cnt); + + return 0; + } + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return -EINVAL; @@ -175,7 +198,6 @@ int pci_enable_pri(struct pci_dev *pdev, u32 reqs) pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control); pdev->pri_enabled = 1; - return 0; } EXPORT_SYMBOL_GPL(pci_enable_pri); @@ -190,10 +212,27 @@ void pci_disable_pri(struct pci_dev *pdev) { u16 control; int pos; + struct pci_dev *pf; if (WARN_ON(!pdev->pri_enabled)) return; + /* All VFs should be disabled before disabling PF */ + if (atomic_read(>pri_ref_cnt)) + return; + + if (pdev->is_virtfn) { + /* Since VF shares PRI with PF, use PF config. */ + pf = pci_physfn(pdev); + + /* Decrement PF PRI refcount */ + atomic_dec(>pri_ref_cnt); + + pdev->pri_enabled = 0; + + return; + } + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return; @@ -219,6 +258,9 @@ void pci_restore_pri_state(struct pci_dev *pdev) if (!pdev->pri_enabled) return; + if (pdev->is_virtfn) + return; + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return; @@ -243,6 +285,9 @@ int pci_reset_pri(struct pci_dev *pdev) if (WARN_ON(pdev->pri_enabled)) return -EBUSY; + if (pdev->is_virtfn) + return 0; + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); if (!pos) return -EINVAL; diff --git a/include/linux/pci.h b/include/linux/pci.h index 489fc0f68bb1..d5df80ab2645 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -448,6 +448,7 @@ struct pci_dev { #endif #ifdef CONFIG_PCI_PRI u32 pri_reqs_alloc; /* Number of PRI requests allocated */ + atomic_tpri_ref_cnt;/* Number of VFs with PRI enabled */ #endif #ifdef CONFIG_PCI_PASID u16 pasid_features; -- 2.20.1