[patch net-next 3/3] net/sched: Change act_api and act_xxx modules to use IDR
Typically, each TC filter has its own action. All the actions of the same type are saved in its hash table. But the hash buckets are too small that it degrades to a list. And the performance is greatly affected. For example, it takes about 0m11.914s to insert 64K rules. If we convert the hash table to IDR, it only takes about 0m1.500s. The improvement is huge. But please note that the test result is based on previous patch that cls_flower uses IDR. Signed-off-by: Chris MiSigned-off-by: Jiri Pirko --- include/net/act_api.h | 76 +- net/sched/act_api.c| 249 ++--- net/sched/act_bpf.c| 17 ++-- net/sched/act_connmark.c | 16 ++- net/sched/act_csum.c | 16 ++- net/sched/act_gact.c | 16 ++- net/sched/act_ife.c| 20 ++-- net/sched/act_ipt.c| 26 +++-- net/sched/act_mirred.c | 19 ++-- net/sched/act_nat.c| 16 ++- net/sched/act_pedit.c | 18 ++-- net/sched/act_police.c | 18 ++-- net/sched/act_sample.c | 17 ++-- net/sched/act_simple.c | 20 ++-- net/sched/act_skbedit.c| 18 ++-- net/sched/act_skbmod.c | 18 ++-- net/sched/act_tunnel_key.c | 20 ++-- net/sched/act_vlan.c | 22 ++-- 18 files changed, 277 insertions(+), 345 deletions(-) diff --git a/include/net/act_api.h b/include/net/act_api.h index 26ffd83..8f3d5d8 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -10,12 +10,9 @@ #include #include - -struct tcf_hashinfo { - struct hlist_head *htab; - unsigned inthmask; - spinlock_t lock; - u32 index; +struct tcf_idrinfo { + spinlock_t lock; + struct idr action_idr; }; struct tc_action_ops; @@ -25,9 +22,8 @@ struct tc_action { __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ __u32 order; struct list_headlist; - struct tcf_hashinfo *hinfo; + struct tcf_idrinfo *idrinfo; - struct hlist_node tcfa_head; u32 tcfa_index; int tcfa_refcnt; int tcfa_bindcnt; @@ -44,7 +40,6 @@ struct tc_action { struct tc_cookie*act_cookie; struct tcf_chain*goto_chain; }; -#define tcf_head common.tcfa_head #define tcf_index common.tcfa_index #define tcf_refcnt common.tcfa_refcnt #define tcf_bindcntcommon.tcfa_bindcnt @@ -57,27 +52,6 @@ struct tc_action { #define tcf_lock common.tcfa_lock #define tcf_rcucommon.tcfa_rcu -static inline unsigned int tcf_hash(u32 index, unsigned int hmask) -{ - return index & hmask; -} - -static inline int tcf_hashinfo_init(struct tcf_hashinfo *hf, unsigned int mask) -{ - int i; - - spin_lock_init(>lock); - hf->index = 0; - hf->hmask = mask; - hf->htab = kzalloc((mask + 1) * sizeof(struct hlist_head), - GFP_KERNEL); - if (!hf->htab) - return -ENOMEM; - for (i = 0; i < mask + 1; i++) - INIT_HLIST_HEAD(>htab[i]); - return 0; -} - /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. */ @@ -126,53 +100,51 @@ struct tc_action_ops { }; struct tc_action_net { - struct tcf_hashinfo *hinfo; + struct tcf_idrinfo *idrinfo; const struct tc_action_ops *ops; }; static inline int tc_action_net_init(struct tc_action_net *tn, - const struct tc_action_ops *ops, unsigned int mask) + const struct tc_action_ops *ops) { int err = 0; - tn->hinfo = kmalloc(sizeof(*tn->hinfo), GFP_KERNEL); - if (!tn->hinfo) + tn->idrinfo = kmalloc(sizeof(*tn->idrinfo), GFP_KERNEL); + if (!tn->idrinfo) return -ENOMEM; tn->ops = ops; - err = tcf_hashinfo_init(tn->hinfo, mask); - if (err) - kfree(tn->hinfo); + spin_lock_init(>idrinfo->lock); + idr_init(>idrinfo->action_idr); return err; } -void tcf_hashinfo_destroy(const struct tc_action_ops *ops, - struct tcf_hashinfo *hinfo); +void tcf_idrinfo_destroy(const struct tc_action_ops *ops, +struct tcf_idrinfo *idrinfo); static inline void tc_action_net_exit(struct tc_action_net *tn) { - tcf_hashinfo_destroy(tn->ops, tn->hinfo); - kfree(tn->hinfo); + tcf_idrinfo_destroy(tn->ops, tn->idrinfo); + kfree(tn->idrinfo); } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops); -int
[patch net-next 2/3] net/sched: Change cls_flower to use IDR
Currently, all filters with the same priority are linked in a doubly linked list. Every filter should have a unique handle. To make the handle unique, we need to iterate the list every time to see if the handle exists or not when inserting a new filter. It is time-consuming. For example, it takes about 5m3.169s to insert 64K rules. This patch changes cls_flower to use IDR. With this patch, it takes about 0m1.127s to insert 64K rules. The improvement is huge. But please note that in this testing, all filters share the same action. If every filter has a unique action, that is another bottleneck. Follow-up patch in this patchset addresses that. Signed-off-by: Chris MiSigned-off-by: Jiri Pirko --- net/sched/cls_flower.c | 55 +- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 052e902..071f0ef 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -68,7 +68,6 @@ struct cls_fl_head { struct rhashtable ht; struct fl_flow_mask mask; struct flow_dissector dissector; - u32 hgen; bool mask_assigned; struct list_head filters; struct rhashtable_params ht_params; @@ -76,6 +75,7 @@ struct cls_fl_head { struct work_struct work; struct rcu_head rcu; }; + struct idr handle_idr; }; struct cls_fl_filter { @@ -210,6 +210,7 @@ static int fl_init(struct tcf_proto *tp) INIT_LIST_HEAD_RCU(>filters); rcu_assign_pointer(tp->root, head); + idr_init(>handle_idr); return 0; } @@ -295,6 +296,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f) { + struct cls_fl_head *head = rtnl_dereference(tp->root); + + idr_remove(>handle_idr, f->handle); list_del_rcu(>list); if (!tc_skip_hw(f->flags)) fl_hw_destroy_filter(tp, f); @@ -327,6 +331,7 @@ static void fl_destroy(struct tcf_proto *tp) list_for_each_entry_safe(f, next, >filters, list) __fl_delete(tp, f); + idr_destroy(>handle_idr); __module_get(THIS_MODULE); call_rcu(>rcu, fl_destroy_rcu); @@ -335,12 +340,8 @@ static void fl_destroy(struct tcf_proto *tp) static void *fl_get(struct tcf_proto *tp, u32 handle) { struct cls_fl_head *head = rtnl_dereference(tp->root); - struct cls_fl_filter *f; - list_for_each_entry(f, >filters, list) - if (f->handle == handle) - return f; - return NULL; + return idr_find(>handle_idr, handle); } static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { @@ -859,27 +860,6 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp, return 0; } -static u32 fl_grab_new_handle(struct tcf_proto *tp, - struct cls_fl_head *head) -{ - unsigned int i = 0x8000; - u32 handle; - - do { - if (++head->hgen == 0x7FFF) - head->hgen = 1; - } while (--i > 0 && fl_get(tp, head->hgen)); - - if (unlikely(i == 0)) { - pr_err("Insufficient number of handles\n"); - handle = 0; - } else { - handle = head->hgen; - } - - return handle; -} - static int fl_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -890,6 +870,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, struct cls_fl_filter *fnew; struct nlattr **tb; struct fl_flow_mask mask = {}; + unsigned long idr_index; int err; if (!tca[TCA_OPTIONS]) @@ -920,13 +901,21 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, goto errout; if (!handle) { - handle = fl_grab_new_handle(tp, head); - if (!handle) { - err = -EINVAL; + err = idr_alloc(>handle_idr, fnew, _index, + 1, 0x8000, GFP_KERNEL); + if (err) goto errout; - } + fnew->handle = idr_index; + } + + /* user specifies a handle and it doesn't exist */ + if (handle && !fold) { + err = idr_alloc(>handle_idr, fnew, _index, + handle, handle + 1, GFP_KERNEL); + if (err) + goto errout; + fnew->handle = idr_index; } - fnew->handle = handle; if (tb[TCA_FLOWER_FLAGS]) { fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]); @@ -980,6 +969,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
[patch net-next 0/3] net/sched: Improve getting objects by indexes
Using current TC code, it is very slow to insert a lot of rules. In order to improve the rules update rate in TC, we introduced the following two changes: 1) changed cls_flower to use IDR to manage the filters. 2) changed all act_xxx modules to use IDR instead of a small hash table But IDR has a limitation that it uses int. TC handle uses u32. To make sure there is no regression, we also changed IDR to use unsigned long. All clients of IDR are changed to use new IDR API. Chris Mi (3): idr: Use unsigned long instead of int net/sched: Change cls_flower to use IDR net/sched: Change act_api and act_xxx modules to use IDR block/bsg.c | 8 +- block/genhd.c | 12 +- drivers/atm/nicstar.c | 11 +- drivers/block/drbd/drbd_main.c | 31 +-- drivers/block/drbd/drbd_nl.c| 22 ++- drivers/block/drbd/drbd_proc.c | 3 +- drivers/block/drbd/drbd_receiver.c | 15 +- drivers/block/drbd/drbd_state.c | 34 ++-- drivers/block/drbd/drbd_worker.c| 6 +- drivers/block/loop.c| 17 +- drivers/block/nbd.c | 20 +- drivers/block/zram/zram_drv.c | 9 +- drivers/char/tpm/tpm-chip.c | 10 +- drivers/char/tpm/tpm.h | 2 +- drivers/dca/dca-sysfs.c | 9 +- drivers/firewire/core-cdev.c| 18 +- drivers/firewire/core-device.c | 15 +- drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.c | 8 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 9 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 2 +- drivers/gpu/drm/drm_auth.c | 9 +- drivers/gpu/drm/drm_connector.c | 10 +- drivers/gpu/drm/drm_context.c | 20 +- drivers/gpu/drm/drm_dp_aux_dev.c| 11 +- drivers/gpu/drm/drm_drv.c | 6 +- drivers/gpu/drm/drm_gem.c | 19 +- drivers/gpu/drm/drm_info.c | 2 +- drivers/gpu/drm/drm_mode_object.c | 11 +- drivers/gpu/drm/drm_syncobj.c | 18 +- drivers/gpu/drm/exynos/exynos_drm_ipp.c | 25 ++- drivers/gpu/drm/i915/gvt/display.c | 2 +- drivers/gpu/drm/i915/gvt/kvmgt.c| 2 +- drivers/gpu/drm/i915/gvt/vgpu.c | 9 +- drivers/gpu/drm/i915/i915_debugfs.c | 6 +- drivers/gpu/drm/i915/i915_gem_context.c | 9 +- drivers/gpu/drm/qxl/qxl_cmd.c | 8 +- drivers/gpu/drm/qxl/qxl_release.c | 14 +- drivers/gpu/drm/sis/sis_mm.c| 8 +- drivers/gpu/drm/tegra/drm.c | 10 +- drivers/gpu/drm/tilcdc/tilcdc_slave_compat.c| 3 +- drivers/gpu/drm/vgem/vgem_fence.c | 12 +- drivers/gpu/drm/via/via_mm.c| 8 +- drivers/gpu/drm/virtio/virtgpu_kms.c| 5 +- drivers/gpu/drm/virtio/virtgpu_vq.c | 5 +- drivers/gpu/drm/vmwgfx/vmwgfx_resource.c| 9 +- drivers/i2c/i2c-core-base.c | 19 +- drivers/infiniband/core/cm.c| 8 +- drivers/infiniband/core/cma.c | 12 +- drivers/infiniband/core/rdma_core.c | 9 +- drivers/infiniband/core/sa_query.c | 23 +-- drivers/infiniband/core/ucm.c | 7 +- drivers/infiniband/core/ucma.c | 14 +- drivers/infiniband/hw/cxgb3/iwch.c | 4 +- drivers/infiniband/hw/cxgb3/iwch.h | 4 +- drivers/infiniband/hw/cxgb4/device.c| 18 +- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 4 +- drivers/infiniband/hw/hfi1/init.c | 9 +- drivers/infiniband/hw/hfi1/vnic_main.c | 6 +- drivers/infiniband/hw/mlx4/cm.c | 13 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 7 +- drivers/infiniband/hw/qib/qib_init.c| 9 +- drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c | 10 +- drivers/iommu/intel-svm.c | 9 +- drivers/md/dm.c | 13 +- drivers/memstick/core/memstick.c| 10 +- drivers/memstick/core/ms_block.c| 9 +- drivers/memstick/core/mspro_block.c | 12 +- drivers/mfd/rtsx_pcr.c | 9 +- drivers/misc/c2port/core.c | 7 +- drivers/misc/cxl/context.c | 8 +- drivers/misc/cxl/main.c | 15 +- drivers/misc/mei/main.c | 8 +- drivers/misc/mic/scif/scif_api.c| 11 +- drivers/misc/mic/scif/scif_ports.c
Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?
On Tue, 15 Aug 2017 08:47:43 -0700 "Paul E. McKenney"wrote: > On Wed, Aug 02, 2017 at 05:25:55PM +0100, Jonathan Cameron wrote: > > On Tue, 1 Aug 2017 11:46:46 -0700 > > "Paul E. McKenney" wrote: > > > > > On Mon, Jul 31, 2017 at 04:27:57PM +0100, Jonathan Cameron wrote: > > > > On Mon, 31 Jul 2017 08:04:11 -0700 > > > > "Paul E. McKenney" wrote: > > > > > > > > > On Mon, Jul 31, 2017 at 12:08:47PM +0100, Jonathan Cameron wrote: > > > > > > On Fri, 28 Jul 2017 12:03:50 -0700 > > > > > > "Paul E. McKenney" wrote: > > > > > > > > > > > > > On Fri, Jul 28, 2017 at 06:27:05PM +0100, Jonathan Cameron wrote: > > > > > > > > > > > > > > > On Fri, 28 Jul 2017 09:55:29 -0700 > > > > > > > > "Paul E. McKenney" wrote: > > > > > > > > > > > > > > > > > On Fri, Jul 28, 2017 at 02:24:03PM +0100, Jonathan Cameron > > > > > > > > > wrote: > > > > > > > > > > On Fri, 28 Jul 2017 08:44:11 +0100 > > > > > > > > > > Jonathan Cameron wrote: > > > > > > > > > > > > > > > > > > > > > > > > > > > > [ . . . ] > > > > > > > > > > > > > > > > > > > Ok. Some info. I disabled a few driver (usb and SAS) in > > > > > > > > > > the interest of having > > > > > > > > > > fewer timer events. Issue became much easier to trigger > > > > > > > > > > (on some runs before > > > > > > > > > > I could get tracing up and running) > > > > > > > > > >e > > > > > > > > > > So logs are large enough that pastebin doesn't like them - > > > > > > > > > > please shoet if > > > > > > > > > >>e another timer period is of interest. > > > > > > > > > > > > > > > > > > > > https://pastebin.com/iUZDfQGM for the timer trace. > > > > > > > > > > https://pastebin.com/3w1F7amH for dmesg. > > > > > > > > > > > > > > > > > > > > The relevant timeout on the RCU stall detector was 8 > > > > > > > > > > seconds. Event is > > > > > > > > > > detected around 835. > > > > > > > > > > > > > > > > > > > > It's a lot of logs, so I haven't identified a smoking gun > > > > > > > > > > yet but there > > > > > > > > > > may well be one in there. > > > > > > > > > > > > > > > > > > The dmesg says: > > > > > > > > > > > > > > > > > > rcu_preempt kthread starved for 2508 jiffies! g112 c111 f0x0 > > > > > > > > > RCU_GP_WAIT_FQS(3) ->state=0x1 > > > > > > > > > > > > > > > > > > So I look for "rcu_preempt" timer events and find these: > > > > > > > > > > > > > > > > > > rcu_preempt-9 [019] 827.579114: timer_init: > > > > > > > > > timer=8017d5fc7da0 > > > > > > > > > rcu_preempt-9 [019] d..1 827.579115: timer_start: > > > > > > > > > timer=8017d5fc7da0 function=process_timeout > > > > > > > > > > > > > > > > > > Next look for "8017d5fc7da0" and I don't find anything > > > > > > > > > else. > > > > > > > > It does show up off the bottom of what would fit in pastebin... > > > > > > > > > > > > > > > > rcu_preempt-9 [001] d..1 837.681077: timer_cancel: > > > > > > > > timer=8017d5fc7da0 > > > > > > > > rcu_preempt-9 [001] 837.681086: timer_init: > > > > > > > > timer=8017d5fc7da0 > > > > > > > > rcu_preempt-9 [001] d..1 837.681087: timer_start: > > > > > > > > timer=8017d5fc7da0 function=process_timeout > > > > > > > > expires=4295101298 [timeout=1] cpu=1 idx=0 flags= > > > > > > > > > > > > > > Odd. I would expect an expiration... And ten seconds is way > > > > > > > longer > > > > > > > than the requested one jiffy! > > > > > > > > > > > > > > > > The timeout was one jiffy, and more than a second later, no > > > > > > > > > expiration. > > > > > > > > > Is it possible that this event was lost? I am not seeing any > > > > > > > > > sign of > > > > > > > > > this is the trace. > > > > > > > > > > > > > > > > > > I don't see any sign of CPU hotplug (and I test with lots of > > > > > > > > > that in > > > > > > > > > any case). > > > > > > > > > > > > > > > > > > The last time we saw something like this it was a timer > > > > > > > > > HW/driver problem, > > > > > > > > > but it is a bit hard to imagine such a problem affecting both > > > > > > > > > ARM64 > > > > > > > > > and SPARC. ;-) > > > > > > > > Could be different issues, both of which were hidden by that > > > > > > > > lockup detector. > > > > > > > > > > > > > > > > There is an errata work around for the timers on this > > > > > > > > particular board. > > > > > > > > I'm only vaguely aware of it, so may be unconnected. > > > > > > > > > > > > > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/clocksource/arm_arch_timer.c?h=v4.13-rc2=bb42ca47401010fc02901b5e8f79e40a26f208cb > > > > > > > > > > > > > > > > Seems unlikely though! + we've not yet seen it on the other
Re: [RFC PATCH v5 0/5] vfio-pci: Add support for mmapping MSI-X table
On Tue, 2017-08-15 at 10:37 -0600, Alex Williamson wrote: > Of course I don't think either of those are worth imposing a > performance penalty where we don't otherwise need one. However, if we > look at a VM scenario where the guest is following the PCI standard for > programming MSI-X interrupts (ie. not POWER), we need some mechanism to > intercept those MMIO writes to the vector table and configure the host > interrupt domain of the device rather than allowing the guest direct > access. This is simply part of virtualizing the device to the guest. > So even if the kernel allows mmap'ing the vector table, the hypervisor > needs to trap it, so the mmap isn't required or used anyway. It's only > when you define a non-PCI standard for your guest to program > interrupts, as POWER has done, and can therefore trust that the > hypervisor does not need to trap on the vector table that having that > mmap'able vector table becomes fully useful. AIUI, ARM supports 64k > pages too... does ARM have any strategy that would actually make it > possible to make use of an mmap covering the vector table? Thanks, WTF Alex, can you stop once and for all with all that "POWER is not standard" bullshit please ? It's completely wrong. This has nothing to do with PCIe standard ! The PCIe standard says strictly *nothing* whatsoever about how an OS obtains the magic address/values to put in the device and how the PCIe host bridge may do appropriate fitering. There is nothing on POWER that prevents the guest from writing the MSI- X address/data by hand. The problem isn't who writes the values or even how. The problem breaks down into these two things that are NOT covered by any aspect of the PCIe standard: 1- The OS needs to obtain address/data values for an MSI that will "work" for the device. 2- The HW+HV needs to prevent collateral damage caused by a device issuing stores to incorrect address or with incorrect data. Now *this* is necessary for *ANY* kind of DMA whether it's an MSI or something else anyway. Now, the filtering done by qemu is NOT a reasonable way to handle 2) and whatever excluse about "making it harder" doesn't fly a meter when it comes to security. Making it "harder to break accidentally" I also don't buy, people don't just randomly put things in their MSI-X tables "accidentally", that stuff works or doesn't. That leaves us with 1). Now this is purely a platform specific matters, not a spec matter. Once the HW has a way to enforce you can only generate "allowed" MSIs it becomes a matter of having some FW mechanism that can be used to informed the OS what address/values to use for a given interrupts. This is provided on POWER by a combination of device-tree and RTAS. It could be that x86/ARM64 doesn't provide good enough mechanisms via ACPI but this is no way a problem of standard compliance, just inferior firmware interfaces. So again, for the 234789246th time in years, can we get that 1-bit-of- information sorted one way or another so we can fix our massive performance issue instead of adding yet another dozen layers of paint on that shed ? Ben.
Re: [PATCH v2 8/9] powerpc/64: Use a table of paca pointers and allocate pacas individually
Hi Nicholas, [auto build test ERROR on v4.13-rc5] [also build test ERROR on next-20170815] [cannot apply to powerpc/next scottwood/next] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Nicholas-Piggin/KVM-PPC-Book3S-HV-Fix-H_REGISTER_VPA-VPA-size-validation/20170815-221907 config: powerpc-defconfig (attached as .config) compiler: powerpc64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705 reproduce: wget https://raw.githubusercontent.com/01org/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree make.cross ARCH=powerpc All errors (new ones prefixed by >>): arch/powerpc/kernel/head_64.o: In function `pmac_secondary_start': >> (.text+0x3762): undefined reference to `paca' --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: [RFC PATCH v5 0/5] vfio-pci: Add support for mmapping MSI-X table
On Mon, 14 Aug 2017 14:12:33 +0100 Robin Murphywrote: > On 14/08/17 10:45, Alexey Kardashevskiy wrote: > > Folks, > > > > Is there anything to change besides those compiler errors and David's > > comment in 5/5? Or the while patchset is too bad? Thanks. > > While I now understand it's not the low-level thing I first thought it > was, so my reasoning has changed, personally I don't like this approach > any more than the previous one - it still smells of abusing external > APIs to pass information from one part of VFIO to another (and it has > the same conceptual problem of attributing something to interrupt > sources that is actually a property of the interrupt target). > > Taking a step back, though, why does vfio-pci perform this check in the > first place? If a malicious guest already has control of a device, any > kind of interrupt spoofing it could do by fiddling with the MSI-X > message address/data it could simply do with a DMA write anyway, so the > security argument doesn't stand up in general (sure, not all PCIe > devices may be capable of arbitrary DMA, but that seems like more of a > tenuous security-by-obscurity angle to me). Besides, with Type1 IOMMU > the fact that we've let a device be assigned at all means that this is > already a non-issue (because either the hardware provides isolation or > the user has explicitly accepted the consequences of an unsafe > configuration) - from patch #4 that's apparently the same for SPAPR TCE, > in which case it seems this flag doesn't even need to be propagated and > could simply be assumed always. > > On the other hand, if the check is not so much to mitigate malicious > guests attacking the system as to prevent dumb guests breaking > themselves (e.g. if some or all of the MSI-X capability is actually > emulated), then allowing things to sometimes go wrong on the grounds of > an irrelevant hardware feature doesn't seem correct :/ While the theoretical security provided by preventing direct access to the MSI-X vector table may be mostly a matter of obfuscation, in practice, I think it changes the problem of creating arbitrary DMA writes from a generic, trivial, PCI spec based exercise to a more device specific challenge. I do however have evidence that there are consumers of the vfio API who would have attempted to program device interrupts by directly manipulating the vector table had they not been prevented from doing so and contacting me to learn about the SET_IRQ ioctl. Therefore I think the behavior also contributes to making the overall API more difficult to use incorrectly. Of course I don't think either of those are worth imposing a performance penalty where we don't otherwise need one. However, if we look at a VM scenario where the guest is following the PCI standard for programming MSI-X interrupts (ie. not POWER), we need some mechanism to intercept those MMIO writes to the vector table and configure the host interrupt domain of the device rather than allowing the guest direct access. This is simply part of virtualizing the device to the guest. So even if the kernel allows mmap'ing the vector table, the hypervisor needs to trap it, so the mmap isn't required or used anyway. It's only when you define a non-PCI standard for your guest to program interrupts, as POWER has done, and can therefore trust that the hypervisor does not need to trap on the vector table that having that mmap'able vector table becomes fully useful. AIUI, ARM supports 64k pages too... does ARM have any strategy that would actually make it possible to make use of an mmap covering the vector table? Thanks, Alex
Re: [PATCH v2 8/9] powerpc/64: Use a table of paca pointers and allocate pacas individually
On Sun, 13 Aug 2017 11:33:45 +1000 Nicholas Pigginwrote: > Change the paca array into an array of pointers to pacas. Allocate > pacas individually. > > This allows flexibility in where the PACAs are allocated. Future work > will allocate them node-local. Platforms that don't have address limits > on PACAs would be able to defer PACA allocations until later in boot > rather than allocate all possible ones up-front then freeing unused. > > This is slightly more overhead (one additional indirection) for cross > CPU paca references, but those aren't too common. > > Signed-off-by: Nicholas Piggin Incremental fix required for this. A !SMP typo bug, and pmac init code that was not switched from paca array to paca_ptrs array. In the first case I just got rid of the r13 thing completely. I think the idea of the code was to give secondaries a different r13 so they wouldn't overwrite CPU0's paca if they accidentally used it. --- arch/powerpc/kernel/head_64.S | 13 ++--- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index f71f468ebe7f..9db29719fc32 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -386,12 +386,11 @@ generic_secondary_common_init: * physical cpu id in r24, we need to search the pacas to find * which logical id maps to our physical one. */ - LOAD_REG_ADDR(r8, paca_ptrs)/* Load paca_ptrs pointe */ - ld r8,0(r8)/* Get base vaddr of array */ #ifndef CONFIG_SMP - li r13,r13,0 /* kill r13 if used accidentally */ b kexec_wait /* wait for next kernel if !SMP */ #else + LOAD_REG_ADDR(r8, paca_ptrs)/* Load paca_ptrs pointe */ + ld r8,0(r8)/* Get base vaddr of array */ LOAD_REG_ADDR(r7, nr_cpu_ids) /* Load nr_cpu_ids address */ lwz r7,0(r7)/* also the max paca allocated */ li r5,0/* logical cpu id*/ @@ -752,10 +751,10 @@ _GLOBAL(pmac_secondary_start) mtmsrd r3 /* RI on */ /* Set up a paca value for this processor. */ - LOAD_REG_ADDR(r4,paca) /* Load paca pointer*/ - ld r4,0(r4)/* Get base vaddr of paca array */ - mulli r13,r24,PACA_SIZE /* Calculate vaddr of right paca */ - add r13,r13,r4 /* for this processor. */ + LOAD_REG_ADDR(r4,paca_ptrs) /* Load paca pointer*/ + ld r4,0(r4)/* Get base vaddr of paca_ptrs array */ + sldir5,r24,3/* get paca_ptrs[] index from cpu id */ + ldx r13,r4,r5 /* r13 = paca_ptrs[cpu id] */ SET_PACA(r13) /* Save vaddr of paca in an SPRG*/ /* Mark interrupts soft and hard disabled (they might be enabled -- 2.13.3
Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?
On Wed, Aug 02, 2017 at 05:25:55PM +0100, Jonathan Cameron wrote: > On Tue, 1 Aug 2017 11:46:46 -0700 > "Paul E. McKenney"wrote: > > > On Mon, Jul 31, 2017 at 04:27:57PM +0100, Jonathan Cameron wrote: > > > On Mon, 31 Jul 2017 08:04:11 -0700 > > > "Paul E. McKenney" wrote: > > > > > > > On Mon, Jul 31, 2017 at 12:08:47PM +0100, Jonathan Cameron wrote: > > > > > On Fri, 28 Jul 2017 12:03:50 -0700 > > > > > "Paul E. McKenney" wrote: > > > > > > > > > > > On Fri, Jul 28, 2017 at 06:27:05PM +0100, Jonathan Cameron wrote: > > > > > > > > > > > > > On Fri, 28 Jul 2017 09:55:29 -0700 > > > > > > > "Paul E. McKenney" wrote: > > > > > > > > > > > > > > > On Fri, Jul 28, 2017 at 02:24:03PM +0100, Jonathan Cameron > > > > > > > > wrote: > > > > > > > > > On Fri, 28 Jul 2017 08:44:11 +0100 > > > > > > > > > Jonathan Cameron wrote: > > > > > > > > > > > > > > > > [ . . . ] > > > > > > > > > > > > > > > > > Ok. Some info. I disabled a few driver (usb and SAS) in the > > > > > > > > > interest of having > > > > > > > > > fewer timer events. Issue became much easier to trigger (on > > > > > > > > > some runs before > > > > > > > > > I could get tracing up and running) > > > > > > > > >e > > > > > > > > > So logs are large enough that pastebin doesn't like them - > > > > > > > > > please shoet if > > > > > > > > >>e another timer period is of interest. > > > > > > > > > > > > > > > > > > https://pastebin.com/iUZDfQGM for the timer trace. > > > > > > > > > https://pastebin.com/3w1F7amH for dmesg. > > > > > > > > > > > > > > > > > > The relevant timeout on the RCU stall detector was 8 seconds. > > > > > > > > > Event is > > > > > > > > > detected around 835. > > > > > > > > > > > > > > > > > > It's a lot of logs, so I haven't identified a smoking gun yet > > > > > > > > > but there > > > > > > > > > may well be one in there. > > > > > > > > > > > > > > > > The dmesg says: > > > > > > > > > > > > > > > > rcu_preempt kthread starved for 2508 jiffies! g112 c111 f0x0 > > > > > > > > RCU_GP_WAIT_FQS(3) ->state=0x1 > > > > > > > > > > > > > > > > So I look for "rcu_preempt" timer events and find these: > > > > > > > > > > > > > > > > rcu_preempt-9 [019] 827.579114: timer_init: > > > > > > > > timer=8017d5fc7da0 > > > > > > > > rcu_preempt-9 [019] d..1 827.579115: timer_start: > > > > > > > > timer=8017d5fc7da0 function=process_timeout > > > > > > > > > > > > > > > > Next look for "8017d5fc7da0" and I don't find anything > > > > > > > > else. > > > > > > > It does show up off the bottom of what would fit in pastebin... > > > > > > > > > > > > > > rcu_preempt-9 [001] d..1 837.681077: timer_cancel: > > > > > > > timer=8017d5fc7da0 > > > > > > > rcu_preempt-9 [001] 837.681086: timer_init: > > > > > > > timer=8017d5fc7da0 > > > > > > > rcu_preempt-9 [001] d..1 837.681087: timer_start: > > > > > > > timer=8017d5fc7da0 function=process_timeout > > > > > > > expires=4295101298 [timeout=1] cpu=1 idx=0 flags= > > > > > > > > > > > > Odd. I would expect an expiration... And ten seconds is way longer > > > > > > than the requested one jiffy! > > > > > > > > > > > > > > The timeout was one jiffy, and more than a second later, no > > > > > > > > expiration. > > > > > > > > Is it possible that this event was lost? I am not seeing any > > > > > > > > sign of > > > > > > > > this is the trace. > > > > > > > > > > > > > > > > I don't see any sign of CPU hotplug (and I test with lots of > > > > > > > > that in > > > > > > > > any case). > > > > > > > > > > > > > > > > The last time we saw something like this it was a timer > > > > > > > > HW/driver problem, > > > > > > > > but it is a bit hard to imagine such a problem affecting both > > > > > > > > ARM64 > > > > > > > > and SPARC. ;-) > > > > > > > Could be different issues, both of which were hidden by that > > > > > > > lockup detector. > > > > > > > > > > > > > > There is an errata work around for the timers on this particular > > > > > > > board. > > > > > > > I'm only vaguely aware of it, so may be unconnected. > > > > > > > > > > > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/clocksource/arm_arch_timer.c?h=v4.13-rc2=bb42ca47401010fc02901b5e8f79e40a26f208cb > > > > > > > > > > > > > > Seems unlikely though! + we've not yet seen it on the other chips > > > > > > > that > > > > > > > errata effects (not that that means much). > > > > > > > > > > > > If you can reproduce quickly, might be worth trying anyway... > > > > > > > > > > > > Thanx, Paul > > > > > Errata fix is running already and was for all those tests. > > > > > >
RE: [RFC PATCH v5 0/5] vfio-pci: Add support for mmapping MSI-X table
From: Benjamin Herrenschmidt > Sent: 15 August 2017 02:34 > On Tue, 2017-08-15 at 09:16 +0800, Jike Song wrote: > > > Taking a step back, though, why does vfio-pci perform this check in the > > > first place? If a malicious guest already has control of a device, any > > > kind of interrupt spoofing it could do by fiddling with the MSI-X > > > message address/data it could simply do with a DMA write anyway, so the > > > security argument doesn't stand up in general (sure, not all PCIe > > > devices may be capable of arbitrary DMA, but that seems like more of a > > > tenuous security-by-obscurity angle to me). > > I tried to make that point for years, thanks for re-iterating it :-) Indeed, we have an FPGA based PCIe card where the MSI-X table is just a piece of PCIe accessible memory. The device driver has to read the MSI-X table and write the address+data values to other registers which are then used to raise the interrupt. (Ok, I've written a better interrupt generator so we don't do that any more.) David
Re: [PATCH 05/11] serial: uuc_uart: constify uart_ops structures
On 8/13/17 1:21 AM, Julia Lawall wrote: These uart_ops structures are only stored in the ops field of a uart_port structure and this fields is const, so the uart_ops structures can also be const. Done with the help of Coccinelle. Signed-off-by: Julia LawallAcked-by: Timur Tabi
Re: [PATCH v7 7/9] mm: Add address parameter to arch_validate_prot()
On 08/14/2017 11:02 PM, Michael Ellerman wrote: Khalid Azizwrites: On 08/10/2017 07:20 AM, Michael Ellerman wrote: Khalid Aziz writes: A protection flag may not be valid across entire address space and hence arch_validate_prot() might need the address a protection bit is being set on to ensure it is a valid protection flag. For example, sparc processors support memory corruption detection (as part of ADI feature) flag on memory addresses mapped on to physical RAM but not on PFN mapped pages or addresses mapped on to devices. This patch adds address to the parameters being passed to arch_validate_prot() so protection bits can be validated in the relevant context. Signed-off-by: Khalid Aziz Cc: Khalid Aziz --- v7: - new patch arch/powerpc/include/asm/mman.h | 2 +- arch/powerpc/kernel/syscalls.c | 2 +- include/linux/mman.h| 2 +- mm/mprotect.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h index 30922f699341..bc74074304a2 100644 --- a/arch/powerpc/include/asm/mman.h +++ b/arch/powerpc/include/asm/mman.h @@ -40,7 +40,7 @@ static inline bool arch_validate_prot(unsigned long prot) return false; return true; } -#define arch_validate_prot(prot) arch_validate_prot(prot) +#define arch_validate_prot(prot, addr) arch_validate_prot(prot) This can be simpler, as just: #define arch_validate_prot arch_validate_prot Hi Michael, Thanks for reviewing! My patch expands parameter list for arch_validate_prot() from one to two parameters. Existing powerpc version of arch_validate_prot() is written with one parameter. If I use the above #define, compilation fails with: mm/mprotect.c: In function ‘do_mprotect_pkey’: mm/mprotect.c:399: error: too many arguments to function ‘arch_validate_prot’ Another way to solve it would be to add the new addr parameter to powerpc version of arch_validate_prot() but I chose the less disruptive solution of tackling it through #define and expanded the existing #define to include the new parameter. Make sense? Yes, it makes sense. But it's a bit gross. At first glance it looks like our arch_validate_prot() has an incorrect signature. I'd prefer you just updated it to have the correct signature, I think you'll have to change one more line in do_mmap2(). So it's not very intrusive. Thanks, Michael. I can do that. -- Khalid
Re: [PATCH 2/2] of: Restrict DMA configuration
On Tue, Aug 15, 2017 at 5:18 AM, Robin Murphywrote: > On 14/08/17 21:08, Rob Herring wrote: >> +linuxppc-dev >> >> On Fri, Aug 11, 2017 at 11:29 AM, Robin Murphy wrote: >>> Moving DMA configuration to happen later at driver probe time had the >>> unnoticed side-effect that we now perform DMA configuration for *every* >>> device represented in DT, rather than only those explicitly created by >>> the of_platform and PCI code. >>> >>> As Christoph points out, this is not really the best thing to do. Whilst >>> there may well be other DMA-capable buses that can benefit from having >>> their children automatically configured after the bridge has probed, >>> there are also plenty of others like USB, MDIO, etc. that definitely do >>> not support DMA and should not be indiscriminately processed. >>> >>> The good news is that DT already gives us the ammunition to do the right >>> thing - anything lacking a "dma-ranges" property should be considered >>> not to have a mapping of DMA address space from its children to its >>> parent, thus anything for which of_dma_get_range() does not succeed does >>> not need DMA configuration. >>> >>> The bad news is that strictly enforcing that would likely break just >>> about every FDT platform out there, since most authors have either not >>> considered the property at all or have mistakenly assumed that omitting >>> "dma-ranges" is equivalent to including the empty property. Thus we have >>> little choice but to special-case platform, AMBA and PCI devices so they >>> continue to receive configuration unconditionally as before. At least >>> anything new will have to get it right in future... >> >> By "anything new", you mean new buses, not new platforms, right? >> What's a platform bus device today could be a different kernel bus >> type tomorrow with no DT change. So this isn't really enforceable. > > Indeed, it would be virtually impossible to do anything on a > per-platform basis, but I think per-bus is a workable compromise - if > someone changes their hypothetical bus driver in a way that would affect > deployed DTs that can't be updated, at worst they can still add their > new bus type to the special case list at the same time. > >> I don't completely agree that omitting dma-ranges is wrong and that >> new DTs have to have dma-ranges simply because there is much precedent >> of DTs with dma-ranges omitted (just go look at PPC). If a bus has no >> bus to cpu address translation nor size restrictions, then no >> dma-ranges should be allowed. > > Sure, I agree that that genie is never going back in the bottle, but > people seem to manage to get empty "ranges" right to differentiate > between memory-mapped vs. non-memory-mapped buses, so it would be nice > to encourage getting the other direction right as well. Well, they get ranges right because they don't get an address otherwise. The problem on dma-ranges is the bus we describe is the slave side, not the master side. So a given bus may have a mixture of DMA capable devices and non-DMA capable devices. If we're still setting DMA masks on all the devices, what have we gained? Maybe we could get stricter on PCI buses at least. > For the immediate issue at hand, I guess the alternative plan of attack > would be to stick a flag in struct bus_type for the bus drivers > themselves to opt into generic DMA configuration. That at least keeps > everything within the kernel (and come to think of it probably works > neatly for modular bus types as well). I'm fine with the change as is, it's really just the commit text I'm commenting on. Rob
should "linux-phandle" be "linux,phandle"?
pedantic nitpickery but, in arch/powerpc/kernel/prom_init.c, line 2426, should that diagnostic message print "" and not ""? rday
Re: [PATCH v4 1/3] mm/hugetlb: Allow arch to override and call the weak function
"Aneesh Kumar K.V"writes: > When running in guest mode ppc64 supports a different mechanism for hugetlb > allocation/reservation. The LPAR management application called HMC can > be used to reserve a set of hugepages and we pass the details of > reserved pages via device tree to the guest. (more details in > htab_dt_scan_hugepage_blocks()) . We do the memblock_reserve of the range > and later in the boot sequence, we add the reserved range to huge_boot_pages. > > But to enable 16G hugetlb on baremetal config (when we are not running as > guest) > we want to do memblock reservation during boot. Generic code already does this > > Signed-off-by: Aneesh Kumar K.V > --- > include/linux/hugetlb.h | 1 + > mm/hugetlb.c| 4 +++- > 2 files changed, 4 insertions(+), 1 deletion(-) I'm planning to take this and the rest of the series in the powerpc tree. Unless someone on linux-mm yells at me :) cheers > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h > index 0ed8e41aaf11..8bbbd37ab105 100644 > --- a/include/linux/hugetlb.h > +++ b/include/linux/hugetlb.h > @@ -358,6 +358,7 @@ int huge_add_to_page_cache(struct page *page, struct > address_space *mapping, > pgoff_t idx); > > /* arch callback */ > +int __init __alloc_bootmem_huge_page(struct hstate *h); > int __init alloc_bootmem_huge_page(struct hstate *h); > > void __init hugetlb_bad_size(void); > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index bc48ee783dd9..b97e6494d74d 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -2083,7 +2083,9 @@ struct page *alloc_huge_page_noerr(struct > vm_area_struct *vma, > return page; > } > > -int __weak alloc_bootmem_huge_page(struct hstate *h) > +int alloc_bootmem_huge_page(struct hstate *h) > + __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); > +int __alloc_bootmem_huge_page(struct hstate *h) > { > struct huge_bootmem_page *m; > int nr_nodes, node; > -- > 2.13.3
Re: [PATCH v2 3/9] powerpc/powernv: Remove real mode access limit for early allocations
On Tue, 15 Aug 2017 22:48:22 +1000 Benjamin Herrenschmidtwrote: > On Tue, 2017-08-15 at 22:10 +1000, Nicholas Piggin wrote: > > On Mon, 14 Aug 2017 23:13:07 +1000 > > Benjamin Herrenschmidt wrote: > > > > > On Mon, 2017-08-14 at 22:49 +1000, Michael Ellerman wrote: > > > > > - /* > > > > > - * We limit the allocation that depend on ppc64_rma_size > > > > > - * to first_memblock_size. We also clamp it to 1GB to > > > > > - * avoid some funky things such as RTAS bugs. > > > > > > > > That comment about RTAS is 7 years old, and I'm pretty sure it was a > > > > historical note when it was written. > > > > > > > > I'm inclined to drop it and if we discover new bugs with RTAS on Power9 > > > > then we can always put it back. > > > > > > Arent' we using a 32-bit RTAS ? (Afaik there's a 64-bit one, we just > > > never used it ..). In this case we need to at least clamp to 2G (no > > > trust RTAS doing unsigned properly). > > > > Is there any allocation not covered by RTAS_INSTANTIATE_MAX? > > Not sure, we have to audit. Okay. > Talking about all this with mpe today, I > think we just need to make sure that anything that has a restriction > uses a specific identifier for *that* restriction rather than just > blindly "rma". For example, seg0_limit for segment 0 in HPT. In the > case of PACAs, we would create a specific limit that is > min(seg0_limit,rma) for pseries and -1 for powernv. etc.. > > The RMA limit can then become either strictly a pseries thing, or be > initialized to -1 on powernv (or max mem). Right, I'm trying to get there with the patch. Well really it breaks into two different things -- RMA for real mode (which is unlimited for powernv), and non faulting (of any kind, SLB or TLB on booke) for virtual mode. RTAS is still squished in there with RMA, but we do have that RTAS limit so we should try to move it out to there. Thanks, Nick
Re: [PATCH v2 3/9] powerpc/powernv: Remove real mode access limit for early allocations
On Tue, 2017-08-15 at 22:10 +1000, Nicholas Piggin wrote: > On Mon, 14 Aug 2017 23:13:07 +1000 > Benjamin Herrenschmidtwrote: > > > On Mon, 2017-08-14 at 22:49 +1000, Michael Ellerman wrote: > > > > - /* > > > > - * We limit the allocation that depend on ppc64_rma_size > > > > - * to first_memblock_size. We also clamp it to 1GB to > > > > - * avoid some funky things such as RTAS bugs. > > > > > > That comment about RTAS is 7 years old, and I'm pretty sure it was a > > > historical note when it was written. > > > > > > I'm inclined to drop it and if we discover new bugs with RTAS on Power9 > > > then we can always put it back. > > > > Arent' we using a 32-bit RTAS ? (Afaik there's a 64-bit one, we just > > never used it ..). In this case we need to at least clamp to 2G (no > > trust RTAS doing unsigned properly). > > Is there any allocation not covered by RTAS_INSTANTIATE_MAX? Not sure, we have to audit. Talking about all this with mpe today, I think we just need to make sure that anything that has a restriction uses a specific identifier for *that* restriction rather than just blindly "rma". For example, seg0_limit for segment 0 in HPT. In the case of PACAs, we would create a specific limit that is min(seg0_limit,rma) for pseries and -1 for powernv. etc.. The RMA limit can then become either strictly a pseries thing, or be initialized to -1 on powernv (or max mem). Ben.
Re: [PATCH v2 3/9] powerpc/powernv: Remove real mode access limit for early allocations
On Mon, 14 Aug 2017 23:13:07 +1000 Benjamin Herrenschmidtwrote: > On Mon, 2017-08-14 at 22:49 +1000, Michael Ellerman wrote: > > > - /* > > > - * We limit the allocation that depend on ppc64_rma_size > > > - * to first_memblock_size. We also clamp it to 1GB to > > > - * avoid some funky things such as RTAS bugs. > > > > That comment about RTAS is 7 years old, and I'm pretty sure it was a > > historical note when it was written. > > > > I'm inclined to drop it and if we discover new bugs with RTAS on Power9 > > then we can always put it back. > > Arent' we using a 32-bit RTAS ? (Afaik there's a 64-bit one, we just > never used it ..). In this case we need to at least clamp to 2G (no > trust RTAS doing unsigned properly). Is there any allocation not covered by RTAS_INSTANTIATE_MAX? Thanks, Nick
Re: [PATCH v2 1/9] KVM: PPC: Book3S HV: Fix H_REGISTER_VPA VPA size validation
Nicholas Pigginwrites: > KVM currently validates the size of the VPA registered by the client > against sizeof(struct lppaca), however we align (and therefore size) > that struct to 1kB to avoid crossing a 4kB boundary in the client. > > PAPR calls for sizes >= 640 bytes to be accepted. Hard code this with > a comment. > > Signed-off-by: Nicholas Piggin > --- > arch/powerpc/kvm/book3s_hv.c | 8 +++- > 1 file changed, 7 insertions(+), 1 deletion(-) This one should go via Paul. Hopefully he can just pick it up. cheers > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 359c79cdf0cc..1182cfd79857 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -485,7 +485,13 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu > *vcpu, > > switch (subfunc) { > case H_VPA_REG_VPA: /* register VPA */ > - if (len < sizeof(struct lppaca)) > + /* > + * The size of our lppaca is 1kB because of the way we align > + * it for the guest to avoid crossing a 4kB boundary. We only > + * use 640 bytes of the structure though, so we should accept > + * clients that set a size of 640. > + */ > + if (len < 640) > break; > vpap = >arch.vpa; > err = 0; > -- > 2.13.3
Re: [PATCH] powerpc: store the intended structure
Julia Lawallwrites: > Normally the values in the resource field and the argument to ARRAY_SIZE > in the num_resources are the same. In this case, the value in the reousrce > field is the same as the one in the previous platform_device structure, and > appears to be a copy-paste error. Replace the value in the resource field > with the argument to the local call to ARRAY_SIZE. > > Signed-off-by: Julia Lawall > > --- > arch/powerpc/platforms/chrp/pegasos_eth.c |2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) Thanks. This is close to EOL code I think, but I'll merge it anyway as it seems obviously correct. cheers > diff --git a/arch/powerpc/platforms/chrp/pegasos_eth.c > b/arch/powerpc/platforms/chrp/pegasos_eth.c > index 2b4dc6a..1976071 100644 > --- a/arch/powerpc/platforms/chrp/pegasos_eth.c > +++ b/arch/powerpc/platforms/chrp/pegasos_eth.c > @@ -63,7 +63,7 @@ > .name = "orion-mdio", > .id = -1, > .num_resources = ARRAY_SIZE(mv643xx_eth_mvmdio_resources), > - .resource = mv643xx_eth_shared_resources, > + .resource = mv643xx_eth_mvmdio_resources, > }; > > static struct resource mv643xx_eth_port1_resources[] = {
Re: [PATCH 05/11] ASoC: omap: make snd_soc_platform_driver const
Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki. Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki On 2017-08-14 14:38, Bhumika Goyal wrote: > Make this const as it is only passed as the 2nd argument to the > function devm_snd_soc_register_platform, which is of type const. > Done using Coccinelle. > > Signed-off-by: Bhumika GoyalAcked-by: Peter Ujfalusi > --- > sound/soc/omap/omap-pcm.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/sound/soc/omap/omap-pcm.c b/sound/soc/omap/omap-pcm.c > index 94e9ff7..1283473 100644 > --- a/sound/soc/omap/omap-pcm.c > +++ b/sound/soc/omap/omap-pcm.c > @@ -243,7 +243,7 @@ static int omap_pcm_new(struct snd_soc_pcm_runtime *rtd) > return ret; > } > > -static struct snd_soc_platform_driver omap_soc_platform = { > +static const struct snd_soc_platform_driver omap_soc_platform = { > .ops= _pcm_ops, > .pcm_new= omap_pcm_new, > .pcm_free = omap_pcm_free_dma_buffers, > - Péter
Re: [PATCH v6 1/2] powerpc/fadump: reduce memory consumption for capture kernel
Hello, sorry about the late reply. Looks like I had too much faith in the parse_args sanity. Looking closely the parsing happens in next_arg and only outermost quotes are removed. So presumably >>foo="bar baz"<< gives >>bar baz<< as value and >>foo=bar" baz"<< gives >>bar" baz<< as value. And presumably you can do fadump_extra_args="par1=val1 par2=val2 pa3=val3" and fadump_extra_args=""par="value with spaces""" (each parameter which needs space in separate fadump_extra_args parameter) provided you remove the outermost quotes in the fadump_extra_args handler as well. Wanted to run some tests but did not get around to do it yet. On Sat, 29 Jul 2017 02:27:22 +0530 Hari Bathiniwrote: > With fadump (dump capture) kernel booting like a regular kernel, it > almost needs the same amount of memory to boot as the production > kernel, which is unwarranted for a dump capture kernel. But with no > option to disable some of the unnecessary subsystems in fadump > kernel, that much memory is wasted on fadump, depriving the > production kernel of that memory. > > Introduce kernel parameter 'fadump_extra_args=' that would take > regular parameters as a space separated quoted string, to be enforced > when fadump is active. This 'fadump_extra_args=' parameter can be > leveraged to pass parameters like nr_cpus=1, cgroup_disable=memory > and numa=off, to disable unwarranted resources/subsystems. > > Also, ensure the log "Firmware-assisted dump is active" is printed > early in the boot process to put the subsequent fadump messages in > context. > > Suggested-by: Michael Ellerman > Signed-off-by: Hari Bathini > --- > > Changes from v5: > * Using 'fadump_extra_args=' instead of 'fadump_append=' to pass > additional parameters, to be enforced when fadump is active. > * Using space-separated quoted list as syntax for 'fadump_extra_args=' > parameter. > > > arch/powerpc/include/asm/fadump.h |2 + > arch/powerpc/kernel/fadump.c | 125 > - > arch/powerpc/kernel/prom.c|7 ++ 3 files changed, 131 > insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/include/asm/fadump.h > b/arch/powerpc/include/asm/fadump.h index ce88bbe..98ae009 100644 > --- a/arch/powerpc/include/asm/fadump.h > +++ b/arch/powerpc/include/asm/fadump.h > @@ -208,11 +208,13 @@ extern int early_init_dt_scan_fw_dump(unsigned > long node, const char *uname, int depth, void *data); > extern int fadump_reserve_mem(void); > extern int setup_fadump(void); > +extern void enforce_fadump_extra_args(char *cmdline); > extern int is_fadump_active(void); > extern void crash_fadump(struct pt_regs *, const char *); > extern void fadump_cleanup(void); > > #else/* CONFIG_FA_DUMP */ > +static inline void enforce_fadump_extra_args(char *cmdline) { } > static inline int is_fadump_active(void) { return 0; } > static inline void crash_fadump(struct pt_regs *regs, const char > *str) { } #endif > diff --git a/arch/powerpc/kernel/fadump.c > b/arch/powerpc/kernel/fadump.c index dc0c49c..d8cb829 100644 > --- a/arch/powerpc/kernel/fadump.c > +++ b/arch/powerpc/kernel/fadump.c > @@ -78,8 +78,10 @@ int __init early_init_dt_scan_fw_dump(unsigned > long node, >* dump data waiting for us. >*/ > fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", > NULL); > - if (fdm_active) > + if (fdm_active) { > + pr_info("Firmware-assisted dump is active.\n"); > fw_dump.dump_active = 1; > + } > > /* Get the sizes required to store dump data for the > firmware provided >* dump sections. > @@ -332,8 +334,11 @@ int __init fadump_reserve_mem(void) > { > unsigned long base, size, memory_boundary; > > - if (!fw_dump.fadump_enabled) > + if (!fw_dump.fadump_enabled) { > + if (fw_dump.dump_active) > + pr_warn("Firmware-assisted dump was active > but kernel booted with fadump disabled!\n"); return 0; > + } > > if (!fw_dump.fadump_supported) { > printk(KERN_INFO "Firmware-assisted dump is not > supported on" @@ -373,7 +378,6 @@ int __init fadump_reserve_mem(void) > memory_boundary = memblock_end_of_DRAM(); > > if (fw_dump.dump_active) { > - printk(KERN_INFO "Firmware-assisted dump is > active.\n"); /* >* If last boot has crashed then reserve all the > memory >* above boot_memory_size so that we don't touch it > until @@ -460,6 +464,121 @@ static int __init > early_fadump_reserve_mem(char *p) } > early_param("fadump_reserve_mem", early_fadump_reserve_mem); > > +#define FADUMP_EXTRA_ARGS_PARAM "fadump_extra_args=" > +#define INIT_ARGS_START "-- " > +#define INIT_ARGS_START_LEN strlen(INIT_ARGS_START) > + > +struct param_info { > + char*params; > + unsigned int
Re: [PATCH 2/2] of: Restrict DMA configuration
On 14/08/17 21:08, Rob Herring wrote: > +linuxppc-dev > > On Fri, Aug 11, 2017 at 11:29 AM, Robin Murphywrote: >> Moving DMA configuration to happen later at driver probe time had the >> unnoticed side-effect that we now perform DMA configuration for *every* >> device represented in DT, rather than only those explicitly created by >> the of_platform and PCI code. >> >> As Christoph points out, this is not really the best thing to do. Whilst >> there may well be other DMA-capable buses that can benefit from having >> their children automatically configured after the bridge has probed, >> there are also plenty of others like USB, MDIO, etc. that definitely do >> not support DMA and should not be indiscriminately processed. >> >> The good news is that DT already gives us the ammunition to do the right >> thing - anything lacking a "dma-ranges" property should be considered >> not to have a mapping of DMA address space from its children to its >> parent, thus anything for which of_dma_get_range() does not succeed does >> not need DMA configuration. >> >> The bad news is that strictly enforcing that would likely break just >> about every FDT platform out there, since most authors have either not >> considered the property at all or have mistakenly assumed that omitting >> "dma-ranges" is equivalent to including the empty property. Thus we have >> little choice but to special-case platform, AMBA and PCI devices so they >> continue to receive configuration unconditionally as before. At least >> anything new will have to get it right in future... > > By "anything new", you mean new buses, not new platforms, right? > What's a platform bus device today could be a different kernel bus > type tomorrow with no DT change. So this isn't really enforceable. Indeed, it would be virtually impossible to do anything on a per-platform basis, but I think per-bus is a workable compromise - if someone changes their hypothetical bus driver in a way that would affect deployed DTs that can't be updated, at worst they can still add their new bus type to the special case list at the same time. > I don't completely agree that omitting dma-ranges is wrong and that > new DTs have to have dma-ranges simply because there is much precedent > of DTs with dma-ranges omitted (just go look at PPC). If a bus has no > bus to cpu address translation nor size restrictions, then no > dma-ranges should be allowed. Sure, I agree that that genie is never going back in the bottle, but people seem to manage to get empty "ranges" right to differentiate between memory-mapped vs. non-memory-mapped buses, so it would be nice to encourage getting the other direction right as well. For the immediate issue at hand, I guess the alternative plan of attack would be to stick a flag in struct bus_type for the bus drivers themselves to opt into generic DMA configuration. That at least keeps everything within the kernel (and come to think of it probably works neatly for modular bus types as well). Robin. > Of course, DT standards can and do > evolve and we could decide to be stricter here, but that hasn't > happened. If it does, then we need to make that clear in the spec and > enforce it. > > Rob > > >> >> Fixes: 09515ef5ddad ("of/acpi: Configure dma operations at probe time for >> platform/amba/pci bus devices") >> Reported-by: Christoph Hellwig >> Signed-off-by: Robin Murphy >> --- >> drivers/of/device.c | 48 >> 1 file changed, 32 insertions(+), 16 deletions(-) >> >> diff --git a/drivers/of/device.c b/drivers/of/device.c >> index e0a28ea341fe..04c4c952dc57 100644 >> --- a/drivers/of/device.c >> +++ b/drivers/of/device.c >> @@ -9,6 +9,9 @@ >> #include >> #include >> #include >> +#include >> +#include >> +#include >> >> #include >> #include "of_private.h" >> @@ -84,31 +87,28 @@ int of_device_add(struct platform_device *ofdev) >> */ >> int of_dma_configure(struct device *dev, struct device_node *np) >> { >> - u64 dma_addr, paddr, size; >> + u64 dma_addr, paddr, size = 0; >> int ret; >> bool coherent; >> unsigned long offset; >> const struct iommu_ops *iommu; >> u64 mask; >> >> - /* >> -* Set default coherent_dma_mask to 32 bit. Drivers are expected to >> -* setup the correct supported mask. >> -*/ >> - if (!dev->coherent_dma_mask) >> - dev->coherent_dma_mask = DMA_BIT_MASK(32); >> - >> - /* >> -* Set it to coherent_dma_mask by default if the architecture >> -* code has not set it. >> -*/ >> - if (!dev->dma_mask) >> - dev->dma_mask = >coherent_dma_mask; >> - >> ret = of_dma_get_range(np, _addr, , ); >> if (ret < 0) { >> + /* >> +* For legacy reasons, we have to assume some devices need >> +* DMA configuration regardless
Re: [PATCH] powerpc/xmon: Exclude all of xmon/ from ftrace
Michael Ellermanwrites: > "Naveen N. Rao" writes: > >> diff --git a/arch/powerpc/xmon/Makefile b/arch/powerpc/xmon/Makefile >> index 0b2f771593eb..5f95af64cb8f 100644 >> --- a/arch/powerpc/xmon/Makefile >> +++ b/arch/powerpc/xmon/Makefile >> @@ -7,6 +7,19 @@ UBSAN_SANITIZE := n >> >> ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) >> >> +ifdef CONFIG_FUNCTION_TRACER >> +CFLAGS_REMOVE_xmon.o= -mno-sched-epilog $(CC_FLAGS_FTRACE) >> +CFLAGS_REMOVE_nonstdio.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) >> +ifdef CONFIG_XMON_DISASSEMBLY >> +CFLAGS_REMOVE_ppc-dis.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) >> +CFLAGS_REMOVE_ppc-opc.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) >> +ifdef CONFIG_SPU_BASE >> +CFLAGS_REMOVE_spu-dis.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) >> +CFLAGS_REMOVE_spu-opc.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) >> +endif >> +endif >> +endif > > Urk. > > We want to disable it for everything in the directory, so can you do > something like: > > ORIG_CFLAGS := $(KBUILD_CFLAGS) > KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS)) Yes: # Disable ftrace for the entire directory ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -mno-sched-epilog,,$(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))) Seems to work. cheers
Re: [v6 15/15] mm: debug for raw alloctor
On Mon 14-08-17 10:01:52, Pasha Tatashin wrote: > >>However, now thinking about it, I will change it to CONFIG_MEMBLOCK_DEBUG, > >>and let users decide what other debugging configs need to be enabled, as > >>this is also OK. > > > >Actually the more I think about it the more I am convinced that a kernel > >boot parameter would be better because it doesn't need the kernel to be > >recompiled and it is a single branch in not so hot path. > > The main reason I do not like kernel parameter is that automated test suits > for every platform would need to be updated to include this new parameter in > order to test it. How does this differ from the enabling a config option and building a separate kernel? My primary point of the kernel option was to have something available to debug without recompiling the kernel which is more tedious than simply adding one option to the kernel command line. -- Michal Hocko SUSE Labs
Re: [v6 05/15] mm: don't accessed uninitialized struct pages
[CC Mel - the original patch was http://lkml.kernel.org/r/1502138329-123460-6-git-send-email-pasha.tatas...@oracle.com] On Mon 07-08-17 16:38:39, Pavel Tatashin wrote: > In deferred_init_memmap() where all deferred struct pages are initialized > we have a check like this: > > if (page->flags) { > VM_BUG_ON(page_zone(page) != zone); > goto free_range; > } > > This way we are checking if the current deferred page has already been > initialized. It works, because memory for struct pages has been zeroed, and > the only way flags are not zero if it went through __init_single_page() > before. But, once we change the current behavior and won't zero the memory > in memblock allocator, we cannot trust anything inside "struct page"es > until they are initialized. This patch fixes this. > > This patch defines a new accessor memblock_get_reserved_pfn_range() > which returns successive ranges of reserved PFNs. deferred_init_memmap() > calls it to determine if a PFN and its struct page has already been > initialized. Maybe I am missing something but how can we see reserved ranges here when for_each_mem_pfn_range iterates over memblock.memory? The loop is rather complex but I am wondering whether the page->flags check is needed at all. We shouldn't have duplicated memblocks covering the same pfn ranges so we cannot initialize the same range multiple times, right? Reserved ranges are excluded altogether so how exactly can we see an initialized struct page? In other words, why this simply doesn't work? --- diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 90e331e4c077..987a340a5bed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1524,11 +1524,6 @@ static int __init deferred_init_memmap(void *data) cond_resched(); } - if (page->flags) { - VM_BUG_ON(page_zone(page) != zone); - goto free_range; - } - __init_single_page(page, pfn, zid, nid); if (!free_base_page) { free_base_page = page; -- Michal Hocko SUSE Labs
[PATCH v2 0/1] rtc: rtctest: Support opal-rtc and rtc-generic
On ppc64le machines the opal-rtc, resp rtc-generic in guest is used. They only support minimal set of functionality and fail this test in not-yet treated way. This extends the checks and skips to the next test when feature is not supported. Changes in v2: - Removed the double EINVAL check - Added missing space before "EIO" check. Lukáš Doktor (1): rtc: rtctest: Improve support detection tools/testing/selftests/timers/rtctest.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) -- 2.9.4
[PATCH v2 1/1] rtc: rtctest: Improve support detection
The rtc-generic and opal-rtc are failing to run this test as they do not support all the features. Let's treat the error returns and skip to the following test. Theoretically the test_DATE should be also adjusted, but as it's enabled on demand I think it makes sense to fail in such case. Signed-off-by: Lukáš Doktor--- tools/testing/selftests/timers/rtctest.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/timers/rtctest.c b/tools/testing/selftests/timers/rtctest.c index f61170f..411eff6 100644 --- a/tools/testing/selftests/timers/rtctest.c +++ b/tools/testing/selftests/timers/rtctest.c @@ -221,6 +221,11 @@ int main(int argc, char **argv) /* Read the current alarm settings */ retval = ioctl(fd, RTC_ALM_READ, _tm); if (retval == -1) { + if (errno == EINVAL) { + fprintf(stderr, + "\n...EINVAL reading current alarm setting.\n"); + goto test_PIE; + } perror("RTC_ALM_READ ioctl"); exit(errno); } @@ -231,7 +236,7 @@ int main(int argc, char **argv) /* Enable alarm interrupts */ retval = ioctl(fd, RTC_AIE_ON, 0); if (retval == -1) { - if (errno == EINVAL) { + if (errno == EINVAL || errno == EIO) { fprintf(stderr, "\n...Alarm IRQs not supported.\n"); goto test_PIE; -- 2.9.4
Re: [PATCH 01/11] ASoC: codecs: make snd_soc_platform_driver const
On Mon, Aug 14, 2017 at 05:08:40PM +0530, Bhumika Goyal wrote: > Make these const as they are either passed as the 2nd argument to the > function devm_snd_soc_register_platform or snd_soc_register_platform, > and the arguments are of type const. > Done using Coccinelle. > > Signed-off-by: Bhumika Goyal> --- For the Wolfson bits: Acked-by: Charles Keepax Thanks, Charles
Re: [PATCH] rtc: rtctest: Improve support detection
Dne 12.8.2017 v 22:22 Alexandre Belloni napsal(a): > Hi, > > On 11/08/2017 at 11:14:55 +0200, Lukáš Doktor wrote: >> The rtc-generic and opal-rtc are failing to run this test as they do not >> support all the features. Let's treat the error returns and skip to the >> following test. >> >> Theoretically the test_DATE should be also adjusted, but as it's enabled >> on demand I think it makes sense to fail in such case. >> >> Signed-off-by: Lukáš Doktor>> --- >> tools/testing/selftests/timers/rtctest.c | 9 +++-- >> 1 file changed, 7 insertions(+), 2 deletions(-) >> >> diff --git a/tools/testing/selftests/timers/rtctest.c >> b/tools/testing/selftests/timers/rtctest.c >> index f61170f..6344842 100644 >> --- a/tools/testing/selftests/timers/rtctest.c >> +++ b/tools/testing/selftests/timers/rtctest.c >> @@ -125,7 +125,7 @@ int main(int argc, char **argv) >> /* Turn on update interrupts (one per second) */ >> retval = ioctl(fd, RTC_UIE_ON, 0); >> if (retval == -1) { >> -if (errno == EINVAL) { >> +if (errno == EINVAL || errno == EINVAL) { > > Well, this needs to be fixed. > Yes, this chunk is actually not needed anymore (it was based on outdated copy we use in Autotest which had ENOTTY which is probably not needed nowadays). Let me send v2 without this. Regards, Lukáš >> fprintf(stderr, >> "\n...Update IRQs not supported.\n"); >> goto test_READ; >> @@ -221,6 +221,11 @@ int main(int argc, char **argv) >> /* Read the current alarm settings */ >> retval = ioctl(fd, RTC_ALM_READ, _tm); >> if (retval == -1) { >> +if (errno == EINVAL) { >> +fprintf(stderr, >> +"\n...EINVAL reading current alarm >> setting.\n"); >> +goto test_PIE; >> +} >> perror("RTC_ALM_READ ioctl"); >> exit(errno); >> } >> @@ -231,7 +236,7 @@ int main(int argc, char **argv) >> /* Enable alarm interrupts */ >> retval = ioctl(fd, RTC_AIE_ON, 0); >> if (retval == -1) { >> -if (errno == EINVAL) { >> +if (errno == EINVAL || errno ==EIO) { >> fprintf(stderr, >> "\n...Alarm IRQs not supported.\n"); >> goto test_PIE; >> -- >> 2.9.4 >> > signature.asc Description: OpenPGP digital signature
Re: [PATCH 05/11] ASoC: omap: make snd_soc_platform_driver const
On Mon, Aug 14, 2017 at 05:08:44PM +0530, Bhumika Goyal wrote: > Make this const as it is only passed as the 2nd argument to the > function devm_snd_soc_register_platform, which is of type const. > Done using Coccinelle. > > Signed-off-by: Bhumika Goyal> --- > sound/soc/omap/omap-pcm.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > Acked-by: Jarkko Nikula