Re: [PATCH v11 14/26] locking/lockdep, cpu/hotplus: Use a weaker annotation in AP thread

2024-02-12 Thread Thomas Gleixner
On Tue, Jan 30 2024 at 11:58, Byungchul Park wrote:
> On Fri, Jan 26, 2024 at 06:30:02PM +0100, Thomas Gleixner wrote:
>> On Wed, Jan 24 2024 at 20:59, Byungchul Park wrote:
>> 
>> Why is lockdep in the subsystem prefix here? You are changing the CPU
>> hotplug (not hotplus) code, right?
>> 
>> > cb92173d1f0 ("locking/lockdep, cpu/hotplug: Annotate AP thread") was
>> > introduced to make lockdep_assert_cpus_held() work in AP thread.
>> >
>> > However, the annotation is too strong for that purpose. We don't have to
>> > use more than try lock annotation for that.
>> 
>> This lacks a proper explanation why this is too strong.
>> 
>> > Furthermore, now that Dept was introduced, false positive alarms was
>> > reported by that. Replaced it with try lock annotation.
>> 
>> I still have zero idea what this is about.
>
> 1. can track PG_locked that is a potential deadlock trigger.
>
>
> https://lore.kernel.org/lkml/1674268856-31807-1-git-send-email-byungchul.p...@lge.com/

Sure, but that wants to be explicitely explained in the changelog and
not with a link. 'Now that Dept was introduced ...' is not an
explanation.

Thanks,

tglx




Re: [PATCH v11 14/26] locking/lockdep, cpu/hotplus: Use a weaker annotation in AP thread

2024-01-26 Thread Thomas Gleixner
On Wed, Jan 24 2024 at 20:59, Byungchul Park wrote:

Why is lockdep in the subsystem prefix here? You are changing the CPU
hotplug (not hotplus) code, right?

> cb92173d1f0 ("locking/lockdep, cpu/hotplug: Annotate AP thread") was
> introduced to make lockdep_assert_cpus_held() work in AP thread.
>
> However, the annotation is too strong for that purpose. We don't have to
> use more than try lock annotation for that.

This lacks a proper explanation why this is too strong.

> Furthermore, now that Dept was introduced, false positive alarms was
> reported by that. Replaced it with try lock annotation.

I still have zero idea what this is about.

Thanks,

tglx


Re: [DO NOT MERGE v5 16/37] irqchip: Add SH7751 INTC driver

2023-12-08 Thread Thomas Gleixner
On Tue, Dec 05 2023 at 18:45, Yoshinori Sato wrote:
> +static void endisable_irq(struct irq_data *data, int enable)

bool enable?

> +{
> + struct sh7751_intc_priv *priv;
> + unsigned int irq;
> +
> + priv = irq_data_to_priv(data);
> +
> + irq = irqd_to_hwirq(data);
> + if (!is_valid_irq(irq)) {
> + /* IRQ out of range */
> + pr_warn_once("%s: IRQ %u is out of range\n", __FILE__, irq);
> + return;
> + }
> +
> + if (irq <= MAX_IRL && !priv->irlm)
> + /* IRL encoded external interrupt */
> + /* disable for SR.IMASK */
> + update_sr_imask(irq - IRQ_START, enable);
> + else
> + /* Internal peripheral interrupt */
> + /* mask for IPR priority 0 */
> + update_ipr(priv, irq, enable);

Lacks curly brackets on the if/else

> +static int irq_sh7751_map(struct irq_domain *h, unsigned int virq,
> +   irq_hw_number_t hw_irq_num)
> +{
> + irq_set_chip_and_handler(virq, _irq_chip, handle_level_irq);
> + irq_get_irq_data(virq)->chip_data = h->host_data;
> + irq_modify_status(virq, IRQ_NOREQUEST, IRQ_NOPROBE);
> + return 0;
> +}
> +static const struct irq_domain_ops irq_ops = {

Newline before 'static ...'

> + .map= irq_sh7751_map,
> + .xlate  = irq_domain_xlate_onecell,
> +};
> +
> +static int __init load_ipr_map(struct device_node *intc,
> +struct sh7751_intc_priv *priv)
> +{
> + struct property *ipr_map;
> + unsigned int num_ipr, i;
> + struct ipr *ipr;
> + const __be32 *p;
> + u32 irq;
> +
> + ipr_map = of_find_property(intc, "renesas,ipr-map", _ipr);
> + if (IS_ERR(ipr_map))
> + return PTR_ERR(ipr_map);
> + num_ipr /= sizeof(u32);
> + /* 3words per entry. */
> + if (num_ipr % 3)

Three words per ... But you can spare the comment by doing:

if (num_ipr % WORDS_PER_ENTRY)

> + goto error1;
> + num_ipr /= 3;
> +static int __init sh7751_intc_of_init(struct device_node *intc,
> +   struct device_node *parent)
> +{
> + struct sh7751_intc_priv *priv;
> + void __iomem *base, *base2;
> + struct irq_domain *domain;
> + u16 icr;
> + int ret;
> +
> + base = of_iomap(intc, 0);
> + base2 = of_iomap(intc, 1);
> + if (!base || !base2) {
> + pr_err("%pOFP: Invalid register definition\n", intc);

What unmaps 'base' if 'base' is valid and base2 == NULL?

> + return -EINVAL;
> + }
> +
> + priv = kzalloc(sizeof(struct sh7751_intc_priv), GFP_KERNEL);
> + if (priv == NULL)
> + return -ENOMEM;

Leaks base[2] maps, no?

> + ret = load_ipr_map(intc, priv);
> + if (ret < 0) {
> + kfree(priv);
> + return ret;
> + }
> +
> + priv->base = base;
> + priv->intpri00 = base2;
> +
> + if (of_property_read_bool(intc, "renesas,irlm")) {
> + priv->irlm = true;
> + icr = __raw_readw(priv->base + R_ICR);
> + icr |= ICR_IRLM;
> + __raw_writew(icr, priv->base + R_ICR);
> + }
> +
> + domain = irq_domain_add_linear(intc, NR_IRQS, _ops, priv);
> + if (domain == NULL) {
> + pr_err("%pOFP: cannot initialize irq domain\n", intc);
> + kfree(priv);
> + return -ENOMEM;
> + }
> +
> + irq_set_default_host(domain);
> + pr_info("%pOFP: SH7751 Interrupt controller (%s external IRQ)",
> + intc, priv->irlm ? "4 lines" : "15 level");
> + return 0;
> +}
> +
> +IRQCHIP_DECLARE(sh_7751_intc,
> + "renesas,sh7751-intc", sh7751_intc_of_init);

One line please.

Thanks,

tglx


Re: drm/vkms: deadlock between dev->event_lock and timer

2023-09-13 Thread Thomas Gleixner
On Wed, Sep 13 2023 at 09:47, Linus Torvalds wrote:
> On Wed, 13 Sept 2023 at 07:21, Tetsuo Handa
>  wrote:
>>
>> Hello. A deadlock was reported in drivers/gpu/drm/vkms/ .
>> It looks like this locking pattern remains as of 6.6-rc1. Please fix.
>>
>> void drm_crtc_vblank_off(struct drm_crtc *crtc) {
>>   spin_lock_irq(>event_lock);
>>   drm_vblank_disable_and_save(dev, pipe) {
>> __disable_vblank(dev, pipe) {
>>   crtc->funcs->disable_vblank(crtc) == vkms_disable_vblank {
>> hrtimer_cancel(>vblank_hrtimer) { // Retries with 
>> dev->event_lock held until lock_hrtimer_base() succeeds.
>>   ret = hrtimer_try_to_cancel(timer) {
>> base = lock_hrtimer_base(timer, ); // Fails forever 
>> because vkms_vblank_simulate() is in progress.
>
> Heh. Ok. This is clearly a bug, but it does seem to be limited to just
> the vkms driver, and literally only to the "simulate vblank" case.
>
> The worst part about it is that it's so subtle and not obvious.
>
> Some light grepping seems to show that amdgpu has almost the exact
> same pattern in its own vkms thing, except it uses
>
> hrtimer_try_to_cancel(_crtc->vblank_timer);
>
> directly, which presumably fixes the livelock, but means that the
> cancel will fail if it's currently running.
>
> So just doing the same thing in the vkms driver probably fixes things.
>
> Maybe the vkms people need to add a flag to say "it's canceled" so
> that it doesn't then get re-enabled?  Or maybe it doesn't matter
> and/or already happens for some reason I didn't look into.

Maybe the VKMS people need to understand locking in the first place. The
first thing I saw in this code is:

static enum hrtimer_restart vkms_vblank_simulate(struct hrtimer *timer)
{
   ...
   mutex_unlock(>enabled_lock);

What?

Unlocking a mutex in the context of a hrtimer callback is simply
violating all mutex locking rules.

How has this code ever survived lock debugging without triggering a big
fat warning?

Thanks,

tglx


Re: [PATCH RFC v7 07/23] dept: Apply sdt_might_sleep_strong() to wait_for_completion()/complete()

2023-01-18 Thread Thomas Gleixner
On Mon, Jan 09 2023 at 12:33, Byungchul Park wrote:
> Makes Dept able to track dependencies by
> wait_for_completion()/complete().
>
> In order to obtain the meaningful caller points, replace all the
> wait_for_completion*() declarations with macros in the header.

That's just wrong.

> -extern void wait_for_completion(struct completion *);
> +extern void raw_wait_for_completion(struct completion *);

> +#define wait_for_completion(x)   \
> +({   \
> + sdt_might_sleep_strong(NULL);   \
> + raw_wait_for_completion(x); \
> + sdt_might_sleep_finish();   \
> +})

The very same can be achieved with a proper annotation which does not
enforce THIS_IP but allows to use __builtin_return_address($N). That's
what everything else uses too.

Thanks,

tglx


Re: [PATCH RFC v7 06/23] dept: Add proc knobs to show stats and dependency graph

2023-01-18 Thread Thomas Gleixner
On Mon, Jan 09 2023 at 12:33, Byungchul Park wrote:
> It'd be useful to show Dept internal stats and dependency graph on
> runtime via proc for better information. Introduced the knobs.

proc?

That's what debugfs is for.


Re: [PATCH RFC v7 00/23] DEPT(Dependency Tracker)

2023-01-18 Thread Thomas Gleixner
On Tue, Jan 17 2023 at 10:18, Boqun Feng wrote:
> On Mon, Jan 16, 2023 at 10:00:52AM -0800, Linus Torvalds wrote:
>> I also recall this giving a fair amount of false positives, are they all 
>> fixed?
>
> From the following part in the cover letter, I guess the answer is no?
>   ...
> 6. Multiple reports are allowed.
> 7. Deduplication control on multiple reports.
> 8. Withstand false positives thanks to 6.
>   ...
>
> seems to me that the logic is since DEPT allows multiple reports so that
> false positives are fitlerable by users?

I really do not know what's so valuable about multiple reports. They
produce a flood of information which needs to be filtered, while a
single report ensures that the first detected issue is dumped, which
increases the probability that it can be recorded and acted upon.

Filtering out false positives is just the wrong approach. Decoding
dependency issues from any tracker is complex enough given the nature of
the problem, so adding the burden of filtering out issues from a stream
of dumps is not helpful at all. It's just a marketing gag.

> * Instead of introducing a brand new detector/dependency tracker,
>   could we first improve the lockdep's dependency tracker? I think
>   Byungchul also agrees that DEPT and lockdep should share the
>   same dependency tracker and the benefit of improving the
>   existing one is that we can always use the self test to catch
>   any regression. Thoughts?

Ack. If the internal implementation of lockdep has shortcomings, then we
can expand and/or replace it instead of having yet another
infrastructure which is not even remotely as mature.

Thanks,

tglx


Re: [PATCH RFC v7 03/23] dept: Add single event dependency tracker APIs

2023-01-18 Thread Thomas Gleixner
On Mon, Jan 09 2023 at 12:33, Byungchul Park wrote:
> +/*
> + * sdt_might_sleep() and its family will be committed in __schedule()
> + * when it actually gets to __schedule(). Both dept_request_event() and
> + * dept_wait() will be performed on the commit.
> + */
> +
> +/*
> + * Use the code location as the class key if an explicit map is not used.
> + */
> +#define sdt_might_sleep_strong(m)\
> + do {\
> + struct dept_map *__m = m;   \
> + static struct dept_key __key;   \
> + dept_stage_wait(__m, __m ? NULL : &__key, _THIS_IP_, __func__, 
> true);\
> + } while (0)
> +
> +/*
> + * Use the code location as the class key if an explicit map is not used.
> + */
> +#define sdt_might_sleep_weak(m)  
> \
> + do {\
> + struct dept_map *__m = m;   \
> + static struct dept_key __key;   \
> + dept_stage_wait(__m, __m ? NULL : &__key, _THIS_IP_, __func__, 
> false);\
> + } while (0)
> +
> +#define sdt_might_sleep_finish() dept_clean_stage()
> +
> +#define sdt_ecxt_enter(m)dept_ecxt_enter(m, 1UL, _THIS_IP_, 
> "start", "event", 0)
> +#define sdt_event(m) dept_event(m, 1UL, _THIS_IP_, __func__)
> +#define sdt_ecxt_exit(m) dept_ecxt_exit(m, 1UL, _THIS_IP_)

None of the above comes with a proper documentation of the various
macros/functions. How should anyone aside of you understand what this is
about and how this should be used?

Thanks,

tglx


Re: [GIT PULL] treewide: timers: Use timer_shutdown*() before freeing timers

2022-11-07 Thread Thomas Gleixner
Linus,

On Sun, Nov 06 2022 at 22:32, Steven Rostedt wrote:
> As discussed here:
>
>   https://lore.kernel.org/all/20221106212427.739928...@goodmis.org/

Please hold off. It's only nits, but tip has documented rules and random
pull requests are not making them go away.

Thanks,

tglx


[patch 00/11] hrtimers: Cleanup hrtimer_forward() [ab]use

2021-09-23 Thread Thomas Gleixner
A recent syzbot report unearthed abuse of hrtimer_forward() which can cause
runaway timers hogging the CPU in timer expiry context by rearming the
timer in the past over and over.

This happens when the caller uses timer->expiry for the 'now' argument of
hrtimer_forward(). That works as long as the timer expiry is on time, but
can cause a long period of rearm/fire loops which hog the CPU. Expiring
late can have various causes, but obviously virtualization is prone to that
due to VCPU scheduling.

The correct usage of hrtimer_forward() is to hand the current time to the
'now' argument which ensures that the next event on the periodic time line
is past now. This is what hrtimer_forward_now() provides.

The following series addresses this:

1) Add a debug mechanism to the hrtimer expiry loop

2) Convert all hrtimer_forward() usage outside of kernel/time/ to
   use hrtimer_forward_now().

3) Confine hrtimer_forward() to kernel/time/ core code.

The mac80211_hwsim patch has already been picked up by the wireless
maintainer and all other patches which affect usage outside the core code
can be picked up by the relevant subsystems. If a maintainer wants me to
pick a particular patch up, please let me know.

The last patch which confines hrtimer_forward() will be postponed until all
other patches have been merged into Linus tree.

The series is also available from git:

git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git hrtimer

Thanks,

tglx
---
 drivers/gpu/drm/i915/i915_pmu.c|2 -
 drivers/net/wireless/mac80211_hwsim.c  |4 +-
 drivers/net/wwan/iosm/iosm_ipc_imem.c  |4 +-
 drivers/power/reset/ltc2952-poweroff.c |4 --
 include/linux/hrtimer.h|   26 -
 include/linux/posix-timers.h   |3 ++
 kernel/signal.c|   14 +
 kernel/time/hrtimer.c  |   48 -
 kernel/time/itimer.c   |   13 
 kernel/time/posix-timers.c |   42 +++-
 kernel/time/tick-internal.h|1 
 net/can/bcm.c  |2 -
 sound/drivers/pcsp/pcsp_lib.c  |2 -
 13 files changed, 92 insertions(+), 73 deletions(-)



[patch 07/11] drm/i915/pmu: Use hrtimer_forward_now()

2021-09-23 Thread Thomas Gleixner
hrtimer_forward() is about to be removed from the public
interfaces. Replace it with hrtimer_forward_now() which provides the same
functionality.

Signed-off-by: Thomas Gleixner 
Cc: David Airlie 
Cc: intel-...@lists.freedesktop.org
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: dri-devel@lists.freedesktop.org
Cc: Daniel Vetter 
Cc: Rodrigo Vivi 
---
 drivers/gpu/drm/i915/i915_pmu.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -439,7 +439,7 @@ static enum hrtimer_restart i915_sample(
engines_sample(gt, period_ns);
frequency_sample(gt, period_ns);
 
-   hrtimer_forward(hrtimer, now, ns_to_ktime(PERIOD));
+   hrtimer_forward_now(hrtimer, ns_to_ktime(PERIOD));
 
return HRTIMER_RESTART;
 }



[patch 6/7] drm/i915: Replace io_mapping_map_atomic_wc()

2021-03-05 Thread Thomas Gleixner
From: Thomas Gleixner 

None of these mapping requires the side effect of disabling pagefaults and
preemption.

Use io_mapping_map_local_wc() instead, and clean up gtt_user_read() and
gtt_user_write() to use a plain copy_from_user() as the local maps are not
disabling pagefaults.

Signed-off-by: Thomas Gleixner 
Cc: Jani Nikula 
Cc: Joonas Lahtinen 
Cc: Rodrigo Vivi 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Chris Wilson 
Cc: intel-...@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
---
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c |7 +---
 drivers/gpu/drm/i915/i915_gem.c|   40 -
 drivers/gpu/drm/i915/selftests/i915_gem.c  |4 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c  |8 ++---
 4 files changed, 22 insertions(+), 37 deletions(-)

--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1080,7 +1080,7 @@ static void reloc_cache_reset(struct rel
struct i915_ggtt *ggtt = cache_to_ggtt(cache);
 
intel_gt_flush_ggtt_writes(ggtt->vm.gt);
-   io_mapping_unmap_atomic((void __iomem *)vaddr);
+   io_mapping_unmap_local((void __iomem *)vaddr);
 
if (drm_mm_node_allocated(>node)) {
ggtt->vm.clear_range(>vm,
@@ -1146,7 +1146,7 @@ static void *reloc_iomap(struct drm_i915
 
if (cache->vaddr) {
intel_gt_flush_ggtt_writes(ggtt->vm.gt);
-   io_mapping_unmap_atomic((void __force __iomem *) 
unmask_page(cache->vaddr));
+   io_mapping_unmap_local((void __force __iomem *) 
unmask_page(cache->vaddr));
} else {
struct i915_vma *vma;
int err;
@@ -1194,8 +1194,7 @@ static void *reloc_iomap(struct drm_i915
offset += page << PAGE_SHIFT;
}
 
-   vaddr = (void __force *)io_mapping_map_atomic_wc(>iomap,
-offset);
+   vaddr = (void __force *)io_mapping_map_local_wc(>iomap, offset);
cache->page = page;
cache->vaddr = (unsigned long)vaddr;
 
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -253,22 +253,15 @@ gtt_user_read(struct io_mapping *mapping
  char __user *user_data, int length)
 {
void __iomem *vaddr;
-   unsigned long unwritten;
+   bool fail = false;
 
/* We can use the cpu mem copy function because this is X86. */
-   vaddr = io_mapping_map_atomic_wc(mapping, base);
-   unwritten = __copy_to_user_inatomic(user_data,
-   (void __force *)vaddr + offset,
-   length);
-   io_mapping_unmap_atomic(vaddr);
-   if (unwritten) {
-   vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
-   unwritten = copy_to_user(user_data,
-(void __force *)vaddr + offset,
-length);
-   io_mapping_unmap(vaddr);
-   }
-   return unwritten;
+   vaddr = io_mapping_map_local_wc(mapping, base);
+   if (copy_to_user(user_data, (void __force *)vaddr + offset, length))
+   fail = true;
+   io_mapping_unmap_local(vaddr);
+
+   return fail;
 }
 
 static int
@@ -437,21 +430,14 @@ ggtt_write(struct io_mapping *mapping,
   char __user *user_data, int length)
 {
void __iomem *vaddr;
-   unsigned long unwritten;
+   bool fail = false;
 
/* We can use the cpu mem copy function because this is X86. */
-   vaddr = io_mapping_map_atomic_wc(mapping, base);
-   unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + 
offset,
- user_data, length);
-   io_mapping_unmap_atomic(vaddr);
-   if (unwritten) {
-   vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE);
-   unwritten = copy_from_user((void __force *)vaddr + offset,
-  user_data, length);
-   io_mapping_unmap(vaddr);
-   }
-
-   return unwritten;
+   vaddr = io_mapping_map_local_wc(mapping, base);
+   if (copy_from_user((void __force *)vaddr + offset, user_data, length))
+   fail = true;
+   io_mapping_unmap_local(vaddr);
+   return fail;
 }
 
 /**
--- a/drivers/gpu/drm/i915/selftests/i915_gem.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem.c
@@ -58,12 +58,12 @@ static void trash_stolen(struct drm_i915
 
ggtt->vm.insert_page(>vm, dma, slot, I915_CACHE_NONE, 0);
 
-   s = io_mapping_map_atomic_wc(>iomap, slot);
+   s = io_mapping_map_local_wc(>iomap, slot);
for (x = 0; x < PAGE_SIZE / sizeof(u32); x++) {
prng = next_pseudo_r

[patch 4/7] drm/qxl: Replace io_mapping_map_atomic_wc()

2021-03-05 Thread Thomas Gleixner
From: Thomas Gleixner 

None of these mapping requires the side effect of disabling pagefaults and
preemption.

Use io_mapping_map_local_wc() instead, rename the related functions
accordingly and clean up qxl_process_single_command() to use a plain
copy_from_user() as the local maps are not disabling pagefaults.

Signed-off-by: Thomas Gleixner 
Cc: David Airlie 
Cc: Gerd Hoffmann 
Cc: Daniel Vetter 
Cc: virtualizat...@lists.linux-foundation.org
Cc: spice-de...@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
---
 drivers/gpu/drm/qxl/qxl_image.c   |   18 +-
 drivers/gpu/drm/qxl/qxl_ioctl.c   |   27 +--
 drivers/gpu/drm/qxl/qxl_object.c  |   12 ++--
 drivers/gpu/drm/qxl/qxl_object.h  |4 ++--
 drivers/gpu/drm/qxl/qxl_release.c |4 ++--
 5 files changed, 32 insertions(+), 33 deletions(-)

--- a/drivers/gpu/drm/qxl/qxl_image.c
+++ b/drivers/gpu/drm/qxl/qxl_image.c
@@ -124,12 +124,12 @@ qxl_image_init_helper(struct qxl_device
  wrong (check the bitmaps are sent correctly
  first) */
 
-   ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, 0);
+   ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, 0);
chunk = ptr;
chunk->data_size = height * chunk_stride;
chunk->prev_chunk = 0;
chunk->next_chunk = 0;
-   qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr);
+   qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr);
 
{
void *k_data, *i_data;
@@ -143,7 +143,7 @@ qxl_image_init_helper(struct qxl_device
i_data = (void *)data;
 
while (remain > 0) {
-   ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, 
page << PAGE_SHIFT);
+   ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, 
page << PAGE_SHIFT);
 
if (page == 0) {
chunk = ptr;
@@ -157,7 +157,7 @@ qxl_image_init_helper(struct qxl_device
 
memcpy(k_data, i_data, size);
 
-   qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr);
+   qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr);
i_data += size;
remain -= size;
page++;
@@ -175,10 +175,10 @@ qxl_image_init_helper(struct qxl_device
page_offset = 
offset_in_page(out_offset);
size = min((int)(PAGE_SIZE - 
page_offset), remain);
 
-   ptr = qxl_bo_kmap_atomic_page(qdev, 
chunk_bo, page_base);
+   ptr = qxl_bo_kmap_local_page(qdev, 
chunk_bo, page_base);
k_data = ptr + page_offset;
memcpy(k_data, i_data, size);
-   qxl_bo_kunmap_atomic_page(qdev, 
chunk_bo, ptr);
+   qxl_bo_kunmap_local_page(qdev, 
chunk_bo, ptr);
remain -= size;
i_data += size;
out_offset += size;
@@ -189,7 +189,7 @@ qxl_image_init_helper(struct qxl_device
qxl_bo_kunmap(chunk_bo);
 
image_bo = dimage->bo;
-   ptr = qxl_bo_kmap_atomic_page(qdev, image_bo, 0);
+   ptr = qxl_bo_kmap_local_page(qdev, image_bo, 0);
image = ptr;
 
image->descriptor.id = 0;
@@ -212,7 +212,7 @@ qxl_image_init_helper(struct qxl_device
break;
default:
DRM_ERROR("unsupported image bit depth\n");
-   qxl_bo_kunmap_atomic_page(qdev, image_bo, ptr);
+   qxl_bo_kunmap_local_page(qdev, image_bo, ptr);
return -EINVAL;
}
image->u.bitmap.flags = QXL_BITMAP_TOP_DOWN;
@@ -222,7 +222,7 @@ qxl_image_init_helper(struct qxl_device
image->u.bitmap.palette = 0;
image->u.bitmap.data = qxl_bo_physical_address(qdev, chunk_bo, 0);
 
-   qxl_bo_kunmap_atomic_page(qdev, image_bo, ptr);
+   qxl_bo_kunmap_local_page(qdev, image_bo, ptr);
 
return 0;
 }
--- a/drivers/gpu/drm/qxl/qxl_ioctl.c
+++ b/drivers/gpu/drm/qxl/qxl_ioctl.c
@@ -89,11 +89,11 @@ apply_reloc(struct qxl_device *qdev, str
 {
void *reloc_page;
 
-   reloc_page = qxl_bo_kmap_atomic_page(qdev, info->dst_bo, 
info->dst_offset & PAGE_MASK);
+   reloc_page = qxl_bo_kmap_local_page(qdev, info->dst_bo, 
info->dst_offset & PAGE_MASK);
*(uint64_t *)(reloc_page + (info->dst_offset & ~PAGE_MASK)) = 
qxl_bo_physical_address(qdev,

  info->src_bo,

[patch 1/7] drm/ttm: Replace kmap_atomic() usage

2021-03-05 Thread Thomas Gleixner
From: Thomas Gleixner 

There is no reason to disable pagefaults and preemption as a side effect of
kmap_atomic_prot().

Use kmap_local_page_prot() instead and document the reasoning for the
mapping usage with the given pgprot.

Remove the NULL pointer check for the map. These functions return a valid
address for valid pages and the return was bogus anyway as it would have
left preemption and pagefaults disabled.

Signed-off-by: Thomas Gleixner 
Cc: Christian Koenig 
Cc: Huang Rui 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: dri-devel@lists.freedesktop.org
---
 drivers/gpu/drm/ttm/ttm_bo_util.c |   20 
 1 file changed, 12 insertions(+), 8 deletions(-)

--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -181,13 +181,15 @@ static int ttm_copy_io_ttm_page(struct t
return -ENOMEM;
 
src = (void *)((unsigned long)src + (page << PAGE_SHIFT));
-   dst = kmap_atomic_prot(d, prot);
-   if (!dst)
-   return -ENOMEM;
+   /*
+* Ensure that a highmem page is mapped with the correct
+* pgprot. For non highmem the mapping is already there.
+*/
+   dst = kmap_local_page_prot(d, prot);
 
memcpy_fromio(dst, src, PAGE_SIZE);
 
-   kunmap_atomic(dst);
+   kunmap_local(dst);
 
return 0;
 }
@@ -203,13 +205,15 @@ static int ttm_copy_ttm_io_page(struct t
return -ENOMEM;
 
dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT));
-   src = kmap_atomic_prot(s, prot);
-   if (!src)
-   return -ENOMEM;
+   /*
+* Ensure that a highmem page is mapped with the correct
+* pgprot. For non highmem the mapping is already there.
+*/
+   src = kmap_local_page_prot(s, prot);
 
memcpy_toio(dst, src, PAGE_SIZE);
 
-   kunmap_atomic(src);
+   kunmap_local(src);
 
return 0;
 }


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 7/7] io-mapping: Remove io_mapping_map_atomic_wc()

2021-03-05 Thread Thomas Gleixner
From: Thomas Gleixner 

No more users. Get rid of it and remove the traces in documentation.

Signed-off-by: Thomas Gleixner 
Cc: Andrew Morton 
Cc: linux...@kvack.org
---
 Documentation/driver-api/io-mapping.rst |   22 +---
 include/linux/io-mapping.h  |   42 +---
 2 files changed, 9 insertions(+), 55 deletions(-)

--- a/Documentation/driver-api/io-mapping.rst
+++ b/Documentation/driver-api/io-mapping.rst
@@ -21,19 +21,15 @@ mappable, while 'size' indicates how lar
 enable. Both are in bytes.
 
 This _wc variant provides a mapping which may only be used with
-io_mapping_map_atomic_wc(), io_mapping_map_local_wc() or
-io_mapping_map_wc().
+io_mapping_map_local_wc() or io_mapping_map_wc().
 
 With this mapping object, individual pages can be mapped either temporarily
 or long term, depending on the requirements. Of course, temporary maps are
-more efficient. They come in two flavours::
+more efficient.
 
void *io_mapping_map_local_wc(struct io_mapping *mapping,
  unsigned long offset)
 
-   void *io_mapping_map_atomic_wc(struct io_mapping *mapping,
-  unsigned long offset)
-
 'offset' is the offset within the defined mapping region.  Accessing
 addresses beyond the region specified in the creation function yields
 undefined results. Using an offset which is not page aligned yields an
@@ -50,9 +46,6 @@ io_mapping_map_local_wc() has a side eff
 migration to make the mapping code work. No caller can rely on this side
 effect.
 
-io_mapping_map_atomic_wc() has the side effect of disabling preemption and
-pagefaults. Don't use in new code. Use io_mapping_map_local_wc() instead.
-
 Nested mappings need to be undone in reverse order because the mapping
 code uses a stack for keeping track of them::
 
@@ -65,11 +58,10 @@ Nested mappings need to be undone in rev
 The mappings are released with::
 
void io_mapping_unmap_local(void *vaddr)
-   void io_mapping_unmap_atomic(void *vaddr)
 
-'vaddr' must be the value returned by the last io_mapping_map_local_wc() or
-io_mapping_map_atomic_wc() call. This unmaps the specified mapping and
-undoes the side effects of the mapping functions.
+'vaddr' must be the value returned by the last io_mapping_map_local_wc()
+call. This unmaps the specified mapping and undoes eventual side effects of
+the mapping function.
 
 If you need to sleep while holding a mapping, you can use the regular
 variant, although this may be significantly slower::
@@ -77,8 +69,8 @@ If you need to sleep while holding a map
void *io_mapping_map_wc(struct io_mapping *mapping,
unsigned long offset)
 
-This works like io_mapping_map_atomic/local_wc() except it has no side
-effects and the pointer is globaly visible.
+This works like io_mapping_map_local_wc() except it has no side effects and
+the pointer is globaly visible.
 
 The mappings are released with::
 
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -60,28 +60,7 @@ io_mapping_fini(struct io_mapping *mappi
iomap_free(mapping->base, mapping->size);
 }
 
-/* Atomic map/unmap */
-static inline void __iomem *
-io_mapping_map_atomic_wc(struct io_mapping *mapping,
-unsigned long offset)
-{
-   resource_size_t phys_addr;
-
-   BUG_ON(offset >= mapping->size);
-   phys_addr = mapping->base + offset;
-   preempt_disable();
-   pagefault_disable();
-   return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
-}
-
-static inline void
-io_mapping_unmap_atomic(void __iomem *vaddr)
-{
-   kunmap_local_indexed((void __force *)vaddr);
-   pagefault_enable();
-   preempt_enable();
-}
-
+/* Temporary mappings which are only valid in the current context */
 static inline void __iomem *
 io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset)
 {
@@ -163,24 +142,7 @@ io_mapping_unmap(void __iomem *vaddr)
 {
 }
 
-/* Atomic map/unmap */
-static inline void __iomem *
-io_mapping_map_atomic_wc(struct io_mapping *mapping,
-unsigned long offset)
-{
-   preempt_disable();
-   pagefault_disable();
-   return io_mapping_map_wc(mapping, offset, PAGE_SIZE);
-}
-
-static inline void
-io_mapping_unmap_atomic(void __iomem *vaddr)
-{
-   io_mapping_unmap(vaddr);
-   pagefault_enable();
-   preempt_enable();
-}
-
+/* Temporary mappings which are only valid in the current context */
 static inline void __iomem *
 io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset)
 {

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 3/7] highmem: Remove kmap_atomic_prot()

2021-03-05 Thread Thomas Gleixner
From: Thomas Gleixner 

No more users.

Signed-off-by: Thomas Gleixner 
Cc: Andrew Morton 
Cc: linux...@kvack.org
---
 include/linux/highmem-internal.h |   14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -88,16 +88,11 @@ static inline void __kunmap_local(void *
kunmap_local_indexed(vaddr);
 }
 
-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_atomic(struct page *page)
 {
preempt_disable();
pagefault_disable();
-   return __kmap_local_page_prot(page, prot);
-}
-
-static inline void *kmap_atomic(struct page *page)
-{
-   return kmap_atomic_prot(page, kmap_prot);
+   return __kmap_local_page_prot(page, kmap_prot);
 }
 
 static inline void *kmap_atomic_pfn(unsigned long pfn)
@@ -184,11 +179,6 @@ static inline void *kmap_atomic(struct p
return page_address(page);
 }
 
-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
-{
-   return kmap_atomic(page);
-}
-
 static inline void *kmap_atomic_pfn(unsigned long pfn)
 {
return kmap_atomic(pfn_to_page(pfn));

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 5/7] drm/nouveau/device: Replace io_mapping_map_atomic_wc()

2021-03-05 Thread Thomas Gleixner
From: Thomas Gleixner 

Neither fbmem_peek() nor fbmem_poke() require to disable pagefaults and
preemption as a side effect of io_mapping_map_atomic_wc().

Use io_mapping_map_local_wc() instead.

Signed-off-by: Thomas Gleixner 
Cc: Ben Skeggs 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: dri-devel@lists.freedesktop.org
Cc: nouv...@lists.freedesktop.org
---
 drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

--- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h
@@ -60,19 +60,19 @@ fbmem_fini(struct io_mapping *fb)
 static inline u32
 fbmem_peek(struct io_mapping *fb, u32 off)
 {
-   u8 __iomem *p = io_mapping_map_atomic_wc(fb, off & PAGE_MASK);
+   u8 __iomem *p = io_mapping_map_local_wc(fb, off & PAGE_MASK);
u32 val = ioread32(p + (off & ~PAGE_MASK));
-   io_mapping_unmap_atomic(p);
+   io_mapping_unmap_local(p);
return val;
 }
 
 static inline void
 fbmem_poke(struct io_mapping *fb, u32 off, u32 val)
 {
-   u8 __iomem *p = io_mapping_map_atomic_wc(fb, off & PAGE_MASK);
+   u8 __iomem *p = io_mapping_map_local_wc(fb, off & PAGE_MASK);
iowrite32(val, p + (off & ~PAGE_MASK));
wmb();
-   io_mapping_unmap_atomic(p);
+   io_mapping_unmap_local(p);
 }
 
 static inline bool


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 0/7] drm, highmem: Cleanup io/kmap_atomic*() usage

2021-03-05 Thread Thomas Gleixner
None of the DRM usage sites of temporary mappings requires the side
effects of io/kmap_atomic(), i.e. preemption and pagefault disable.

Replace them with the io/kmap_local() variants, simplify the
copy_to/from_user() error handling and remove the atomic variants.

Thanks,

tglx
---
 Documentation/driver-api/io-mapping.rst |   22 +++---
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c  |7 +--
 drivers/gpu/drm/i915/i915_gem.c |   40 ++-
 drivers/gpu/drm/i915/selftests/i915_gem.c   |4 -
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c   |8 +--
 drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h |8 +--
 drivers/gpu/drm/qxl/qxl_image.c |   18 
 drivers/gpu/drm/qxl/qxl_ioctl.c |   27 ++--
 drivers/gpu/drm/qxl/qxl_object.c|   12 ++---
 drivers/gpu/drm/qxl/qxl_object.h|4 -
 drivers/gpu/drm/qxl/qxl_release.c   |4 -
 drivers/gpu/drm/ttm/ttm_bo_util.c   |   20 +
 drivers/gpu/drm/vmwgfx/vmwgfx_blit.c|   30 +-
 include/linux/highmem-internal.h|   14 --
 include/linux/io-mapping.h  |   42 
 15 files changed, 93 insertions(+), 167 deletions(-)
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 2/7] drm/vmgfx: Replace kmap_atomic()

2021-03-05 Thread Thomas Gleixner
From: Thomas Gleixner 

There is no reason to disable pagefaults and preemption as a side effect of
kmap_atomic_prot().

Use kmap_local_page_prot() instead and document the reasoning for the
mapping usage with the given pgprot.

Remove the NULL pointer check for the map. These functions return a valid
address for valid pages and the return was bogus anyway as it would have
left preemption and pagefaults disabled.

Signed-off-by: Thomas Gleixner 
Cc: VMware Graphics 
Cc: Roland Scheidegger 
Cc: Zack Rusin 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: dri-devel@lists.freedesktop.org
---
 drivers/gpu/drm/vmwgfx/vmwgfx_blit.c |   30 --
 1 file changed, 12 insertions(+), 18 deletions(-)

--- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
@@ -375,12 +375,12 @@ static int vmw_bo_cpu_blit_line(struct v
copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset);
 
if (unmap_src) {
-   kunmap_atomic(d->src_addr);
+   kunmap_local(d->src_addr);
d->src_addr = NULL;
}
 
if (unmap_dst) {
-   kunmap_atomic(d->dst_addr);
+   kunmap_local(d->dst_addr);
d->dst_addr = NULL;
}
 
@@ -388,12 +388,8 @@ static int vmw_bo_cpu_blit_line(struct v
if (WARN_ON_ONCE(dst_page >= d->dst_num_pages))
return -EINVAL;
 
-   d->dst_addr =
-   kmap_atomic_prot(d->dst_pages[dst_page],
-d->dst_prot);
-   if (!d->dst_addr)
-   return -ENOMEM;
-
+   d->dst_addr = 
kmap_local_page_prot(d->dst_pages[dst_page],
+  d->dst_prot);
d->mapped_dst = dst_page;
}
 
@@ -401,12 +397,8 @@ static int vmw_bo_cpu_blit_line(struct v
if (WARN_ON_ONCE(src_page >= d->src_num_pages))
return -EINVAL;
 
-   d->src_addr =
-   kmap_atomic_prot(d->src_pages[src_page],
-d->src_prot);
-   if (!d->src_addr)
-   return -ENOMEM;
-
+   d->src_addr = 
kmap_local_page_prot(d->src_pages[src_page],
+  d->src_prot);
d->mapped_src = src_page;
}
diff->do_cpy(diff, d->dst_addr + dst_page_offset,
@@ -436,8 +428,10 @@ static int vmw_bo_cpu_blit_line(struct v
  *
  * Performs a CPU blit from one buffer object to another avoiding a full
  * bo vmap which may exhaust- or fragment vmalloc space.
- * On supported architectures (x86), we're using kmap_atomic which avoids
- * cross-processor TLB- and cache flushes and may, on non-HIGHMEM systems
+ *
+ * On supported architectures (x86), we're using kmap_local_prot() which
+ * avoids cross-processor TLB- and cache flushes. kmap_local_prot() will
+ * either map a highmem page with the proper pgprot on HIGHMEM=y systems or
  * reference already set-up mappings.
  *
  * Neither of the buffer objects may be placed in PCI memory
@@ -500,9 +494,9 @@ int vmw_bo_cpu_blit(struct ttm_buffer_ob
}
 out:
if (d.src_addr)
-   kunmap_atomic(d.src_addr);
+   kunmap_local(d.src_addr);
if (d.dst_addr)
-   kunmap_atomic(d.dst_addr);
+   kunmap_local(d.dst_addr);
 
return ret;
 }


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch 02/30] genirq: Move status flag checks to core

2021-01-11 Thread Thomas Gleixner
On Sun, Dec 27 2020 at 11:20, Guenter Roeck wrote:
> On Thu, Dec 10, 2020 at 08:25:38PM +0100, Thomas Gleixner wrote:
> Yes, but that means that irq_check_status_bit() may be called from modules,
> but it is not exported, resulting in build errors such as the following.
>
> arm64:allmodconfig:
>
> ERROR: modpost: "irq_check_status_bit" [drivers/perf/arm_spe_pmu.ko] 
> undefined!

Duh. Yes, that lacks an export obviously.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch 03/30] genirq: Move irq_set_lockdep_class() to core

2020-12-14 Thread Thomas Gleixner
On Fri, Dec 11 2020 at 22:08, Thomas Gleixner wrote:

> On Fri, Dec 11 2020 at 19:53, Andy Shevchenko wrote:
>
>> On Thu, Dec 10, 2020 at 10:14 PM Thomas Gleixner  wrote:
>>>
>>> irq_set_lockdep_class() is used from modules and requires irq_to_desc() to
>>> be exported. Move it into the core code which lifts another requirement for
>>> the export.
>>
>> ...
>>
>>> +   if (IS_ENABLED(CONFIG_LOCKDEP))
>>> +   __irq_set_lockdep_class(irq, lock_class, request_class);
>
> You are right. Let me fix that.

No. I have to correct myself. You're wrong.

The inline is evaluated in the compilation units which include that
header and because the function declaration is unconditional it is
happy.

Now the optimizer stage makes the whole thing a NOOP if CONFIG_LOCKDEP=n
and thereby drops the reference to the function which makes it not
required for linking.

So in the file where the function is implemented:

#ifdef CONFIG_LOCKDEP
void __irq_set_lockdep_class()
{
}
#endif

The whole block is either discarded because CONFIG_LOCKDEP is not
defined or compile if it is defined which makes it available for the
linker.

And in the latter case the optimizer keeps the call in the inline (it
optimizes the condition away because it's always true).

So in both cases the compiler and the linker are happy and everything
works as expected.

It would fail if the header file had the following:

#ifdef CONFIG_LOCKDEP
void __irq_set_lockdep_class();
#endif

Because then it would complain about the missing function prototype when
it evaluates the inline.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch 03/30] genirq: Move irq_set_lockdep_class() to core

2020-12-14 Thread Thomas Gleixner
On Fri, Dec 11 2020 at 19:53, Andy Shevchenko wrote:

> On Thu, Dec 10, 2020 at 10:14 PM Thomas Gleixner  wrote:
>>
>> irq_set_lockdep_class() is used from modules and requires irq_to_desc() to
>> be exported. Move it into the core code which lifts another requirement for
>> the export.
>
> ...
>
>> +   if (IS_ENABLED(CONFIG_LOCKDEP))
>> +   __irq_set_lockdep_class(irq, lock_class, request_class);

You are right. Let me fix that.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch 27/30] xen/events: Only force affinity mask for percpu interrupts

2020-12-14 Thread Thomas Gleixner
Andrew,

On Fri, Dec 11 2020 at 22:21, Andrew Cooper wrote:
> On 11/12/2020 21:27, Thomas Gleixner wrote:
>> It's not any different from the hardware example at least not as far as
>> I understood the code.
>
> Xen's event channels do have a couple of quirks.

Why am I not surprised?

> Binding an event channel always results in one spurious event being
> delivered.  This is to cover notifications which can get lost during the
> bidirectional setup, or re-setups in certain configurations.
>
> Binding an interdomain or pirq event channel always defaults to vCPU0. 
> There is no way to atomically set the affinity while binding.  I believe
> the API predates SMP guest support in Xen, and noone has fixed it up
> since.

That's fine. I'm not changing that.

What I'm changing is the unwanted and unnecessary overwriting of the
actual affinity mask.

We have a similar issue on real hardware where we can only target _one_
CPU and not all CPUs in the affinity mask. So we still can preserve the
(user) requested mask and just affine it to one CPU which is reflected
in the effective affinity mask. This the right thing to do for two
reasons:

   1) It allows proper interrupt distribution

   2) It does not break (user) requested affinity when the effective
  target CPU goes offline and the affinity mask still contains
  online CPUs. If you overwrite it you lost track of the requested
  broader mask.

> As a consequence, the guest will observe the event raised on vCPU0 as
> part of setting up the event, even if it attempts to set a different
> affinity immediately afterwards.  A little bit of care needs to be taken
> when binding an event channel on vCPUs other than 0, to ensure that the
> callback is safe with respect to any remaining state needing
> initialisation.

That's preserved for all non percpu interrupts. The percpu variant of
VIRQ and IPIs did binding to vCPU != 0 already before this change.

> Beyond this, there is nothing magic I'm aware of.
>
> We have seen soft lockups before in certain scenarios, simply due to the
> quantity of events hitting vCPU0 before irqbalance gets around to
> spreading the load.  This is why there is an attempt to round-robin the
> userspace event channel affinities by default, but I still don't see why
> this would need custom affinity logic itself.

Just the previous attempt makes no sense for the reasons I outlined in
the changelog. So now with this new spreading mechanics you get the
distribution for all cases:

  1) Post setup using and respecting the default affinity mask which can
 be set as a kernel commandline parameter.

  2) Runtime (user) requested affinity change with a mask which contains
 more than one vCPU. The previous logic always chose the first one
 in the mask.

 So assume userspace affines 4 irqs to a CPU 0-3 and 4 irqs to CPU
 4-7 then 4 irqs end up on CPU0 and 4 on CPU4

 The new algorithm which is similar to what we have on x86 (minus
 the vector space limitation) picks the CPU which has the least
 number of channels affine to it at that moment. If e.g. all 8 CPUs
 have the same number of vectors before that change then in the
 example above the first 4 are spread to CPU0-3 and the second 4 to
 CPU4-7

Thanks,

tglx
   
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch 27/30] xen/events: Only force affinity mask for percpu interrupts

2020-12-14 Thread Thomas Gleixner
On Fri, Dec 11 2020 at 13:10, Jürgen Groß wrote:
> On 11.12.20 00:20, boris.ostrov...@oracle.com wrote:
>> 
>> On 12/10/20 2:26 PM, Thomas Gleixner wrote:
>>> All event channel setups bind the interrupt on CPU0 or the target CPU for
>>> percpu interrupts and overwrite the affinity mask with the corresponding
>>> cpumask. That does not make sense.
>>>
>>> The XEN implementation of irqchip::irq_set_affinity() already picks a
>>> single target CPU out of the affinity mask and the actual target is stored
>>> in the effective CPU mask, so destroying the user chosen affinity mask
>>> which might contain more than one CPU is wrong.
>>>
>>> Change the implementation so that the channel is bound to CPU0 at the XEN
>>> level and leave the affinity mask alone. At startup of the interrupt
>>> affinity will be assigned out of the affinity mask and the XEN binding will
>>> be updated.
>> 
>> 
>> If that's the case then I wonder whether we need this call at all and 
>> instead bind at startup time.
>
> After some discussion with Thomas on IRC and xen-devel archaeology the
> result is: this will be needed especially for systems running on a
> single vcpu (e.g. small guests), as the .irq_set_affinity() callback
> won't be called in this case when starting the irq.

That's right, but not limited to ARM. The same problem exists on x86 UP.
So yes, the call makes sense, but the changelog is not really useful.
Let me add a comment to this.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch 27/30] xen/events: Only force affinity mask for percpu interrupts

2020-12-14 Thread Thomas Gleixner
On Fri, Dec 11 2020 at 09:29, boris ostrovsky wrote:

> On 12/11/20 7:37 AM, Thomas Gleixner wrote:
>> On Fri, Dec 11 2020 at 13:10, Jürgen Groß wrote:
>>> On 11.12.20 00:20, boris.ostrov...@oracle.com wrote:
>>>> On 12/10/20 2:26 PM, Thomas Gleixner wrote:
>>>>> Change the implementation so that the channel is bound to CPU0 at the XEN
>>>>> level and leave the affinity mask alone. At startup of the interrupt
>>>>> affinity will be assigned out of the affinity mask and the XEN binding 
>>>>> will
>>>>> be updated.
>>>>
>>>> If that's the case then I wonder whether we need this call at all and 
>>>> instead bind at startup time.
>>> After some discussion with Thomas on IRC and xen-devel archaeology the
>>> result is: this will be needed especially for systems running on a
>>> single vcpu (e.g. small guests), as the .irq_set_affinity() callback
>>> won't be called in this case when starting the irq.
>
> On UP are we not then going to end up with an empty affinity mask? Or
> are we guaranteed to have it set to 1 by interrupt generic code?

An UP kernel does not ever look on the affinity mask. The
chip::irq_set_affinity() callback is not invoked so the mask is
irrelevant.

A SMP kernel on a UP machine sets CPU0 in the mask so all is good.

> This is actually why I brought this up in the first place --- a
> potential mismatch between the affinity mask and Xen-specific data
> (e.g. info->cpu and then protocol-specific data in event channel
> code). Even if they are re-synchronized later, at startup time (for
> SMP).

Which is not a problem either. The affinity mask is only relevant for
setting the affinity, but it's not relevant for delivery and never can
be.

> I don't see anything that would cause a problem right now but I worry
> that this inconsistency may come up at some point.

As long as the affinity mask becomes not part of the event channel magic
this should never matter.

Look at it from hardware:

interrupt is affine to CPU0

 CPU0 runs:
 
 set_affinity(CPU0 -> CPU1)
local_irq_disable()

 --> interrupt is raised in hardware and pending on CPU0

irq hardware is reconfigured to be affine to CPU1

local_irq_enable()

 --> interrupt is handled on CPU0

the next interrupt will be raised on CPU1

So info->cpu which is registered via the hypercall binds the 'hardware
delivery' and whenever the new affinity is written it is rebound to some
other CPU and the next interrupt is then raised on this other CPU.

It's not any different from the hardware example at least not as far as
I understood the code.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch 14/30] drm/i915/pmu: Replace open coded kstat_irqs() copy

2020-12-14 Thread Thomas Gleixner
On Fri, Dec 11 2020 at 10:13, Tvrtko Ursulin wrote:
> On 10/12/2020 19:25, Thomas Gleixner wrote:

>> 
>> Aside of that the count is per interrupt line and therefore takes
>> interrupts from other devices into account which share the interrupt line
>> and are not handled by the graphics driver.
>> 
>> Replace it with a pmu private count which only counts interrupts which
>> originate from the graphics card.
>> 
>> To avoid atomics or heuristics of some sort make the counter field
>> 'unsigned long'. That limits the count to 4e9 on 32bit which is a lot and
>> postprocessing can easily deal with the occasional wraparound.
>
> After my failed hasty sketch from last night I had a different one which 
> was kind of heuristics based (re-reading the upper dword and retrying if 
> it changed on 32-bit).

The problem is that there will be two seperate modifications for the low
and high word. Several ways how the compiler can translate this, but the
problem is the same for all of them:

CPU 0   CPU 1
load low
load high
add  low, 1
addc high, 0
store low   load high
--> NMI load low
load high and compare
store high

You can't catch that. If this really becomes an issue you need a
sequence counter around it.
  

> But you are right - it is okay to at least start 
> like this today and if later there is a need we can either do that or 
> deal with wrap at PMU read time.

Right.

>> +/*
>> + * Interrupt statistic for PMU. Increments the counter only if the
>> + * interrupt originated from the the GPU so interrupts from a device which
>> + * shares the interrupt line are not accounted.
>> + */
>> +static inline void pmu_irq_stats(struct drm_i915_private *priv,
>
> We never use priv as a local name, it should be either i915 or
> dev_priv.

Sure, will fix.

>> +/*
>> + * A clever compiler translates that into INC. A not so clever one
>> + * should at least prevent store tearing.
>> + */
>> +WRITE_ONCE(priv->pmu.irq_count, priv->pmu.irq_count + 1);
>
> Curious, probably more educational for me - given x86_32 and x86_64, and 
> the context of it getting called, what is the difference from just doing 
> irq_count++?

Several reasons:

1) The compiler can pretty much do what it wants with cnt++
   including tearing and whatever. https://lwn.net/Articles/816850/
   for the full set of insanities.

   Not really a problem here, but

2) It's annotating the reader and the writer side and documenting
   that this is subject to concurrency

3) It will prevent KCSAN to complain about the data race,
   i.e. concurrent modification while reading.

Thanks,

tglx

>> --- a/drivers/gpu/drm/i915/i915_pmu.c
>> +++ b/drivers/gpu/drm/i915/i915_pmu.c
>> @@ -423,22 +423,6 @@ static enum hrtimer_restart i915_sample(
>>  return HRTIMER_RESTART;
>>   }
>
> In this file you can also drop the #include  line.

Indeed.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


RE: [patch 14/30] drm/i915/pmu: Replace open coded kstat_irqs() copy

2020-12-14 Thread Thomas Gleixner
On Fri, Dec 11 2020 at 14:19, David Laight wrote:
> From: Thomas Gleixner
>> You can't catch that. If this really becomes an issue you need a
>> sequence counter around it.
>
> Or just two copies of the high word.
> Provided the accesses are sequenced:
> writer:
>   load high:low
>   add small_value,high:low
>   store high
>   store low
>   store high_copy
> reader:
>   load high_copy
>   load low
>   load high
>   if (high != high_copy)
>   low = 0;

And low = 0 is solving what? You need to loop back and retry until it's
consistent and then it's nothing else than an open coded sequence count.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 16/30] mfd: ab8500-debugfs: Remove the racy fiddling with irq_desc

2020-12-11 Thread Thomas Gleixner
First of all drivers have absolutely no business to dig into the internals
of an irq descriptor. That's core code and subject to change. All of this
information is readily available to /proc/interrupts in a safe and race
free way.

Remove the inspection code which is a blatant violation of subsystem
boundaries and racy against concurrent modifications of the interrupt
descriptor.

Print the irq line instead so the information can be looked up in a sane
way in /proc/interrupts.

Signed-off-by: Thomas Gleixner 
Cc: Linus Walleij 
Cc: Lee Jones 
Cc: linux-arm-ker...@lists.infradead.org
---
 drivers/mfd/ab8500-debugfs.c |   16 +++-
 1 file changed, 3 insertions(+), 13 deletions(-)

--- a/drivers/mfd/ab8500-debugfs.c
+++ b/drivers/mfd/ab8500-debugfs.c
@@ -1513,24 +1513,14 @@ static int ab8500_interrupts_show(struct
 {
int line;
 
-   seq_puts(s, "name: number:  number of: wake:\n");
+   seq_puts(s, "name: number: irq: number of: wake:\n");
 
for (line = 0; line < num_interrupt_lines; line++) {
-   struct irq_desc *desc = irq_to_desc(line + irq_first);
-
-   seq_printf(s, "%3i:  %6i %4i",
+   seq_printf(s, "%3i:  %6i %4i %4i\n",
   line,
+  line + irq_first,
   num_interrupts[line],
   num_wake_interrupts[line]);
-
-   if (desc && desc->name)
-   seq_printf(s, "-%-8s", desc->name);
-   if (desc && desc->action) {
-   struct irqaction *action = desc->action;
-
-   seq_printf(s, "  %s", action->name);
-   while ((action = action->next) != NULL)
-   seq_printf(s, ", %s", action->name);
}
seq_putc(s, '\n');
}

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 05/30] genirq: Annotate irq stats data races

2020-12-11 Thread Thomas Gleixner
Both the per cpu stats and the accumulated count are accessed lockless and
can be concurrently modified. That's intentional and the stats are a rough
estimate anyway. Annotate them with data_race().

Signed-off-by: Thomas Gleixner 
---
 kernel/irq/irqdesc.c |4 ++--
 kernel/irq/proc.c|5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -943,10 +943,10 @@ unsigned int kstat_irqs(unsigned int irq
if (!irq_settings_is_per_cpu_devid(desc) &&
!irq_settings_is_per_cpu(desc) &&
!irq_is_nmi(desc))
-   return desc->tot_count;
+   return data_race(desc->tot_count);
 
for_each_possible_cpu(cpu)
-   sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
+   sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu));
return sum;
 }
 
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -488,9 +488,10 @@ int show_interrupts(struct seq_file *p,
if (!desc || irq_settings_is_hidden(desc))
goto outsparse;
 
-   if (desc->kstat_irqs)
+   if (desc->kstat_irqs) {
for_each_online_cpu(j)
-   any_count |= *per_cpu_ptr(desc->kstat_irqs, j);
+   any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, 
j));
+   }
 
if ((!desc->action || irq_desc_is_chained(desc)) && !any_count)
goto outsparse;

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 02/30] genirq: Move status flag checks to core

2020-12-11 Thread Thomas Gleixner
These checks are used by modules and prevent the removal of the export of
irq_to_desc(). Move the accessor into the core.

Signed-off-by: Thomas Gleixner 
---
 include/linux/irqdesc.h |   17 +
 kernel/irq/manage.c |   17 +
 2 files changed, 22 insertions(+), 12 deletions(-)

--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -223,28 +223,21 @@ irq_set_chip_handler_name_locked(struct
data->chip = chip;
 }
 
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask);
+
 static inline bool irq_balancing_disabled(unsigned int irq)
 {
-   struct irq_desc *desc;
-
-   desc = irq_to_desc(irq);
-   return desc->status_use_accessors & IRQ_NO_BALANCING_MASK;
+   return irq_check_status_bit(irq, IRQ_NO_BALANCING_MASK);
 }
 
 static inline bool irq_is_percpu(unsigned int irq)
 {
-   struct irq_desc *desc;
-
-   desc = irq_to_desc(irq);
-   return desc->status_use_accessors & IRQ_PER_CPU;
+   return irq_check_status_bit(irq, IRQ_PER_CPU);
 }
 
 static inline bool irq_is_percpu_devid(unsigned int irq)
 {
-   struct irq_desc *desc;
-
-   desc = irq_to_desc(irq);
-   return desc->status_use_accessors & IRQ_PER_CPU_DEVID;
+   return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID);
 }
 
 static inline void
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2769,3 +2769,23 @@ bool irq_has_action(unsigned int irq)
return res;
 }
 EXPORT_SYMBOL_GPL(irq_has_action);
+
+/**
+ * irq_check_status_bit - Check whether bits in the irq descriptor status are 
set
+ * @irq:   The linux irq number
+ * @bitmask:   The bitmask to evaluate
+ *
+ * Returns: True if one of the bits in @bitmask is set
+ */
+bool irq_check_status_bit(unsigned int irq, unsigned int bitmask)
+{
+   struct irq_desc *desc;
+   bool res = false;
+
+   rcu_read_lock();
+   desc = irq_to_desc(irq);
+   if (desc)
+   res = !!(desc->status_use_accessors & bitmask);
+   rcu_read_unlock();
+   return res;
+}

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 20/30] net/mlx4: Replace irq_to_desc() abuse

2020-12-11 Thread Thomas Gleixner
No driver has any business with the internals of an interrupt
descriptor. Storing a pointer to it just to use yet another helper at the
actual usage site to retrieve the affinity mask is creative at best. Just
because C does not allow encapsulation does not mean that the kernel has no
limits.

Retrieve a pointer to the affinity mask itself and use that. It's still
using an interface which is usually not for random drivers, but definitely
less hideous than the previous hack.

Signed-off-by: Thomas Gleixner 
Cc: Tariq Toukan 
Cc: "David S. Miller" 
Cc: Jakub Kicinski 
Cc: net...@vger.kernel.org
Cc: linux-r...@vger.kernel.org
---
 drivers/net/ethernet/mellanox/mlx4/en_cq.c   |8 +++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   |6 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |3 ++-
 3 files changed, 6 insertions(+), 11 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -90,7 +90,7 @@ int mlx4_en_activate_cq(struct mlx4_en_p
int cq_idx)
 {
struct mlx4_en_dev *mdev = priv->mdev;
-   int err = 0;
+   int irq, err = 0;
int timestamp_en = 0;
bool assigned_eq = false;
 
@@ -116,10 +116,8 @@ int mlx4_en_activate_cq(struct mlx4_en_p
 
assigned_eq = true;
}
-
-   cq->irq_desc =
-   irq_to_desc(mlx4_eq_get_irq(mdev->dev,
-   cq->vector));
+   irq = mlx4_eq_get_irq(mdev->dev, cq->vector);
+   cq->aff_mask = irq_get_affinity_mask(irq);
} else {
/* For TX we use the same irq per
ring we assigned for the RX*/
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -959,8 +959,6 @@ int mlx4_en_poll_rx_cq(struct napi_struc
 
/* If we used up all the quota - we're probably not done yet... */
if (done == budget || !clean_complete) {
-   const struct cpumask *aff;
-   struct irq_data *idata;
int cpu_curr;
 
/* in case we got here because of !clean_complete */
@@ -969,10 +967,8 @@ int mlx4_en_poll_rx_cq(struct napi_struc
INC_PERF_COUNTER(priv->pstats.napi_quota);
 
cpu_curr = smp_processor_id();
-   idata = irq_desc_get_irq_data(cq->irq_desc);
-   aff = irq_data_get_affinity_mask(idata);
 
-   if (likely(cpumask_test_cpu(cpu_curr, aff)))
+   if (likely(cpumask_test_cpu(cpu_curr, cq->aff_mask)))
return budget;
 
/* Current cpu is not according to smp_irq_affinity -
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -46,6 +46,7 @@
 #endif
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -380,7 +381,7 @@ struct mlx4_en_cq {
struct mlx4_cqe *buf;
 #define MLX4_EN_OPCODE_ERROR   0x1e
 
-   struct irq_desc *irq_desc;
+   const struct cpumask *aff_mask;
 };
 
 struct mlx4_en_port_profile {

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 29/30] xen/events: Implement irq distribution

2020-12-11 Thread Thomas Gleixner
Keep track of the assignments of event channels to CPUs and select the
online CPU with the least assigned channels in the affinity mask which is
handed to irq_chip::irq_set_affinity() from the core code.

Signed-off-by: Thomas Gleixner 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: xen-de...@lists.xenproject.org
---
 drivers/xen/events/events_base.c |   72 ++-
 1 file changed, 64 insertions(+), 8 deletions(-)

--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -96,6 +96,7 @@ struct irq_info {
struct list_head eoi_list;
short refcnt;
u8 spurious_cnt;
+   u8 is_accounted;
enum xen_irq_type type; /* type */
unsigned irq;
evtchn_port_t evtchn;   /* event channel */
@@ -161,6 +162,9 @@ static DEFINE_PER_CPU(int [NR_VIRQS], vi
 /* IRQ <-> IPI mapping */
 static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] 
= -1};
 
+/* Event channel distribution data */
+static atomic_t channels_on_cpu[NR_CPUS];
+
 static int **evtchn_to_irq;
 #ifdef CONFIG_X86
 static unsigned long *pirq_eoi_map;
@@ -257,6 +261,32 @@ static void set_info_for_irq(unsigned in
irq_set_chip_data(irq, info);
 }
 
+/* Per CPU channel accounting */
+static void channels_on_cpu_dec(struct irq_info *info)
+{
+   if (!info->is_accounted)
+   return;
+
+   info->is_accounted = 0;
+
+   if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids))
+   return;
+
+   WARN_ON_ONCE(!atomic_add_unless(_on_cpu[info->cpu], -1 , 0));
+}
+
+static void channels_on_cpu_inc(struct irq_info *info)
+{
+   if (WARN_ON_ONCE(info->cpu >= nr_cpu_ids))
+   return;
+
+   if (WARN_ON_ONCE(!atomic_add_unless(_on_cpu[info->cpu], 1,
+   INT_MAX)))
+   return;
+
+   info->is_accounted = 1;
+}
+
 /* Constructors for packed IRQ information. */
 static int xen_irq_info_common_setup(struct irq_info *info,
 unsigned irq,
@@ -339,6 +369,7 @@ static void xen_irq_info_cleanup(struct
 {
set_evtchn_to_irq(info->evtchn, -1);
info->evtchn = 0;
+   channels_on_cpu_dec(info);
 }
 
 /*
@@ -449,7 +480,9 @@ static void bind_evtchn_to_cpu(evtchn_po
 
xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu);
 
+   channels_on_cpu_dec(info);
info->cpu = cpu;
+   channels_on_cpu_inc(info);
 }
 
 /**
@@ -622,11 +655,6 @@ static void xen_irq_init(unsigned irq)
 {
struct irq_info *info;
 
-#ifdef CONFIG_SMP
-   /* By default all event channels notify CPU#0. */
-   cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(0));
-#endif
-
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (info == NULL)
panic("Unable to allocate metadata for IRQ%d\n", irq);
@@ -1691,10 +1719,34 @@ static int xen_rebind_evtchn_to_cpu(evtc
return 0;
 }
 
+/*
+ * Find the CPU within @dest mask which has the least number of channels
+ * assigned. This is not precise as the per cpu counts can be modified
+ * concurrently.
+ */
+static unsigned int select_target_cpu(const struct cpumask *dest)
+{
+   unsigned int cpu, best_cpu = UINT_MAX, minch = UINT_MAX;
+
+   for_each_cpu_and(cpu, dest, cpu_online_mask) {
+   unsigned int curch = atomic_read(_on_cpu[cpu]);
+
+   if (curch < minch) {
+   minch = curch;
+   best_cpu = cpu;
+   }
+   }
+
+   /* If this happens accounting is screwed up */
+   if (WARN_ON_ONCE(best_cpu == UINT_MAX))
+   best_cpu = cpumask_first(dest);
+   return best_cpu;
+}
+
 static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest,
bool force)
 {
-   unsigned tcpu = cpumask_first_and(dest, cpu_online_mask);
+   unsigned int tcpu = select_target_cpu(dest);
int ret;
 
ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu);
@@ -1922,8 +1974,12 @@ void xen_irq_resume(void)
xen_evtchn_resume();
 
/* No IRQ <-> event-channel mappings. */
-   list_for_each_entry(info, _irq_list_head, list)
-   info->evtchn = 0; /* zap event-channel binding */
+   list_for_each_entry(info, _irq_list_head, list) {
+   /* Zap event-channel binding */
+   info->evtchn = 0;
+   /* Adjust accounting */
+   channels_on_cpu_dec(info);
+   }
 
clear_evtchn_to_irq_all();
 

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 30/30] genirq: Remove export of irq_to_desc()

2020-12-11 Thread Thomas Gleixner
No more (ab)use in modules finally. Remove the export so there won't come
new ones.

Signed-off-by: Thomas Gleixner 
---
 kernel/irq/irqdesc.c |1 -
 1 file changed, 1 deletion(-)

--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -352,7 +352,6 @@ struct irq_desc *irq_to_desc(unsigned in
 {
return radix_tree_lookup(_desc_tree, irq);
 }
-EXPORT_SYMBOL(irq_to_desc);
 
 static void delete_irq_desc(unsigned int irq)
 {

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 25/30] xen/events: Remove disfunct affinity spreading

2020-12-11 Thread Thomas Gleixner
This function can only ever work when the event channels:

  - are already established
  - interrupts assigned to them
  - the affinity has been set by user space already

because any newly set up event channel is forced to be bound to CPU0 and
the affinity mask of the interrupt is forced to contain cpumask_of(0).

As the CPU0 enforcement was in place _before_ this was implemented it's
entirely unclear how that can ever have worked at all.

Remove it as preparation for doing it proper.

Signed-off-by: Thomas Gleixner 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: xen-de...@lists.xenproject.org
---
 drivers/xen/events/events_base.c |9 -
 drivers/xen/evtchn.c |   34 +-
 2 files changed, 1 insertion(+), 42 deletions(-)

--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1696,15 +1696,6 @@ static int set_affinity_irq(struct irq_d
return ret;
 }
 
-/* To be called with desc->lock held. */
-int xen_set_affinity_evtchn(struct irq_desc *desc, unsigned int tcpu)
-{
-   struct irq_data *d = irq_desc_get_irq_data(desc);
-
-   return set_affinity_irq(d, cpumask_of(tcpu), false);
-}
-EXPORT_SYMBOL_GPL(xen_set_affinity_evtchn);
-
 static void enable_dynirq(struct irq_data *data)
 {
evtchn_port_t evtchn = evtchn_from_irq(data->irq);
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -421,36 +421,6 @@ static void evtchn_unbind_from_user(stru
del_evtchn(u, evtchn);
 }
 
-static DEFINE_PER_CPU(int, bind_last_selected_cpu);
-
-static void evtchn_bind_interdom_next_vcpu(evtchn_port_t evtchn)
-{
-   unsigned int selected_cpu, irq;
-   struct irq_desc *desc;
-   unsigned long flags;
-
-   irq = irq_from_evtchn(evtchn);
-   desc = irq_to_desc(irq);
-
-   if (!desc)
-   return;
-
-   raw_spin_lock_irqsave(>lock, flags);
-   selected_cpu = this_cpu_read(bind_last_selected_cpu);
-   selected_cpu = cpumask_next_and(selected_cpu,
-   desc->irq_common_data.affinity, cpu_online_mask);
-
-   if (unlikely(selected_cpu >= nr_cpu_ids))
-   selected_cpu = cpumask_first_and(desc->irq_common_data.affinity,
-   cpu_online_mask);
-
-   this_cpu_write(bind_last_selected_cpu, selected_cpu);
-
-   /* unmask expects irqs to be disabled */
-   xen_set_affinity_evtchn(desc, selected_cpu);
-   raw_spin_unlock_irqrestore(>lock, flags);
-}
-
 static long evtchn_ioctl(struct file *file,
 unsigned int cmd, unsigned long arg)
 {
@@ -508,10 +478,8 @@ static long evtchn_ioctl(struct file *fi
break;
 
rc = evtchn_bind_to_user(u, bind_interdomain.local_port);
-   if (rc == 0) {
+   if (rc == 0)
rc = bind_interdomain.local_port;
-   evtchn_bind_interdom_next_vcpu(rc);
-   }
break;
}
 

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 27/30] xen/events: Only force affinity mask for percpu interrupts

2020-12-11 Thread Thomas Gleixner
All event channel setups bind the interrupt on CPU0 or the target CPU for
percpu interrupts and overwrite the affinity mask with the corresponding
cpumask. That does not make sense.

The XEN implementation of irqchip::irq_set_affinity() already picks a
single target CPU out of the affinity mask and the actual target is stored
in the effective CPU mask, so destroying the user chosen affinity mask
which might contain more than one CPU is wrong.

Change the implementation so that the channel is bound to CPU0 at the XEN
level and leave the affinity mask alone. At startup of the interrupt
affinity will be assigned out of the affinity mask and the XEN binding will
be updated. Only keep the enforcement for real percpu interrupts.

On resume the overwrite is not required either because info->cpu and the
affinity mask are still the same as at the time of suspend. Same for
rebind_evtchn_irq().

This also prepares for proper interrupt spreading.

Signed-off-by: Thomas Gleixner 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: xen-de...@lists.xenproject.org
---
 drivers/xen/events/events_base.c |   42 ++-
 1 file changed, 28 insertions(+), 14 deletions(-)

--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -433,15 +433,20 @@ static bool pirq_needs_eoi_flag(unsigned
return info->u.pirq.flags & PIRQ_NEEDS_EOI;
 }
 
-static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu)
+static void bind_evtchn_to_cpu(evtchn_port_t evtchn, unsigned int cpu,
+  bool force_affinity)
 {
int irq = get_evtchn_to_irq(evtchn);
struct irq_info *info = info_for_irq(irq);
 
BUG_ON(irq == -1);
-#ifdef CONFIG_SMP
-   cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu));
-#endif
+
+   if (IS_ENABLED(CONFIG_SMP) && force_affinity) {
+   cpumask_copy(irq_get_affinity_mask(irq), cpumask_of(cpu));
+   cpumask_copy(irq_get_effective_affinity_mask(irq),
+cpumask_of(cpu));
+   }
+
xen_evtchn_port_bind_to_cpu(evtchn, cpu, info->cpu);
 
info->cpu = cpu;
@@ -788,7 +793,7 @@ static unsigned int __startup_pirq(unsig
goto err;
 
info->evtchn = evtchn;
-   bind_evtchn_to_cpu(evtchn, 0);
+   bind_evtchn_to_cpu(evtchn, 0, false);
 
rc = xen_evtchn_port_setup(evtchn);
if (rc)
@@ -1107,8 +1112,8 @@ static int bind_evtchn_to_irq_chip(evtch
irq = ret;
goto out;
}
-   /* New interdomain events are bound to VCPU 0. */
-   bind_evtchn_to_cpu(evtchn, 0);
+   /* New interdomain events are initially bound to VCPU 0. */
+   bind_evtchn_to_cpu(evtchn, 0, false);
} else {
struct irq_info *info = info_for_irq(irq);
WARN_ON(info == NULL || info->type != IRQT_EVTCHN);
@@ -1156,7 +1161,11 @@ static int bind_ipi_to_irq(unsigned int
irq = ret;
goto out;
}
-   bind_evtchn_to_cpu(evtchn, cpu);
+   /*
+* Force the affinity mask to the target CPU so proc shows
+* the correct target.
+*/
+   bind_evtchn_to_cpu(evtchn, cpu, true);
} else {
struct irq_info *info = info_for_irq(irq);
WARN_ON(info == NULL || info->type != IRQT_IPI);
@@ -1269,7 +1278,11 @@ int bind_virq_to_irq(unsigned int virq,
goto out;
}
 
-   bind_evtchn_to_cpu(evtchn, cpu);
+   /*
+* Force the affinity mask for percpu interrupts so proc
+* shows the correct target.
+*/
+   bind_evtchn_to_cpu(evtchn, cpu, percpu);
} else {
struct irq_info *info = info_for_irq(irq);
WARN_ON(info == NULL || info->type != IRQT_VIRQ);
@@ -1634,8 +1647,7 @@ void rebind_evtchn_irq(evtchn_port_t evt
 
mutex_unlock(_mapping_update_lock);
 
-bind_evtchn_to_cpu(evtchn, info->cpu);
-   irq_set_affinity(irq, cpumask_of(info->cpu));
+   bind_evtchn_to_cpu(evtchn, info->cpu, false);
 
/* Unmask the event channel. */
enable_irq(irq);
@@ -1669,7 +1681,7 @@ static int xen_rebind_evtchn_to_cpu(evtc
 * it, but don't do the xenlinux-level rebind in that case.
 */
if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, _vcpu) >= 0)
-   bind_evtchn_to_cpu(evtchn, tcpu);
+   bind_evtchn_to_cpu(evtchn, tcpu, false);
 
if (!masked)
unmask_evtchn(evtchn);
@@ -1798,7 +1810,8 @@ static void restore_cpu_virqs(unsigned i
 
/* Record the new mapping. */
(void)xen_irq_info_virq_setup(cpu, irq, evtchn, 

[patch 24/30] xen/events: Remove unused bind_evtchn_to_irq_lateeoi()

2020-12-11 Thread Thomas Gleixner
Signed-off-by: Thomas Gleixner 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: xen-de...@lists.xenproject.org
---
 drivers/xen/events/events_base.c |6 --
 1 file changed, 6 deletions(-)

--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -1132,12 +1132,6 @@ int bind_evtchn_to_irq(evtchn_port_t evt
 }
 EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
 
-int bind_evtchn_to_irq_lateeoi(evtchn_port_t evtchn)
-{
-   return bind_evtchn_to_irq_chip(evtchn, _lateeoi_chip);
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi);
-
 static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 {
struct evtchn_bind_ipi bind_ipi;

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 09/30] ARM: smp: Use irq_desc_kstat_cpu() in show_ipi_list()

2020-12-11 Thread Thomas Gleixner
The irq descriptor is already there, no need to look it up again.

Signed-off-by: Thomas Gleixner 
Cc: Marc Zyngier 
Cc: Russell King 
Cc: linux-arm-ker...@lists.infradead.org
---
 arch/arm/kernel/smp.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -550,7 +550,7 @@ void show_ipi_list(struct seq_file *p, i
seq_printf(p, "%*s%u: ", prec - 1, "IPI", i);
 
for_each_online_cpu(cpu)
-   seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
+   seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], 
cpu));
 
seq_printf(p, " %s\n", ipi_types[i]);
}

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 28/30] xen/events: Reduce irq_info::spurious_cnt storage size

2020-12-11 Thread Thomas Gleixner
To prepare for interrupt spreading reduce the storage size of
irq_info::spurious_cnt to u8 so the required flag for the spreading logic
will not increase the storage size.

Protect the usage site against overruns.

Signed-off-by: Thomas Gleixner 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: xen-de...@lists.xenproject.org
---
 drivers/xen/events/events_base.c |8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -95,7 +95,7 @@ struct irq_info {
struct list_head list;
struct list_head eoi_list;
short refcnt;
-   short spurious_cnt;
+   u8 spurious_cnt;
enum xen_irq_type type; /* type */
unsigned irq;
evtchn_port_t evtchn;   /* event channel */
@@ -528,8 +528,10 @@ static void xen_irq_lateeoi_locked(struc
return;
 
if (spurious) {
-   if ((1 << info->spurious_cnt) < (HZ << 2))
-   info->spurious_cnt++;
+   if ((1 << info->spurious_cnt) < (HZ << 2)) {
+   if (info->spurious_cnt != 0xFF)
+   info->spurious_cnt++;
+   }
if (info->spurious_cnt > 1) {
delay = 1 << (info->spurious_cnt - 2);
if (delay > HZ)

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 10/30] arm64/smp: Use irq_desc_kstat_cpu() in arch_show_interrupts()

2020-12-11 Thread Thomas Gleixner
The irq descriptor is already there, no need to look it up again.

Signed-off-by: Thomas Gleixner 
Cc: Mark Rutland 
Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Marc Zyngier 
Cc: linux-arm-ker...@lists.infradead.org
---
 arch/arm64/kernel/smp.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -809,7 +809,7 @@ int arch_show_interrupts(struct seq_file
seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i,
   prec >= 4 ? " " : "");
for_each_online_cpu(cpu)
-   seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
+   seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], 
cpu));
seq_printf(p, "  %s\n", ipi_types[i]);
}
 

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 13/30] drm/i915/lpe_audio: Remove pointless irq_to_desc() usage

2020-12-11 Thread Thomas Gleixner
Nothing uses the result and nothing should ever use it in driver code.

Signed-off-by: Thomas Gleixner 
Cc: Jani Nikula 
Cc: Joonas Lahtinen 
Cc: Rodrigo Vivi 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Pankaj Bharadiya 
Cc: Chris Wilson 
Cc: Wambui Karuga 
Cc: intel-...@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
---
 drivers/gpu/drm/i915/display/intel_lpe_audio.c |4 
 1 file changed, 4 deletions(-)

--- a/drivers/gpu/drm/i915/display/intel_lpe_audio.c
+++ b/drivers/gpu/drm/i915/display/intel_lpe_audio.c
@@ -297,13 +297,9 @@ int intel_lpe_audio_init(struct drm_i915
  */
 void intel_lpe_audio_teardown(struct drm_i915_private *dev_priv)
 {
-   struct irq_desc *desc;
-
if (!HAS_LPE_AUDIO(dev_priv))
return;
 
-   desc = irq_to_desc(dev_priv->lpe_audio.irq);
-
lpe_audio_platdev_destroy(dev_priv);
 
irq_free_desc(dev_priv->lpe_audio.irq);

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 06/30] parisc/irq: Simplify irq count output for /proc/interrupts

2020-12-11 Thread Thomas Gleixner
The SMP variant works perfectly fine on UP as well.

Signed-off-by: Thomas Gleixner 
Cc: "James E.J. Bottomley" 
Cc: Helge Deller 
Cc: afzal mohammed 
Cc: linux-par...@vger.kernel.org
---
 arch/parisc/kernel/irq.c |5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -216,12 +216,9 @@ int show_interrupts(struct seq_file *p,
if (!action)
goto skip;
seq_printf(p, "%3d: ", i);
-#ifdef CONFIG_SMP
+
for_each_online_cpu(j)
seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-#else
-   seq_printf(p, "%10u ", kstat_irqs(i));
-#endif
 
seq_printf(p, " %14s", irq_desc_get_chip(desc)->name);
 #ifndef PARISC_IRQ_CR16_COUNTS

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 11/30] parisc/irq: Use irq_desc_kstat_cpu() in show_interrupts()

2020-12-11 Thread Thomas Gleixner
The irq descriptor is already there, no need to look it up again.

Signed-off-by: Thomas Gleixner 
Cc: "James E.J. Bottomley" 
Cc: Helge Deller 
Cc: afzal mohammed 
Cc: linux-par...@vger.kernel.org
---
 arch/parisc/kernel/irq.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -218,7 +218,7 @@ int show_interrupts(struct seq_file *p,
seq_printf(p, "%3d: ", i);
 
for_each_online_cpu(j)
-   seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+   seq_printf(p, "%10u ", irq_desc_kstat_cpu(desc, j));
 
seq_printf(p, " %14s", irq_desc_get_chip(desc)->name);
 #ifndef PARISC_IRQ_CR16_COUNTS

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 21/30] net/mlx4: Use effective interrupt affinity

2020-12-11 Thread Thomas Gleixner
Using the interrupt affinity mask for checking locality is not really
working well on architectures which support effective affinity masks.

The affinity mask is either the system wide default or set by user space,
but the architecture can or even must reduce the mask to the effective set,
which means that checking the affinity mask itself does not really tell
about the actual target CPUs.

Signed-off-by: Thomas Gleixner 
Cc: Tariq Toukan 
Cc: "David S. Miller" 
Cc: Jakub Kicinski 
Cc: net...@vger.kernel.org
Cc: linux-r...@vger.kernel.org
---
 drivers/net/ethernet/mellanox/mlx4/en_cq.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -117,7 +117,7 @@ int mlx4_en_activate_cq(struct mlx4_en_p
assigned_eq = true;
}
irq = mlx4_eq_get_irq(mdev->dev, cq->vector);
-   cq->aff_mask = irq_get_affinity_mask(irq);
+   cq->aff_mask = irq_get_effective_affinity_mask(irq);
} else {
/* For TX we use the same irq per
ring we assigned for the RX*/

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 18/30] PCI: xilinx-nwl: Use irq_data_get_irq_chip_data()

2020-12-11 Thread Thomas Gleixner
Going through a full irq descriptor lookup instead of just using the proper
helper function which provides direct access is suboptimal.

In fact it _is_ wrong because the chip callback needs to get the chip data
which is relevant for the chip while using the irq descriptor variant
returns the irq chip data of the top level chip of a hierarchy. It does not
matter in this case because the chip is the top level chip, but that
doesn't make it more correct.

Signed-off-by: Thomas Gleixner 
Cc: Lorenzo Pieralisi 
Cc: Rob Herring 
Cc: Bjorn Helgaas 
Cc: Michal Simek 
Cc: linux-...@vger.kernel.org
Cc: linux-arm-ker...@lists.infradead.org
---
 drivers/pci/controller/pcie-xilinx-nwl.c |8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

--- a/drivers/pci/controller/pcie-xilinx-nwl.c
+++ b/drivers/pci/controller/pcie-xilinx-nwl.c
@@ -379,13 +379,11 @@ static void nwl_pcie_msi_handler_low(str
 
 static void nwl_mask_leg_irq(struct irq_data *data)
 {
-   struct irq_desc *desc = irq_to_desc(data->irq);
-   struct nwl_pcie *pcie;
+   struct nwl_pcie *pcie = irq_data_get_irq_chip_data(data);
unsigned long flags;
u32 mask;
u32 val;
 
-   pcie = irq_desc_get_chip_data(desc);
mask = 1 << (data->hwirq - 1);
raw_spin_lock_irqsave(>leg_mask_lock, flags);
val = nwl_bridge_readl(pcie, MSGF_LEG_MASK);
@@ -395,13 +393,11 @@ static void nwl_mask_leg_irq(struct irq_
 
 static void nwl_unmask_leg_irq(struct irq_data *data)
 {
-   struct irq_desc *desc = irq_to_desc(data->irq);
-   struct nwl_pcie *pcie;
+   struct nwl_pcie *pcie = irq_data_get_irq_chip_data(data);
unsigned long flags;
u32 mask;
u32 val;
 
-   pcie = irq_desc_get_chip_data(desc);
mask = 1 << (data->hwirq - 1);
raw_spin_lock_irqsave(>leg_mask_lock, flags);
val = nwl_bridge_readl(pcie, MSGF_LEG_MASK);

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 22/30] net/mlx5: Replace irq_to_desc() abuse

2020-12-11 Thread Thomas Gleixner
No driver has any business with the internals of an interrupt
descriptor. Storing a pointer to it just to use yet another helper at the
actual usage site to retrieve the affinity mask is creative at best. Just
because C does not allow encapsulation does not mean that the kernel has no
limits.

Retrieve a pointer to the affinity mask itself and use that. It's still
using an interface which is usually not for random drivers, but definitely
less hideous than the previous hack.

Signed-off-by: Thomas Gleixner 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c |6 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -669,7 +669,7 @@ struct mlx5e_channel {
spinlock_t async_icosq_lock;
 
/* data path - accessed per napi poll */
-   struct irq_desc *irq_desc;
+   const struct cpumask  *aff_mask;
struct mlx5e_ch_stats *stats;
 
/* control */
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1998,7 +1998,7 @@ static int mlx5e_open_channel(struct mlx
c->num_tc   = params->num_tc;
c->xdp  = !!params->xdp_prog;
c->stats= >channel_stats[ix].ch;
-   c->irq_desc = irq_to_desc(irq);
+   c->aff_mask = irq_get_affinity_mask(irq);
c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix);
 
netif_napi_add(netdev, >napi, mlx5e_napi_poll, 64);
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -40,12 +40,8 @@
 static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
 {
int current_cpu = smp_processor_id();
-   const struct cpumask *aff;
-   struct irq_data *idata;
 
-   idata = irq_desc_get_irq_data(c->irq_desc);
-   aff = irq_data_get_affinity_mask(idata);
-   return cpumask_test_cpu(current_cpu, aff);
+   return cpumask_test_cpu(current_cpu, c->aff_mask);
 }
 
 static void mlx5e_handle_tx_dim(struct mlx5e_txqsq *sq)

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch 24/30] xen/events: Remove unused bind_evtchn_to_irq_lateeoi()

2020-12-11 Thread Thomas Gleixner
On Thu, Dec 10 2020 at 18:19, boris ostrovsky wrote:
> On 12/10/20 2:26 PM, Thomas Gleixner wrote:
>> -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq_lateeoi);
>
> include/xen/events.h also needs to be updated (and in the next patch for 
> xen_set_affinity_evtchn() as well).

Darn, I lost that.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Intel-gfx] [PATCH v3 2/4] drm/i915/pmu: Use kstat_irqs to get interrupt count

2020-12-11 Thread Thomas Gleixner
On Thu, Dec 10 2020 at 10:45, Tvrtko Ursulin wrote:
> On 10/12/2020 07:53, Joonas Lahtinen wrote:
>> I think later in the thread there was a suggestion to replace this with
>> simple counter increment in IRQ handler.
>
> It was indeed unsafe until recent b00bccb3f0bb ("drm/i915/pmu: Handle 
> PCI unbind") but now should be fine.
>
> If kstat_irqs does not get exported it is easy enough for i915 to keep a 
> local counter. Reasoning was very infrequent per cpu summation is much 
> cheaper than very frequent atomic add. Up to thousands of interrupts per 
> second vs "once per second" PMU read kind of thing.

Why do you need a atomic_add? It's ONE interrupt which can only be
executed on ONE CPU at a time. Interrupt handlers are non-reentrant.

The core code function will just return an accumulated counter nowadays
which is only 32bit wide, which is what the interface provided forever.
That needs to be fixed first.

Aside of that the accounting is wrong when the interrupt line is shared
because the core accounts interrupt per line not per device sharing the
line. Don't know whether you care or not.

I'll send out a series addressing irq_to_desc() (ab)use all over the
place shortly. i915 is in there...

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 00/30] genirq: Treewide hunt for irq descriptor abuse and assorted fixes

2020-12-11 Thread Thomas Gleixner
A recent request to export kstat_irqs() pointed to a copy of the same in
the i915 code, which made me look for further usage of irq descriptors in
drivers.

The usage in drivers ranges from creative to broken in all colours.

irqdesc.h clearly says that this is core functionality and the fact C does
not allow full encapsulation is not a justification to fiddle with it just
because. It took us a lot of effort to make the core functionality provide
what drivers need.

If there is a shortcoming, it's not asked too much to talk to the relevant
maintainers instead of going off and fiddling with the guts of interrupt
descriptors and often enough without understanding lifetime and locking
rules.

As people insist on not respecting boundaries, this series cleans up the
(ab)use and at the end removes the export of irq_to_desc() to make it at
least harder. All legitimate users of this are built in.

While at it I stumbled over some other oddities related to interrupt
counting and cleaned them up as well.

The series applies on top of

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core

and is also available from git:

  git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git genirq

Thanks,

tglx
---
 arch/alpha/kernel/sys_jensen.c   |2 
 arch/arm/kernel/smp.c|2 
 arch/parisc/kernel/irq.c |7 
 arch/s390/kernel/irq.c   |2 
 arch/x86/kernel/topology.c   |1 
 arch/arm64/kernel/smp.c  |2 
 drivers/gpu/drm/i915/display/intel_lpe_audio.c   |4 
 drivers/gpu/drm/i915/i915_irq.c  |   34 +++
 drivers/gpu/drm/i915/i915_pmu.c  |   18 -
 drivers/gpu/drm/i915/i915_pmu.h  |8 
 drivers/mfd/ab8500-debugfs.c |   16 -
 drivers/net/ethernet/mellanox/mlx4/en_cq.c   |8 
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   |6 
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |3 
 drivers/net/ethernet/mellanox/mlx5/core/en.h |2 
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c|2 
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c|6 
 drivers/ntb/msi.c|4 
 drivers/pci/controller/mobiveil/pcie-mobiveil-host.c |8 
 drivers/pci/controller/pcie-xilinx-nwl.c |8 
 drivers/pinctrl/nomadik/pinctrl-nomadik.c|3 
 drivers/xen/events/events_base.c |  172 +++
 drivers/xen/evtchn.c |   34 ---
 include/linux/interrupt.h|1 
 include/linux/irq.h  |7 
 include/linux/irqdesc.h  |   40 +---
 include/linux/kernel_stat.h  |1 
 kernel/irq/irqdesc.c |   42 ++--
 kernel/irq/manage.c  |   37 
 kernel/irq/proc.c|5 
 30 files changed, 263 insertions(+), 222 deletions(-)


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 07/30] genirq: Make kstat_irqs() static

2020-12-11 Thread Thomas Gleixner
No more users outside the core code.

Signed-off-by: Thomas Gleixner 
---
 include/linux/kernel_stat.h |1 -
 kernel/irq/irqdesc.c|   19 ++-
 2 files changed, 6 insertions(+), 14 deletions(-)

--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -67,7 +67,6 @@ static inline unsigned int kstat_softirq
 /*
  * Number of interrupts per specific IRQ source, since bootup
  */
-extern unsigned int kstat_irqs(unsigned int irq);
 extern unsigned int kstat_irqs_usr(unsigned int irq);
 
 /*
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -924,15 +924,7 @@ static bool irq_is_nmi(struct irq_desc *
return desc->istate & IRQS_NMI;
 }
 
-/**
- * kstat_irqs - Get the statistics for an interrupt
- * @irq:   The interrupt number
- *
- * Returns the sum of interrupt counts on all cpus since boot for
- * @irq. The caller must ensure that the interrupt is not removed
- * concurrently.
- */
-unsigned int kstat_irqs(unsigned int irq)
+static unsigned int kstat_irqs(unsigned int irq)
 {
struct irq_desc *desc = irq_to_desc(irq);
unsigned int sum = 0;
@@ -951,13 +943,14 @@ unsigned int kstat_irqs(unsigned int irq
 }
 
 /**
- * kstat_irqs_usr - Get the statistics for an interrupt
+ * kstat_irqs_usr - Get the statistics for an interrupt from thread context
  * @irq:   The interrupt number
  *
  * Returns the sum of interrupt counts on all cpus since boot for @irq.
- * Contrary to kstat_irqs() this can be called from any context.
- * It uses rcu since a concurrent removal of an interrupt descriptor is
- * observing an rcu grace period before delayed_free_desc()/irq_kobj_release().
+ *
+ * It uses rcu to protect the access since a concurrent removal of an
+ * interrupt descriptor is observing an rcu grace period before
+ * delayed_free_desc()/irq_kobj_release().
  */
 unsigned int kstat_irqs_usr(unsigned int irq)
 {

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 01/30] genirq: Move irq_has_action() into core code

2020-12-11 Thread Thomas Gleixner
This function uses irq_to_desc() and is going to be used by modules to
replace the open coded irq_to_desc() (ab)usage. The final goal is to remove
the export of irq_to_desc() so driver cannot fiddle with it anymore.

Move it into the core code and fixup the usage sites to include the proper
header.

Signed-off-by: Thomas Gleixner 
---
 arch/alpha/kernel/sys_jensen.c |2 +-
 arch/x86/kernel/topology.c |1 +
 include/linux/interrupt.h  |1 +
 include/linux/irqdesc.h|7 +--
 kernel/irq/manage.c|   17 +
 5 files changed, 21 insertions(+), 7 deletions(-)

--- a/arch/alpha/kernel/sys_jensen.c
+++ b/arch/alpha/kernel/sys_jensen.c
@@ -7,7 +7,7 @@
  *
  * Code supporting the Jensen.
  */
-
+#include 
 #include 
 #include 
 #include 
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,6 +25,7 @@
  *
  * Send feedback to 
  */
+#include 
 #include 
 #include 
 #include 
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -232,6 +232,7 @@ extern void devm_free_irq(struct device
 # define local_irq_enable_in_hardirq() local_irq_enable()
 #endif
 
+bool irq_has_action(unsigned int irq);
 extern void disable_irq_nosync(unsigned int irq);
 extern bool disable_hardirq(unsigned int irq);
 extern void disable_irq(unsigned int irq);
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -179,12 +179,7 @@ int handle_domain_nmi(struct irq_domain
 /* Test to see if a driver has successfully requested an irq */
 static inline int irq_desc_has_action(struct irq_desc *desc)
 {
-   return desc->action != NULL;
-}
-
-static inline int irq_has_action(unsigned int irq)
-{
-   return irq_desc_has_action(irq_to_desc(irq));
+   return desc && desc->action != NULL;
 }
 
 /**
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2752,3 +2752,20 @@ int irq_set_irqchip_state(unsigned int i
return err;
 }
 EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
+
+/**
+ * irq_has_action - Check whether an interrupt is requested
+ * @irq:   The linux irq number
+ *
+ * Returns: A snapshot of the current state
+ */
+bool irq_has_action(unsigned int irq)
+{
+   bool res;
+
+   rcu_read_lock();
+   res = irq_desc_has_action(irq_to_desc(irq));
+   rcu_read_unlock();
+   return res;
+}
+EXPORT_SYMBOL_GPL(irq_has_action);

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 23/30] net/mlx5: Use effective interrupt affinity

2020-12-11 Thread Thomas Gleixner
Using the interrupt affinity mask for checking locality is not really
working well on architectures which support effective affinity masks.

The affinity mask is either the system wide default or set by user space,
but the architecture can or even must reduce the mask to the effective set,
which means that checking the affinity mask itself does not really tell
about the actual target CPUs.

Signed-off-by: Thomas Gleixner 
Cc: Saeed Mahameed 
Cc: Leon Romanovsky 
Cc: "David S. Miller" 
Cc: Jakub Kicinski 
Cc: net...@vger.kernel.org
Cc: linux-r...@vger.kernel.org
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1998,7 +1998,7 @@ static int mlx5e_open_channel(struct mlx
c->num_tc   = params->num_tc;
c->xdp  = !!params->xdp_prog;
c->stats= >channel_stats[ix].ch;
-   c->aff_mask = irq_get_affinity_mask(irq);
+   c->aff_mask = irq_get_effective_affinity_mask(irq);
c->lag_port = mlx5e_enumerate_lag_port(priv->mdev, ix);
 
netif_napi_add(netdev, >napi, mlx5e_napi_poll, 64);

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 08/30] genirq: Provide kstat_irqdesc_cpu()

2020-12-11 Thread Thomas Gleixner
Most users of kstat_irqs_cpu() have the irq descriptor already. No point in
calling into the core code and looking it up once more.

Use it in per_cpu_count_show() to start with.

Signed-off-by: Thomas Gleixner 
---
 include/linux/irqdesc.h |6 ++
 kernel/irq/irqdesc.c|4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -113,6 +113,12 @@ static inline void irq_unlock_sparse(voi
 extern struct irq_desc irq_desc[NR_IRQS];
 #endif
 
+static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc,
+ unsigned int cpu)
+{
+   return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+}
+
 static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
 {
return container_of(data->common, struct irq_desc, irq_common_data);
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -147,12 +147,12 @@ static ssize_t per_cpu_count_show(struct
  struct kobj_attribute *attr, char *buf)
 {
struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
-   int cpu, irq = desc->irq_data.irq;
ssize_t ret = 0;
char *p = "";
+   int cpu;
 
for_each_possible_cpu(cpu) {
-   unsigned int c = kstat_irqs_cpu(irq, cpu);
+   unsigned int c = irq_desc_kstat_cpu(desc, cpu);
 
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
p = ",";

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 03/30] genirq: Move irq_set_lockdep_class() to core

2020-12-11 Thread Thomas Gleixner
irq_set_lockdep_class() is used from modules and requires irq_to_desc() to
be exported. Move it into the core code which lifts another requirement for
the export.

Signed-off-by: Thomas Gleixner 
---
 include/linux/irqdesc.h |   10 --
 kernel/irq/irqdesc.c|   14 ++
 2 files changed, 18 insertions(+), 6 deletions(-)

--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -240,16 +240,14 @@ static inline bool irq_is_percpu_devid(u
return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID);
 }
 
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key 
*lock_class,
+struct lock_class_key *request_class);
 static inline void
 irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
  struct lock_class_key *request_class)
 {
-   struct irq_desc *desc = irq_to_desc(irq);
-
-   if (desc) {
-   lockdep_set_class(>lock, lock_class);
-   lockdep_set_class(>request_mutex, request_class);
-   }
+   if (IS_ENABLED(CONFIG_LOCKDEP))
+   __irq_set_lockdep_class(irq, lock_class, request_class);
 }
 
 #endif
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -968,3 +968,17 @@ unsigned int kstat_irqs_usr(unsigned int
rcu_read_unlock();
return sum;
 }
+
+#ifdef CONFIG_LOCKDEP
+void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key 
*lock_class,
+struct lock_class_key *request_class)
+{
+   struct irq_desc *desc = irq_to_desc(irq);
+
+   if (desc) {
+   lockdep_set_class(>lock, lock_class);
+   lockdep_set_class(>request_mutex, request_class);
+   }
+}
+EXPORT_SYMBOL_GPL(irq_set_lockdep_class);
+#endif

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 15/30] pinctrl: nomadik: Use irq_has_action()

2020-12-11 Thread Thomas Gleixner
Let the core code do the fiddling with irq_desc.

Signed-off-by: Thomas Gleixner 
Cc: Linus Walleij 
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-g...@vger.kernel.org
---
 drivers/pinctrl/nomadik/pinctrl-nomadik.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

--- a/drivers/pinctrl/nomadik/pinctrl-nomadik.c
+++ b/drivers/pinctrl/nomadik/pinctrl-nomadik.c
@@ -948,7 +948,6 @@ static void nmk_gpio_dbg_show_one(struct
   (mode < 0) ? "unknown" : modes[mode]);
} else {
int irq = chip->to_irq(chip, offset);
-   struct irq_desc *desc = irq_to_desc(irq);
const int pullidx = pull ? 1 : 0;
int val;
static const char * const pulls[] = {
@@ -969,7 +968,7 @@ static void nmk_gpio_dbg_show_one(struct
 * This races with request_irq(), set_irq_type(),
 * and set_irq_wake() ... but those are "rare".
 */
-   if (irq > 0 && desc && desc->action) {
+   if (irq > 0 && irq_has_action(irq)) {
char *trigger;
 
if (nmk_chip->edge_rising & BIT(offset))

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch 27/30] xen/events: Only force affinity mask for percpu interrupts

2020-12-11 Thread Thomas Gleixner
On Thu, Dec 10 2020 at 18:20, boris ostrovsky wrote:
> On 12/10/20 2:26 PM, Thomas Gleixner wrote:
>> All event channel setups bind the interrupt on CPU0 or the target CPU for
>> percpu interrupts and overwrite the affinity mask with the corresponding
>> cpumask. That does not make sense.
>>
>> The XEN implementation of irqchip::irq_set_affinity() already picks a
>> single target CPU out of the affinity mask and the actual target is stored
>> in the effective CPU mask, so destroying the user chosen affinity mask
>> which might contain more than one CPU is wrong.
>>
>> Change the implementation so that the channel is bound to CPU0 at the XEN
>> level and leave the affinity mask alone. At startup of the interrupt
>> affinity will be assigned out of the affinity mask and the XEN binding will
>> be updated. 
>
> If that's the case then I wonder whether we need this call at all and
> instead bind at startup time.

I was wondering about that, but my knowledge about the Xen internal
requirements is pretty limited. The current set at least survived basic
testing by Jürgen.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 17/30] NTB/msi: Use irq_has_action()

2020-12-11 Thread Thomas Gleixner
Use the proper core function.

Signed-off-by: Thomas Gleixner 
Cc: Jon Mason 
Cc: Dave Jiang 
Cc: Allen Hubbe 
Cc: linux-...@googlegroups.com
---
 drivers/ntb/msi.c |4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

--- a/drivers/ntb/msi.c
+++ b/drivers/ntb/msi.c
@@ -282,15 +282,13 @@ int ntbm_msi_request_threaded_irq(struct
  struct ntb_msi_desc *msi_desc)
 {
struct msi_desc *entry;
-   struct irq_desc *desc;
int ret;
 
if (!ntb->msi)
return -EINVAL;
 
for_each_pci_msi_entry(entry, ntb->pdev) {
-   desc = irq_to_desc(entry->irq);
-   if (desc->action)
+   if (irq_has_action(entry->irq))
continue;
 
ret = devm_request_threaded_irq(>dev, entry->irq, handler,

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 19/30] PCI: mobiveil: Use irq_data_get_irq_chip_data()

2020-12-11 Thread Thomas Gleixner
Going through a full irq descriptor lookup instead of just using the proper
helper function which provides direct access is suboptimal.

In fact it _is_ wrong because the chip callback needs to get the chip data
which is relevant for the chip while using the irq descriptor variant
returns the irq chip data of the top level chip of a hierarchy. It does not
matter in this case because the chip is the top level chip, but that
doesn't make it more correct.

Signed-off-by: Thomas Gleixner 
Cc: Karthikeyan Mitran 
Cc: Hou Zhiqiang 
Cc: Lorenzo Pieralisi 
Cc: Rob Herring 
Cc: Bjorn Helgaas 
Cc: linux-...@vger.kernel.org
---
 drivers/pci/controller/mobiveil/pcie-mobiveil-host.c |8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

--- a/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c
+++ b/drivers/pci/controller/mobiveil/pcie-mobiveil-host.c
@@ -306,13 +306,11 @@ int mobiveil_host_init(struct mobiveil_p
 
 static void mobiveil_mask_intx_irq(struct irq_data *data)
 {
-   struct irq_desc *desc = irq_to_desc(data->irq);
-   struct mobiveil_pcie *pcie;
+   struct mobiveil_pcie *pcie = irq_data_get_irq_chip_data(data);
struct mobiveil_root_port *rp;
unsigned long flags;
u32 mask, shifted_val;
 
-   pcie = irq_desc_get_chip_data(desc);
rp = >rp;
mask = 1 << ((data->hwirq + PAB_INTX_START) - 1);
raw_spin_lock_irqsave(>intx_mask_lock, flags);
@@ -324,13 +322,11 @@ static void mobiveil_mask_intx_irq(struc
 
 static void mobiveil_unmask_intx_irq(struct irq_data *data)
 {
-   struct irq_desc *desc = irq_to_desc(data->irq);
-   struct mobiveil_pcie *pcie;
+   struct mobiveil_pcie *pcie = irq_data_get_irq_chip_data(data);
struct mobiveil_root_port *rp;
unsigned long flags;
u32 shifted_val, mask;
 
-   pcie = irq_desc_get_chip_data(desc);
rp = >rp;
mask = 1 << ((data->hwirq + PAB_INTX_START) - 1);
raw_spin_lock_irqsave(>intx_mask_lock, flags);

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Intel-gfx] [PATCH v3 2/4] drm/i915/pmu: Use kstat_irqs to get interrupt count

2020-12-11 Thread Thomas Gleixner
On Thu, Dec 10 2020 at 17:09, Tvrtko Ursulin wrote:
> On 10/12/2020 16:35, Thomas Gleixner wrote:
>> I'll send out a series addressing irq_to_desc() (ab)use all over the
>> place shortly. i915 is in there...
>
> Yep we don't need atomic, my bad. And we would care about the shared 
> interrupt line. And without atomic the extra accounting falls way below 
> noise.

You have to be careful though. If you make the accumulated counter 64
bit wide then you need to be careful vs. 32bit machines.

> So in the light of it all, it sounds best I just quickly replace our 
> abuse with private counting and then you don't have to deal with it in 
> your series.

I mostly have it. Still chewing on the 32bit vs. 64bit thing. And
keeping it in my series allows me to remove the export of irq_to_desc()
at the end without waiting for your tree to be merged.

Give me a few.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 04/30] genirq: Provide irq_get_effective_affinity()

2020-12-11 Thread Thomas Gleixner
Provide an accessor to the effective interrupt affinity mask. Going to be
used to replace open coded fiddling with the irq descriptor.

Signed-off-by: Thomas Gleixner 
---
 include/linux/irq.h |7 +++
 1 file changed, 7 insertions(+)

--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -907,6 +907,13 @@ struct cpumask *irq_data_get_effective_a
 }
 #endif
 
+static inline struct cpumask *irq_get_effective_affinity_mask(unsigned int irq)
+{
+   struct irq_data *d = irq_get_irq_data(irq);
+
+   return d ? irq_data_get_effective_affinity_mask(d) : NULL;
+}
+
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 12/30] s390/irq: Use irq_desc_kstat_cpu() in show_msi_interrupt()

2020-12-11 Thread Thomas Gleixner
The irq descriptor is already there, no need to look it up again.

Signed-off-by: Thomas Gleixner 
Cc: Christian Borntraeger 
Cc: Heiko Carstens 
Cc: linux-s...@vger.kernel.org
---
 arch/s390/kernel/irq.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -124,7 +124,7 @@ static void show_msi_interrupt(struct se
raw_spin_lock_irqsave(>lock, flags);
seq_printf(p, "%3d: ", irq);
for_each_online_cpu(cpu)
-   seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu));
+   seq_printf(p, "%10u ", irq_desc_kstat_irq(desc, cpu));
 
if (desc->irq_data.chip)
seq_printf(p, " %8s", desc->irq_data.chip->name);

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 26/30] xen/events: Use immediate affinity setting

2020-12-11 Thread Thomas Gleixner
There is absolutely no reason to mimic the x86 deferred affinity
setting. This mechanism is required to handle the hardware induced issues
of IO/APIC and MSI and is not in use when the interrupts are remapped.

XEN does not need this and can simply change the affinity from the calling
context. The core code invokes this with the interrupt descriptor lock held
so it is fully serialized against any other operation.

Mark the interrupts with IRQ_MOVE_PCNTXT to disable the deferred affinity
setting. The conditional mask/unmask operation is already handled in
xen_rebind_evtchn_to_cpu().

This makes XEN on x86 use the same mechanics as on e.g. ARM64 where
deferred affinity setting is not required and not implemented and the code
path in the ack functions is compiled out.

Signed-off-by: Thomas Gleixner 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: xen-de...@lists.xenproject.org
---
 drivers/xen/events/events_base.c |   35 +--
 1 file changed, 9 insertions(+), 26 deletions(-)

--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -628,6 +628,11 @@ static void xen_irq_init(unsigned irq)
info->refcnt = -1;
 
set_info_for_irq(irq, info);
+   /*
+* Interrupt affinity setting can be immediate. No point
+* in delaying it until an interrupt is handled.
+*/
+   irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
 
INIT_LIST_HEAD(>eoi_list);
list_add_tail(>list, _irq_list_head);
@@ -739,18 +744,7 @@ static void eoi_pirq(struct irq_data *da
if (!VALID_EVTCHN(evtchn))
return;
 
-   if (unlikely(irqd_is_setaffinity_pending(data)) &&
-   likely(!irqd_irq_disabled(data))) {
-   int masked = test_and_set_mask(evtchn);
-
-   clear_evtchn(evtchn);
-
-   irq_move_masked_irq(data);
-
-   if (!masked)
-   unmask_evtchn(evtchn);
-   } else
-   clear_evtchn(evtchn);
+   clear_evtchn(evtchn);
 
if (pirq_needs_eoi(data->irq)) {
rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, );
@@ -1641,7 +1635,6 @@ void rebind_evtchn_irq(evtchn_port_t evt
mutex_unlock(_mapping_update_lock);
 
 bind_evtchn_to_cpu(evtchn, info->cpu);
-   /* This will be deferred until interrupt is processed */
irq_set_affinity(irq, cpumask_of(info->cpu));
 
/* Unmask the event channel. */
@@ -1688,8 +1681,9 @@ static int set_affinity_irq(struct irq_d
bool force)
 {
unsigned tcpu = cpumask_first_and(dest, cpu_online_mask);
-   int ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu);
+   int ret;
 
+   ret = xen_rebind_evtchn_to_cpu(evtchn_from_irq(data->irq), tcpu);
if (!ret)
irq_data_update_effective_affinity(data, cpumask_of(tcpu));
 
@@ -1719,18 +1713,7 @@ static void ack_dynirq(struct irq_data *
if (!VALID_EVTCHN(evtchn))
return;
 
-   if (unlikely(irqd_is_setaffinity_pending(data)) &&
-   likely(!irqd_irq_disabled(data))) {
-   int masked = test_and_set_mask(evtchn);
-
-   clear_evtchn(evtchn);
-
-   irq_move_masked_irq(data);
-
-   if (!masked)
-   unmask_evtchn(evtchn);
-   } else
-   clear_evtchn(evtchn);
+   clear_evtchn(evtchn);
 }
 
 static void mask_ack_dynirq(struct irq_data *data)

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch 14/30] drm/i915/pmu: Replace open coded kstat_irqs() copy

2020-12-11 Thread Thomas Gleixner
Driver code has no business with the internals of the irq descriptor.

Aside of that the count is per interrupt line and therefore takes
interrupts from other devices into account which share the interrupt line
and are not handled by the graphics driver.

Replace it with a pmu private count which only counts interrupts which
originate from the graphics card.

To avoid atomics or heuristics of some sort make the counter field
'unsigned long'. That limits the count to 4e9 on 32bit which is a lot and
postprocessing can easily deal with the occasional wraparound.

Signed-off-by: Thomas Gleixner 
Cc: Tvrtko Ursulin 
Cc: Jani Nikula 
Cc: Joonas Lahtinen 
Cc: Rodrigo Vivi 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: intel-...@lists.freedesktop.org
Cc: dri-devel@lists.freedesktop.org
---
 drivers/gpu/drm/i915/i915_irq.c |   34 ++
 drivers/gpu/drm/i915/i915_pmu.c |   18 +-
 drivers/gpu/drm/i915/i915_pmu.h |8 
 3 files changed, 43 insertions(+), 17 deletions(-)

--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -60,6 +60,24 @@
  * and related files, but that will be described in separate chapters.
  */
 
+/*
+ * Interrupt statistic for PMU. Increments the counter only if the
+ * interrupt originated from the the GPU so interrupts from a device which
+ * shares the interrupt line are not accounted.
+ */
+static inline void pmu_irq_stats(struct drm_i915_private *priv,
+irqreturn_t res)
+{
+   if (unlikely(res != IRQ_HANDLED))
+   return;
+
+   /*
+* A clever compiler translates that into INC. A not so clever one
+* should at least prevent store tearing.
+*/
+   WRITE_ONCE(priv->pmu.irq_count, priv->pmu.irq_count + 1);
+}
+
 typedef bool (*long_pulse_detect_func)(enum hpd_pin pin, u32 val);
 
 static const u32 hpd_ilk[HPD_NUM_PINS] = {
@@ -1599,6 +1617,8 @@ static irqreturn_t valleyview_irq_handle
valleyview_pipestat_irq_handler(dev_priv, pipe_stats);
} while (0);
 
+   pmu_irq_stats(dev_priv, ret);
+
enable_rpm_wakeref_asserts(_priv->runtime_pm);
 
return ret;
@@ -1676,6 +1696,8 @@ static irqreturn_t cherryview_irq_handle
valleyview_pipestat_irq_handler(dev_priv, pipe_stats);
} while (0);
 
+   pmu_irq_stats(dev_priv, ret);
+
enable_rpm_wakeref_asserts(_priv->runtime_pm);
 
return ret;
@@ -2103,6 +2125,8 @@ static irqreturn_t ilk_irq_handler(int i
if (sde_ier)
raw_reg_write(regs, SDEIER, sde_ier);
 
+   pmu_irq_stats(i915, ret);
+
/* IRQs are synced during runtime_suspend, we don't require a wakeref */
enable_rpm_wakeref_asserts(>runtime_pm);
 
@@ -2419,6 +2443,8 @@ static irqreturn_t gen8_irq_handler(int
 
gen8_master_intr_enable(regs);
 
+   pmu_irq_stats(dev_priv, IRQ_HANDLED);
+
return IRQ_HANDLED;
 }
 
@@ -2514,6 +2540,8 @@ static __always_inline irqreturn_t
 
gen11_gu_misc_irq_handler(gt, gu_misc_iir);
 
+   pmu_irq_stats(i915, IRQ_HANDLED);
+
return IRQ_HANDLED;
 }
 
@@ -3688,6 +3716,8 @@ static irqreturn_t i8xx_irq_handler(int
i8xx_pipestat_irq_handler(dev_priv, iir, pipe_stats);
} while (0);
 
+   pmu_irq_stats(dev_priv, ret);
+
enable_rpm_wakeref_asserts(_priv->runtime_pm);
 
return ret;
@@ -3796,6 +3826,8 @@ static irqreturn_t i915_irq_handler(int
i915_pipestat_irq_handler(dev_priv, iir, pipe_stats);
} while (0);
 
+   pmu_irq_stats(dev_priv, ret);
+
enable_rpm_wakeref_asserts(_priv->runtime_pm);
 
return ret;
@@ -3941,6 +3973,8 @@ static irqreturn_t i965_irq_handler(int
i965_pipestat_irq_handler(dev_priv, iir, pipe_stats);
} while (0);
 
+   pmu_irq_stats(dev_priv, IRQ_HANDLED);
+
enable_rpm_wakeref_asserts(_priv->runtime_pm);
 
return ret;
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -423,22 +423,6 @@ static enum hrtimer_restart i915_sample(
return HRTIMER_RESTART;
 }
 
-static u64 count_interrupts(struct drm_i915_private *i915)
-{
-   /* open-coded kstat_irqs() */
-   struct irq_desc *desc = irq_to_desc(i915->drm.pdev->irq);
-   u64 sum = 0;
-   int cpu;
-
-   if (!desc || !desc->kstat_irqs)
-   return 0;
-
-   for_each_possible_cpu(cpu)
-   sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
-
-   return sum;
-}
-
 static void i915_pmu_event_destroy(struct perf_event *event)
 {
struct drm_i915_private *i915 =
@@ -581,7 +565,7 @@ static u64 __i915_pmu_event_read(struct
   USEC_PER_SEC /* to MHz */);
break;
case I915_PMU_INTERRUPTS:
-   val = count_interrupts(i915);
+   val = READ_ONCE(pmu->ir

Re: [PATCH v3 3/4] tpm_tis: Disable interrupts if interrupt storm detected

2020-12-07 Thread Thomas Gleixner
Jerry,

On Fri, Dec 04 2020 at 18:43, Jerry Snitselaar wrote:
> @@ -715,9 +717,23 @@ static irqreturn_t tis_int_handler(int dummy, void 
> *dev_id)
>  {
>   struct tpm_chip *chip = dev_id;
>   struct tpm_tis_data *priv = dev_get_drvdata(>dev);
> + static bool check_storm = true;
> + static unsigned int check_start;

So this assumes that there can't be two TPMs which is probably true, but
everything else in this driver has stuff in tpm_tis_data per device.

>   u32 interrupt;
>   int i, rc;
>  
> + if (unlikely(check_storm)) {
> + if (!check_start) {
> + check_start = jiffies_to_msecs(jiffies);

Yuck. I had to read that twice to figure out that it's correct vs. the
truncation of the result to unsigned int. You can spare that conversion
by simply doing

   unsigned long end_of_check = jiffies + HZ / 2;

and then the check becomes

time_before(jiffies, end_of_check)

> + } else if ((kstat_irqs(priv->irq) > 1000) &&
> +(jiffies_to_msecs(jiffies) - check_start < 500)) {

I assume you can't call disable_irq_nosync() here, but shouldn't this
shut up the interrupt at the TPM level right here?

> + check_storm = false;
> + schedule_work(>storm_work);
> + } else if (jiffies_to_msecs(jiffies) - check_start >= 500) {
> + check_storm = false;
> + }
> + }

So back to kstat_irqs(). As this needs two extra variables anyway:

init()
priv->irq_check = 1;
priv->end_check = 0;

isr()
if (unlikely(priv->irq_check)) {
if (!priv->end_check) {
priv->end_check = jiffies + HZ / 2;
} else if (time_before(jiffies, priv->end_check)) {
if (priv->irq_check++ > 1000)
schedule_work(...);
} else {
priv->irq_check = 0;
}
}

Hmm? I still need to see an argument for an kstat_irqs() export being
superior.

Though I wonder whether such an infrastructure should be provided in the
irq core. Let me think about it.

Just as a side note. I was looking at tpm_tis_probe_irq_single() and
that function is leaking the interrupt request if any of the checks
afterwards fails, except for the final interrupt probe check which does
a cleanup. That means on fail before that the interrupt handler stays
requested up to the point where the module is removed. If that's a
shared interrupt and some other device is active on the same line, then
each interrupt from that device will call into the TPM code. Something
like the below is needed.

Also the X86 autoprobe mechanism is interesting:

if (IS_ENABLED(CONFIG_X86))
for (i = 3; i <= 15; i++)
if (!tpm_tis_probe_irq_single(chip, intmask, 0, i))
return;

The third argument is 'flags' which is handed to request_irq(). So that
won't ever be able to probe a shared interrupt. But if an interrupt
number > 0 is handed to tpm_tis_core_init() the interrupt is requested
with IRQF_SHARED. Same issue when the chip has an interrupt number in
the register. It's also requested exclusive which is pretty likely
to fail on ancient x86 machines.

The vast amount of comments didn't help to figure out what the reasoning
is.

Thanks,

tglx
---
 drivers/char/tpm/tpm_tis_core.c |   14 +-
 1 file changed, 9 insertions(+), 5 deletions(-)

--- a/drivers/char/tpm/tpm_tis_core.c
+++ b/drivers/char/tpm/tpm_tis_core.c
@@ -782,26 +782,26 @@ static int tpm_tis_probe_irq_single(stru
rc = tpm_tis_read8(priv, TPM_INT_VECTOR(priv->locality),
   _int_vec);
if (rc < 0)
-   return rc;
+   goto fail;
 
rc = tpm_tis_write8(priv, TPM_INT_VECTOR(priv->locality), irq);
if (rc < 0)
-   return rc;
+   goto fail;
 
rc = tpm_tis_read32(priv, TPM_INT_STATUS(priv->locality), _status);
if (rc < 0)
-   return rc;
+   goto fail;
 
/* Clear all existing */
rc = tpm_tis_write32(priv, TPM_INT_STATUS(priv->locality), int_status);
if (rc < 0)
-   return rc;
+   goto fail;
 
/* Turn on */
rc = tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality),
 intmask | TPM_GLOBAL_INT_ENABLE);
if (rc < 0)
-   return rc;
+   goto fail;
 
priv->irq_tested = false;
 
@@ -825,6 +825,10 @@ static int tpm_tis_probe_irq_single(stru
}
 
return 0;
+
+fail:
+   disable_interrupts(chip);
+   return rc;
 }
 
 /* Try to find the IRQ the TPM is using. This is for legacy x86 systems that
___
dri-devel mailing list
dri-devel@lists.freedesktop.org

Re: [PATCH v3 2/4] drm/i915/pmu: Use kstat_irqs to get interrupt count

2020-12-07 Thread Thomas Gleixner
On Fri, Dec 04 2020 at 18:43, Jerry Snitselaar wrote:

> Now that kstat_irqs is exported, get rid of count_interrupts in
> i915_pmu.c
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -423,22 +423,6 @@ static enum hrtimer_restart i915_sample(struct hrtimer 
> *hrtimer)
>   return HRTIMER_RESTART;
>  }
>  
> -static u64 count_interrupts(struct drm_i915_private *i915)
> -{
> - /* open-coded kstat_irqs() */
> - struct irq_desc *desc = irq_to_desc(i915->drm.pdev->irq);
> - u64 sum = 0;
> - int cpu;
> -
> - if (!desc || !desc->kstat_irqs)
> - return 0;
> -
> - for_each_possible_cpu(cpu)
> - sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
> -
> - return sum;
> -}

May I ask why this has been merged in the first place?

Nothing in a driver has ever to fiddle with the internals of an irq
descriptor. We have functions for properly accessing them. Just because
C allows to fiddle with everything is not a justification. If the
required function is not exported then adding the export with a proper
explanation is not asked too much.

Also this lacks protection or at least a comment why this can be called
safely and is not subject to a concurrent removal of the irq descriptor.
The same problem exists when calling kstat_irqs(). It's even documented
at the top of the function.

Thanks,

tglx


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH v3 1/4] irq: export kstat_irqs

2020-12-07 Thread Thomas Gleixner
On Sun, Dec 06 2020 at 09:40, James Bottomley wrote:
> On Sun, 2020-12-06 at 17:40 +0100, Thomas Gleixner wrote:
>> On Sat, Dec 05 2020 at 12:39, Jarkko Sakkinen wrote:
>> > On Fri, Dec 04, 2020 at 06:43:37PM -0700, Jerry Snitselaar wrote:
>> > > To try and detect potential interrupt storms that
>> > > have been occurring with tpm_tis devices it was suggested
>> > > to use kstat_irqs() to get the number of interrupts.
>> > > Since tpm_tis can be built as a module it needs kstat_irqs
>> > > exported.
>> > 
>> > I think you should also have a paragraph explicitly stating that
>> > i915_pmu.c contains a duplicate of kstat_irqs() because it is not
>> > exported as of today. It adds a lot more weight to this given that
>> > there is already existing mainline usage (kind of).
>> 
>> It's abusage and just the fact that it exists is not an argument by
>> itself.
>
> What we want is a count of the interrupts to see if we're having an
> interrupt storm from the TPM device (some seem to be wired to fire the
> interrupt even when there's no event to warrant it).  Since
> kstat_irqs_user() does the correct RCU locking, should we be using that
> instead?

If we need to export it, yes. But I still have to understand the
value. See my other reply.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH v3 2/4] drm/i915/pmu: Use kstat_irqs to get interrupt count

2020-12-07 Thread Thomas Gleixner
On Sun, Dec 06 2020 at 17:38, Thomas Gleixner wrote:
> On Fri, Dec 04 2020 at 18:43, Jerry Snitselaar wrote:
>> Now that kstat_irqs is exported, get rid of count_interrupts in
>> i915_pmu.c
>
> May I ask why this has been merged in the first place?
>
> Nothing in a driver has ever to fiddle with the internals of an irq
> descriptor. We have functions for properly accessing them. Just because
> C allows to fiddle with everything is not a justification. If the
> required function is not exported then adding the export with a proper
> explanation is not asked too much.
>
> Also this lacks protection or at least a comment why this can be called
> safely and is not subject to a concurrent removal of the irq descriptor.
> The same problem exists when calling kstat_irqs(). It's even documented
> at the top of the function.

And as pointed out vs. that TPM thing this really could have been a
trivial

i915->irqs++;

in the interrupt handler and a read of that instead of iterating over
all possible cpus and summing it up. Oh well...

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH v3 1/4] irq: export kstat_irqs

2020-12-07 Thread Thomas Gleixner
On Sat, Dec 05 2020 at 12:39, Jarkko Sakkinen wrote:
> On Fri, Dec 04, 2020 at 06:43:37PM -0700, Jerry Snitselaar wrote:
>> To try and detect potential interrupt storms that
>> have been occurring with tpm_tis devices it was suggested
>> to use kstat_irqs() to get the number of interrupts.
>> Since tpm_tis can be built as a module it needs kstat_irqs
>> exported.
>
> I think you should also have a paragraph explicitly stating that
> i915_pmu.c contains a duplicate of kstat_irqs() because it is not
> exported as of today. It adds a lot more weight to this given that
> there is already existing mainline usage (kind of).

It's abusage and just the fact that it exists is not an argument by
itself.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH v3 2/4] drm/i915/pmu: Use kstat_irqs to get interrupt count

2020-12-07 Thread Thomas Gleixner
On Sun, Dec 06 2020 at 14:47, Jerry Snitselaar wrote:
> Thomas Gleixner @ 2020-12-06 09:38 MST:
>
> I don't know the history behind this bit. I stumbled across it in cscope
> when looking for places using kstat_irqs.

I'm not ranting at you. The i915 people are on Cc.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH v3 1/4] irq: export kstat_irqs

2020-12-07 Thread Thomas Gleixner
Jerry,

On Fri, Dec 04 2020 at 18:43, Jerry Snitselaar wrote:

The proper prefix is 'genirq:' git log kernel/irq/irqdesc.c would have
told you. 

> To try and detect potential interrupt storms that
> have been occurring with tpm_tis devices it was suggested
> to use kstat_irqs() to get the number of interrupts.
> Since tpm_tis can be built as a module it needs kstat_irqs
> exported.

I'm not really enthused about exporting this without making it at least
safe. Using it from an interrupt handler is obviously safe vs. concurrent
removal, but the next driver writer who thinks this is cool is going to
get it wrong for sure.

Though I still have to figure out what the advantage of invoking a
function which needs to do a radix tree lookup over a device local
counter is just to keep track of this.

I'll reply on the TPM part of this as well.

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch V3 10/37] ARM: highmem: Switch to generic kmap atomic

2020-11-13 Thread Thomas Gleixner
Marek,

On Thu, Nov 12 2020 at 09:10, Marek Szyprowski wrote:
> On 03.11.2020 10:27, Thomas Gleixner wrote:
>
> I can do more tests to help fixing this issue. Just let me know what to do.

Just sent out the fix before I saw your report.

 https://lore.kernel.org/r/87y2j6n8mj@nanos.tec.linutronix.de

Thanks,

tglx
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH RFC PKS/PMEM 05/58] kmap: Introduce k[un]map_thread

2020-11-10 Thread Thomas Gleixner
On Mon, Nov 09 2020 at 20:59, Ira Weiny wrote:
> On Tue, Nov 10, 2020 at 02:13:56AM +0100, Thomas Gleixner wrote:
> Also, we can convert the new memcpy_*_page() calls to kmap_local() as well.
> [For now my patch just uses kmap_atomic().]
>
> I've not looked at all of the patches in your latest version.  Have you
> included converting any of the kmap() call sites?  I thought you were more
> focused on converting the kmap_atomic() to kmap_local()?

I did not touch any of those yet, but it's a logical consequence to
convert all kmap() instances which are _not_ creating a global mapping
over to it.

Thanks,

tglx

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH RFC PKS/PMEM 05/58] kmap: Introduce k[un]map_thread

2020-11-10 Thread Thomas Gleixner
Ira,

On Fri, Oct 09 2020 at 12:49, ira weiny wrote:
> From: Ira Weiny 
>
> To correctly support the semantics of kmap() with Kernel protection keys
> (PKS), kmap() may be required to set the protections on multiple
> processors (globally).  Enabling PKS globally can be very expensive
> depending on the requested operation.  Furthermore, enabling a domain
> globally reduces the protection afforded by PKS.
>
> Most kmap() (Aprox 209 of 229) callers use the map within a single thread and
> have no need for the protection domain to be enabled globally.  However, the
> remaining callers do not follow this pattern and, as best I can tell, expect
> the mapping to be 'global' and available to any thread who may access the
> mapping.[1]
>
> We don't anticipate global mappings to pmem, however in general there is a
> danger in changing the semantics of kmap().  Effectively, this would cause an
> unresolved page fault with little to no information about why the failure
> occurred.
>
> To resolve this a number of options were considered.
>
> 1) Attempt to change all the thread local kmap() calls to kmap_atomic()[2]
> 2) Introduce a flags parameter to kmap() to indicate if the mapping should be
>global or not
> 3) Change ~20 call sites to 'kmap_global()' to indicate that they require a
>global enablement of the pages.
> 4) Change ~209 call sites to 'kmap_thread()' to indicate that the mapping is 
> to
>be used within that thread of execution only
>
> Option 1 is simply not feasible.  Option 2 would require all of the call sites
> of kmap() to change.  Option 3 seems like a good minimal change but there is a
> danger that new code may miss the semantic change of kmap() and not get the
> behavior the developer intended.  Therefore, #4 was chosen.

There is Option #5:

Convert the thread local kmap() invocations to the proposed kmap_local()
interface which is coming along [1].

That solves a couple of issues:

 1) It relieves the current kmap_atomic() usage sites from the implict
pagefault/preempt disable semantics which apply even when
CONFIG_HIGHMEM is disabled. kmap_local() still can be invoked from
atomic context.

 2) Due to #1 it allows to replace the conditional usage of kmap() and
kmap_atomic() for purely thread local mappings.

 3) It puts the burden on the HIGHMEM inflicted systems

 4) It is actually more efficient for most of the pure thread local use
cases on HIGHMEM inflicted systems because it avoids the overhead of
the global lock and the potential kmap slot exhaustion. A potential
preemption will be more expensive, but that's not really the case we
want to optimize for.

 5) It solves the RT issue vs. kmap_atomic()

So instead of creating yet another variety of kmap() which is just
scratching the particular PKRS itch, can we please consolidate all of
that on the wider reaching kmap_local() approach?

Thanks,

tglx
 
[1] https://lore.kernel.org/lkml/20201103092712.714480...@linutronix.de/

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 19/37] mm/highmem: Remove the old kmap_atomic cruft

2020-11-04 Thread Thomas Gleixner
All users gone.

Signed-off-by: Thomas Gleixner 
---
 include/linux/highmem.h |   63 +++-
 mm/highmem.c|7 -
 2 files changed, 5 insertions(+), 65 deletions(-)

--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -86,31 +86,16 @@ static inline void kunmap(struct page *p
  * be used in IRQ contexts, so in some (very limited) cases we need
  * it.
  */
-
-#ifndef CONFIG_KMAP_LOCAL
-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot);
-void kunmap_atomic_high(void *kvaddr);
-
 static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
preempt_disable();
pagefault_disable();
-   if (!PageHighMem(page))
-   return page_address(page);
-   return kmap_atomic_high_prot(page, prot);
-}
-
-static inline void __kunmap_atomic(void *vaddr)
-{
-   kunmap_atomic_high(vaddr);
+   return __kmap_local_page_prot(page, prot);
 }
-#else /* !CONFIG_KMAP_LOCAL */
 
-static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+static inline void *kmap_atomic(struct page *page)
 {
-   preempt_disable();
-   pagefault_disable();
-   return __kmap_local_page_prot(page, prot);
+   return kmap_atomic_prot(page, kmap_prot);
 }
 
 static inline void *kmap_atomic_pfn(unsigned long pfn)
@@ -125,13 +110,6 @@ static inline void __kunmap_atomic(void
kunmap_local_indexed(addr);
 }
 
-#endif /* CONFIG_KMAP_LOCAL */
-
-static inline void *kmap_atomic(struct page *page)
-{
-   return kmap_atomic_prot(page, kmap_prot);
-}
-
 /* declarations for linux/mm/highmem.c */
 unsigned int nr_free_highpages(void);
 extern atomic_long_t _totalhigh_pages;
@@ -212,41 +190,8 @@ static inline void __kunmap_atomic(void
 
 #define kmap_flush_unused()do {} while(0)
 
-#endif /* CONFIG_HIGHMEM */
-
-#if !defined(CONFIG_KMAP_LOCAL)
-#if defined(CONFIG_HIGHMEM)
-
-DECLARE_PER_CPU(int, __kmap_atomic_idx);
-
-static inline int kmap_atomic_idx_push(void)
-{
-   int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1;
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-   WARN_ON_ONCE(in_irq() && !irqs_disabled());
-   BUG_ON(idx >= KM_TYPE_NR);
-#endif
-   return idx;
-}
-
-static inline int kmap_atomic_idx(void)
-{
-   return __this_cpu_read(__kmap_atomic_idx) - 1;
-}
 
-static inline void kmap_atomic_idx_pop(void)
-{
-#ifdef CONFIG_DEBUG_HIGHMEM
-   int idx = __this_cpu_dec_return(__kmap_atomic_idx);
-
-   BUG_ON(idx < 0);
-#else
-   __this_cpu_dec(__kmap_atomic_idx);
-#endif
-}
-#endif
-#endif
+#endif /* CONFIG_HIGHMEM */
 
 /*
  * Prevent people trying to call kunmap_atomic() as if it were kunmap()
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -31,12 +31,6 @@
 #include 
 #include 
 
-#ifndef CONFIG_KMAP_LOCAL
-#ifdef CONFIG_HIGHMEM
-DEFINE_PER_CPU(int, __kmap_atomic_idx);
-#endif
-#endif
-
 /*
  * Virtual_count is not a pure "count".
  *  0 means that it is not mapped, and has not been mapped
@@ -410,6 +404,7 @@ static inline void kmap_local_idx_pop(vo
 #ifndef arch_kmap_local_post_map
 # define arch_kmap_local_post_map(vaddr, pteval)   do { } while (0)
 #endif
+
 #ifndef arch_kmap_local_pre_unmap
 # define arch_kmap_local_pre_unmap(vaddr)  do { } while (0)
 #endif

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 02/37] highmem: Remove unused functions

2020-11-04 Thread Thomas Gleixner
Nothing uses totalhigh_pages_dec() and totalhigh_pages_set().

Signed-off-by: Thomas Gleixner 
---
V3: New patch
---
 include/linux/highmem.h |   10 --
 1 file changed, 10 deletions(-)

--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -104,21 +104,11 @@ static inline void totalhigh_pages_inc(v
atomic_long_inc(&_totalhigh_pages);
 }
 
-static inline void totalhigh_pages_dec(void)
-{
-   atomic_long_dec(&_totalhigh_pages);
-}
-
 static inline void totalhigh_pages_add(long count)
 {
atomic_long_add(count, &_totalhigh_pages);
 }
 
-static inline void totalhigh_pages_set(long val)
-{
-   atomic_long_set(&_totalhigh_pages, val);
-}
-
 void kmap_flush_unused(void);
 
 struct page *kmap_to_page(void *addr);

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 03/37] fs: Remove asm/kmap_types.h includes

2020-11-04 Thread Thomas Gleixner
Historical leftovers from the time where kmap() had fixed slots.

Signed-off-by: Thomas Gleixner 
Cc: Alexander Viro 
Cc: Benjamin LaHaise 
Cc: linux-fsde...@vger.kernel.org
Cc: linux-...@kvack.org
Cc: Chris Mason 
Cc: Josef Bacik 
Cc: David Sterba 
Cc: linux-bt...@vger.kernel.org
---
 fs/aio.c |1 -
 fs/btrfs/ctree.h |1 -
 2 files changed, 2 deletions(-)

--- a/fs/aio.c
+++ b/fs/aio.c
@@ -43,7 +43,6 @@
 #include 
 #include 
 
-#include 
 #include 
 #include 
 
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -17,7 +17,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 26/37] io-mapping: Provide iomap_local variant

2020-11-04 Thread Thomas Gleixner
Similar to kmap local provide a iomap local variant which only disables
migration, but neither disables pagefaults nor preemption.

Signed-off-by: Thomas Gleixner 
---
V3: Restrict migrate disable to the 32bit mapping case and update documentation.

V2: Split out from the large combo patch and add the !IOMAP_ATOMIC variants
---
 Documentation/driver-api/io-mapping.rst |   76 +++-
 include/linux/io-mapping.h  |   30 +++-
 2 files changed, 74 insertions(+), 32 deletions(-)

--- a/Documentation/driver-api/io-mapping.rst
+++ b/Documentation/driver-api/io-mapping.rst
@@ -20,55 +20,71 @@ as it would consume too much of the kern
 mappable, while 'size' indicates how large a mapping region to
 enable. Both are in bytes.
 
-This _wc variant provides a mapping which may only be used
-with the io_mapping_map_atomic_wc or io_mapping_map_wc.
+This _wc variant provides a mapping which may only be used with
+io_mapping_map_atomic_wc(), io_mapping_map_local_wc() or
+io_mapping_map_wc().
+
+With this mapping object, individual pages can be mapped either temporarily
+or long term, depending on the requirements. Of course, temporary maps are
+more efficient. They come in two flavours::
 
-With this mapping object, individual pages can be mapped either atomically
-or not, depending on the necessary scheduling environment. Of course, atomic
-maps are more efficient::
+   void *io_mapping_map_local_wc(struct io_mapping *mapping,
+ unsigned long offset)
 
void *io_mapping_map_atomic_wc(struct io_mapping *mapping,
   unsigned long offset)
 
-'offset' is the offset within the defined mapping region.
-Accessing addresses beyond the region specified in the
-creation function yields undefined results. Using an offset
-which is not page aligned yields an undefined result. The
-return value points to a single page in CPU address space.
-
-This _wc variant returns a write-combining map to the
-page and may only be used with mappings created by
-io_mapping_create_wc
+'offset' is the offset within the defined mapping region.  Accessing
+addresses beyond the region specified in the creation function yields
+undefined results. Using an offset which is not page aligned yields an
+undefined result. The return value points to a single page in CPU address
+space.
 
-Note that the task may not sleep while holding this page
-mapped.
+This _wc variant returns a write-combining map to the page and may only be
+used with mappings created by io_mapping_create_wc()
 
-::
+Temporary mappings are only valid in the context of the caller. The mapping
+is not guaranteed to be globaly visible.
 
-   void io_mapping_unmap_atomic(void *vaddr)
+io_mapping_map_local_wc() has a side effect on X86 32bit as it disables
+migration to make the mapping code work. No caller can rely on this side
+effect.
+
+io_mapping_map_atomic_wc() has the side effect of disabling preemption and
+pagefaults. Don't use in new code. Use io_mapping_map_local_wc() instead.
 
-'vaddr' must be the value returned by the last
-io_mapping_map_atomic_wc call. This unmaps the specified
-page and allows the task to sleep once again.
+Nested mappings need to be undone in reverse order because the mapping
+code uses a stack for keeping track of them::
 
-If you need to sleep while holding the lock, you can use the non-atomic
-variant, although they may be significantly slower.
+ addr1 = io_mapping_map_local_wc(map1, offset1);
+ addr2 = io_mapping_map_local_wc(map2, offset2);
+ ...
+ io_mapping_unmap_local(addr2);
+ io_mapping_unmap_local(addr1);
 
-::
+The mappings are released with::
+
+   void io_mapping_unmap_local(void *vaddr)
+   void io_mapping_unmap_atomic(void *vaddr)
+
+'vaddr' must be the value returned by the last io_mapping_map_local_wc() or
+io_mapping_map_atomic_wc() call. This unmaps the specified mapping and
+undoes the side effects of the mapping functions.
+
+If you need to sleep while holding a mapping, you can use the regular
+variant, although this may be significantly slower::
 
void *io_mapping_map_wc(struct io_mapping *mapping,
unsigned long offset)
 
-This works like io_mapping_map_atomic_wc except it allows
-the task to sleep while holding the page mapped.
-
+This works like io_mapping_map_atomic/local_wc() except it has no side
+effects and the pointer is globaly visible.
 
-::
+The mappings are released with::
 
void io_mapping_unmap(void *vaddr)
 
-This works like io_mapping_unmap_atomic, except it is used
-for pages mapped with io_mapping_map_wc.
+Use for pages mapped with io_mapping_map_wc().
 
 At driver close time, the io_mapping object must be freed::
 
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -83,6 +83,21 @@ io_mapping_unmap_atomic(void __iomem *va
 }
 
 static inline void __iomem *
+io_mapping_map_local_wc(struct io_mapping *mapping, unsigned

Re: [patch V3 22/37] highmem: High implementation details and document API

2020-11-04 Thread Thomas Gleixner
On Tue, Nov 03 2020 at 09:48, Linus Torvalds wrote:
> I have no complaints about the patch, but it strikes me that if people
> want to actually have much better debug coverage, this is where it
> should be (I like the "every other address" thing too, don't get me
> wrong).
>
> In particular, instead of these PageHighMem(page) tests, I think
> something like this would be better:
>
>#ifdef CONFIG_DEBUG_HIGHMEM
>  #define page_use_kmap(page) ((page),1)
>#else
>  #define page_use_kmap(page) PageHighMem(page)
>#endif
>
> adn then replace those "if (!PageHighMem(page))" tests with "if
> (!page_use_kmap())" instead.
>
> IOW, in debug mode, it would _always_ remap the page, whether it's
> highmem or not. That would really stress the highmem code and find any
> fragilities.

Yes, that makes a lot of sense. We just have to avoid that for the
architectures with aliasing issues.

> Anyway, this is all sepatrate from the series, which still looks fine
> to me. Just a reaction to seeing the patch, and Thomas' earlier
> mention that the highmem debugging doesn't actually do much.

Right, forcing it for both kmap and kmap_local is straight forward. I'll
cook a patch on top for that.

Thanks,

tglx


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 34/37] drm/qxl: Replace io_mapping_map_atomic_wc()

2020-11-04 Thread Thomas Gleixner
None of these mapping requires the side effect of disabling pagefaults and
preemption.

Use io_mapping_map_local_wc() instead, rename the related functions
accordingly and clean up qxl_process_single_command() to use a plain
copy_from_user() as the local maps are not disabling pagefaults.

Signed-off-by: Thomas Gleixner 
Cc: Dave Airlie 
Cc: Gerd Hoffmann 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: virtualizat...@lists.linux-foundation.org
Cc: spice-de...@lists.freedesktop.org
---
V3: New patch
---
 drivers/gpu/drm/qxl/qxl_image.c   |   18 +-
 drivers/gpu/drm/qxl/qxl_ioctl.c   |   27 +--
 drivers/gpu/drm/qxl/qxl_object.c  |   12 ++--
 drivers/gpu/drm/qxl/qxl_object.h  |4 ++--
 drivers/gpu/drm/qxl/qxl_release.c |4 ++--
 5 files changed, 32 insertions(+), 33 deletions(-)

--- a/drivers/gpu/drm/qxl/qxl_image.c
+++ b/drivers/gpu/drm/qxl/qxl_image.c
@@ -124,12 +124,12 @@ qxl_image_init_helper(struct qxl_device
  wrong (check the bitmaps are sent correctly
  first) */
 
-   ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, 0);
+   ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, 0);
chunk = ptr;
chunk->data_size = height * chunk_stride;
chunk->prev_chunk = 0;
chunk->next_chunk = 0;
-   qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr);
+   qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr);
 
{
void *k_data, *i_data;
@@ -143,7 +143,7 @@ qxl_image_init_helper(struct qxl_device
i_data = (void *)data;
 
while (remain > 0) {
-   ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, 
page << PAGE_SHIFT);
+   ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, 
page << PAGE_SHIFT);
 
if (page == 0) {
chunk = ptr;
@@ -157,7 +157,7 @@ qxl_image_init_helper(struct qxl_device
 
memcpy(k_data, i_data, size);
 
-   qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr);
+   qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr);
i_data += size;
remain -= size;
page++;
@@ -175,10 +175,10 @@ qxl_image_init_helper(struct qxl_device
page_offset = 
offset_in_page(out_offset);
size = min((int)(PAGE_SIZE - 
page_offset), remain);
 
-   ptr = qxl_bo_kmap_atomic_page(qdev, 
chunk_bo, page_base);
+   ptr = qxl_bo_kmap_local_page(qdev, 
chunk_bo, page_base);
k_data = ptr + page_offset;
memcpy(k_data, i_data, size);
-   qxl_bo_kunmap_atomic_page(qdev, 
chunk_bo, ptr);
+   qxl_bo_kunmap_local_page(qdev, 
chunk_bo, ptr);
remain -= size;
i_data += size;
out_offset += size;
@@ -189,7 +189,7 @@ qxl_image_init_helper(struct qxl_device
qxl_bo_kunmap(chunk_bo);
 
image_bo = dimage->bo;
-   ptr = qxl_bo_kmap_atomic_page(qdev, image_bo, 0);
+   ptr = qxl_bo_kmap_local_page(qdev, image_bo, 0);
image = ptr;
 
image->descriptor.id = 0;
@@ -212,7 +212,7 @@ qxl_image_init_helper(struct qxl_device
break;
default:
DRM_ERROR("unsupported image bit depth\n");
-   qxl_bo_kunmap_atomic_page(qdev, image_bo, ptr);
+   qxl_bo_kunmap_local_page(qdev, image_bo, ptr);
return -EINVAL;
}
image->u.bitmap.flags = QXL_BITMAP_TOP_DOWN;
@@ -222,7 +222,7 @@ qxl_image_init_helper(struct qxl_device
image->u.bitmap.palette = 0;
image->u.bitmap.data = qxl_bo_physical_address(qdev, chunk_bo, 0);
 
-   qxl_bo_kunmap_atomic_page(qdev, image_bo, ptr);
+   qxl_bo_kunmap_local_page(qdev, image_bo, ptr);
 
return 0;
 }
--- a/drivers/gpu/drm/qxl/qxl_ioctl.c
+++ b/drivers/gpu/drm/qxl/qxl_ioctl.c
@@ -89,11 +89,11 @@ apply_reloc(struct qxl_device *qdev, str
 {
void *reloc_page;
 
-   reloc_page = qxl_bo_kmap_atomic_page(qdev, info->dst_bo, 
info->dst_offset & PAGE_MASK);
+   reloc_page = qxl_bo_kmap_local_page(qdev, info->dst_bo, 
info->dst_offset & PAGE_MASK);
*(uint64_t *)(reloc_page + (info->dst_offset & ~PAGE_MASK)) = 
qxl_bo_physical_address(qdev,

  info->src_bo,

[patch V3 10/37] ARM: highmem: Switch to generic kmap atomic

2020-11-04 Thread Thomas Gleixner
No reason having the same code in every architecture.

Signed-off-by: Thomas Gleixner 
Cc: Russell King 
Cc: Arnd Bergmann 
Cc: linux-arm-ker...@lists.infradead.org
---
V3: Remove the kmap types cruft
---
 arch/arm/Kconfig  |1 
 arch/arm/include/asm/fixmap.h |4 -
 arch/arm/include/asm/highmem.h|   33 +++---
 arch/arm/include/asm/kmap_types.h |   10 ---
 arch/arm/mm/Makefile  |1 
 arch/arm/mm/highmem.c |  121 --
 6 files changed, 26 insertions(+), 144 deletions(-)

--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1498,6 +1498,7 @@ config HAVE_ARCH_PFN_VALID
 config HIGHMEM
bool "High Memory Support"
depends on MMU
+   select KMAP_LOCAL
help
  The address space of ARM processors is only 4 Gigabytes large
  and it has to accommodate user address space, kernel address
--- a/arch/arm/include/asm/fixmap.h
+++ b/arch/arm/include/asm/fixmap.h
@@ -7,14 +7,14 @@
 #define FIXADDR_TOP(FIXADDR_END - PAGE_SIZE)
 
 #include 
-#include 
+#include 
 
 enum fixed_addresses {
FIX_EARLYCON_MEM_BASE,
__end_of_permanent_fixed_addresses,
 
FIX_KMAP_BEGIN = __end_of_permanent_fixed_addresses,
-   FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1,
+   FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
 
/* Support writing RO kernel text via kprobes, jump labels, etc. */
FIX_TEXT_POKE0,
--- a/arch/arm/include/asm/highmem.h
+++ b/arch/arm/include/asm/highmem.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_HIGHMEM_H
 #define _ASM_HIGHMEM_H
 
-#include 
+#include 
 
 #define PKMAP_BASE (PAGE_OFFSET - PMD_SIZE)
 #define LAST_PKMAP PTRS_PER_PTE
@@ -46,19 +46,32 @@ extern pte_t *pkmap_page_table;
 
 #ifdef ARCH_NEEDS_KMAP_HIGH_GET
 extern void *kmap_high_get(struct page *page);
-#else
+
+static inline void *arch_kmap_local_high_get(struct page *page)
+{
+   if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !cache_is_vivt())
+   return NULL;
+   return kmap_high_get(page);
+}
+#define arch_kmap_local_high_get arch_kmap_local_high_get
+
+#else /* ARCH_NEEDS_KMAP_HIGH_GET */
 static inline void *kmap_high_get(struct page *page)
 {
return NULL;
 }
-#endif
+#endif /* !ARCH_NEEDS_KMAP_HIGH_GET */
 
-/*
- * The following functions are already defined by 
- * when CONFIG_HIGHMEM is not set.
- */
-#ifdef CONFIG_HIGHMEM
-extern void *kmap_atomic_pfn(unsigned long pfn);
-#endif
+#define arch_kmap_local_post_map(vaddr, pteval)
\
+   local_flush_tlb_kernel_page(vaddr)
+
+#define arch_kmap_local_pre_unmap(vaddr)   \
+do {   \
+   if (cache_is_vivt())\
+   __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); \
+} while (0)
+
+#define arch_kmap_local_post_unmap(vaddr)  \
+   local_flush_tlb_kernel_page(vaddr)
 
 #endif
--- a/arch/arm/include/asm/kmap_types.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __ARM_KMAP_TYPES_H
-#define __ARM_KMAP_TYPES_H
-
-/*
- * This is the "bare minimum".  AIO seems to require this.
- */
-#define KM_TYPE_NR 16
-
-#endif
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -19,7 +19,6 @@ obj-$(CONFIG_MODULES) += proc-syms.o
 obj-$(CONFIG_DEBUG_VIRTUAL)+= physaddr.o
 
 obj-$(CONFIG_ALIGNMENT_TRAP)   += alignment.o
-obj-$(CONFIG_HIGHMEM)  += highmem.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_ARM_PV_FIXUP) += pv-fixup-asm.o
 
--- a/arch/arm/mm/highmem.c
+++ /dev/null
@@ -1,121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * arch/arm/mm/highmem.c -- ARM highmem support
- *
- * Author: Nicolas Pitre
- * Created:september 8, 2008
- * Copyright:  Marvell Semiconductors Inc.
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include "mm.h"
-
-static inline void set_fixmap_pte(int idx, pte_t pte)
-{
-   unsigned long vaddr = __fix_to_virt(idx);
-   pte_t *ptep = virt_to_kpte(vaddr);
-
-   set_pte_ext(ptep, pte, 0);
-   local_flush_tlb_kernel_page(vaddr);
-}
-
-static inline pte_t get_fixmap_pte(unsigned long vaddr)
-{
-   pte_t *ptep = virt_to_kpte(vaddr);
-
-   return *ptep;
-}
-
-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
-{
-   unsigned int idx;
-   unsigned long vaddr;
-   void *kmap;
-   int type;
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-   /*
-* There is no cache coherency issue when non VIVT, so force the
-* dedicated kmap usage for better debugging purposes in that case.
-*/
-   if (!cache_is_vivt())
-   kmap = NULL;
-   else
-#endif
-   kmap = kmap_high_get(page);
-   if (kmap)
-

[patch V3 15/37] powerpc/mm/highmem: Switch to generic kmap atomic

2020-11-04 Thread Thomas Gleixner
No reason having the same code in every architecture

Signed-off-by: Thomas Gleixner 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: linuxppc-...@lists.ozlabs.org
---
V3: Remove the kmap types cruft
---
 arch/powerpc/Kconfig  |1 
 arch/powerpc/include/asm/fixmap.h |4 +-
 arch/powerpc/include/asm/highmem.h|7 ++-
 arch/powerpc/include/asm/kmap_types.h |   13 --
 arch/powerpc/mm/Makefile  |1 
 arch/powerpc/mm/highmem.c |   67 --
 arch/powerpc/mm/mem.c |7 ---
 7 files changed, 8 insertions(+), 92 deletions(-)

--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -409,6 +409,7 @@ menu "Kernel options"
 config HIGHMEM
bool "High memory support"
depends on PPC32
+   select KMAP_LOCAL
 
 source "kernel/Kconfig.hz"
 
--- a/arch/powerpc/include/asm/fixmap.h
+++ b/arch/powerpc/include/asm/fixmap.h
@@ -20,7 +20,7 @@
 #include 
 #ifdef CONFIG_HIGHMEM
 #include 
-#include 
+#include 
 #endif
 
 #ifdef CONFIG_KASAN
@@ -55,7 +55,7 @@ enum fixed_addresses {
FIX_EARLY_DEBUG_BASE = FIX_EARLY_DEBUG_TOP+(ALIGN(SZ_128K, 
PAGE_SIZE)/PAGE_SIZE)-1,
 #ifdef CONFIG_HIGHMEM
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
-   FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+   FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
 #endif
 #ifdef CONFIG_PPC_8xx
/* For IMMR we need an aligned 512K area */
--- a/arch/powerpc/include/asm/highmem.h
+++ b/arch/powerpc/include/asm/highmem.h
@@ -24,12 +24,10 @@
 #ifdef __KERNEL__
 
 #include 
-#include 
 #include 
 #include 
 #include 
 
-extern pte_t *kmap_pte;
 extern pte_t *pkmap_page_table;
 
 /*
@@ -60,6 +58,11 @@ extern pte_t *pkmap_page_table;
 
 #define flush_cache_kmaps()flush_cache_all()
 
+#define arch_kmap_local_post_map(vaddr, pteval)\
+   local_flush_tlb_page(NULL, vaddr)
+#define arch_kmap_local_post_unmap(vaddr)  \
+   local_flush_tlb_page(NULL, vaddr)
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
--- a/arch/powerpc/include/asm/kmap_types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifndef _ASM_POWERPC_KMAP_TYPES_H
-#define _ASM_POWERPC_KMAP_TYPES_H
-
-#ifdef __KERNEL__
-
-/*
- */
-
-#define KM_TYPE_NR 16
-
-#endif /* __KERNEL__ */
-#endif /* _ASM_POWERPC_KMAP_TYPES_H */
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -16,7 +16,6 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += num
 obj-$(CONFIG_PPC_MM_SLICES)+= slice.o
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
-obj-$(CONFIG_HIGHMEM)  += highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
 obj-$(CONFIG_PPC_PTDUMP)   += ptdump/
 obj-$(CONFIG_KASAN)+= kasan/
--- a/arch/powerpc/mm/highmem.c
+++ /dev/null
@@ -1,67 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * highmem.c: virtual kernel memory mappings for high memory
- *
- * PowerPC version, stolen from the i386 version.
- *
- * Used in CONFIG_HIGHMEM systems for memory pages which
- * are not addressable by direct kernel virtual addresses.
- *
- * Copyright (C) 1999 Gerhard Wichert, Siemens AG
- *   gerhard.wich...@pdb.siemens.de
- *
- *
- * Redesigned the x86 32-bit VM architecture to deal with
- * up to 16 Terrabyte physical memory. With current x86 CPUs
- * we now support up to 64 Gigabytes physical RAM.
- *
- * Copyright (C) 1999 Ingo Molnar 
- *
- * Reworked for PowerPC by various contributors. Moved from
- * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp.
- */
-
-#include 
-#include 
-
-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
-{
-   unsigned long vaddr;
-   int idx, type;
-
-   type = kmap_atomic_idx_push();
-   idx = type + KM_TYPE_NR*smp_processor_id();
-   vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-   WARN_ON(IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !pte_none(*(kmap_pte - 
idx)));
-   __set_pte_at(_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1);
-   local_flush_tlb_page(NULL, vaddr);
-
-   return (void*) vaddr;
-}
-EXPORT_SYMBOL(kmap_atomic_high_prot);
-
-void kunmap_atomic_high(void *kvaddr)
-{
-   unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-
-   if (vaddr < __fix_to_virt(FIX_KMAP_END))
-   return;
-
-   if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) {
-   int type = kmap_atomic_idx();
-   unsigned int idx;
-
-   idx = type + KM_TYPE_NR * smp_processor_id();
-   WARN_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
-
-   /*
-* force other mappings to Oops if they'll try to access
-* this pte without first remap it
-*/
-   pte_clear(_mm, vaddr, kmap_pte-idx);
-  

[patch V3 12/37] microblaze/mm/highmem: Switch to generic kmap atomic

2020-11-04 Thread Thomas Gleixner
No reason having the same code in every architecture.

Signed-off-by: Thomas Gleixner 
Cc: Michal Simek 
---
V3: Remove the kmap types cruft
---
 arch/microblaze/Kconfig   |1 
 arch/microblaze/include/asm/fixmap.h  |4 -
 arch/microblaze/include/asm/highmem.h |6 ++
 arch/microblaze/mm/Makefile   |1 
 arch/microblaze/mm/highmem.c  |   78 --
 arch/microblaze/mm/init.c |6 --
 6 files changed, 8 insertions(+), 88 deletions(-)

--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -155,6 +155,7 @@ config XILINX_UNCACHED_SHADOW
 config HIGHMEM
bool "High memory support"
depends on MMU
+   select KMAP_LOCAL
help
  The address space of Microblaze processors is only 4 Gigabytes large
  and it has to accommodate user address space, kernel address
--- a/arch/microblaze/include/asm/fixmap.h
+++ b/arch/microblaze/include/asm/fixmap.h
@@ -20,7 +20,7 @@
 #include 
 #ifdef CONFIG_HIGHMEM
 #include 
-#include 
+#include 
 #endif
 
 #define FIXADDR_TOP((unsigned long)(-PAGE_SIZE))
@@ -47,7 +47,7 @@ enum fixed_addresses {
FIX_HOLE,
 #ifdef CONFIG_HIGHMEM
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
-   FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * num_possible_cpus()) - 1,
+   FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * num_possible_cpus()) - 1,
 #endif
__end_of_fixed_addresses
 };
--- a/arch/microblaze/include/asm/highmem.h
+++ b/arch/microblaze/include/asm/highmem.h
@@ -25,7 +25,6 @@
 #include 
 #include 
 
-extern pte_t *kmap_pte;
 extern pte_t *pkmap_page_table;
 
 /*
@@ -52,6 +51,11 @@ extern pte_t *pkmap_page_table;
 
 #define flush_cache_kmaps(){ flush_icache(); flush_dcache(); }
 
+#define arch_kmap_local_post_map(vaddr, pteval)\
+   local_flush_tlb_page(NULL, vaddr);
+#define arch_kmap_local_post_unmap(vaddr)  \
+   local_flush_tlb_page(NULL, vaddr);
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
--- a/arch/microblaze/mm/Makefile
+++ b/arch/microblaze/mm/Makefile
@@ -6,4 +6,3 @@
 obj-y := consistent.o init.o
 
 obj-$(CONFIG_MMU) += pgtable.o mmu_context.o fault.o
-obj-$(CONFIG_HIGHMEM) += highmem.o
--- a/arch/microblaze/mm/highmem.c
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * highmem.c: virtual kernel memory mappings for high memory
- *
- * PowerPC version, stolen from the i386 version.
- *
- * Used in CONFIG_HIGHMEM systems for memory pages which
- * are not addressable by direct kernel virtual addresses.
- *
- * Copyright (C) 1999 Gerhard Wichert, Siemens AG
- *   gerhard.wich...@pdb.siemens.de
- *
- *
- * Redesigned the x86 32-bit VM architecture to deal with
- * up to 16 Terrabyte physical memory. With current x86 CPUs
- * we now support up to 64 Gigabytes physical RAM.
- *
- * Copyright (C) 1999 Ingo Molnar 
- *
- * Reworked for PowerPC by various contributors. Moved from
- * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp.
- */
-
-#include 
-#include 
-
-/*
- * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap
- * gives a more generic (and caching) interface. But kmap_atomic can
- * be used in IRQ contexts, so in some (very limited) cases we need
- * it.
- */
-#include 
-
-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
-{
-
-   unsigned long vaddr;
-   int idx, type;
-
-   type = kmap_atomic_idx_push();
-   idx = type + KM_TYPE_NR*smp_processor_id();
-   vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-#ifdef CONFIG_DEBUG_HIGHMEM
-   BUG_ON(!pte_none(*(kmap_pte-idx)));
-#endif
-   set_pte_at(_mm, vaddr, kmap_pte-idx, mk_pte(page, prot));
-   local_flush_tlb_page(NULL, vaddr);
-
-   return (void *) vaddr;
-}
-EXPORT_SYMBOL(kmap_atomic_high_prot);
-
-void kunmap_atomic_high(void *kvaddr)
-{
-   unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
-   int type;
-   unsigned int idx;
-
-   if (vaddr < __fix_to_virt(FIX_KMAP_END))
-   return;
-
-   type = kmap_atomic_idx();
-
-   idx = type + KM_TYPE_NR * smp_processor_id();
-#ifdef CONFIG_DEBUG_HIGHMEM
-   BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
-#endif
-   /*
-* force other mappings to Oops if they'll try to access
-* this pte without first remap it
-*/
-   pte_clear(_mm, vaddr, kmap_pte-idx);
-   local_flush_tlb_page(NULL, vaddr);
-
-   kmap_atomic_idx_pop();
-}
-EXPORT_SYMBOL(kunmap_atomic_high);
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -49,17 +49,11 @@ unsigned long lowmem_size;
 EXPORT_SYMBOL(min_low_pfn);
 EXPORT_SYMBOL(max_low_pfn);
 
-#ifdef CONFIG_HIGHMEM
-pte_t *kmap_pte;
-EXPORT_SYMBOL(kmap_pte);
-
 static void __init highmem_init(void)
 {
pr_debug("%x\n", (u32)PKMAP_BASE);
map_page(PKMAP_BASE, 0, 0); /* XXX gross */
pkma

[patch V4 24/37] sched: highmem: Store local kmaps in task struct

2020-11-04 Thread Thomas Gleixner
Instead of storing the map per CPU provide and use per task storage. That
prepares for local kmaps which are preemptible.

The context switch code is preparatory and not yet in use because
kmap_atomic() runs with preemption disabled. Will be made usable in the
next step.

The context switch logic is safe even when an interrupt happens after
clearing or before restoring the kmaps. The kmap index in task struct is
not modified so any nesting kmap in an interrupt will use unused indices
and on return the counter is the same as before.

Also add an assert into the return to user space code. Going back to user
space with an active kmap local is a nono.

Signed-off-by: Thomas Gleixner 
---
V4: Use the version which actually compiles and works
V3: Handle the debug case correctly
---
 include/linux/highmem-internal.h |   10 +++
 include/linux/sched.h|9 +++
 kernel/entry/common.c|2 
 kernel/fork.c|1 
 kernel/sched/core.c  |   18 +++
 mm/highmem.c |   99 +++
 6 files changed, 129 insertions(+), 10 deletions(-)

--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -9,6 +9,16 @@
 void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
 void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
 void kunmap_local_indexed(void *vaddr);
+void kmap_local_fork(struct task_struct *tsk);
+void __kmap_local_sched_out(void);
+void __kmap_local_sched_in(void);
+static inline void kmap_assert_nomap(void)
+{
+   DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx);
+}
+#else
+static inline void kmap_local_fork(struct task_struct *tsk) { }
+static inline void kmap_assert_nomap(void) { }
 #endif
 
 #ifdef CONFIG_HIGHMEM
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
@@ -629,6 +630,13 @@ struct wake_q_node {
struct wake_q_node *next;
 };
 
+struct kmap_ctrl {
+#ifdef CONFIG_KMAP_LOCAL
+   int idx;
+   pte_t   pteval[KM_MAX_IDX];
+#endif
+};
+
 struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
/*
@@ -1294,6 +1302,7 @@ struct task_struct {
unsigned intsequential_io;
unsigned intsequential_io_avg;
 #endif
+   struct kmap_ctrlkmap_ctrl;
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long   task_state_change;
 #endif
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,6 +2,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -194,6 +195,7 @@ static void exit_to_user_mode_prepare(st
 
/* Ensure that the address limit is intact and no locks are held */
addr_limit_user_check();
+   kmap_assert_nomap();
lockdep_assert_irqs_disabled();
lockdep_sys_exit();
 }
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -930,6 +930,7 @@ static struct task_struct *dup_task_stru
account_kernel_stack(tsk, 1);
 
kcov_task_init(tsk);
+   kmap_local_fork(tsk);
 
 #ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4053,6 +4053,22 @@ static inline void finish_lock_switch(st
 # define finish_arch_post_lock_switch()do { } while (0)
 #endif
 
+static inline void kmap_local_sched_out(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+   if (unlikely(current->kmap_ctrl.idx))
+   __kmap_local_sched_out();
+#endif
+}
+
+static inline void kmap_local_sched_in(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+   if (unlikely(current->kmap_ctrl.idx))
+   __kmap_local_sched_in();
+#endif
+}
+
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
@@ -4075,6 +4091,7 @@ prepare_task_switch(struct rq *rq, struc
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
+   kmap_local_sched_out();
prepare_task(next);
prepare_arch_switch(next);
 }
@@ -4141,6 +4158,7 @@ static struct rq *finish_task_switch(str
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
+   kmap_local_sched_in();
 
fire_sched_in_preempt_notifiers(current);
/*
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -365,8 +365,6 @@ EXPORT_SYMBOL(kunmap_high);
 
 #include 
 
-static DEFINE_PER_CPU(int, __kmap_local_idx);
-
 /*
  * With DEBUG_HIGHMEM the stack depth is doubled and every second
  * slot is unused which acts as a guard page
@@ -379,23 +377,21 @@ static DEFINE_PER_CPU(int, __kmap_local_
 
 static inline int kmap_local_idx_push(void)
 {
-   int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1;
-
WARN_ON_ONCE(in_irq() &

[patch V3 20/37] io-mapping: Cleanup atomic iomap

2020-11-04 Thread Thomas Gleixner
Switch the atomic iomap implementation over to kmap_local and stick the
preempt/pagefault mechanics into the generic code similar to the
kmap_atomic variants.

Rename the x86 map function in preparation for a non-atomic variant.

Signed-off-by: Thomas Gleixner 
---
V2: New patch to make review easier
---
 arch/x86/include/asm/iomap.h |9 +
 arch/x86/mm/iomap_32.c   |6 ++
 include/linux/io-mapping.h   |8 ++--
 3 files changed, 9 insertions(+), 14 deletions(-)

--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -13,14 +13,7 @@
 #include 
 #include 
 
-void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot);
-
-static inline void iounmap_atomic(void __iomem *vaddr)
-{
-   kunmap_local_indexed((void __force *)vaddr);
-   pagefault_enable();
-   preempt_enable();
-}
+void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
 
 int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
 
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -44,7 +44,7 @@ void iomap_free(resource_size_t base, un
 }
 EXPORT_SYMBOL_GPL(iomap_free);
 
-void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot)
+void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
 {
/*
 * For non-PAT systems, translate non-WB request to UC- just in
@@ -60,8 +60,6 @@ void __iomem *iomap_atomic_pfn_prot(unsi
/* Filter out unsupported __PAGE_KERNEL* bits: */
pgprot_val(prot) &= __default_kernel_pte_mask;
 
-   preempt_disable();
-   pagefault_disable();
return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot);
 }
-EXPORT_SYMBOL_GPL(iomap_atomic_pfn_prot);
+EXPORT_SYMBOL_GPL(__iomap_local_pfn_prot);
--- a/include/linux/io-mapping.h
+++ b/include/linux/io-mapping.h
@@ -69,13 +69,17 @@ io_mapping_map_atomic_wc(struct io_mappi
 
BUG_ON(offset >= mapping->size);
phys_addr = mapping->base + offset;
-   return iomap_atomic_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
+   preempt_disable();
+   pagefault_disable();
+   return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot);
 }
 
 static inline void
 io_mapping_unmap_atomic(void __iomem *vaddr)
 {
-   iounmap_atomic(vaddr);
+   kunmap_local_indexed((void __force *)vaddr);
+   pagefault_enable();
+   preempt_enable();
 }
 
 static inline void __iomem *

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 24/37] sched: highmem: Store local kmaps in task struct

2020-11-04 Thread Thomas Gleixner
Instead of storing the map per CPU provide and use per task storage. That
prepares for local kmaps which are preemptible.

The context switch code is preparatory and not yet in use because
kmap_atomic() runs with preemption disabled. Will be made usable in the
next step.

The context switch logic is safe even when an interrupt happens after
clearing or before restoring the kmaps. The kmap index in task struct is
not modified so any nesting kmap in an interrupt will use unused indices
and on return the counter is the same as before.

Also add an assert into the return to user space code. Going back to user
space with an active kmap local is a nono.

Signed-off-by: Thomas Gleixner 
---
V3: Handle the debug case correctly
---
 include/linux/highmem-internal.h |   10 +++
 include/linux/sched.h|9 +++
 kernel/entry/common.c|2 
 kernel/fork.c|1 
 kernel/sched/core.c  |   18 +++
 mm/highmem.c |   99 +++
 6 files changed, 129 insertions(+), 10 deletions(-)

--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -9,6 +9,16 @@
 void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
 void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
 void kunmap_local_indexed(void *vaddr);
+void kmap_local_fork(struct task_struct *tsk);
+void __kmap_local_sched_out(void);
+void __kmap_local_sched_in(void);
+static inline void kmap_assert_nomap(void)
+{
+   DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx);
+}
+#else
+static inline void kmap_local_fork(struct task_struct *tsk) { }
+static inline void kmap_assert_nomap(void) { }
 #endif
 
 #ifdef CONFIG_HIGHMEM
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
@@ -629,6 +630,13 @@ struct wake_q_node {
struct wake_q_node *next;
 };
 
+struct kmap_ctrl {
+#ifdef CONFIG_KMAP_LOCAL
+   int idx;
+   pte_t   pteval[KM_TYPE_NR];
+#endif
+};
+
 struct task_struct {
 #ifdef CONFIG_THREAD_INFO_IN_TASK
/*
@@ -1294,6 +1302,7 @@ struct task_struct {
unsigned intsequential_io;
unsigned intsequential_io_avg;
 #endif
+   struct kmap_ctrlkmap_ctrl;
 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long   task_state_change;
 #endif
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,6 +2,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -194,6 +195,7 @@ static void exit_to_user_mode_prepare(st
 
/* Ensure that the address limit is intact and no locks are held */
addr_limit_user_check();
+   kmap_assert_nomap();
lockdep_assert_irqs_disabled();
lockdep_sys_exit();
 }
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -930,6 +930,7 @@ static struct task_struct *dup_task_stru
account_kernel_stack(tsk, 1);
 
kcov_task_init(tsk);
+   kmap_local_fork(tsk);
 
 #ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4053,6 +4053,22 @@ static inline void finish_lock_switch(st
 # define finish_arch_post_lock_switch()do { } while (0)
 #endif
 
+static inline void kmap_local_sched_out(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+   if (unlikely(current->kmap_ctrl.idx))
+   __kmap_local_sched_out();
+#endif
+}
+
+static inline void kmap_local_sched_in(void)
+{
+#ifdef CONFIG_KMAP_LOCAL
+   if (unlikely(current->kmap_ctrl.idx))
+   __kmap_local_sched_in();
+#endif
+}
+
 /**
  * prepare_task_switch - prepare to switch tasks
  * @rq: the runqueue preparing to switch
@@ -4075,6 +4091,7 @@ prepare_task_switch(struct rq *rq, struc
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
+   kmap_local_sched_out();
prepare_task(next);
prepare_arch_switch(next);
 }
@@ -4141,6 +4158,7 @@ static struct rq *finish_task_switch(str
finish_lock_switch(rq);
finish_arch_post_lock_switch();
kcov_finish_switch(current);
+   kmap_local_sched_in();
 
fire_sched_in_preempt_notifiers(current);
/*
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -365,8 +365,6 @@ EXPORT_SYMBOL(kunmap_high);
 
 #include 
 
-static DEFINE_PER_CPU(int, __kmap_local_idx);
-
 /*
  * With DEBUG_HIGHMEM the stack depth is doubled and every second
  * slot is unused which acts as a guard page
@@ -379,23 +377,21 @@ static DEFINE_PER_CPU(int, __kmap_local_
 
 static inline int kmap_local_idx_push(void)
 {
-   int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1;
-
WARN_ON_ONCE(in_irq() && !irqs_disabled());
-   BUG_ON(idx >= KM_MAX_I

[patch V3 35/37] drm/nouveau/device: Replace io_mapping_map_atomic_wc()

2020-11-04 Thread Thomas Gleixner
Neither fbmem_peek() nor fbmem_poke() require to disable pagefaults and
preemption as a side effect of io_mapping_map_atomic_wc().

Use io_mapping_map_local_wc() instead.

Signed-off-by: Thomas Gleixner 
Cc: Ben Skeggs 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: dri-devel@lists.freedesktop.org
Cc: nouv...@lists.freedesktop.org
---
V3: New patch
---
 drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

--- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h
@@ -60,19 +60,19 @@ fbmem_fini(struct io_mapping *fb)
 static inline u32
 fbmem_peek(struct io_mapping *fb, u32 off)
 {
-   u8 __iomem *p = io_mapping_map_atomic_wc(fb, off & PAGE_MASK);
+   u8 __iomem *p = io_mapping_map_local_wc(fb, off & PAGE_MASK);
u32 val = ioread32(p + (off & ~PAGE_MASK));
-   io_mapping_unmap_atomic(p);
+   io_mapping_unmap_local(p);
return val;
 }
 
 static inline void
 fbmem_poke(struct io_mapping *fb, u32 off, u32 val)
 {
-   u8 __iomem *p = io_mapping_map_atomic_wc(fb, off & PAGE_MASK);
+   u8 __iomem *p = io_mapping_map_local_wc(fb, off & PAGE_MASK);
iowrite32(val, p + (off & ~PAGE_MASK));
wmb();
-   io_mapping_unmap_atomic(p);
+   io_mapping_unmap_local(p);
 }
 
 static inline bool

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [patch V3 24/37] sched: highmem: Store local kmaps in task struct

2020-11-04 Thread Thomas Gleixner
On Tue, Nov 03 2020 at 10:27, Thomas Gleixner wrote:
> +struct kmap_ctrl {
> +#ifdef CONFIG_KMAP_LOCAL
> + int idx;
> + pte_t   pteval[KM_TYPE_NR];

I'm a moron. Fixed it on the test machine ...
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 06/37] highmem: Provide generic variant of kmap_atomic*

2020-11-04 Thread Thomas Gleixner
The kmap_atomic* interfaces in all architectures are pretty much the same
except for post map operations (flush) and pre- and post unmap operations.

Provide a generic variant for that.

Signed-off-by: Thomas Gleixner 
Cc: Andrew Morton 
Cc: linux...@kvack.org
---
V3: Do not reuse the kmap_atomic_idx pile and use kmap_size.h right away
V2: Address review comments from Christoph (style and EXPORT variant)
---
 include/linux/highmem.h |   82 ++-
 mm/Kconfig  |3 +
 mm/highmem.c|  144 +++-
 3 files changed, 211 insertions(+), 18 deletions(-)

--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -31,9 +31,16 @@ static inline void invalidate_kernel_vma
 
 #include 
 
+/*
+ * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
+ */
+#ifdef CONFIG_KMAP_LOCAL
+void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
+void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
+void kunmap_local_indexed(void *vaddr);
+#endif
+
 #ifdef CONFIG_HIGHMEM
-extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot);
-extern void kunmap_atomic_high(void *kvaddr);
 #include 
 
 #ifndef ARCH_HAS_KMAP_FLUSH_TLB
@@ -81,6 +88,11 @@ static inline void kunmap(struct page *p
  * be used in IRQ contexts, so in some (very limited) cases we need
  * it.
  */
+
+#ifndef CONFIG_KMAP_LOCAL
+void *kmap_atomic_high_prot(struct page *page, pgprot_t prot);
+void kunmap_atomic_high(void *kvaddr);
+
 static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
preempt_disable();
@@ -89,7 +101,38 @@ static inline void *kmap_atomic_prot(str
return page_address(page);
return kmap_atomic_high_prot(page, prot);
 }
-#define kmap_atomic(page)  kmap_atomic_prot(page, kmap_prot)
+
+static inline void __kunmap_atomic(void *vaddr)
+{
+   kunmap_atomic_high(vaddr);
+}
+#else /* !CONFIG_KMAP_LOCAL */
+
+static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+   preempt_disable();
+   pagefault_disable();
+   return __kmap_local_page_prot(page, prot);
+}
+
+static inline void *kmap_atomic_pfn(unsigned long pfn)
+{
+   preempt_disable();
+   pagefault_disable();
+   return __kmap_local_pfn_prot(pfn, kmap_prot);
+}
+
+static inline void __kunmap_atomic(void *addr)
+{
+   kunmap_local_indexed(addr);
+}
+
+#endif /* CONFIG_KMAP_LOCAL */
+
+static inline void *kmap_atomic(struct page *page)
+{
+   return kmap_atomic_prot(page, kmap_prot);
+}
 
 /* declarations for linux/mm/highmem.c */
 unsigned int nr_free_highpages(void);
@@ -147,25 +190,33 @@ static inline void *kmap_atomic(struct p
pagefault_disable();
return page_address(page);
 }
-#define kmap_atomic_prot(page, prot)   kmap_atomic(page)
 
-static inline void kunmap_atomic_high(void *addr)
+static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+   return kmap_atomic(page);
+}
+
+static inline void *kmap_atomic_pfn(unsigned long pfn)
+{
+   return kmap_atomic(pfn_to_page(pfn));
+}
+
+static inline void __kunmap_atomic(void *addr)
 {
/*
 * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic()
-* handles re-enabling faults + preemption
+* handles re-enabling faults and preemption
 */
 #ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(addr);
 #endif
 }
 
-#define kmap_atomic_pfn(pfn)   kmap_atomic(pfn_to_page(pfn))
-
 #define kmap_flush_unused()do {} while(0)
 
 #endif /* CONFIG_HIGHMEM */
 
+#if !defined(CONFIG_KMAP_LOCAL)
 #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
 
 DECLARE_PER_CPU(int, __kmap_atomic_idx);
@@ -196,22 +247,21 @@ static inline void kmap_atomic_idx_pop(v
__this_cpu_dec(__kmap_atomic_idx);
 #endif
 }
-
+#endif
 #endif
 
 /*
  * Prevent people trying to call kunmap_atomic() as if it were kunmap()
  * kunmap_atomic() should get the return value of kmap_atomic, not the page.
  */
-#define kunmap_atomic(addr) \
-do {\
-   BUILD_BUG_ON(__same_type((addr), struct page *));   \
-   kunmap_atomic_high(addr);  \
-   pagefault_enable(); \
-   preempt_enable();   \
+#define kunmap_atomic(__addr)  \
+do {   \
+   BUILD_BUG_ON(__same_type((__addr), struct page *)); \
+   __kunmap_atomic(__addr);\
+   pagefault_enable(); \
+   preempt_enable();   \
 } while (0)
 
-
 /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
 #ifndef clear_user_highpage
 static inline void clear_user_highpage

[patch V3 07/37] highmem: Make DEBUG_HIGHMEM functional

2020-11-04 Thread Thomas Gleixner
For some obscure reason when CONFIG_DEBUG_HIGHMEM is enabled the stack
depth is increased from 20 to 41. But the only thing DEBUG_HIGHMEM does is
to enable a few BUG_ON()'s in the mapping code.

That's a leftover from the historical mapping code which had fixed entries
for various purposes. DEBUG_HIGHMEM inserted guard mappings between the map
types. But that got all ditched when kmap_atomic() switched to a stack
based map management. Though the WITH_KM_FENCE magic survived without being
functional. All the thing does today is to increase the stack depth.

Add a working implementation to the generic kmap_local* implementation.

Signed-off-by: Thomas Gleixner 
---
V3: New patch
---
 mm/highmem.c |   14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -374,9 +374,19 @@ EXPORT_SYMBOL(kunmap_high);
 
 static DEFINE_PER_CPU(int, __kmap_local_idx);
 
+/*
+ * With DEBUG_HIGHMEM the stack depth is doubled and every second
+ * slot is unused which acts as a guard page
+ */
+#ifdef CONFIG_DEBUG_HIGHMEM
+# define KM_INCR   2
+#else
+# define KM_INCR   1
+#endif
+
 static inline int kmap_local_idx_push(void)
 {
-   int idx = __this_cpu_inc_return(__kmap_local_idx) - 1;
+   int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1;
 
WARN_ON_ONCE(in_irq() && !irqs_disabled());
BUG_ON(idx >= KM_MAX_IDX);
@@ -390,7 +400,7 @@ static inline int kmap_local_idx(void)
 
 static inline void kmap_local_idx_pop(void)
 {
-   int idx = __this_cpu_dec_return(__kmap_local_idx);
+   int idx = __this_cpu_sub_return(__kmap_local_idx, KM_INCR);
 
BUG_ON(idx < 0);
 }

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 18/37] highmem: Get rid of kmap_types.h

2020-11-04 Thread Thomas Gleixner
The header is not longer used and on alpha, ia64, openrisc, parisc and um
it was completely unused anyway as these architectures have no highmem
support.

Signed-off-by: Thomas Gleixner 
---
V3: New patch
---
 arch/alpha/include/asm/kmap_types.h  |   15 ---
 arch/ia64/include/asm/kmap_types.h   |   13 -
 arch/openrisc/mm/init.c  |1 -
 arch/openrisc/mm/ioremap.c   |1 -
 arch/parisc/include/asm/kmap_types.h |   13 -
 arch/um/include/asm/fixmap.h |1 -
 arch/um/include/asm/kmap_types.h |   13 -
 include/asm-generic/Kbuild   |1 -
 include/asm-generic/kmap_types.h |   11 ---
 include/linux/highmem.h  |2 --
 10 files changed, 71 deletions(-)

--- a/arch/alpha/include/asm/kmap_types.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_KMAP_TYPES_H
-#define _ASM_KMAP_TYPES_H
-
-/* Dummy header just to define km_type. */
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define  __WITH_KM_FENCE
-#endif
-
-#include 
-
-#undef __WITH_KM_FENCE
-
-#endif
--- a/arch/ia64/include/asm/kmap_types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_IA64_KMAP_TYPES_H
-#define _ASM_IA64_KMAP_TYPES_H
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define  __WITH_KM_FENCE
-#endif
-
-#include 
-
-#undef __WITH_KM_FENCE
-
-#endif /* _ASM_IA64_KMAP_TYPES_H */
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -33,7 +33,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
--- a/arch/openrisc/mm/ioremap.c
+++ b/arch/openrisc/mm/ioremap.c
@@ -15,7 +15,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
--- a/arch/parisc/include/asm/kmap_types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_KMAP_TYPES_H
-#define _ASM_KMAP_TYPES_H
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define  __WITH_KM_FENCE
-#endif
-
-#include 
-
-#undef __WITH_KM_FENCE
-
-#endif
--- a/arch/um/include/asm/fixmap.h
+++ b/arch/um/include/asm/fixmap.h
@@ -3,7 +3,6 @@
 #define __UM_FIXMAP_H
 
 #include 
-#include 
 #include 
 #include 
 #include 
--- a/arch/um/include/asm/kmap_types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* 
- * Copyright (C) 2002 Jeff Dike (jd...@karaya.com)
- */
-
-#ifndef __UM_KMAP_TYPES_H
-#define __UM_KMAP_TYPES_H
-
-/* No more #include "asm/arch/kmap_types.h" ! */
-
-#define KM_TYPE_NR 14
-
-#endif
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -30,7 +30,6 @@ mandatory-y += irq.h
 mandatory-y += irq_regs.h
 mandatory-y += irq_work.h
 mandatory-y += kdebug.h
-mandatory-y += kmap_types.h
 mandatory-y += kmap_size.h
 mandatory-y += kprobes.h
 mandatory-y += linkage.h
--- a/include/asm-generic/kmap_types.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_GENERIC_KMAP_TYPES_H
-#define _ASM_GENERIC_KMAP_TYPES_H
-
-#ifdef __WITH_KM_FENCE
-# define KM_TYPE_NR 41
-#else
-# define KM_TYPE_NR 20
-#endif
-
-#endif
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -29,8 +29,6 @@ static inline void invalidate_kernel_vma
 }
 #endif
 
-#include 
-
 /*
  * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
  */

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 14/37] nds32/mm/highmem: Switch to generic kmap atomic

2020-11-04 Thread Thomas Gleixner
The mapping code is odd and looks broken. See FIXME in the comment.

Also fix the harmless off by one in the FIX_KMAP_END define.

Signed-off-by: Thomas Gleixner 
Cc: Nick Hu 
Cc: Greentime Hu 
Cc: Vincent Chen 
---
V3: Remove the kmap types cruft
---
 arch/nds32/Kconfig.cpu   |1 
 arch/nds32/include/asm/fixmap.h  |4 +--
 arch/nds32/include/asm/highmem.h |   22 +
 arch/nds32/mm/Makefile   |1 
 arch/nds32/mm/highmem.c  |   48 ---
 5 files changed, 19 insertions(+), 57 deletions(-)

--- a/arch/nds32/Kconfig.cpu
+++ b/arch/nds32/Kconfig.cpu
@@ -157,6 +157,7 @@ config HW_SUPPORT_UNALIGNMENT_ACCESS
 config HIGHMEM
bool "High Memory Support"
depends on MMU && !CPU_CACHE_ALIASING
+   select KMAP_LOCAL
help
  The address space of Andes processors is only 4 Gigabytes large
  and it has to accommodate user address space, kernel address
--- a/arch/nds32/include/asm/fixmap.h
+++ b/arch/nds32/include/asm/fixmap.h
@@ -6,7 +6,7 @@
 
 #ifdef CONFIG_HIGHMEM
 #include 
-#include 
+#include 
 #endif
 
 enum fixed_addresses {
@@ -14,7 +14,7 @@ enum fixed_addresses {
FIX_KMAP_RESERVED,
FIX_KMAP_BEGIN,
 #ifdef CONFIG_HIGHMEM
-   FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS),
+   FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
 #endif
FIX_EARLYCON_MEM_BASE,
__end_of_fixed_addresses
--- a/arch/nds32/include/asm/highmem.h
+++ b/arch/nds32/include/asm/highmem.h
@@ -5,7 +5,6 @@
 #define _ASM_HIGHMEM_H
 
 #include 
-#include 
 #include 
 
 /*
@@ -45,11 +44,22 @@ extern pte_t *pkmap_page_table;
 extern void kmap_init(void);
 
 /*
- * The following functions are already defined by 
- * when CONFIG_HIGHMEM is not set.
+ * FIXME: The below looks broken vs. a kmap_atomic() in task context which
+ * is interupted and another kmap_atomic() happens in interrupt context.
+ * But what do I know about nds32. -- tglx
  */
-#ifdef CONFIG_HIGHMEM
-extern void *kmap_atomic_pfn(unsigned long pfn);
-#endif
+#define arch_kmap_local_post_map(vaddr, pteval)\
+   do {\
+   __nds32__tlbop_inv(vaddr);  \
+   __nds32__mtsr_dsb(vaddr, NDS32_SR_TLB_VPN); \
+   __nds32__tlbop_rwr(pteval); \
+   __nds32__isb(); \
+   } while (0)
+
+#define arch_kmap_local_pre_unmap(vaddr)   \
+   do {\
+   __nds32__tlbop_inv(vaddr);  \
+   __nds32__isb(); \
+   } while (0)
 
 #endif
--- a/arch/nds32/mm/Makefile
+++ b/arch/nds32/mm/Makefile
@@ -3,7 +3,6 @@ obj-y   := extable.o tlb.o fault.o init
mm-nds32.o cacheflush.o proc.o
 
 obj-$(CONFIG_ALIGNMENT_TRAP)   += alignment.o
-obj-$(CONFIG_HIGHMEM)   += highmem.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_proc.o = $(CC_FLAGS_FTRACE)
--- a/arch/nds32/mm/highmem.c
+++ /dev/null
@@ -1,48 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-// Copyright (C) 2005-2017 Andes Technology Corporation
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
-{
-   unsigned int idx;
-   unsigned long vaddr, pte;
-   int type;
-   pte_t *ptep;
-
-   type = kmap_atomic_idx_push();
-
-   idx = type + KM_TYPE_NR * smp_processor_id();
-   vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-   pte = (page_to_pfn(page) << PAGE_SHIFT) | prot;
-   ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr);
-   set_pte(ptep, pte);
-
-   __nds32__tlbop_inv(vaddr);
-   __nds32__mtsr_dsb(vaddr, NDS32_SR_TLB_VPN);
-   __nds32__tlbop_rwr(pte);
-   __nds32__isb();
-   return (void *)vaddr;
-}
-EXPORT_SYMBOL(kmap_atomic_high_prot);
-
-void kunmap_atomic_high(void *kvaddr)
-{
-   if (kvaddr >= (void *)FIXADDR_START) {
-   unsigned long vaddr = (unsigned long)kvaddr;
-   pte_t *ptep;
-   kmap_atomic_idx_pop();
-   __nds32__tlbop_inv(vaddr);
-   __nds32__isb();
-   ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr);
-   set_pte(ptep, 0);
-   }
-}
-EXPORT_SYMBOL(kunmap_atomic_high);

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 29/37] ARM: mm: Replace kmap_atomic_pfn()

2020-11-04 Thread Thomas Gleixner
There is no requirement to disable pagefaults and preemption for these
cache management mappings.

Replace kmap_atomic_pfn() with kmap_local_pfn(). This allows to remove
kmap_atomic_pfn() in the next step.

Signed-off-by: Thomas Gleixner 
Cc: Russell King 
Cc: linux-arm-ker...@lists.infradead.org
---
V3: New patch
---
 arch/arm/mm/cache-feroceon-l2.c |6 +++---
 arch/arm/mm/cache-xsc3l2.c  |4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

--- a/arch/arm/mm/cache-feroceon-l2.c
+++ b/arch/arm/mm/cache-feroceon-l2.c
@@ -49,9 +49,9 @@ static inline unsigned long l2_get_va(un
 * we simply install a virtual mapping for it only for the
 * TLB lookup to occur, hence no need to flush the untouched
 * memory mapping afterwards (note: a cache flush may happen
-* in some circumstances depending on the path taken in kunmap_atomic).
+* in some circumstances depending on the path taken in kunmap_local).
 */
-   void *vaddr = kmap_atomic_pfn(paddr >> PAGE_SHIFT);
+   void *vaddr = kmap_local_pfn(paddr >> PAGE_SHIFT);
return (unsigned long)vaddr + (paddr & ~PAGE_MASK);
 #else
return __phys_to_virt(paddr);
@@ -61,7 +61,7 @@ static inline unsigned long l2_get_va(un
 static inline void l2_put_va(unsigned long vaddr)
 {
 #ifdef CONFIG_HIGHMEM
-   kunmap_atomic((void *)vaddr);
+   kunmap_local((void *)vaddr);
 #endif
 }
 
--- a/arch/arm/mm/cache-xsc3l2.c
+++ b/arch/arm/mm/cache-xsc3l2.c
@@ -59,7 +59,7 @@ static inline void l2_unmap_va(unsigned
 {
 #ifdef CONFIG_HIGHMEM
if (va != -1)
-   kunmap_atomic((void *)va);
+   kunmap_local((void *)va);
 #endif
 }
 
@@ -75,7 +75,7 @@ static inline unsigned long l2_map_va(un
 * in place for it.
 */
l2_unmap_va(prev_va);
-   va = (unsigned long)kmap_atomic_pfn(pa >> PAGE_SHIFT);
+   va = (unsigned long)kmap_local_pfn(pa >> PAGE_SHIFT);
}
return va + (pa_offset >> (32 - PAGE_SHIFT));
 #else

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 22/37] highmem: High implementation details and document API

2020-11-04 Thread Thomas Gleixner
Move the gory details of kmap & al into a private header and only document
the interfaces which are usable by drivers.

Signed-off-by: Thomas Gleixner 
---
V3: New patch
---
 include/linux/highmem-internal.h |  174 +
 include/linux/highmem.h  |  270 ++-
 mm/highmem.c |   11 -
 3 files changed, 276 insertions(+), 179 deletions(-)

--- /dev/null
+++ b/include/linux/highmem-internal.h
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_HIGHMEM_INTERNAL_H
+#define _LINUX_HIGHMEM_INTERNAL_H
+
+/*
+ * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
+ */
+#ifdef CONFIG_KMAP_LOCAL
+void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
+void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
+void kunmap_local_indexed(void *vaddr);
+#endif
+
+#ifdef CONFIG_HIGHMEM
+#include 
+
+#ifndef ARCH_HAS_KMAP_FLUSH_TLB
+static inline void kmap_flush_tlb(unsigned long addr) { }
+#endif
+
+#ifndef kmap_prot
+#define kmap_prot PAGE_KERNEL
+#endif
+
+void *kmap_high(struct page *page);
+void kunmap_high(struct page *page);
+void __kmap_flush_unused(void);
+struct page *__kmap_to_page(void *addr);
+
+static inline void *kmap(struct page *page)
+{
+   void *addr;
+
+   might_sleep();
+   if (!PageHighMem(page))
+   addr = page_address(page);
+   else
+   addr = kmap_high(page);
+   kmap_flush_tlb((unsigned long)addr);
+   return addr;
+}
+
+static inline void kunmap(struct page *page)
+{
+   might_sleep();
+   if (!PageHighMem(page))
+   return;
+   kunmap_high(page);
+}
+
+static inline struct page *kmap_to_page(void *addr)
+{
+   return __kmap_to_page(addr);
+}
+
+static inline void kmap_flush_unused(void)
+{
+   __kmap_flush_unused();
+}
+
+static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+   preempt_disable();
+   pagefault_disable();
+   return __kmap_local_page_prot(page, prot);
+}
+
+static inline void *kmap_atomic(struct page *page)
+{
+   return kmap_atomic_prot(page, kmap_prot);
+}
+
+static inline void *kmap_atomic_pfn(unsigned long pfn)
+{
+   preempt_disable();
+   pagefault_disable();
+   return __kmap_local_pfn_prot(pfn, kmap_prot);
+}
+
+static inline void __kunmap_atomic(void *addr)
+{
+   kunmap_local_indexed(addr);
+   pagefault_enable();
+   preempt_enable();
+}
+
+unsigned int __nr_free_highpages(void);
+extern atomic_long_t _totalhigh_pages;
+
+static inline unsigned int nr_free_highpages(void)
+{
+   return __nr_free_highpages();
+}
+
+static inline unsigned long totalhigh_pages(void)
+{
+   return (unsigned long)atomic_long_read(&_totalhigh_pages);
+}
+
+static inline void totalhigh_pages_inc(void)
+{
+   atomic_long_inc(&_totalhigh_pages);
+}
+
+static inline void totalhigh_pages_add(long count)
+{
+   atomic_long_add(count, &_totalhigh_pages);
+}
+
+#else /* CONFIG_HIGHMEM */
+
+static inline struct page *kmap_to_page(void *addr)
+{
+   return virt_to_page(addr);
+}
+
+static inline void *kmap(struct page *page)
+{
+   might_sleep();
+   return page_address(page);
+}
+
+static inline void kunmap_high(struct page *page) { }
+static inline void kmap_flush_unused(void) { }
+
+static inline void kunmap(struct page *page)
+{
+#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
+   kunmap_flush_on_unmap(page_address(page));
+#endif
+}
+
+static inline void *kmap_atomic(struct page *page)
+{
+   preempt_disable();
+   pagefault_disable();
+   return page_address(page);
+}
+
+static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
+{
+   return kmap_atomic(page);
+}
+
+static inline void *kmap_atomic_pfn(unsigned long pfn)
+{
+   return kmap_atomic(pfn_to_page(pfn));
+}
+
+static inline void __kunmap_atomic(void *addr)
+{
+#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
+   kunmap_flush_on_unmap(addr);
+#endif
+   pagefault_enable();
+   preempt_enable();
+}
+
+static inline unsigned int nr_free_highpages(void) { return 0; }
+static inline unsigned long totalhigh_pages(void) { return 0UL; }
+
+#endif /* CONFIG_HIGHMEM */
+
+/*
+ * Prevent people trying to call kunmap_atomic() as if it were kunmap()
+ * kunmap_atomic() should get the return value of kmap_atomic, not the page.
+ */
+#define kunmap_atomic(__addr)  \
+do {   \
+   BUILD_BUG_ON(__same_type((__addr), struct page *)); \
+   __kunmap_atomic(__addr);\
+} while (0)
+
+#endif
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -11,199 +11,125 @@
 
 #include 
 
-#ifndef ARCH_HAS_FLUSH_ANON_PAGE
-static inline void flush_anon_page(struct vm_area_struct *vma, struct page 
*page, unsigned long vmaddr)
-{
-}
-#endif
+#include "highmem-internal.h"
 
-#ifndef ARCH_HA

[patch V3 08/37] x86/mm/highmem: Use generic kmap atomic implementation

2020-11-04 Thread Thomas Gleixner
Convert X86 to the generic kmap atomic implementation and make the
iomap_atomic() naming convention consistent while at it.

Signed-off-by: Thomas Gleixner 
Cc: x...@kernel.org
---
V3: Remove the kmap_types cruft
---
 arch/x86/Kconfig  |3 +
 arch/x86/include/asm/fixmap.h |5 +-
 arch/x86/include/asm/highmem.h|   13 +--
 arch/x86/include/asm/iomap.h  |   18 +-
 arch/x86/include/asm/kmap_types.h |   13 ---
 arch/x86/include/asm/paravirt_types.h |1 
 arch/x86/mm/highmem_32.c  |   59 --
 arch/x86/mm/init_32.c |   15 
 arch/x86/mm/iomap_32.c|   59 ++
 include/linux/highmem.h   |2 -
 include/linux/io-mapping.h|2 -
 mm/highmem.c  |2 -
 12 files changed, 31 insertions(+), 161 deletions(-)

--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -14,10 +14,11 @@ config X86_32
select ARCH_WANT_IPC_PARSE_VERSION
select CLKSRC_I8253
select CLONE_BACKWARDS
+   select GENERIC_VDSO_32
select HAVE_DEBUG_STACKOVERFLOW
+   select KMAP_LOCAL
select MODULES_USE_ELF_REL
select OLD_SIGACTION
-   select GENERIC_VDSO_32
 
 config X86_64
def_bool y
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -31,7 +31,7 @@
 #include 
 #ifdef CONFIG_X86_32
 #include 
-#include 
+#include 
 #else
 #include 
 #endif
@@ -94,7 +94,7 @@ enum fixed_addresses {
 #endif
 #ifdef CONFIG_X86_32
FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
-   FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
+   FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1,
 #ifdef CONFIG_PCI_MMCONFIG
FIX_PCIE_MCFG,
 #endif
@@ -151,7 +151,6 @@ extern void reserve_top_address(unsigned
 
 extern int fixmaps_set;
 
-extern pte_t *kmap_pte;
 extern pte_t *pkmap_page_table;
 
 void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -23,7 +23,6 @@
 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -58,11 +57,17 @@ extern unsigned long highstart_pfn, high
 #define PKMAP_NR(virt)  ((virt-PKMAP_BASE) >> PAGE_SHIFT)
 #define PKMAP_ADDR(nr)  (PKMAP_BASE + ((nr) << PAGE_SHIFT))
 
-void *kmap_atomic_pfn(unsigned long pfn);
-void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
-
 #define flush_cache_kmaps()do { } while (0)
 
+#definearch_kmap_local_post_map(vaddr, pteval) \
+   arch_flush_lazy_mmu_mode()
+
+#definearch_kmap_local_post_unmap(vaddr)   \
+   do {\
+   flush_tlb_one_kernel((vaddr));  \
+   arch_flush_lazy_mmu_mode(); \
+   } while (0)
+
 extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
unsigned long end_pfn);
 
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -9,19 +9,21 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
-void __iomem *
-iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
+void __iomem *iomap_atomic_pfn_prot(unsigned long pfn, pgprot_t prot);
 
-void
-iounmap_atomic(void __iomem *kvaddr);
+static inline void iounmap_atomic(void __iomem *vaddr)
+{
+   kunmap_local_indexed((void __force *)vaddr);
+   pagefault_enable();
+   preempt_enable();
+}
 
-int
-iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
+int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
 
-void
-iomap_free(resource_size_t base, unsigned long size);
+void iomap_free(resource_size_t base, unsigned long size);
 
 #endif /* _ASM_X86_IOMAP_H */
--- a/arch/x86/include/asm/kmap_types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_KMAP_TYPES_H
-#define _ASM_X86_KMAP_TYPES_H
-
-#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
-#define  __WITH_KM_FENCE
-#endif
-
-#include 
-
-#undef __WITH_KM_FENCE
-
-#endif /* _ASM_X86_KMAP_TYPES_H */
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -41,7 +41,6 @@
 #ifndef __ASSEMBLY__
 
 #include 
-#include 
 #include 
 #include 
 
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -4,65 +4,6 @@
 #include  /* for totalram_pages */
 #include 
 
-void *kmap_atomic_high_prot(struct page *page, pgprot_t prot)
-{
-   unsigned long vaddr;
-   int idx, type;
-
-   type = kmap_atomic_idx_push();
-   idx = type + KM_TYPE_NR*smp_processor_id();
-   vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
-   BUG_ON(!pte_none(*(kmap_pte-idx)));
-   set_pte(kmap_pte-idx, mk_pte(page, prot));
-   arch_flush_lazy_mmu_mode

[patch V3 28/37] mips/crashdump: Simplify copy_oldmem_page()

2020-11-04 Thread Thomas Gleixner
Replace kmap_atomic_pfn() with kmap_local_pfn() which is preemptible and
can take page faults.

Remove the indirection of the dump page and the related cruft which is not
longer required.

Signed-off-by: Thomas Gleixner 
Cc: Thomas Bogendoerfer 
Cc: linux-m...@vger.kernel.org
---
V3: New patch
---
 arch/mips/kernel/crash_dump.c |   42 +++---
 1 file changed, 7 insertions(+), 35 deletions(-)

--- a/arch/mips/kernel/crash_dump.c
+++ b/arch/mips/kernel/crash_dump.c
@@ -5,8 +5,6 @@
 #include 
 #include 
 
-static void *kdump_buf_page;
-
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
@@ -17,51 +15,25 @@ static void *kdump_buf_page;
  * @userbuf: if set, @buf is in user address space, use copy_to_user(),
  * otherwise @buf is in kernel address space, use memcpy().
  *
- * Copy a page from "oldmem". For this page, there is no pte mapped
+ * Copy a page from "oldmem". For this page, there might be no pte mapped
  * in the current kernel.
- *
- * Calling copy_to_user() in atomic context is not desirable. Hence first
- * copying the data to a pre-allocated kernel page and then copying to user
- * space in non-atomic context.
  */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-size_t csize, unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+unsigned long offset, int userbuf)
 {
void  *vaddr;
 
if (!csize)
return 0;
 
-   vaddr = kmap_atomic_pfn(pfn);
+   vaddr = kmap_local_pfn(pfn);
 
if (!userbuf) {
-   memcpy(buf, (vaddr + offset), csize);
-   kunmap_atomic(vaddr);
+   memcpy(buf, vaddr + offset, csize);
} else {
-   if (!kdump_buf_page) {
-   pr_warn("Kdump: Kdump buffer page not allocated\n");
-
-   return -EFAULT;
-   }
-   copy_page(kdump_buf_page, vaddr);
-   kunmap_atomic(vaddr);
-   if (copy_to_user(buf, (kdump_buf_page + offset), csize))
-   return -EFAULT;
+   if (copy_to_user(buf, vaddr + offset, csize))
+   csize = -EFAULT;
}
 
return csize;
 }
-
-static int __init kdump_buf_page_init(void)
-{
-   int ret = 0;
-
-   kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
-   if (!kdump_buf_page) {
-   pr_warn("Kdump: Failed to allocate kdump buffer page\n");
-   ret = -ENOMEM;
-   }
-
-   return ret;
-}
-arch_initcall(kdump_buf_page_init);

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 27/37] x86/crashdump/32: Simplify copy_oldmem_page()

2020-11-04 Thread Thomas Gleixner
Replace kmap_atomic_pfn() with kmap_local_pfn() which is preemptible and
can take page faults.

Remove the indirection of the dump page and the related cruft which is not
longer required.

Signed-off-by: Thomas Gleixner 
---
V3: New patch
---
 arch/x86/kernel/crash_dump_32.c |   48 
 1 file changed, 10 insertions(+), 38 deletions(-)

--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,8 +13,6 @@
 
 #include 
 
-static void *kdump_buf_page;
-
 static inline bool is_crashed_pfn_valid(unsigned long pfn)
 {
 #ifndef CONFIG_X86_PAE
@@ -41,15 +39,11 @@ static inline bool is_crashed_pfn_valid(
  * @userbuf: if set, @buf is in user address space, use copy_to_user(),
  * otherwise @buf is in kernel address space, use memcpy().
  *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- *
- * Calling copy_to_user() in atomic context is not desirable. Hence first
- * copying the data to a pre-allocated kernel page and then copying to user
- * space in non-atomic context.
+ * Copy a page from "oldmem". For this page, there might be no pte mapped
+ * in the current kernel.
  */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-   size_t csize, unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+unsigned long offset, int userbuf)
 {
void  *vaddr;
 
@@ -59,38 +53,16 @@ ssize_t copy_oldmem_page(unsigned long p
if (!is_crashed_pfn_valid(pfn))
return -EFAULT;
 
-   vaddr = kmap_atomic_pfn(pfn);
+   vaddr = kmap_local_pfn(pfn);
 
if (!userbuf) {
-   memcpy(buf, (vaddr + offset), csize);
-   kunmap_atomic(vaddr);
+   memcpy(buf, vaddr + offset, csize);
} else {
-   if (!kdump_buf_page) {
-   printk(KERN_WARNING "Kdump: Kdump buffer page not"
-   " allocated\n");
-   kunmap_atomic(vaddr);
-   return -EFAULT;
-   }
-   copy_page(kdump_buf_page, vaddr);
-   kunmap_atomic(vaddr);
-   if (copy_to_user(buf, (kdump_buf_page + offset), csize))
-   return -EFAULT;
+   if (copy_to_user(buf, vaddr + offset, csize))
+   csize = -EFAULT;
}
 
-   return csize;
-}
+   kunmap_local(vaddr);
 
-static int __init kdump_buf_page_init(void)
-{
-   int ret = 0;
-
-   kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
-   if (!kdump_buf_page) {
-   printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
-" page\n");
-   ret = -ENOMEM;
-   }
-
-   return ret;
+   return csize;
 }
-arch_initcall(kdump_buf_page_init);

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 05/37] asm-generic: Provide kmap_size.h

2020-11-04 Thread Thomas Gleixner
kmap_types.h is a misnomer because the old atomic MAP based array does not
exist anymore and the whole indirection of architectures including
kmap_types.h is inconinstent and does not allow to provide guard page
debugging for this misfeature.

Add a common header file which defines the mapping stack size for all
architectures. Will be used when converting architectures over to a
generic kmap_local/atomic implementation.

The array size is chosen with the following constraints in mind:

- The deepest nest level in one context is 3 according to code
  inspection.

- The worst case nesting for the upcoming reemptible version would be:

  2 maps in task context and a fault inside
  2 maps in the fault handler
  3 maps in softirq
  2 maps in interrupt

So a total of 16 is sufficient and probably overestimated.

Signed-off-by: Thomas Gleixner 
---
V3: New patch
---
 include/asm-generic/Kbuild  |1 +
 include/asm-generic/kmap_size.h |   12 
 2 files changed, 13 insertions(+)

--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -31,6 +31,7 @@ mandatory-y += irq_regs.h
 mandatory-y += irq_work.h
 mandatory-y += kdebug.h
 mandatory-y += kmap_types.h
+mandatory-y += kmap_size.h
 mandatory-y += kprobes.h
 mandatory-y += linkage.h
 mandatory-y += local.h
--- /dev/null
+++ b/include/asm-generic/kmap_size.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_GENERIC_KMAP_SIZE_H
+#define _ASM_GENERIC_KMAP_SIZE_H
+
+/* For debug this provides guard pages between the maps */
+#ifdef CONFIG_DEBUG_HIGHMEM
+# define KM_MAX_IDX33
+#else
+# define KM_MAX_IDX16
+#endif
+
+#endif

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 25/37] mm/highmem: Provide kmap_local*

2020-11-04 Thread Thomas Gleixner
Now that the kmap atomic index is stored in task struct provide a
preemptible variant. On context switch the maps of an outgoing task are
removed and the map of the incoming task are restored. That's obviously
slow, but highmem is slow anyway.

The kmap_local.*() functions can be invoked from both preemptible and
atomic context. kmap local sections disable migration to keep the resulting
virtual mapping address correct, but disable neither pagefaults nor
preemption.

A wholesale conversion of kmap_atomic to be fully preemptible is not
possible because some of the usage sites might rely on the preemption
disable for serialization or on the implicit pagefault disable. Needs to be
done on a case by case basis.

Signed-off-by: Thomas Gleixner 
---
V3: Move migrate disable into the actual highmem mapping code so it only
affects real highmem mappings.
   
V2: Make it more consistent and add commentry
---
 include/linux/highmem-internal.h |   48 +++
 include/linux/highmem.h  |   43 +-
 mm/highmem.c |6 
 3 files changed, 81 insertions(+), 16 deletions(-)

--- a/include/linux/highmem-internal.h
+++ b/include/linux/highmem-internal.h
@@ -69,6 +69,26 @@ static inline void kmap_flush_unused(voi
__kmap_flush_unused();
 }
 
+static inline void *kmap_local_page(struct page *page)
+{
+   return __kmap_local_page_prot(page, kmap_prot);
+}
+
+static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
+{
+   return __kmap_local_page_prot(page, prot);
+}
+
+static inline void *kmap_local_pfn(unsigned long pfn)
+{
+   return __kmap_local_pfn_prot(pfn, kmap_prot);
+}
+
+static inline void __kunmap_local(void *vaddr)
+{
+   kunmap_local_indexed(vaddr);
+}
+
 static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
 {
preempt_disable();
@@ -141,6 +161,28 @@ static inline void kunmap(struct page *p
 #endif
 }
 
+static inline void *kmap_local_page(struct page *page)
+{
+   return page_address(page);
+}
+
+static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
+{
+   return kmap_local_page(page);
+}
+
+static inline void *kmap_local_pfn(unsigned long pfn)
+{
+   return kmap_local_page(pfn_to_page(pfn));
+}
+
+static inline void __kunmap_local(void *addr)
+{
+#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
+   kunmap_flush_on_unmap(addr);
+#endif
+}
+
 static inline void *kmap_atomic(struct page *page)
 {
preempt_disable();
@@ -182,4 +224,10 @@ do {   
\
__kunmap_atomic(__addr);\
 } while (0)
 
+#define kunmap_local(__addr)   \
+do {   \
+   BUILD_BUG_ON(__same_type((__addr), struct page *)); \
+   __kunmap_local(__addr); \
+} while (0)
+
 #endif
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -60,24 +60,22 @@ static inline struct page *kmap_to_page(
 static inline void kmap_flush_unused(void);
 
 /**
- * kmap_atomic - Atomically map a page for temporary usage
+ * kmap_local_page - Map a page for temporary usage
  * @page:  Pointer to the page to be mapped
  *
  * Returns: The virtual address of the mapping
  *
- * Side effect: On return pagefaults and preemption are disabled.
- *
  * Can be invoked from any context.
  *
  * Requires careful handling when nesting multiple mappings because the map
  * management is stack based. The unmap has to be in the reverse order of
  * the map operation:
  *
- * addr1 = kmap_atomic(page1);
- * addr2 = kmap_atomic(page2);
+ * addr1 = kmap_local_page(page1);
+ * addr2 = kmap_local_page(page2);
  * ...
- * kunmap_atomic(addr2);
- * kunmap_atomic(addr1);
+ * kunmap_local(addr2);
+ * kunmap_local(addr1);
  *
  * Unmapping addr1 before addr2 is invalid and causes malfunction.
  *
@@ -88,10 +86,26 @@ static inline void kmap_flush_unused(voi
  * virtual address of the direct mapping. Only real highmem pages are
  * temporarily mapped.
  *
- * While it is significantly faster than kmap() it comes with restrictions
- * about the pointer validity and the side effects of disabling page faults
- * and preemption. Use it only when absolutely necessary, e.g. from non
- * preemptible contexts.
+ * While it is significantly faster than kmap() for the higmem case it
+ * comes with restrictions about the pointer validity. Only use when really
+ * necessary.
+ *
+ * On HIGHMEM enabled systems mapping a highmem page has the side effect of
+ * disabling migration in order to keep the virtual address stable across
+ * preemption. No caller of kmap_local_page() can rely on this side effect.
+ */
+static inline void *kmap_local_page(struct page *page);
+
+/**
+ * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
+ * @page:  Pointer to the page

[patch V3 31/37] drm/ttm: Replace kmap_atomic() usage

2020-11-04 Thread Thomas Gleixner
There is no reason to disable pagefaults and preemption as a side effect of
kmap_atomic_prot().

Use kmap_local_page_prot() instead and document the reasoning for the
mapping usage with the given pgprot.

Remove the NULL pointer check for the map. These functions return a valid
address for valid pages and the return was bogus anyway as it would have
left preemption and pagefaults disabled.

Signed-off-by: Thomas Gleixner 
Cc: Christian Koenig 
Cc: Huang Rui 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: dri-devel@lists.freedesktop.org
---
V3: New patch
---
 drivers/gpu/drm/ttm/ttm_bo_util.c |   20 
 1 file changed, 12 insertions(+), 8 deletions(-)

--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -181,13 +181,15 @@ static int ttm_copy_io_ttm_page(struct t
return -ENOMEM;
 
src = (void *)((unsigned long)src + (page << PAGE_SHIFT));
-   dst = kmap_atomic_prot(d, prot);
-   if (!dst)
-   return -ENOMEM;
+   /*
+* Ensure that a highmem page is mapped with the correct
+* pgprot. For non highmem the mapping is already there.
+*/
+   dst = kmap_local_page_prot(d, prot);
 
memcpy_fromio(dst, src, PAGE_SIZE);
 
-   kunmap_atomic(dst);
+   kunmap_local(dst);
 
return 0;
 }
@@ -203,13 +205,15 @@ static int ttm_copy_ttm_io_page(struct t
return -ENOMEM;
 
dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT));
-   src = kmap_atomic_prot(s, prot);
-   if (!src)
-   return -ENOMEM;
+   /*
+* Ensure that a highmem page is mapped with the correct
+* pgprot. For non highmem the mapping is already there.
+*/
+   src = kmap_local_page_prot(s, prot);
 
memcpy_toio(dst, src, PAGE_SIZE);
 
-   kunmap_atomic(src);
+   kunmap_local(src);
 
return 0;
 }

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 01/37] mm/highmem: Un-EXPORT __kmap_atomic_idx()

2020-11-04 Thread Thomas Gleixner
Nothing in modules can use that.

Signed-off-by: Thomas Gleixner 
Reviewed-by: Christoph Hellwig 
Cc: Andrew Morton 
Cc: linux...@kvack.org
---
 mm/highmem.c |2 --
 1 file changed, 2 deletions(-)

--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -108,8 +108,6 @@ static inline wait_queue_head_t *get_pkm
 atomic_long_t _totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(_totalhigh_pages);
 
-EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
-
 unsigned int nr_free_highpages (void)
 {
struct zone *zone;

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[patch V3 04/37] sh/highmem: Remove all traces of unused cruft

2020-11-04 Thread Thomas Gleixner
For whatever reasons SH has highmem bits all over the place but does
not enable it via Kconfig. Remove the bitrot.

Signed-off-by: Thomas Gleixner 
---
 arch/sh/include/asm/fixmap.h |8 
 arch/sh/include/asm/kmap_types.h |   15 ---
 arch/sh/mm/init.c|8 
 3 files changed, 31 deletions(-)

--- a/arch/sh/include/asm/fixmap.h
+++ b/arch/sh/include/asm/fixmap.h
@@ -13,9 +13,6 @@
 #include 
 #include 
 #include 
-#ifdef CONFIG_HIGHMEM
-#include 
-#endif
 
 /*
  * Here we define all the compile-time 'special' virtual
@@ -53,11 +50,6 @@ enum fixed_addresses {
FIX_CMAP_BEGIN,
FIX_CMAP_END = FIX_CMAP_BEGIN + (FIX_N_COLOURS * NR_CPUS) - 1,
 
-#ifdef CONFIG_HIGHMEM
-   FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
-   FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1,
-#endif
-
 #ifdef CONFIG_IOREMAP_FIXED
/*
 * FIX_IOREMAP entries are useful for mapping physical address
--- a/arch/sh/include/asm/kmap_types.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __SH_KMAP_TYPES_H
-#define __SH_KMAP_TYPES_H
-
-/* Dummy header just to define km_type. */
-
-#ifdef CONFIG_DEBUG_HIGHMEM
-#define  __WITH_KM_FENCE
-#endif
-
-#include 
-
-#undef __WITH_KM_FENCE
-
-#endif
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -362,9 +362,6 @@ void __init mem_init(void)
mem_init_print_info(NULL);
pr_info("virtual kernel memory layout:\n"
"fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-#ifdef CONFIG_HIGHMEM
-   "pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"
-#endif
"vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
"lowmem  : 0x%08lx - 0x%08lx   (%4ld MB) (cached)\n"
 #ifdef CONFIG_UNCACHED_MAPPING
@@ -376,11 +373,6 @@ void __init mem_init(void)
FIXADDR_START, FIXADDR_TOP,
(FIXADDR_TOP - FIXADDR_START) >> 10,
 
-#ifdef CONFIG_HIGHMEM
-   PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
-   (LAST_PKMAP*PAGE_SIZE) >> 10,
-#endif
-
(unsigned long)VMALLOC_START, VMALLOC_END,
(VMALLOC_END - VMALLOC_START) >> 20,
 

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


  1   2   3   >