[PATCH v2 1/1] x86/vector: Fix vector leak during CPU offline

2024-05-22 Thread Dongli Zhang
The absence of IRQD_MOVE_PCNTXT prevents immediate effectiveness of
interrupt affinity reconfiguration via procfs. Instead, the change is
deferred until the next instance of the interrupt being triggered on the
original CPU.

When the interrupt next triggers on the original CPU, the new affinity is
enforced within __irq_move_irq(). A vector is allocated from the new CPU,
but if the old vector on the original CPU remains online, it is not
immediately reclaimed. Instead, apicd->move_in_progress is flagged, and the
reclaiming process is delayed until the next trigger of the interrupt on
the new CPU.

Upon the subsequent triggering of the interrupt on the new CPU,
irq_complete_move() adds a task to the old CPU's vector_cleanup list if it
remains online. Subsequently, the timer on the old CPU iterates over its
vector_cleanup list, reclaiming old vectors.

However, a rare scenario arises if the old CPU is outgoing before the
interrupt triggers again on the new CPU. The irq_force_complete_move() may
not have the chance to be invoked on the outgoing CPU to reclaim the old
apicd->prev_vector. This is because the interrupt isn't currently affine to
the outgoing CPU, and irq_needs_fixup() returns false. Even though
__vector_schedule_cleanup() is later called on the new CPU, it doesn't
reclaim apicd->prev_vector; instead, it simply resets both
apicd->move_in_progress and apicd->prev_vector to 0.

As a result, the vector remains unreclaimed in vector_matrix, leading to a
CPU vector leak.

To address this issue, move the invocation of irq_force_complete_move()
before the irq_needs_fixup() call to reclaim apicd->prev_vector, if the
interrupt is currently or used to affine to the outgoing CPU. Additionally,
reclaim the vector in __vector_schedule_cleanup() as well, following a
warning message, although theoretically it should never see
apicd->move_in_progress with apicd->prev_cpu pointing to an offline CPU.

Fixes: f0383c24b485 ("genirq/cpuhotplug: Add support for cleaning up move in 
progress")
Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
Changed since v1:
- Add Fixes (suggested by Thomas)
- Add warning to __vector_schedule_cleanup() (suggested by Thomas)
- Use free_moved_vector() not irq_matrix_free() (suggested by Thomas)
- Move irq_force_complete_move() to early migrate_one_irq()
- Add more conditions in irq_force_complete_move() (suggested by Thomas)

 arch/x86/kernel/apic/vector.c |  9 ++---
 kernel/irq/cpuhotplug.c   | 16 
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 9eec52925fa3..b385bb5eac10 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -1035,7 +1035,8 @@ static void __vector_schedule_cleanup(struct 
apic_chip_data *apicd)
add_timer_on(>timer, cpu);
}
} else {
-   apicd->prev_vector = 0;
+   pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", 
apicd->irq, cpu);
+   free_moved_vector(apicd);
}
raw_spin_unlock(_lock);
 }
@@ -1072,6 +1073,7 @@ void irq_complete_move(struct irq_cfg *cfg)
  */
 void irq_force_complete_move(struct irq_desc *desc)
 {
+   unsigned int cpu = smp_processor_id();
struct apic_chip_data *apicd;
struct irq_data *irqd;
unsigned int vector;
@@ -1096,10 +1098,11 @@ void irq_force_complete_move(struct irq_desc *desc)
goto unlock;
 
/*
-* If prev_vector is empty, no action required.
+* If prev_vector is empty or the descriptor is neither currently
+* nor previously on the outgoing CPU no action required.
 */
vector = apicd->prev_vector;
-   if (!vector)
+   if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
goto unlock;
 
/*
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 75cadbc3c232..eb8628390156 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -69,6 +69,14 @@ static bool migrate_one_irq(struct irq_desc *desc)
return false;
}
 
+   /*
+* Complete an eventually pending irq move cleanup. If this
+* interrupt was moved in hard irq context, then the vectors need
+* to be cleaned up. It can't wait until this interrupt actually
+* happens and this CPU was involved.
+*/
+   irq_force_complete_move(desc);
+
/*
 * No move required, if:
 * - Interrupt is per cpu
@@ -87,14 +95,6 @@ static bool migrate_one_irq(struct irq_desc *desc)
return false;
}
 
-   /*
-* Complete an eventually pending irq move cleanup. If this
-* interrupt was moved in hard irq context, then the vectors need
-* to be cleaned up. It can't wait until this interrupt actually
-* happens and t

Re: [PATCH 1/1] x86/vector: Fix vector leak during CPU offline

2024-05-22 Thread Dongli Zhang



On 5/21/24 5:00 AM, Thomas Gleixner wrote:
> On Wed, May 15 2024 at 12:51, Dongli Zhang wrote:
>> On 5/13/24 3:46 PM, Thomas Gleixner wrote:
>>> So yes, moving the invocation of irq_force_complete_move() before the
>>> irq_needs_fixup() call makes sense, but it wants this to actually work
>>> correctly:
>>> @@ -1097,10 +1098,11 @@ void irq_force_complete_move(struct irq_
>>> goto unlock;
>>>  
>>> /*
>>> -* If prev_vector is empty, no action required.
>>> +* If prev_vector is empty or the descriptor was previously
>>> +* not on the outgoing CPU no action required.
>>>  */
>>> vector = apicd->prev_vector;
>>> -   if (!vector)
>>> +   if (!vector || apicd->prev_cpu != smp_processor_id())
>>> goto unlock;
>>>  
>>
>> The above may not work. migrate_one_irq() relies on 
>> irq_force_complete_move() to
>> always reclaim the apicd->prev_vector. Otherwise, the call of
>> irq_do_set_affinity() later may return -EBUSY.
> 
> You're right. But that still can be handled in irq_force_complete_move()
> with a single unconditional invocation in migrate_one_irq():
> 
>   cpu = smp_processor_id();
>   if (!vector || (apicd->cur_cpu != cpu && apicd->prev_cpu != cpu))
>   goto unlock;

The current affine is apicd->cpu :)

Thank you very much for the suggestion!

> 
> because there are only two cases when a cleanup is required:
> 
>1) The outgoing CPU is the current target
> 
>2) The outgoing CPU was the previous target
> 
> No?

I agree with this statement.

My only concern is: while we use "apicd->cpu", the irq_needs_fixup() uses a
different way. It uses d->common->effective_affinity or d->common->affinity to
decide whether to move forward to migrate the interrupt.

I have spent some time reading about the discussion that happened in the year
2017 (below link). According to my understanding,
CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK always relies on CONFIG_SMP, and we do not
have the chance to encounter the issue for x86.

https://lore.kernel.org/all/alpine.DEB.2.20.1710042208400.2406@nanos/T/#u

I have tested the new patch for a while and never encountered any issue.

Therefore, I will send v2.

Thank you very much for all suggestions!

Dongli Zhang



Re: [PATCH 1/1] x86/vector: Fix vector leak during CPU offline

2024-05-15 Thread Dongli Zhang



On 5/13/24 3:46 PM, Thomas Gleixner wrote:
> On Mon, May 13 2024 at 10:43, Dongli Zhang wrote:
>> On 5/13/24 5:44 AM, Thomas Gleixner wrote:
>>> On Fri, May 10 2024 at 12:06, Dongli Zhang wrote:
>>> Any interrupt which is affine to an outgoing CPU is migrated and
>>> eventually pending moves are enforced:
>>>
>>> cpu_down()
>>>   ...
>>>   cpu_disable_common()
>>> fixup_irqs()
>>>   irq_migrate_all_off_this_cpu()
>>> migrate_one_irq()
>>>   irq_force_complete_move()
>>> free_moved_vector();
>>>
>>> No?
>>
>> I noticed this and finally abandoned the solution to fix at 
>> migrate_one_irq(),
>> because:
>>
>> 1. The objective of migrate_one_irq()-->irq_force_complete_move() looks to
>> cleanup before irq_do_set_affinity().
>>
>> 2. The irq_needs_fixup() may return false so that irq_force_complete_move() 
>> does
>> not get the chance to trigger.
>>
>> 3. Even irq_force_complete_move() is triggered, it exits early if
>> apicd->prev_vector==0.
> 
> But that's not the case, really.
> 
>> The apicd->prev_vector can be cleared by __vector_schedule_cleanup() because
>> cpu_disable_common() releases the vector_lock after CPU is flagged offline.
> 
> Nothing can schedule vector cleanup at that point because _all_ other
> CPUs spin in stop_machine() with interrupts disabled and therefore
> cannot handle interrupts which might invoke it.

Thank you very much for the explanation! Now I see why to drop the vector_lock
is not an issue.


[snip]


> 
>> diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
>> index 1ed2b1739363..5ecd072a34fe 100644
>> --- a/kernel/irq/cpuhotplug.c
>> +++ b/kernel/irq/cpuhotplug.c
>> @@ -69,6 +69,14 @@ static bool migrate_one_irq(struct irq_desc *desc)
>> return false;
>> }
>>
>> +   /*
>> +* Complete an eventually pending irq move cleanup. If this
>> +* interrupt was moved in hard irq context, then the vectors need
>> +* to be cleaned up. It can't wait until this interrupt actually
>> +* happens and this CPU was involved.
>> +*/
>> +   irq_force_complete_move(desc);
> 
> You cannot do that here because it is only valid when the interrupt is
> affine to the outgoing CPU.
> 
> In the problem case the interrupt was affine to the outgoing CPU, but
> the core code does not know that it has not been cleaned up yet. It does
> not even know that the interrupt was affine to the outgoing CPU before.
> 
> So in principle we could just go and do:
> 
>   } else {
> - apicd->prev_vector = 0;
> + free_moved_vector(apicd);
>   }
>   raw_spin_unlock(_lock);
> 
> but that would not give enough information and would depend on the
> interrupt to actually arrive.
> 
> irq_force_complete_move() already describes this case, but obviously it
> does not work because the core code does not invoke it in that
> situation.
> 
> So yes, moving the invocation of irq_force_complete_move() before the
> irq_needs_fixup() call makes sense, but it wants this to actually work
> correctly:
> 
> --- a/arch/x86/kernel/apic/vector.c
> +++ b/arch/x86/kernel/apic/vector.c
> @@ -1036,7 +1036,8 @@ static void __vector_schedule_cleanup(st
>   add_timer_on(>timer, cpu);
>   }
>   } else {
> - apicd->prev_vector = 0;
> + pr_warn("IRQ %u schedule cleanup for offline CPU %u\n", 
> apicd->irq, cpu);
> + free_moved_vector(apicd);
>   }
>   raw_spin_unlock(_lock);
>  }
> @@ -1097,10 +1098,11 @@ void irq_force_complete_move(struct irq_
>   goto unlock;
>  
>   /*
> -  * If prev_vector is empty, no action required.
> +  * If prev_vector is empty or the descriptor was previously
> +  * not on the outgoing CPU no action required.
>*/
>   vector = apicd->prev_vector;
> - if (!vector)
> + if (!vector || apicd->prev_cpu != smp_processor_id())
>   goto unlock;
>  

The above may not work. migrate_one_irq() relies on irq_force_complete_move() to
always reclaim the apicd->prev_vector. Otherwise, the call of
irq_do_set_affinity() later may return -EBUSY.


 801 /*
 802  * Careful here. @apicd might either have move_in_progress set or
 803  * be enqueued for cleanup. Assigning a new vector would either
 804  * leave a stale vector on some CPU around or in case of a pending
 805  * cleanu

Re: [PATCH 1/1] x86/vector: Fix vector leak during CPU offline

2024-05-13 Thread Dongli Zhang
Hi Thomas,

On 5/13/24 5:44 AM, Thomas Gleixner wrote:
> On Fri, May 10 2024 at 12:06, Dongli Zhang wrote:
>> The absence of IRQD_MOVE_PCNTXT prevents immediate effectiveness of
>> interrupt affinity reconfiguration via procfs. Instead, the change is
>> deferred until the next instance of the interrupt being triggered on the
>> original CPU.
>>
>> When the interrupt next triggers on the original CPU, the new affinity is
>> enforced within __irq_move_irq(). A vector is allocated from the new CPU,
>> but if the old vector on the original CPU remains online, it is not
>> immediately reclaimed. Instead, apicd->move_in_progress is flagged, and the
>> reclaiming process is delayed until the next trigger of the interrupt on
>> the new CPU.
>>
>> Upon the subsequent triggering of the interrupt on the new CPU,
>> irq_complete_move() adds a task to the old CPU's vector_cleanup list if it
>> remains online. Subsequently, the timer on the old CPU iterates over its
>> vector_cleanup list, reclaiming vectors.
>>
>> However, if the old CPU is offline before the interrupt triggers again on
>> the new CPU, irq_complete_move() simply resets both apicd->move_in_progress
>> and apicd->prev_vector to 0. Consequently, the vector remains unreclaimed
>> in vector_matrix, resulting in a CPU vector leak.
> 
> I doubt that.
> 
> Any interrupt which is affine to an outgoing CPU is migrated and
> eventually pending moves are enforced:
> 
> cpu_down()
>   ...
>   cpu_disable_common()
> fixup_irqs()
>   irq_migrate_all_off_this_cpu()
> migrate_one_irq()
>   irq_force_complete_move()
> free_moved_vector();
> 
> No?

I noticed this and finally abandoned the solution to fix at migrate_one_irq(),
because:

1. The objective of migrate_one_irq()-->irq_force_complete_move() looks to
cleanup before irq_do_set_affinity().

2. The irq_needs_fixup() may return false so that irq_force_complete_move() does
not get the chance to trigger.

3. Even irq_force_complete_move() is triggered, it exits early if
apicd->prev_vector==0.

The apicd->prev_vector can be cleared by __vector_schedule_cleanup() because
cpu_disable_common() releases the vector_lock after CPU is flagged offline.

void cpu_disable_common(void)
{
int cpu = smp_processor_id();

remove_siblinginfo(cpu);

/* It's now safe to remove this processor from the online map */
lock_vector_lock();
remove_cpu_from_maps(cpu); ---> CPU is flagged offline
unlock_vector_lock();  ---> release the vector_lock here!
fixup_irqs();
lapic_offline();
}


Therefore, the bugfix may become something like (just to demo the idea):

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 185738c72766..247a53fe9ada 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -1035,8 +1035,6 @@ static void __vector_schedule_cleanup(struct
apic_chip_data *apicd)
cl->timer.expires = jiffies + 1;
add_timer_on(>timer, cpu);
}
-   } else {
-   apicd->prev_vector = 0; // or print a warning
}
raw_spin_unlock(_lock);
 }
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 1ed2b1739363..5ecd072a34fe 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -69,6 +69,14 @@ static bool migrate_one_irq(struct irq_desc *desc)
return false;
}

+   /*
+* Complete an eventually pending irq move cleanup. If this
+* interrupt was moved in hard irq context, then the vectors need
+* to be cleaned up. It can't wait until this interrupt actually
+* happens and this CPU was involved.
+*/
+   irq_force_complete_move(desc);
+
/*
 * No move required, if:
 * - Interrupt is per cpu
@@ -87,14 +95,6 @@ static bool migrate_one_irq(struct irq_desc *desc)
return false;
}

-   /*
-* Complete an eventually pending irq move cleanup. If this
-* interrupt was moved in hard irq context, then the vectors need
-* to be cleaned up. It can't wait until this interrupt actually
-* happens and this CPU was involved.
-*/
-   irq_force_complete_move(desc);
-
/*
 * If there is a setaffinity pending, then try to reuse the pending
 * mask, so the last change of the affinity does not get lost. If



That's why I modify only the __vector_schedule_cleanup() as it looked simple.

I will fix in the CPU hotplug interrupt migration code.

> 
> In fact irq_complete_move() should never see apicd->move_in_progress
> with apicd->prev_cpu pointing to an offline CPU.

I think it is possible. The fact that a CPU is offline does

[PATCH RESEND 1/1] x86/vector: Fix vector leak during CPU offline

2024-05-10 Thread Dongli Zhang
The absence of IRQD_MOVE_PCNTXT prevents immediate effectiveness of
interrupt affinity reconfiguration via procfs. Instead, the change is
deferred until the next instance of the interrupt being triggered on the
original CPU.

When the interrupt next triggers on the original CPU, the new affinity is
enforced within __irq_move_irq(). A vector is allocated from the new CPU,
but if the old vector on the original CPU remains online, it is not
immediately reclaimed. Instead, apicd->move_in_progress is flagged, and the
reclaiming process is delayed until the next trigger of the interrupt on
the new CPU.

Upon the subsequent triggering of the interrupt on the new CPU,
irq_complete_move() adds a task to the old CPU's vector_cleanup list if it
remains online. Subsequently, the timer on the old CPU iterates over its
vector_cleanup list, reclaiming vectors.

However, if the old CPU is offline before the interrupt triggers again on
the new CPU, irq_complete_move() simply resets both apicd->move_in_progress
and apicd->prev_vector to 0. Consequently, the vector remains unreclaimed
in vector_matrix, resulting in a CPU vector leak.

To address this issue, the fix borrows from the comments and implementation
of apic_update_vector(): "If the target CPU is offline then the regular
release mechanism via the cleanup vector is not possible and the vector can
be immediately freed in the underlying matrix allocator.".

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
Correct the email of b...@alien8.de. Sorry for my fault. 

 arch/x86/kernel/apic/vector.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 185738c72766..aad189a3bac9 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -1036,6 +1036,15 @@ static void __vector_schedule_cleanup(struct 
apic_chip_data *apicd)
add_timer_on(>timer, cpu);
}
} else {
+   /*
+* This call borrows from the comments and implementation
+* of apic_update_vector(): "If the target CPU is offline
+* then the regular release mechanism via the cleanup
+* vector is not possible and the vector can be immediately
+* freed in the underlying matrix allocator.".
+*/
+   irq_matrix_free(vector_matrix, apicd->prev_cpu,
+   apicd->prev_vector, apicd->is_managed);
apicd->prev_vector = 0;
}
raw_spin_unlock(_lock);
-- 
2.39.3




[PATCH 1/1] x86/vector: Fix vector leak during CPU offline

2024-05-10 Thread Dongli Zhang
The absence of IRQD_MOVE_PCNTXT prevents immediate effectiveness of
interrupt affinity reconfiguration via procfs. Instead, the change is
deferred until the next instance of the interrupt being triggered on the
original CPU.

When the interrupt next triggers on the original CPU, the new affinity is
enforced within __irq_move_irq(). A vector is allocated from the new CPU,
but if the old vector on the original CPU remains online, it is not
immediately reclaimed. Instead, apicd->move_in_progress is flagged, and the
reclaiming process is delayed until the next trigger of the interrupt on
the new CPU.

Upon the subsequent triggering of the interrupt on the new CPU,
irq_complete_move() adds a task to the old CPU's vector_cleanup list if it
remains online. Subsequently, the timer on the old CPU iterates over its
vector_cleanup list, reclaiming vectors.

However, if the old CPU is offline before the interrupt triggers again on
the new CPU, irq_complete_move() simply resets both apicd->move_in_progress
and apicd->prev_vector to 0. Consequently, the vector remains unreclaimed
in vector_matrix, resulting in a CPU vector leak.

To address this issue, the fix borrows from the comments and implementation
of apic_update_vector(): "If the target CPU is offline then the regular
release mechanism via the cleanup vector is not possible and the vector can
be immediately freed in the underlying matrix allocator.".

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
 arch/x86/kernel/apic/vector.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 185738c72766..aad189a3bac9 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -1036,6 +1036,15 @@ static void __vector_schedule_cleanup(struct 
apic_chip_data *apicd)
add_timer_on(>timer, cpu);
}
} else {
+   /*
+* This call borrows from the comments and implementation
+* of apic_update_vector(): "If the target CPU is offline
+* then the regular release mechanism via the cleanup
+* vector is not possible and the vector can be immediately
+* freed in the underlying matrix allocator.".
+*/
+   irq_matrix_free(vector_matrix, apicd->prev_cpu,
+   apicd->prev_vector, apicd->is_managed);
apicd->prev_vector = 0;
}
raw_spin_unlock(_lock);
-- 
2.39.3




[PATCH v2 1/1] genirq/cpuhotplug: retry with cpu_online_mask when irq_do_set_affinity return -ENOSPC

2024-04-23 Thread Dongli Zhang
When a CPU goes offline, the interrupts pinned to that CPU are
re-configured.

Its managed interrupts undergo either migration to other CPUs or shutdown
if all CPUs listed in the affinity are offline. This patch doesn't affect
managed interrupts.

For regular interrupts, they are migrated to other selected online CPUs.
The target CPUs are chosen from either desc->pending_mask (suppose
CONFIG_GENERIC_PENDING_IRQ) or d->common->affinity (suppose CONFIG_SMP).
The cpu_online_mask is used as target CPUs only when CPUs in both
desc->pending_mask and d->common->affinity are offline.

However, there is a bad corner case, when desc->pending_mask or
d->common->affinity is selected as the target cpumask, but none of their
CPUs has any available vectors.

In this case the migration fails and the device interrupt becomes
stale. This is not any different from the case where the affinity
mask does not contain any online CPU, but there is no fallback
operation for this.

Instead of giving up, retry the migration attempt with the online CPU
mask if the interrupt is not managed, as managed interrupts cannot be
affected by this problem.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
[tglx: massage some changelog]
---
Changed since v1:
  - Re-work the commit message
  - Move pr_debug before setting affinity
  - Remove 'all' from pr_debug message

 kernel/irq/cpuhotplug.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 1ed2b1739363..19babb914949 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -130,6 +130,17 @@ static bool migrate_one_irq(struct irq_desc *desc)
 * CPU.
 */
err = irq_do_set_affinity(d, affinity, false);
+
+   if (err == -ENOSPC && !irqd_affinity_is_managed(d) && affinity != 
cpu_online_mask) {
+   pr_debug("IRQ%u: set affinity failed for %*pbl, re-try with 
online CPUs\n",
+d->irq, cpumask_pr_args(affinity));
+
+   affinity = cpu_online_mask;
+   brokeaff = true;
+
+   err = irq_do_set_affinity(d, affinity, false);
+   }
+
if (err) {
pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
d->irq, err);
-- 
2.34.1




Re: [PATCH 1/1] genirq/cpuhotplug: retry with online CPUs on irq_do_set_affinity failure

2024-04-22 Thread Dongli Zhang
Hi Thomas,

On 4/22/24 13:58, Thomas Gleixner wrote:
> On Thu, Apr 18 2024 at 18:33, Dongli Zhang wrote:
> 
>> When a CPU is offline, its IRQs may migrate to other CPUs. For managed
>> IRQs, they are migrated, or shutdown (if all CPUs of the managed IRQ
>> affinity are offline). For regular IRQs, there will only be a
>> migration.
> 
> Please write out interrupts. There is enough space for it and IRQ is
> just not a regular word.

I will use "interrupts".

> 
>> The migrate_one_irq() first uses pending_mask or affinity_mask of the IRQ.
>>
>> 104 if (irq_fixup_move_pending(desc, true))
>> 105 affinity = irq_desc_get_pending_mask(desc);
>> 106 else
>> 107 affinity = irq_data_get_affinity_mask(d);
>>
>> The migrate_one_irq() may use all online CPUs, if all CPUs in
>> pending_mask/affinity_mask are already offline.
>>
>> 113 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
>> 114 /*
>> 115  * If the interrupt is managed, then shut it down and 
>> leave
>> 116  * the affinity untouched.
>> 117  */
>> 118 if (irqd_affinity_is_managed(d)) {
>> 119 irqd_set_managed_shutdown(d);
>> 120 irq_shutdown_and_deactivate(desc);
>> 121 return false;
>> 122 }
>> 123 affinity = cpu_online_mask;
>> 124 brokeaff = true;
>> 125 }
> 
> Please don't copy code into the change log. Describe the problem in
> text.

Would you mind suggesting if the below commit message is fine to you?


genirq/cpuhotplug: retry with cpu_online_mask when irq_do_set_affinity return
-ENOSPC

When a CPU goes offline, the interrupts pinned to that CPU are
re-configured.

Its managed interrupts undergo either migration to other CPUs or shutdown
if all CPUs listed in the affinity are offline. This patch doesn't affect
managed interrupts.

For regular interrupts, they are migrated to other selected online CPUs.
The target CPUs are chosen from either desc->pending_mask (suppose
CONFIG_GENERIC_PENDING_IRQ) or d->common->affinity (suppose CONFIG_SMP).
The cpu_online_mask is used as target CPUs only when CPUs in both
desc->pending_mask and d->common->affinity are offline.

However, there is a bad corner case, when desc->pending_mask or
d->common->affinity is selected as the target cpumask, but none of their
CPUs has any available vectors.

As a result, an -ENOSPC error happens:

  "IRQ151: set affinity failed(-28)."

This is from the debugfs. The allocation fails although other online CPUs
(except CPU=2) have many free vectors.

name:   VECTOR
 size:   0
 mapped: 529
 flags:  0x0103
Online bitmaps:7
Global available:884
Global reserved:   6
Total allocated: 539
System: 36: 0-19,21,50,128,236,243-244,246-255
 | CPU | avl | man | mac | act | vectors
 0   147 0 0   55  32-49,51-87
 1   147 0 0   55  32-49,51-87
 2 0 0 0  202  32-49,51-127,129-235
 4   147 0 0   55  32-49,51-87
 5   147 0 0   55  32-49,51-87
 6   148 0 0   54  32-49,51-86
 7   148 0 0   54  32-49,51-86

The steps to reproduce the issue are in [1]. The core idea is:

1. Create a KVM guest with many virtio-net PCI devices, each configured
with a very large number of queues/vectors.

2. Set the affinity of all virtio-net interrupts to "2,3".

3. Offline many CPUs, excluding "2,3".

4. Offline CPU=2, and irq_do_set_affinity() returns -ENOSPC.

For regular interrupts, if irq_do_set_affinity() returns -ENOSPC, retry it
with all online CPUs. The issue does not happen for managed interrupts
because the vectors are always reserved (in cm->managed_map) before the CPU
offline operation.

[1] https://lore.kernel.org/all/20240419013322.58500-1-dongli.zh...@oracle.com/

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 


> 
>> However, there is a corner case. Although some CPUs in
>> pending_mask/affinity_mask are still online, they are lack of available
>> vectors. If the kernel continues calling irq_do_set_affinity() with those 
>> CPUs,
>> there will be -ENOSPC error.
>>
>> This is not reasonable as other online CPUs still have many available
>> vectors.
> 
> Reasonable is not the question here. It's either correct or not.

This has been re-written in the new commit message.

> 
>> name:   VECTOR
>>  size:   0
>>  mapped: 529
>>  flags:  0x0103
>> Online bitmaps:7
>> Global available:884
>> Global reserved:   6
>> Total allocated: 539
>

[PATCH 1/1] genirq/cpuhotplug: retry with online CPUs on irq_do_set_affinity failure

2024-04-18 Thread Dongli Zhang
When a CPU is offline, its IRQs may migrate to other CPUs. For managed
IRQs, they are migrated, or shutdown (if all CPUs of the managed IRQ
affinity are offline). For regular IRQs, there will only be a migration.

The migrate_one_irq() first uses pending_mask or affinity_mask of the IRQ.

104 if (irq_fixup_move_pending(desc, true))
105 affinity = irq_desc_get_pending_mask(desc);
106 else
107 affinity = irq_data_get_affinity_mask(d);

The migrate_one_irq() may use all online CPUs, if all CPUs in
pending_mask/affinity_mask are already offline.

113 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
114 /*
115  * If the interrupt is managed, then shut it down and leave
116  * the affinity untouched.
117  */
118 if (irqd_affinity_is_managed(d)) {
119 irqd_set_managed_shutdown(d);
120 irq_shutdown_and_deactivate(desc);
121 return false;
122 }
123 affinity = cpu_online_mask;
124 brokeaff = true;
125 }

However, there is a corner case. Although some CPUs in
pending_mask/affinity_mask are still online, they are lack of available
vectors. If the kernel continues calling irq_do_set_affinity() with those CPUs,
there will be -ENOSPC error.

This is not reasonable as other online CPUs still have many available vectors.

name:   VECTOR
 size:   0
 mapped: 529
 flags:  0x0103
Online bitmaps:7
Global available:884
Global reserved:   6
Total allocated: 539
System: 36: 0-19,21,50,128,236,243-244,246-255
 | CPU | avl | man | mac | act | vectors
 0   147 0 0   55  32-49,51-87
 1   147 0 0   55  32-49,51-87
 2 0 0 0  202  32-49,51-127,129-235
 4   147 0 0   55  32-49,51-87
 5   147 0 0   55  32-49,51-87
 6   148 0 0   54  32-49,51-86
 7   148 0 0   54  32-49,51-86

This issue should not happen for managed IRQs because the vectors are already
reserved before CPU hotplug. For regular IRQs, do a re-try with all online
CPUs if the prior irq_do_set_affinity() is failed with -ENOSPC.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
 kernel/irq/cpuhotplug.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 1ed2b1739363..d1666a6b73f4 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -130,6 +130,19 @@ static bool migrate_one_irq(struct irq_desc *desc)
 * CPU.
 */
err = irq_do_set_affinity(d, affinity, false);
+
+   if (err == -ENOSPC &&
+   !irqd_affinity_is_managed(d) &&
+   affinity != cpu_online_mask) {
+   affinity = cpu_online_mask;
+   brokeaff = true;
+
+   pr_debug("IRQ%u: set affinity failed for %*pbl, re-try with all 
online CPUs\n",
+d->irq, cpumask_pr_args(affinity));
+
+   err = irq_do_set_affinity(d, affinity, false);
+   }
+
if (err) {
pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
d->irq, err);
-- 
2.34.1




[PATCH 0/1] genirq/cpuhotplug: fix CPU hotplug set affinity failure issue

2024-04-18 Thread Dongli Zhang
Please refer to the commit message of the patch for details.

The cover letter is to demonstrate how to reproduce the issue on purpose with
QEMU/KVM + virtio-net (that's why virtualizat...@lists.linux.dev is CCed).

Thank you very much!



1. Build the mainline linux kernel.

$ make defconfig
$ scripts/config --file ".config" -e CONFIG_X86_X2APIC \
  -e CONFIG_GENERIC_IRQ_DEBUGFS
$ make olddefconfig
$ make -j24 > /dev/null

Confirm the config is enabled.

$ cat .config | grep CONFIG_GENERIC_IRQ_DEBUGFS
CONFIG_GENERIC_IRQ_DEBUGFS=y


2. Create the VM with the below QEMU command line. The libvirt virbr0 is used
as bridge for virtio-net.

---
$ cat qemu-ifup
#!/bin/sh
# Script to bring a network (tap) device for qemu up.

br="virbr0"
ifconfig $1 up
brctl addif $br "$1"
exit
---

/home/zhang/kvm/qemu-8.2.0/build/qemu-system-x86_64 \
-hda ubuntu2204.qcow2 -m 8G -smp 32 -vnc :5 -enable-kvm -cpu host \
-net nic -net user,hostfwd=tcp::5025-:22 \
-device 
virtio-net-pci,netdev=tapnet01,id=net01,mac=01:54:00:12:34:56,bus=pci.0,addr=0x4,mq=true,vectors=257
 \
-netdev 
tap,id=tapnet01,ifname=tap01,script=qemu-ifup,downscript=no,queues=128,vhost=off
 \
-device 
virtio-net-pci,netdev=tapnet02,id=net02,mac=02:54:00:12:34:56,bus=pci.0,addr=0x5,mq=true,vectors=257
 \
-netdev 
tap,id=tapnet02,ifname=tap02,script=qemu-ifup,downscript=no,queues=128,vhost=off
 \
-kernel /home/zhang/img/debug/mainline-linux/arch/x86_64/boot/bzImage \
-append "root=/dev/sda3 init=/sbin/init text loglevel=7 console=ttyS0" \
-serial stdio -name debug-threads=on


3. Use procfs to confirm the virtio IRQ numbers.

$ cat /proc/interrupts | grep virtio
 24: ... ... PCI-MSIX-:00:04.0   0-edge  virtio0-config
 25: ... ... PCI-MSIX-:00:04.0   1-edge  virtio0-input.0
... ...
537: ... ... PCI-MSIX-:00:05.0 256-edge  virtio1-output.127

Reset the affinity of IRQs 25-537 to CPUs=2,3.

---
#!/bin/sh

for irq in {25..537}
do
  echo $irq
  echo 2,3 > /proc/irq/$irq/smp_affinity_list
  cat /proc/irq/$irq/smp_affinity_list
  cat /proc/irq/$irq/effective_affinity_list
  echo ""
done
---

Now offline CPU=8-31.

---
#!/bin/sh

for cpu in {8..31}
do
  echo $cpu
  echo 0 > /sys/devices/system/cpu/cpu$cpu/online
done
---


The below is the current VECTOR debugfs.

# cat /sys/kernel/debug/irq/domains/VECTOR
name:   VECTOR
 size:   0
 mapped: 529
 flags:  0x0103
Online bitmaps:8
Global available:   1090
Global reserved:   6
Total allocated: 536
System: 36: 0-19,21,50,128,236,243-244,246-255
 | CPU | avl | man | mac | act | vectors
 0   169 0 0   33  32-49,51-65
 1   171 0 0   31  32-49,51-63
 226 0 0  176  32-49,52-127,129-210
 327 0 0  175  32-49,51-127,129-171,173-209
 4   175 0 0   27  32-49,51-59
 5   175 0 0   27  32-49,51-59
 6   172 0 0   30  32-49,51-62
 7   175 0 0   27  32-49,51-59


4. Now offline CPU=3.

# echo 0 > /sys/devices/system/cpu/cpu3/online

There are below from dmesg.

[   96.234045] IRQ151: set affinity failed(-28).
[   96.234064] IRQ156: set affinity failed(-28).
[   96.234078] IRQ158: set affinity failed(-28).
[   96.234091] IRQ159: set affinity failed(-28).
[   96.234105] IRQ161: set affinity failed(-28).
[   96.234118] IRQ162: set affinity failed(-28).
[   96.234132] IRQ163: set affinity failed(-28).
[   96.234145] IRQ164: set affinity failed(-28).
[   96.234159] IRQ165: set affinity failed(-28).
[   96.234172] IRQ166: set affinity failed(-28).
[   96.235013] IRQ fixup: irq 339 move in progress, old vector 48
[   96.237129] smpboot: CPU 3 is now offline


Although other CPUs have many available vectors, only CPU=2 is used.

# cat /sys/kernel/debug/irq/domains/VECTOR
name:   VECTOR
 size:   0
 mapped: 529
 flags:  0x0103
Online bitmaps:7
Global available:   1022
Global reserved:   6
Total allocated: 533
System: 36: 0-19,21,50,128,236,243-244,246-255
 | CPU | avl | man | mac | act | vectors
 0   168 0 0   34  32-49,51-53,55-57,59-68
 1   165 0 0   37  32-49,51-57,59-60,64-73
 2 0 0 0  202  32-49,51-127,129-235
 4   173 0 0   29  32-40,42-48,52-63,65
 5   171 0 0   31  32-49,51-54,56,58-62,64-66
 6   172 0 0   30  32-49,51-52,54-57,59-63,65
 7   173 0 0   29  32-49,51-52,54-58,60-62,64


Dongli Zhang





[PATCH v2 1/1] kernel/cpu: to track which CPUHP callback is failed

2021-04-08 Thread Dongli Zhang
During bootup or cpu hotplug, the cpuhp_up_callbacks() or
cpuhp_down_callbacks() call many CPUHP callbacks (e.g., perf, mm,
workqueue, RCU, kvmclock and more) for each cpu to online/offline. It may
roll back to its previous state if any of callbacks is failed. As a result,
the user will not be able to know which callback is failed and usually the
only symptom is cpu online/offline failure.

This patch is to print more debug log to help user narrow down where is the
root cause.

Below is the example that how the patch helps narrow down the root cause
for the issue fixed by commit d7eb79c6290c ("KVM: kvmclock: Fix vCPUs > 64
can't be online/hotpluged").

We will have below dynamic debug log once we add
dyndbg="file kernel/cpu.c +p" to kernel command line and when issue is
reproduced.

"CPUHP up callback failure (-12) for cpu 64 at kvmclock:setup_percpu (66)"

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
Changed since v1 RFC:
  - use pr_debug() but not pr_err_once() (suggested by Qais Yousef)
  - print log for cpuhp_down_callbacks() as well (suggested by Qais Yousef)

 kernel/cpu.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1b6302ecbabe..bcd4dd7de9c3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -621,6 +621,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct 
cpuhp_cpu_state *st,
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
+   pr_debug("CPUHP up callback failure (%d) for cpu %u at 
%s (%d)\n",
+ret, cpu, cpuhp_get_step(st->state)->name,
+st->state);
+
if (can_rollback_cpu(st)) {
st->target = prev_state;
undo_cpu_up(cpu, st);
@@ -990,6 +994,10 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct 
cpuhp_cpu_state *st,
for (; st->state > target; st->state--) {
ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
if (ret) {
+   pr_debug("CPUHP down callback failure (%d) for cpu %u 
at %s (%d)\n",
+ret, cpu, cpuhp_get_step(st->state)->name,
+st->state);
+
st->target = prev_state;
if (st->state < prev_state)
undo_cpu_down(cpu, st);
-- 
2.17.1



Re: [PATCH RFC 1/1] kernel/cpu: to track which CPUHP callback is failed

2021-04-07 Thread Dongli Zhang



On 4/6/21 9:10 AM, Qais Yousef wrote:
> On 04/05/21 14:22, Dongli Zhang wrote:
>> May I have if there is any feedback on this? To pr_err_once() here helps 
>> narrow
>> down what is the root cause of cpu online failure.
>>
>>
>> The issue fixed by d7eb79c6290c ("KVM: kvmclock: Fix vCPUs > 64 can't be
>> online/hotpluged") is able to demonstrate how this pr_err_once() helps.
>>
>> Due to VM kernel issue, at most 64 VCPUs can online if host clocksource is
>> switched to hpet.
>>
>> # echo hpet > 
>> /sys/devices/system/clocksource/clocksource0/current_clocksource
>>
>>
>> By default, we have no idea why only 64 out of 100 VCPUs are online in VM. We
>> need to customize kernel to debug why some CPUs are not able to online.
>>
>>
>> We will have below and know the root cause is with kvmclock, if we add the
>> pr_err_once().
> 
> Isn't pr_debug() more appropriate here? Don't you want one on the
> cpuhp_down_callbacks() too?

I will add one to cpuhp_down_callbacks() as well. Thank you very much for the
suggestion.

> 
> I *think* it's common now to have CONFIG_DYNAMIC_DEBUG enabled so one can
> enable that line when they need to
> 
>   
> https://www.kernel.org/doc/html/latest/admin-guide/dynamic-debug-howto.html
> 
> I can see the appeal of pr_err_once() but I think this can fail for legitimate
> reasons and is not necessarily strictly always an error?

The reason I used pr_err_once() is to remind the sysadmin something bad (maybe
not necessarily strictly as error) used to happen. It is quite helpful for
developer/sysadmin that is not familiar with cpu/smp/bootup/online.

Otherwise, someone will need to manually configure kernel command line with
"loglevel=8 dyndbg="file kernel/cpu.c +p".

The good thing with pr_err_once is it is not based on the assumption that
someone already know the root cause is related to kernel/cpu.c.

The good thing with pr_debug is we will not need to print for at most once.

I will send v2 with pr_debug() and let all maintainers to decide if it is
fine/required to print something here. Some places use WARN_ON_ONCE() for the
return value of cpuhp_invoke_callback(). I prefer to use pr_debug/pr_err to
print extra debug information for cpu online/hotplug case.

Thank you very much!

Dongli Zhang

> 
> Thanks
> 
> --
> Qais Yousef
> 


Re: [PATCH RFC 1/1] kernel/cpu: to track which CPUHP callback is failed

2021-04-05 Thread Dongli Zhang
May I have if there is any feedback on this? To pr_err_once() here helps narrow
down what is the root cause of cpu online failure.


The issue fixed by d7eb79c6290c ("KVM: kvmclock: Fix vCPUs > 64 can't be
online/hotpluged") is able to demonstrate how this pr_err_once() helps.

Due to VM kernel issue, at most 64 VCPUs can online if host clocksource is
switched to hpet.

# echo hpet > /sys/devices/system/clocksource/clocksource0/current_clocksource


By default, we have no idea why only 64 out of 100 VCPUs are online in VM. We
need to customize kernel to debug why some CPUs are not able to online.


We will have below and know the root cause is with kvmclock, if we add the
pr_err_once().

[0.693112] CPUHP callback failure (-12) for cpu 64 at kvmclock:setup_percpu 
(66)

Thank you very much!

Dongli Zhang


On 3/9/21 11:08 PM, Dongli Zhang wrote:
> During bootup or cpu hotplug, the cpuhp_up_callbacks() calls many CPUHP
> callbacks (e.g., perf, mm, workqueue, RCU, kvmclock and more) for each
> cpu to online. It may roll back to its previous state if any of
> callbacks is failed. As a result, the user will not be able to know
> which callback is failed and usually the only symptom is cpu online
> failure.
> 
> The error log is printed for once to have confirm which callback is
> failed.
> 
> Cc: Joe Jin 
> Signed-off-by: Dongli Zhang 
> ---
> I used 'RFC' because WARN_ON_ONCE() is always used for the result from
> cpuhp_invoke_callback(). I would prefer to get feedback from
> maintainers/reviewers. Here I prefer to print the cpuhp name and state
> value to help confirm the specific callback that is failed.
> 
>  kernel/cpu.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 1b6302ecbabe..c7a719079272 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -621,6 +621,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct 
> cpuhp_cpu_state *st,
>   st->state++;
>   ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
>   if (ret) {
> + pr_err_once("CPUHP callback failure (%d) for cpu %u at 
> %s (%d)\n",
> + ret, cpu, cpuhp_get_step(st->state)->name,
> + st->state);
> +
>   if (can_rollback_cpu(st)) {
>   st->target = prev_state;
>   undo_cpu_up(cpu, st);
> 


[PATCH 1/1] KVM: x86: remove unused declaration of kvm_write_tsc()

2021-03-26 Thread Dongli Zhang
The kvm_write_tsc() was not used since commit 0c899c25d754 ("KVM: x86: do
not attempt TSC synchronization on guest writes"). Remove its unused
declaration.

Signed-off-by: Dongli Zhang 
---
 arch/x86/kvm/x86.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 39eb04887141..9035e34aa156 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -250,7 +250,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu 
*vcpu)
 void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs);
 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int 
inc_eip);
 
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
 u64 get_kvmclock_ns(struct kvm *kvm);
 
 int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
-- 
2.17.1



Re: [PATCH 1/1] KVM: x86: to track if L1 is running L2 VM

2021-03-12 Thread Dongli Zhang
Hi Paolo,

On 3/6/21 5:56 AM, Paolo Bonzini wrote:
> On 05/03/21 23:57, Dongli Zhang wrote:
>> The new per-cpu stat 'nested_run' is introduced in order to track if L1 VM
>> is running or used to run L2 VM.
>>
>> An example of the usage of 'nested_run' is to help the host administrator
>> to easily track if any L1 VM is used to run L2 VM. Suppose there is issue
>> that may happen with nested virtualization, the administrator will be able
>> to easily narrow down and confirm if the issue is due to nested
>> virtualization via 'nested_run'. For example, whether the fix like
>> commit 88dddc11a8d6 ("KVM: nVMX: do not use dangling shadow VMCS after
>> guest reset") is required.
>>
>> Cc: Joe Jin 
>> Signed-off-by: Dongli Zhang 
...
> 
> Queued, thanks.
> 
> Paolo
>

While testing the most recent kvm tree, I did not find this patch queued
(debugfs entry for nested_run not available). Would you mind help confirm?

Thank you very much!

Dongli Zhang



[PATCH RFC 1/1] kernel/cpu: to track which CPUHP callback is failed

2021-03-09 Thread Dongli Zhang
During bootup or cpu hotplug, the cpuhp_up_callbacks() calls many CPUHP
callbacks (e.g., perf, mm, workqueue, RCU, kvmclock and more) for each
cpu to online. It may roll back to its previous state if any of
callbacks is failed. As a result, the user will not be able to know
which callback is failed and usually the only symptom is cpu online
failure.

The error log is printed for once to have confirm which callback is
failed.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
I used 'RFC' because WARN_ON_ONCE() is always used for the result from
cpuhp_invoke_callback(). I would prefer to get feedback from
maintainers/reviewers. Here I prefer to print the cpuhp name and state
value to help confirm the specific callback that is failed.

 kernel/cpu.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1b6302ecbabe..c7a719079272 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -621,6 +621,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct 
cpuhp_cpu_state *st,
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
+   pr_err_once("CPUHP callback failure (%d) for cpu %u at 
%s (%d)\n",
+   ret, cpu, cpuhp_get_step(st->state)->name,
+   st->state);
+
if (can_rollback_cpu(st)) {
st->target = prev_state;
undo_cpu_up(cpu, st);
-- 
2.17.1



[PATCH 1/1] KVM: x86: to track if L1 is running L2 VM

2021-03-05 Thread Dongli Zhang
The new per-cpu stat 'nested_run' is introduced in order to track if L1 VM
is running or used to run L2 VM.

An example of the usage of 'nested_run' is to help the host administrator
to easily track if any L1 VM is used to run L2 VM. Suppose there is issue
that may happen with nested virtualization, the administrator will be able
to easily narrow down and confirm if the issue is due to nested
virtualization via 'nested_run'. For example, whether the fix like
commit 88dddc11a8d6 ("KVM: nVMX: do not use dangling shadow VMCS after
guest reset") is required.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/svm/nested.c   | 2 ++
 arch/x86/kvm/vmx/nested.c   | 2 ++
 arch/x86/kvm/x86.c  | 1 +
 4 files changed, 6 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 877a4025d8da..7669215426ac 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1125,6 +1125,7 @@ struct kvm_vcpu_stat {
u64 req_event;
u64 halt_poll_success_ns;
u64 halt_poll_fail_ns;
+   u64 nested_run;
 };
 
 struct x86_instruction_info;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 35891d9a1099..18c02e958a09 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -494,6 +494,8 @@ int nested_svm_vmrun(struct vcpu_svm *svm)
struct kvm_host_map map;
u64 vmcb12_gpa;
 
+   ++svm->vcpu.stat.nested_run;
+
if (is_smm(>vcpu)) {
kvm_queue_exception(>vcpu, UD_VECTOR);
return 1;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index bcca0b80e0d0..bd1343a0896e 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3453,6 +3453,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool 
launch)
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
enum nested_evmptrld_status evmptrld_status;
 
+   ++vcpu->stat.nested_run;
+
if (!nested_vmx_check_permission(vcpu))
return 1;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a20ce60152e..f296febb0485 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -245,6 +245,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("l1d_flush", l1d_flush),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+   VCPU_STAT("nested_run", nested_run),
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
-- 
2.17.1



[PATCH v2 1/1] KVM: x86: remove misplaced comment on active_mmu_pages

2021-02-25 Thread Dongli Zhang
The 'mmu_page_hash' is used as hash table while 'active_mmu_pages' is a
list. Remove the misplaced comment as it's mostly stating the obvious
anyways.

Signed-off-by: Dongli Zhang 
Reviewed-by: Sean Christopherson 
---
Changed since v1:
  - change 'incorrect' to 'misplaced'

 arch/x86/include/asm/kvm_host.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 84499aad01a4..318242512407 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -937,9 +937,6 @@ struct kvm_arch {
unsigned int indirect_shadow_pages;
u8 mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
-   /*
-* Hash table of struct kvm_mmu_page.
-*/
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
struct list_head lpage_disallowed_mmu_pages;
-- 
2.17.1



Re: [PATCH 1/1] KVM: x86: remove incorrect comment on active_mmu_pages

2021-02-25 Thread Dongli Zhang



On 2/25/21 4:44 PM, Sean Christopherson wrote:
> On Tue, Feb 23, 2021, Dongli Zhang wrote:
>> The 'mmu_page_hash' is used as hash table while 'active_mmu_pages' is a
>> list. This patch removes the incorrect comment on active_mmu_pages.
> 
> Maybe change the last sentence to "Remove the misplaced comment, it's mostly
> stating the obvious anyways."  It's more misplaced than flat out incorrect, 
> e.g.
> the alternative would be to hoist the comment above mmu_page_hash.  I like
> removing it though, IMO mmu_page_hash is the most obvious name out of the
> various structures that track shadow pages.

Thank you very much!

I will change the last sentence and send v2 with your Reviewed-by.

Dongli Zhang

> 
> With that tweak:
> 
> Reviewed-by: Sean Christopherson 
> 
> 
>> Signed-off-by: Dongli Zhang 
>> ---
>>  arch/x86/include/asm/kvm_host.h | 3 ---
>>  1 file changed, 3 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h 
>> b/arch/x86/include/asm/kvm_host.h
>> index 84499aad01a4..318242512407 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -937,9 +937,6 @@ struct kvm_arch {
>>  unsigned int indirect_shadow_pages;
>>  u8 mmu_valid_gen;
>>  struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
>> -/*
>> - * Hash table of struct kvm_mmu_page.
>> - */
>>  struct list_head active_mmu_pages;
>>  struct list_head zapped_obsolete_pages;
>>  struct list_head lpage_disallowed_mmu_pages;
>> -- 
>> 2.17.1
>>


[PATCH 1/1] KVM: x86: remove incorrect comment on active_mmu_pages

2021-02-23 Thread Dongli Zhang
The 'mmu_page_hash' is used as hash table while 'active_mmu_pages' is a
list. This patch removes the incorrect comment on active_mmu_pages.

Signed-off-by: Dongli Zhang 
---
 arch/x86/include/asm/kvm_host.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 84499aad01a4..318242512407 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -937,9 +937,6 @@ struct kvm_arch {
unsigned int indirect_shadow_pages;
u8 mmu_valid_gen;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
-   /*
-* Hash table of struct kvm_mmu_page.
-*/
struct list_head active_mmu_pages;
struct list_head zapped_obsolete_pages;
struct list_head lpage_disallowed_mmu_pages;
-- 
2.17.1



Re: [vdpa_sim_net] 79991caf52: net/ipv4/ipmr.c:#RCU-list_traversed_in_non-reader_section

2021-02-07 Thread Dongli Zhang
Is it possible that the issue is not due to this change?

This change is just to call different API to allocate memory, which is
equivalent to kzalloc()+vzalloc().

Before the change:

try kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | __GFP_RETRY_MAYFAIL);

... and then below if the former is failed.

vzalloc(sizeof(*vs));


After the change:

try kmalloc_node(size, FP_KERNEL|GFP_ZERO|__GFP_NOWARN|__GFP_NORETRY, node);

... and then below if the former is failed

__vmalloc_node(size, 1, GFP_KERNEL|GFP_ZERO, node, __builtin_return_address(0));


The below is the first WARNING in uploaded dmesg. I assume it was called before
to open /dev/vhost-scsi.

Will this test try to open /dev/vhost-scsi?

[5.095515] =
[5.095515] WARNING: suspicious RCU usage
[5.095515] 5.11.0-rc4-8-g79991caf5202 #1 Not tainted
[5.095534] -
[5.096041] security/smack/smack_lsm.c:351 RCU-list traversed in non-reader
section!!
[5.096982]
[5.096982] other info that might help us debug this:
[5.096982]
[5.097953]
[5.097953] rcu_scheduler_active = 1, debug_locks = 1
[5.098739] no locks held by kthreadd/2.
[5.099237]
[5.099237] stack backtrace:
[5.099537] CPU: 0 PID: 2 Comm: kthreadd Not tainted
5.11.0-rc4-8-g79991caf5202 #1
[5.100470] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.12.0-1 04/01/2014
[5.101442] Call Trace:
[5.101807]  dump_stack+0x15f/0x1bf
[5.102298]  smack_cred_prepare+0x400/0x420
[5.102840]  ? security_prepare_creds+0xd4/0x120
[5.103441]  security_prepare_creds+0x84/0x120
[5.103515]  prepare_creds+0x3f1/0x580
[5.103515]  copy_creds+0x65/0x480
[5.103515]  copy_process+0x7b4/0x3600
[5.103515]  ? check_prev_add+0xa40/0xa40
[5.103515]  ? lockdep_enabled+0xd/0x60
[5.103515]  ? lock_is_held_type+0x1a/0x100
[5.103515]  ? __cleanup_sighand+0xc0/0xc0
[5.103515]  ? lockdep_unlock+0x39/0x160
[5.103515]  kernel_clone+0x165/0xd20
[5.103515]  ? copy_init_mm+0x20/0x20
[5.103515]  ? pvclock_clocksource_read+0xd9/0x1a0
[5.103515]  ? sched_clock_local+0x99/0xc0
[5.103515]  ? kthread_insert_work_sanity_check+0xc0/0xc0
[5.103515]  kernel_thread+0xba/0x100
[5.103515]  ? __ia32_sys_clone3+0x40/0x40
[5.103515]  ? kthread_insert_work_sanity_check+0xc0/0xc0
[5.103515]  ? do_raw_spin_unlock+0xa9/0x160
[5.103515]  kthreadd+0x68f/0x7a0
[5.103515]  ? kthread_create_on_cpu+0x160/0x160
[5.103515]  ? lockdep_hardirqs_on+0x77/0x100
[5.103515]  ? _raw_spin_unlock_irq+0x24/0x60
[5.103515]  ? kthread_create_on_cpu+0x160/0x160
[5.103515]  ret_from_fork+0x22/0x30

Thank you very much!

Dongli Zhang


On 2/6/21 7:03 PM, kernel test robot wrote:
> 
> Greeting,
> 
> FYI, we noticed the following commit (built with gcc-9):
> 
> commit: 79991caf5202c7989928be534727805f8f68bb8d ("vdpa_sim_net: Add support 
> for user supported devices")
> https://urldefense.com/v3/__https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git__;!!GqivPVa7Brio!LfgrgVVtPAjwjqTZX8yANgsix4f3cJmAA_CcMeCVymh5XYcamWdR9dnbIQA-p61PJtI$
>   
> Dongli-Zhang/vhost-scsi-alloc-vhost_scsi-with-kvzalloc-to-avoid-delay/20210129-191605
> 
> 
> in testcase: trinity
> version: trinity-static-x86_64-x86_64-f93256fb_2019-08-28
> with following parameters:
> 
>   runtime: 300s
> 
> test-description: Trinity is a linux system call fuzz tester.
> test-url: 
> https://urldefense.com/v3/__http://codemonkey.org.uk/projects/trinity/__;!!GqivPVa7Brio!LfgrgVVtPAjwjqTZX8yANgsix4f3cJmAA_CcMeCVymh5XYcamWdR9dnbIQA-6Y4x88c$
>  
> 
> 
> on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 8G
> 
> caused below changes (please refer to attached dmesg/kmsg for entire 
> log/backtrace):
> 
> 
> +-+++
> | | 
> 39502d042a | 79991caf52 |
> +-+++
> | boot_successes  | 0 
>  | 0  |
> | boot_failures   | 
> 62 | 57 |
> | WARNING:suspicious_RCU_usage| 
> 62 | 57 |
> | security/smack/smack_lsm.c:#RCU-list_traversed_in_non-reader_section| 
> 62 | 57 |
> | security/smack/smack_access.c:#RCU-list_traversed_in_non-reader_section | 
> 62 | 57 |
> | BUG:workqueue_lockup-pool   | 
> 33 | 40 |
> | BUG:kernel_hang_in_boot_stage   

[PATCH RFC v1 2/6] swiotlb: convert variables to arrays

2021-02-03 Thread Dongli Zhang
This patch converts several swiotlb related variables to arrays, in
order to maintain stat/status for different swiotlb buffers. Here are
variables involved:

- io_tlb_start and io_tlb_end
- io_tlb_nslabs and io_tlb_used
- io_tlb_list
- io_tlb_index
- max_segment
- io_tlb_orig_addr
- no_iotlb_memory

There is no functional change and this is to prepare to enable 64-bit
swiotlb.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
 arch/powerpc/platforms/pseries/svm.c |   6 +-
 drivers/xen/swiotlb-xen.c|   4 +-
 include/linux/swiotlb.h  |   5 +-
 kernel/dma/swiotlb.c | 257 ++-
 4 files changed, 140 insertions(+), 132 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/svm.c 
b/arch/powerpc/platforms/pseries/svm.c
index 7b739cc7a8a9..9f8842d0da1f 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -55,9 +55,9 @@ void __init svm_swiotlb_init(void)
if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false))
return;
 
-   if (io_tlb_start)
-   memblock_free_early(io_tlb_start,
-   PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+   if (io_tlb_start[SWIOTLB_LO])
+   memblock_free_early(io_tlb_start[SWIOTLB_LO],
+   PAGE_ALIGN(io_tlb_nslabs[SWIOTLB_LO] << 
IO_TLB_SHIFT));
panic("SVM: Cannot allocate SWIOTLB buffer");
 }
 
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 2b385c1b4a99..3261880ad859 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -192,8 +192,8 @@ int __ref xen_swiotlb_init(int verbose, bool early)
/*
 * IO TLB memory already allocated. Just use it.
 */
-   if (io_tlb_start != 0) {
-   xen_io_tlb_start = phys_to_virt(io_tlb_start);
+   if (io_tlb_start[SWIOTLB_LO] != 0) {
+   xen_io_tlb_start = phys_to_virt(io_tlb_start[SWIOTLB_LO]);
goto end;
}
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index ca125c1b1281..777046cd4d1b 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -76,11 +76,12 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys,
 
 #ifdef CONFIG_SWIOTLB
 extern enum swiotlb_force swiotlb_force;
-extern phys_addr_t io_tlb_start, io_tlb_end;
+extern phys_addr_t io_tlb_start[], io_tlb_end[];
 
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
 {
-   return paddr >= io_tlb_start && paddr < io_tlb_end;
+   return paddr >= io_tlb_start[SWIOTLB_LO] &&
+  paddr < io_tlb_end[SWIOTLB_LO];
 }
 
 void __init swiotlb_exit(void);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 7c42df6e6100..1fbb65daa2dd 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -69,38 +69,38 @@ enum swiotlb_force swiotlb_force;
  * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by 
this
  * API.
  */
-phys_addr_t io_tlb_start, io_tlb_end;
+phys_addr_t io_tlb_start[SWIOTLB_MAX], io_tlb_end[SWIOTLB_MAX];
 
 /*
  * The number of IO TLB blocks (in groups of 64) between io_tlb_start and
  * io_tlb_end.  This is command line adjustable via setup_io_tlb_npages.
  */
-static unsigned long io_tlb_nslabs;
+static unsigned long io_tlb_nslabs[SWIOTLB_MAX];
 
 /*
  * The number of used IO TLB block
  */
-static unsigned long io_tlb_used;
+static unsigned long io_tlb_used[SWIOTLB_MAX];
 
 /*
  * This is a free list describing the number of free entries available from
  * each index
  */
-static unsigned int *io_tlb_list;
-static unsigned int io_tlb_index;
+static unsigned int *io_tlb_list[SWIOTLB_MAX];
+static unsigned int io_tlb_index[SWIOTLB_MAX];
 
 /*
  * Max segment that we can provide which (if pages are contingous) will
  * not be bounced (unless SWIOTLB_FORCE is set).
  */
-static unsigned int max_segment;
+static unsigned int max_segment[SWIOTLB_MAX];
 
 /*
  * We need to save away the original address corresponding to a mapped entry
  * for the sync operations.
  */
 #define INVALID_PHYS_ADDR (~(phys_addr_t)0)
-static phys_addr_t *io_tlb_orig_addr;
+static phys_addr_t *io_tlb_orig_addr[SWIOTLB_MAX];
 
 /*
  * Protect the above data structures in the map and unmap calls
@@ -113,9 +113,9 @@ static int __init
 setup_io_tlb_npages(char *str)
 {
if (isdigit(*str)) {
-   io_tlb_nslabs = simple_strtoul(str, , 0);
+   io_tlb_nslabs[SWIOTLB_LO] = simple_strtoul(str, , 0);
/* avoid tail segment of size < IO_TLB_SEGSIZE */
-   io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE);
+   io_tlb_nslabs[SWIOTLB_LO] = ALIGN(io_tlb_nslabs[SWIOTLB_LO], 
IO_TLB_SEGSIZE);
}
if (*str == ',')
++str;
@@ -123,40 +123,40 @@ setup_io_tlb_npages(char *str)
swiotlb_fo

[PATCH RFC v1 6/6] xen-swiotlb: enable 64-bit xen-swiotlb

2021-02-03 Thread Dongli Zhang
This patch is to enable the 64-bit xen-swiotlb buffer.

For Xen PVM DMA address, the 64-bit device will be able to allocate from
64-bit swiotlb buffer.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
 drivers/xen/swiotlb-xen.c | 117 --
 1 file changed, 74 insertions(+), 43 deletions(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index e18cae693cdc..c9ab07809e32 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -108,27 +108,36 @@ static int is_xen_swiotlb_buffer(struct device *dev, 
dma_addr_t dma_addr)
unsigned long bfn = XEN_PFN_DOWN(dma_to_phys(dev, dma_addr));
unsigned long xen_pfn = bfn_to_local_pfn(bfn);
phys_addr_t paddr = (phys_addr_t)xen_pfn << XEN_PAGE_SHIFT;
+   int i;
 
/* If the address is outside our domain, it CAN
 * have the same virtual address as another address
 * in our domain. Therefore _only_ check address within our domain.
 */
-   if (pfn_valid(PFN_DOWN(paddr))) {
-   return paddr >= virt_to_phys(xen_io_tlb_start[SWIOTLB_LO]) &&
-  paddr < virt_to_phys(xen_io_tlb_end[SWIOTLB_LO]);
-   }
+   if (!pfn_valid(PFN_DOWN(paddr)))
+   return 0;
+
+   for (i = 0; i < swiotlb_nr; i++)
+   if (paddr >= virt_to_phys(xen_io_tlb_start[i]) &&
+   paddr < virt_to_phys(xen_io_tlb_end[i]))
+   return 1;
+
return 0;
 }
 
 static int
-xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs)
+xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs,
+ enum swiotlb_t type)
 {
int i, rc;
int dma_bits;
dma_addr_t dma_handle;
phys_addr_t p = virt_to_phys(buf);
 
-   dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT;
+   if (type == SWIOTLB_HI)
+   dma_bits = max_dma_bits[SWIOTLB_HI];
+   else
+   dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + 
PAGE_SHIFT;
 
i = 0;
do {
@@ -139,7 +148,7 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long 
nslabs)
p + (i << IO_TLB_SHIFT),
get_order(slabs << IO_TLB_SHIFT),
dma_bits, _handle);
-   } while (rc && dma_bits++ < max_dma_bits[SWIOTLB_LO]);
+   } while (rc && dma_bits++ < max_dma_bits[type]);
if (rc)
return rc;
 
@@ -147,16 +156,17 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long 
nslabs)
} while (i < nslabs);
return 0;
 }
-static unsigned long xen_set_nslabs(unsigned long nr_tbl)
+
+static unsigned long xen_set_nslabs(unsigned long nr_tbl, enum swiotlb_t type)
 {
if (!nr_tbl) {
-   xen_io_tlb_nslabs[SWIOTLB_LO] = (64 * 1024 * 1024 >> 
IO_TLB_SHIFT);
-   xen_io_tlb_nslabs[SWIOTLB_LO] = 
ALIGN(xen_io_tlb_nslabs[SWIOTLB_LO],
+   xen_io_tlb_nslabs[type] = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
+   xen_io_tlb_nslabs[type] = ALIGN(xen_io_tlb_nslabs[type],
  IO_TLB_SEGSIZE);
} else
-   xen_io_tlb_nslabs[SWIOTLB_LO] = nr_tbl;
+   xen_io_tlb_nslabs[type] = nr_tbl;
 
-   return xen_io_tlb_nslabs[SWIOTLB_LO] << IO_TLB_SHIFT;
+   return xen_io_tlb_nslabs[type] << IO_TLB_SHIFT;
 }
 
 enum xen_swiotlb_err {
@@ -180,23 +190,24 @@ static const char *xen_swiotlb_error(enum xen_swiotlb_err 
err)
}
return "";
 }
-int __ref xen_swiotlb_init(int verbose, bool early)
+
+static int xen_swiotlb_init_type(int verbose, bool early, enum swiotlb_t type)
 {
unsigned long bytes, order;
int rc = -ENOMEM;
enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN;
unsigned int repeat = 3;
 
-   xen_io_tlb_nslabs[SWIOTLB_LO] = swiotlb_nr_tbl(SWIOTLB_LO);
+   xen_io_tlb_nslabs[type] = swiotlb_nr_tbl(type);
 retry:
-   bytes = xen_set_nslabs(xen_io_tlb_nslabs[SWIOTLB_LO]);
-   order = get_order(xen_io_tlb_nslabs[SWIOTLB_LO] << IO_TLB_SHIFT);
+   bytes = xen_set_nslabs(xen_io_tlb_nslabs[type], type);
+   order = get_order(xen_io_tlb_nslabs[type] << IO_TLB_SHIFT);
 
/*
 * IO TLB memory already allocated. Just use it.
 */
-   if (io_tlb_start[SWIOTLB_LO] != 0) {
-   xen_io_tlb_start[SWIOTLB_LO] = 
phys_to_virt(io_tlb_start[SWIOTLB_LO]);
+   if (io_tlb_start[type] != 0) {
+   xen_io_tlb_start[type] = phys_to_virt(io_tlb_start[type]);
goto end;
}
 
@@ -204,81 +215,95 @@ int __ref xen_swiotlb_init(int verbose, bool early)
 * Get IO TLB memory from any location.
 */
if (early) {
-  

[PATCH RFC v1 5/6] xen-swiotlb: convert variables to arrays

2021-02-03 Thread Dongli Zhang
This patch converts several xen-swiotlb related variables to arrays, in
order to maintain stat/status for different swiotlb buffers. Here are
variables involved:

- xen_io_tlb_start and xen_io_tlb_end
- xen_io_tlb_nslabs
- MAX_DMA_BITS

There is no functional change and this is to prepare to enable 64-bit
xen-swiotlb.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
 drivers/xen/swiotlb-xen.c | 75 +--
 1 file changed, 40 insertions(+), 35 deletions(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 662638093542..e18cae693cdc 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -39,15 +39,17 @@
 #include 
 
 #include 
-#define MAX_DMA_BITS 32
 /*
  * Used to do a quick range check in swiotlb_tbl_unmap_single and
  * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by 
this
  * API.
  */
 
-static char *xen_io_tlb_start, *xen_io_tlb_end;
-static unsigned long xen_io_tlb_nslabs;
+static char *xen_io_tlb_start[SWIOTLB_MAX], *xen_io_tlb_end[SWIOTLB_MAX];
+static unsigned long xen_io_tlb_nslabs[SWIOTLB_MAX];
+
+static int max_dma_bits[] = {32, 64};
+
 /*
  * Quick lookup value of the bus address of the IOTLB.
  */
@@ -112,8 +114,8 @@ static int is_xen_swiotlb_buffer(struct device *dev, 
dma_addr_t dma_addr)
 * in our domain. Therefore _only_ check address within our domain.
 */
if (pfn_valid(PFN_DOWN(paddr))) {
-   return paddr >= virt_to_phys(xen_io_tlb_start) &&
-  paddr < virt_to_phys(xen_io_tlb_end);
+   return paddr >= virt_to_phys(xen_io_tlb_start[SWIOTLB_LO]) &&
+  paddr < virt_to_phys(xen_io_tlb_end[SWIOTLB_LO]);
}
return 0;
 }
@@ -137,7 +139,7 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long 
nslabs)
p + (i << IO_TLB_SHIFT),
get_order(slabs << IO_TLB_SHIFT),
dma_bits, _handle);
-   } while (rc && dma_bits++ < MAX_DMA_BITS);
+   } while (rc && dma_bits++ < max_dma_bits[SWIOTLB_LO]);
if (rc)
return rc;
 
@@ -148,12 +150,13 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long 
nslabs)
 static unsigned long xen_set_nslabs(unsigned long nr_tbl)
 {
if (!nr_tbl) {
-   xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT);
-   xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE);
+   xen_io_tlb_nslabs[SWIOTLB_LO] = (64 * 1024 * 1024 >> 
IO_TLB_SHIFT);
+   xen_io_tlb_nslabs[SWIOTLB_LO] = 
ALIGN(xen_io_tlb_nslabs[SWIOTLB_LO],
+ IO_TLB_SEGSIZE);
} else
-   xen_io_tlb_nslabs = nr_tbl;
+   xen_io_tlb_nslabs[SWIOTLB_LO] = nr_tbl;
 
-   return xen_io_tlb_nslabs << IO_TLB_SHIFT;
+   return xen_io_tlb_nslabs[SWIOTLB_LO] << IO_TLB_SHIFT;
 }
 
 enum xen_swiotlb_err {
@@ -184,16 +187,16 @@ int __ref xen_swiotlb_init(int verbose, bool early)
enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN;
unsigned int repeat = 3;
 
-   xen_io_tlb_nslabs = swiotlb_nr_tbl(SWIOTLB_LO);
+   xen_io_tlb_nslabs[SWIOTLB_LO] = swiotlb_nr_tbl(SWIOTLB_LO);
 retry:
-   bytes = xen_set_nslabs(xen_io_tlb_nslabs);
-   order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT);
+   bytes = xen_set_nslabs(xen_io_tlb_nslabs[SWIOTLB_LO]);
+   order = get_order(xen_io_tlb_nslabs[SWIOTLB_LO] << IO_TLB_SHIFT);
 
/*
 * IO TLB memory already allocated. Just use it.
 */
if (io_tlb_start[SWIOTLB_LO] != 0) {
-   xen_io_tlb_start = phys_to_virt(io_tlb_start[SWIOTLB_LO]);
+   xen_io_tlb_start[SWIOTLB_LO] = 
phys_to_virt(io_tlb_start[SWIOTLB_LO]);
goto end;
}
 
@@ -201,76 +204,78 @@ int __ref xen_swiotlb_init(int verbose, bool early)
 * Get IO TLB memory from any location.
 */
if (early) {
-   xen_io_tlb_start = memblock_alloc(PAGE_ALIGN(bytes),
+   xen_io_tlb_start[SWIOTLB_LO] = memblock_alloc(PAGE_ALIGN(bytes),
  PAGE_SIZE);
-   if (!xen_io_tlb_start)
+   if (!xen_io_tlb_start[SWIOTLB_LO])
panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
  __func__, PAGE_ALIGN(bytes), PAGE_SIZE);
} else {
 #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT))
 #define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT)
while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
-   xen_io_tlb_start = (void 
*)xen_get_swiotlb_free_pages(order);
-   if (xen_io_tlb_start)
+   

[PATCH RFC v1 1/6] swiotlb: define new enumerated type

2021-02-03 Thread Dongli Zhang
This is just to define new enumerated type without functional change.

The 'SWIOTLB_LO' is to index legacy 32-bit swiotlb buffer, while the
'SWIOTLB_HI' is to index the 64-bit buffer.

This is to prepare to enable 64-bit swiotlb.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
 include/linux/swiotlb.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index d9c9fc9ca5d2..ca125c1b1281 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -17,6 +17,12 @@ enum swiotlb_force {
SWIOTLB_NO_FORCE,   /* swiotlb=noforce */
 };
 
+enum swiotlb_t {
+   SWIOTLB_LO,
+   SWIOTLB_HI,
+   SWIOTLB_MAX,
+};
+
 /*
  * Maximum allowable number of contiguous slabs to map,
  * must be a power of 2.  What is the appropriate value ?
-- 
2.17.1



[PATCH RFC v1 4/6] swiotlb: enable 64-bit swiotlb

2021-02-03 Thread Dongli Zhang
This patch is to enable the 64-bit swiotlb buffer.

The state of the art swiotlb pre-allocates <=32-bit memory in order to meet
the DMA mask requirement for some 32-bit legacy device. Considering most
devices nowadays support 64-bit DMA and IOMMU is available, the swiotlb is
not used for most of the times, except:

1. The xen PVM domain requires the DMA addresses to both (1) less than the
dma mask, and (2) continuous in machine address. Therefore, the 64-bit
device may still require swiotlb on PVM domain.

2. From source code the AMD SME/SEV will enable SWIOTLB_FORCE. As a result
it is always required to allocate from swiotlb buffer even the device dma
mask is 64-bit.

sme_early_init()
-> if (sev_active())
   swiotlb_force = SWIOTLB_FORCE;

Therefore, this patch introduces the 2nd swiotlb buffer for 64-bit DMA
access. For instance, the swiotlb_tbl_map_single() allocates from the 2nd
64-bit buffer if the device DMA mask is
min_not_zero(*hwdev->dma_mask, hwdev->bus_dma_limit).

The example to configure 64-bit swiotlb is "swiotlb=65536,524288,force"
or "swiotlb=,524288,force", where 524288 is the size of 64-bit buffer.

With the patch, the kernel will be able to allocate >4GB swiotlb buffer.
This patch is only for swiotlb, not including xen-swiotlb.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
 arch/mips/cavium-octeon/dma-octeon.c |   3 +-
 arch/powerpc/kernel/dma-swiotlb.c|   2 +-
 arch/powerpc/platforms/pseries/svm.c |   2 +-
 arch/x86/kernel/pci-swiotlb.c|   5 +-
 arch/x86/pci/sta2x11-fixup.c |   2 +-
 drivers/gpu/drm/i915/gem/i915_gem_internal.c |   4 +-
 drivers/gpu/drm/i915/i915_scatterlist.h  |   2 +-
 drivers/gpu/drm/nouveau/nouveau_ttm.c|   2 +-
 drivers/mmc/host/sdhci.c |   2 +-
 drivers/pci/xen-pcifront.c   |   2 +-
 drivers/xen/swiotlb-xen.c|   9 +-
 include/linux/swiotlb.h  |  28 +-
 kernel/dma/swiotlb.c | 339 +++
 13 files changed, 238 insertions(+), 164 deletions(-)

diff --git a/arch/mips/cavium-octeon/dma-octeon.c 
b/arch/mips/cavium-octeon/dma-octeon.c
index df70308db0e6..3480555d908a 100644
--- a/arch/mips/cavium-octeon/dma-octeon.c
+++ b/arch/mips/cavium-octeon/dma-octeon.c
@@ -245,6 +245,7 @@ void __init plat_swiotlb_setup(void)
panic("%s: Failed to allocate %zu bytes align=%lx\n",
  __func__, swiotlbsize, PAGE_SIZE);
 
-   if (swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs, 1) == -ENOMEM)
+   if (swiotlb_init_with_tbl(octeon_swiotlb, swiotlb_nslabs,
+ SWIOTLB_LO, 1) == -ENOMEM)
panic("Cannot allocate SWIOTLB buffer");
 }
diff --git a/arch/powerpc/kernel/dma-swiotlb.c 
b/arch/powerpc/kernel/dma-swiotlb.c
index fc7816126a40..88113318c53f 100644
--- a/arch/powerpc/kernel/dma-swiotlb.c
+++ b/arch/powerpc/kernel/dma-swiotlb.c
@@ -20,7 +20,7 @@ void __init swiotlb_detect_4g(void)
 static int __init check_swiotlb_enabled(void)
 {
if (ppc_swiotlb_enable)
-   swiotlb_print_info();
+   swiotlb_print_info(SWIOTLB_LO);
else
swiotlb_exit();
 
diff --git a/arch/powerpc/platforms/pseries/svm.c 
b/arch/powerpc/platforms/pseries/svm.c
index 9f8842d0da1f..77910e4ffad8 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -52,7 +52,7 @@ void __init svm_swiotlb_init(void)
bytes = io_tlb_nslabs << IO_TLB_SHIFT;
 
vstart = memblock_alloc(PAGE_ALIGN(bytes), PAGE_SIZE);
-   if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, false))
+   if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, SWIOTLB_LO, 
false))
return;
 
if (io_tlb_start[SWIOTLB_LO])
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index c2cfa5e7c152..950624fd95a4 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -67,12 +67,15 @@ void __init pci_swiotlb_init(void)
 
 void __init pci_swiotlb_late_init(void)
 {
+   int i;
+
/* An IOMMU turned us off. */
if (!swiotlb)
swiotlb_exit();
else {
printk(KERN_INFO "PCI-DMA: "
   "Using software bounce buffering for IO (SWIOTLB)\n");
-   swiotlb_print_info();
+   for (i = 0; i < swiotlb_nr; i++)
+   swiotlb_print_info(i);
}
 }
diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c
index 7d2525691854..c440520b2055 100644
--- a/arch/x86/pci/sta2x11-fixup.c
+++ b/arch/x86/pci/sta2x11-fixup.c
@@ -57,7 +57,7 @@ static void sta2x11_new_instance(struct pci_dev *pdev)
int size = STA2X11_SWIOTLB_SIZE;
/* First instance

[PATCH RFC v1 0/6] swiotlb: 64-bit DMA buffer

2021-02-03 Thread Dongli Zhang
This RFC is to introduce the 2nd swiotlb buffer for 64-bit DMA access.  The
prototype is based on v5.11-rc6.

The state of the art swiotlb pre-allocates <=32-bit memory in order to meet
the DMA mask requirement for some 32-bit legacy device. Considering most
devices nowadays support 64-bit DMA and IOMMU is available, the swiotlb is
not used for most of times, except:

1. The xen PVM domain requires the DMA addresses to both (1) <= the device
dma mask, and (2) continuous in machine address. Therefore, the 64-bit
device may still require swiotlb on PVM domain.

2. From source code the AMD SME/SEV will enable SWIOTLB_FORCE. As a result
it is always required to allocate from swiotlb buffer even the device dma
mask is 64-bit.

sme_early_init()
-> if (sev_active())
   swiotlb_force = SWIOTLB_FORCE;


Therefore, this RFC introduces the 2nd swiotlb buffer for 64-bit DMA
access.  For instance, the swiotlb_tbl_map_single() allocates from the 2nd
64-bit buffer if the device DMA mask min_not_zero(*hwdev->dma_mask,
hwdev->bus_dma_limit) is 64-bit.  With the RFC, the Xen/AMD will be able to
allocate >4GB swiotlb buffer.

With it being 64-bit, you can (not in this patch set but certainly
possible) allocate this at runtime. Meaning the size could change depending
on the device MMIO buffers, etc.


I have tested the patch set on Xen PVM dom0 boot via QEMU. The dom0 is boot
via:

qemu-system-x86_64 -smp 8 -m 20G -enable-kvm -vnc :9 \
-net nic -net user,hostfwd=tcp::5029-:22 \
-hda disk.img \
-device nvme,drive=nvme0,serial=deudbeaf1,max_ioqpairs=16 \
-drive file=test.qcow2,if=none,id=nvme0 \
-serial stdio

The "swiotlb=65536,1048576,force" is to configure 32-bit swiotlb as 128MB
and 64-bit swiotlb as 2048MB. The swiotlb is enforced.

vm# cat /proc/cmdline 
placeholder root=UUID=4e942d60-c228-4caf-b98e-f41c365d9703 ro text
swiotlb=65536,1048576,force quiet splash

[5.119877] Booting paravirtualized kernel on Xen
... ...
[5.190423] software IO TLB: Low Mem mapped [mem 
0x000234e0-0x00023ce0] (128MB)
[6.276161] software IO TLB: High Mem mapped [mem 
0x000166f33000-0x0001e6f33000] (2048MB)

0x000234e0 is mapped to 0x001c (32-bit machine address)
0x00023ce0-1 is mapped to 0x0ff3 (32-bit machine address)
0x000166f33000 is mapped to 0x0004b728 (64-bit machine address)
0x0001e6f33000-1 is mapped to 0x00033a07 (64-bit machine address)


While running fio for emulated-NVMe, the swiotlb is allocating from 64-bit
io_tlb_used-highmem.

vm# cat /sys/kernel/debug/swiotlb/io_tlb_nslabs
65536
vm# cat /sys/kernel/debug/swiotlb/io_tlb_used
258
vm# cat /sys/kernel/debug/swiotlb/io_tlb_nslabs-highmem
1048576
vm# cat /sys/kernel/debug/swiotlb/io_tlb_used-highmem
58880


I also tested virtio-scsi (with "disable-legacy=on,iommu_platform=true") on
VM with AMD SEV enabled.

qemu-system-x86_64 -enable-kvm -machine q35 -smp 36 -m 20G \
-drive if=pflash,format=raw,unit=0,file=OVMF_CODE.pure-efi.fd,readonly \
-drive if=pflash,format=raw,unit=1,file=OVMF_VARS.fd \
-hda ol7-uefi.qcow2 -serial stdio -vnc :9 \
-net nic -net user,hostfwd=tcp::5029-:22 \
-cpu EPYC -object sev-guest,id=sev0,cbitpos=47,reduced-phys-bits=1 \
-machine memory-encryption=sev0 \
-device virtio-scsi-pci,id=scsi,disable-legacy=on,iommu_platform=true \
-device scsi-hd,drive=disk0 \
-drive file=test.qcow2,if=none,id=disk0,format=qcow2

The "swiotlb=65536,1048576" is to configure 32-bit swiotlb as 128MB and
64-bit swiotlb as 2048MB. We do not need to force swiotlb because AMD SEV
will set SWIOTLB_FORCE.

# cat /proc/cmdline
BOOT_IMAGE=/vmlinuz-5.11.0-rc6swiotlb+ root=/dev/mapper/ol-root ro
crashkernel=auto rd.lvm.lv=ol/root rd.lvm.lv=ol/swap rhgb quiet
LANG=en_US.UTF-8 swiotlb=65536,1048576

[0.729790] AMD Memory Encryption Features active: SEV
... ...
[2.113147] software IO TLB: Low Mem mapped [mem 
0x73e1e000-0x7be1e000] (128MB)
[2.113151] software IO TLB: High Mem mapped [mem 
0x0004e840-0x00056840] (2048MB)

While running fio for virtio-scsi, the swiotlb is allocating from 64-bit
io_tlb_used-highmem.

vm# cat /sys/kernel/debug/swiotlb/io_tlb_nslabs
65536
vm# cat /sys/kernel/debug/swiotlb/io_tlb_used
0
vm# cat /sys/kernel/debug/swiotlb/io_tlb_nslabs-highmem
1048576
vm# cat /sys/kernel/debug/swiotlb/io_tlb_used-highmem
64647


Please let me know if there is any feedback for this idea and RFC.


Dongli Zhang (6):
   swiotlb: define new enumerated type
   swiotlb: convert variables to arrays
   swiotlb: introduce swiotlb_get_type() to calculate swiotlb buffer type
   swiotlb: enable 64-bit swiotlb
   xen-swiotlb: convert variables to arrays
   xen-swiotlb: enable 64-bit xen-swiotlb

 arch/mips/cavium-octeon/dma-octeon.c |   3 +-
 arch/powerpc/kernel/dma-swiotlb.c|   2 +-
 arch/powerpc/platforms/pseries/svm.c |   8 +-
 arch/x86/kernel/pci-swiotlb.c  

[PATCH RFC v1 3/6] swiotlb: introduce swiotlb_get_type() to calculate swiotlb buffer type

2021-02-03 Thread Dongli Zhang
This patch introduces swiotlb_get_type() in order to calculate which
swiotlb buffer the given DMA address is belong to.

This is to prepare to enable 64-bit swiotlb.

Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
 include/linux/swiotlb.h | 14 ++
 kernel/dma/swiotlb.c|  2 ++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 777046cd4d1b..3d5980d77810 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -3,6 +3,7 @@
 #define __LINUX_SWIOTLB_H
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -23,6 +24,8 @@ enum swiotlb_t {
SWIOTLB_MAX,
 };
 
+extern int swiotlb_nr;
+
 /*
  * Maximum allowable number of contiguous slabs to map,
  * must be a power of 2.  What is the appropriate value ?
@@ -84,6 +87,17 @@ static inline bool is_swiotlb_buffer(phys_addr_t paddr)
   paddr < io_tlb_end[SWIOTLB_LO];
 }
 
+static inline int swiotlb_get_type(phys_addr_t paddr)
+{
+   int i;
+
+   for (i = 0; i < swiotlb_nr; i++)
+   if (paddr >= io_tlb_start[i] && paddr < io_tlb_end[i])
+   return i;
+
+   return -ENOENT;
+}
+
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1fbb65daa2dd..c91d3d2c3936 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -109,6 +109,8 @@ static DEFINE_SPINLOCK(io_tlb_lock);
 
 static int late_alloc;
 
+int swiotlb_nr = 1;
+
 static int __init
 setup_io_tlb_npages(char *str)
 {
-- 
2.17.1



Re: [PATCH v2] xen-blkback: fix compatibility bug with single page rings

2021-01-30 Thread Dongli Zhang



On 1/29/21 12:13 AM, Paul Durrant wrote:
>> -Original Message-
>> From: Jürgen Groß 
>> Sent: 29 January 2021 07:35
>> To: Dongli Zhang ; Paul Durrant ; xen-
>> de...@lists.xenproject.org; linux-bl...@vger.kernel.org; 
>> linux-kernel@vger.kernel.org
>> Cc: Paul Durrant ; Konrad Rzeszutek Wilk 
>> ; Roger Pau
>> Monné ; Jens Axboe 
>> Subject: Re: [PATCH v2] xen-blkback: fix compatibility bug with single page 
>> rings
>>
>> On 29.01.21 07:20, Dongli Zhang wrote:
>>>
>>>
>>> On 1/28/21 5:04 AM, Paul Durrant wrote:
>>>> From: Paul Durrant 
>>>>
>>>> Prior to commit 4a8c31a1c6f5 ("xen/blkback: rework connect_ring() to avoid
>>>> inconsistent xenstore 'ring-page-order' set by malicious blkfront"), the
>>>> behaviour of xen-blkback when connecting to a frontend was:
>>>>
>>>> - read 'ring-page-order'
>>>> - if not present then expect a single page ring specified by 'ring-ref'
>>>> - else expect a ring specified by 'ring-refX' where X is between 0 and
>>>>1 << ring-page-order
>>>>
>>>> This was correct behaviour, but was broken by the afforementioned commit to
>>>> become:
>>>>
>>>> - read 'ring-page-order'
>>>> - if not present then expect a single page ring (i.e. ring-page-order = 0)
>>>> - expect a ring specified by 'ring-refX' where X is between 0 and
>>>>1 << ring-page-order
>>>> - if that didn't work then see if there's a single page ring specified by
>>>>'ring-ref'
>>>>
>>>> This incorrect behaviour works most of the time but fails when a frontend
>>>> that sets 'ring-page-order' is unloaded and replaced by one that does not
>>>> because, instead of reading 'ring-ref', xen-blkback will read the stale
>>>> 'ring-ref0' left around by the previous frontend will try to map the wrong
>>>> grant reference.
>>>>
>>>> This patch restores the original behaviour.
>>>>
>>>> Fixes: 4a8c31a1c6f5 ("xen/blkback: rework connect_ring() to avoid 
>>>> inconsistent xenstore 'ring-page-
>> order' set by malicious blkfront")
>>>> Signed-off-by: Paul Durrant 
>>>> ---
>>>> Cc: Konrad Rzeszutek Wilk 
>>>> Cc: "Roger Pau Monné" 
>>>> Cc: Jens Axboe 
>>>> Cc: Dongli Zhang 
>>>>
>>>> v2:
>>>>   - Remove now-spurious error path special-case when nr_grefs == 1
>>>> ---
>>>>   drivers/block/xen-blkback/common.h |  1 +
>>>>   drivers/block/xen-blkback/xenbus.c | 38 +-
>>>>   2 files changed, 17 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/drivers/block/xen-blkback/common.h 
>>>> b/drivers/block/xen-blkback/common.h
>>>> index b0c71d3a81a0..524a79f10de6 100644
>>>> --- a/drivers/block/xen-blkback/common.h
>>>> +++ b/drivers/block/xen-blkback/common.h
>>>> @@ -313,6 +313,7 @@ struct xen_blkif {
>>>>
>>>>struct work_struct  free_work;
>>>>unsigned intnr_ring_pages;
>>>> +  boolmulti_ref;
>>>
>>> Is it really necessary to introduce 'multi_ref' here or we may just re-use
>>> 'nr_ring_pages'?
>>>
>>> According to blkfront code, 'ring-page-order' is set only when it is not 
>>> zero,
>>> that is, only when (info->nr_ring_pages > 1).
>>
> 
> That's how it is *supposed* to be. Windows certainly behaves that way too.
> 
>> Did you look into all other OS's (Windows, OpenBSD, FreebSD, NetBSD,
>> Solaris, Netware, other proprietary systems) implementations to verify
>> that claim?
>>
>> I don't think so. So better safe than sorry.
>>
> 
> Indeed. It was unfortunate that the commit to blkif.h documenting multi-page 
> (829f2a9c6dfae) was not crystal clear and (possibly as a consequence) blkback 
> was implemented to read ring-ref0 rather than ring-ref if ring-page-order was 
> present and 0. Hence the only safe thing to do is to restore that behaviour.
> 

Thank you very much for the explanation!

Reviewed-by: Dongli Zhang 

Dongli ZHang


Re: [PATCH v2] xen-blkback: fix compatibility bug with single page rings

2021-01-28 Thread Dongli Zhang



On 1/28/21 5:04 AM, Paul Durrant wrote:
> From: Paul Durrant 
> 
> Prior to commit 4a8c31a1c6f5 ("xen/blkback: rework connect_ring() to avoid
> inconsistent xenstore 'ring-page-order' set by malicious blkfront"), the
> behaviour of xen-blkback when connecting to a frontend was:
> 
> - read 'ring-page-order'
> - if not present then expect a single page ring specified by 'ring-ref'
> - else expect a ring specified by 'ring-refX' where X is between 0 and
>   1 << ring-page-order
> 
> This was correct behaviour, but was broken by the afforementioned commit to
> become:
> 
> - read 'ring-page-order'
> - if not present then expect a single page ring (i.e. ring-page-order = 0)
> - expect a ring specified by 'ring-refX' where X is between 0 and
>   1 << ring-page-order
> - if that didn't work then see if there's a single page ring specified by
>   'ring-ref'
> 
> This incorrect behaviour works most of the time but fails when a frontend
> that sets 'ring-page-order' is unloaded and replaced by one that does not
> because, instead of reading 'ring-ref', xen-blkback will read the stale
> 'ring-ref0' left around by the previous frontend will try to map the wrong
> grant reference.
> 
> This patch restores the original behaviour.
> 
> Fixes: 4a8c31a1c6f5 ("xen/blkback: rework connect_ring() to avoid 
> inconsistent xenstore 'ring-page-order' set by malicious blkfront")
> Signed-off-by: Paul Durrant 
> ---
> Cc: Konrad Rzeszutek Wilk 
> Cc: "Roger Pau Monné" 
> Cc: Jens Axboe 
> Cc: Dongli Zhang 
> 
> v2:
>  - Remove now-spurious error path special-case when nr_grefs == 1
> ---
>  drivers/block/xen-blkback/common.h |  1 +
>  drivers/block/xen-blkback/xenbus.c | 38 +-
>  2 files changed, 17 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/block/xen-blkback/common.h 
> b/drivers/block/xen-blkback/common.h
> index b0c71d3a81a0..524a79f10de6 100644
> --- a/drivers/block/xen-blkback/common.h
> +++ b/drivers/block/xen-blkback/common.h
> @@ -313,6 +313,7 @@ struct xen_blkif {
>  
>   struct work_struct  free_work;
>   unsigned intnr_ring_pages;
> + boolmulti_ref;

Is it really necessary to introduce 'multi_ref' here or we may just re-use
'nr_ring_pages'?

According to blkfront code, 'ring-page-order' is set only when it is not zero,
that is, only when (info->nr_ring_pages > 1).

1819 if (info->nr_ring_pages > 1) {
1820 err = xenbus_printf(xbt, dev->nodename, "ring-page-order",
"%u",
1821 ring_page_order);
1822 if (err) {
1823 message = "writing ring-page-order";
1824 goto abort_transaction;
1825 }
1826 }

Therefore, can we assume 'ring-page-order' can never be 0? Once we have
'ring-page-order' set, it should be >= 1 and we should read from "ring-ref%u"?

If the specification allows 'ring-page-order' to be zero with "ring-ref%u"
available, we should introduce 'multi_ref'.

Thank you very much!

Dongli Zhang


>   /* All rings for this device. */
>   struct xen_blkif_ring   *rings;
>   unsigned intnr_rings;
> diff --git a/drivers/block/xen-blkback/xenbus.c 
> b/drivers/block/xen-blkback/xenbus.c
> index 9860d4842f36..6c5e9373e91c 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -998,14 +998,17 @@ static int read_per_ring_refs(struct xen_blkif_ring 
> *ring, const char *dir)
>   for (i = 0; i < nr_grefs; i++) {
>   char ring_ref_name[RINGREF_NAME_LEN];
>  
> - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
> + if (blkif->multi_ref)
> + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", 
> i);
> + else {
> + WARN_ON(i != 0);
> + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref");
> + }
> +
>   err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
>  "%u", _ref[i]);
>  
>   if (err != 1) {
> - if (nr_grefs == 1)
> - break;
> -
>   err = -EINVAL;
>   xenbus_dev_fatal(dev, err, "reading %s/%s",
>dir, ring_ref_name);
> @@ -1013,18 +1016,6 @@ static int read_per_ring_refs(struct xen_blkif_ring 
> *ring, const char *dir)
>   }
>   }
>  
> - i

Re: [PATCH] xen-blkback: fix compatibility bug with single page rings

2021-01-27 Thread Dongli Zhang



On 1/27/21 2:30 AM, Paul Durrant wrote:
> From: Paul Durrant 
> 
> Prior to commit 4a8c31a1c6f5 ("xen/blkback: rework connect_ring() to avoid
> inconsistent xenstore 'ring-page-order' set by malicious blkfront"), the
> behaviour of xen-blkback when connecting to a frontend was:
> 
> - read 'ring-page-order'
> - if not present then expect a single page ring specified by 'ring-ref'
> - else expect a ring specified by 'ring-refX' where X is between 0 and
>   1 << ring-page-order
> 
> This was correct behaviour, but was broken by the afforementioned commit to
> become:
> 
> - read 'ring-page-order'
> - if not present then expect a single page ring
> - expect a ring specified by 'ring-refX' where X is between 0 and
>   1 << ring-page-order
> - if that didn't work then see if there's a single page ring specified by
>   'ring-ref'
> 
> This incorrect behaviour works most of the time but fails when a frontend
> that sets 'ring-page-order' is unloaded and replaced by one that does not
> because, instead of reading 'ring-ref', xen-blkback will read the stale
> 'ring-ref0' left around by the previous frontend will try to map the wrong
> grant reference.
> 
> This patch restores the original behaviour.
> 
> Fixes: 4a8c31a1c6f5 ("xen/blkback: rework connect_ring() to avoid 
> inconsistent xenstore 'ring-page-order' set by malicious blkfront")
> Signed-off-by: Paul Durrant 
> ---
> Cc: Konrad Rzeszutek Wilk 
> Cc: "Roger Pau Monné" 
> Cc: Jens Axboe 
> Cc: Dongli Zhang 
> ---
>  drivers/block/xen-blkback/common.h |  1 +
>  drivers/block/xen-blkback/xenbus.c | 36 +-
>  2 files changed, 17 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/block/xen-blkback/common.h 
> b/drivers/block/xen-blkback/common.h
> index b0c71d3a81a0..524a79f10de6 100644
> --- a/drivers/block/xen-blkback/common.h
> +++ b/drivers/block/xen-blkback/common.h
> @@ -313,6 +313,7 @@ struct xen_blkif {
>  
>   struct work_struct  free_work;
>   unsigned intnr_ring_pages;
> + boolmulti_ref;
>   /* All rings for this device. */
>   struct xen_blkif_ring   *rings;
>   unsigned intnr_rings;
> diff --git a/drivers/block/xen-blkback/xenbus.c 
> b/drivers/block/xen-blkback/xenbus.c
> index 9860d4842f36..4c1541cde68c 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -998,10 +998,15 @@ static int read_per_ring_refs(struct xen_blkif_ring 
> *ring, const char *dir)
>   for (i = 0; i < nr_grefs; i++) {
>   char ring_ref_name[RINGREF_NAME_LEN];
>  
> - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
> + if (blkif->multi_ref)
> + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", 
> i);
> + else {
> + WARN_ON(i != 0);
> + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref");
> + }
> +
>   err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
>  "%u", _ref[i]);
> -
>   if (err != 1) {
>       if (nr_grefs == 1)
>   break;

I think we should not simply break here because the failure can be due to when
(nr_grefs == 1) and reading from legacy "ring-ref".

Should we do something as below?

err = -EINVAL;
xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
return err;

Dongli Zhang


> @@ -1013,18 +1018,6 @@ static int read_per_ring_refs(struct xen_blkif_ring 
> *ring, const char *dir)
>   }
>   }
>  
> - if (err != 1) {
> - WARN_ON(nr_grefs != 1);
> -
> - err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u",
> -_ref[0]);
> - if (err != 1) {
> - err = -EINVAL;
> - xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
> - return err;
> - }
> - }
> -
>   err = -ENOMEM;
>   for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
>   req = kzalloc(sizeof(*req), GFP_KERNEL);
> @@ -1129,10 +1122,15 @@ static int connect_ring(struct backend_info *be)
>blkif->nr_rings, blkif->blk_protocol, protocol,
>blkif->vbd.feature_gnt_persistent ? "persistent grants" : "");
>  
> - ring_page_order = xenbus_read_unsigned(dev->otherend,
> -&

Re: [PATCH v2 1/1] vhost scsi: alloc vhost_scsi with kvzalloc() to avoid delay

2021-01-23 Thread Dongli Zhang
According to my "git send-email" history, I have CCed jasow...@redhat.com. Not
sure why Jason is not on the list.

CCed Jason. Thank you very much!

Dongli Zhang

On 1/23/21 12:08 AM, Dongli Zhang wrote:
> The size of 'struct vhost_scsi' is order-10 (~2.3MB). It may take long time
> delay by kzalloc() to compact memory pages by retrying multiple times when
> there is a lack of high-order pages. As a result, there is latency to
> create a VM (with vhost-scsi) or to hotadd vhost-scsi-based storage.
> 
> The prior commit 595cb754983d ("vhost/scsi: use vmalloc for order-10
> allocation") prefers to fallback only when really needed, while this patch
> allocates with kvzalloc() with __GFP_NORETRY implicitly set to avoid
> retrying memory pages compact for multiple times.
> 
> The __GFP_NORETRY is implicitly set if the size to allocate is more than
> PAGE_SZIE and when __GFP_RETRY_MAYFAIL is not explicitly set.
> 
> Cc: Aruna Ramakrishna 
> Cc: Joe Jin 
> Signed-off-by: Dongli Zhang 
> ---
> Changed since v1:
>   - To combine kzalloc() and vzalloc() as kvzalloc()
> (suggested by Jason Wang)
> 
>  drivers/vhost/scsi.c | 9 +++--
>  1 file changed, 3 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index 4ce9f00ae10e..5de21ad4bd05 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -1814,12 +1814,9 @@ static int vhost_scsi_open(struct inode *inode, struct 
> file *f)
>   struct vhost_virtqueue **vqs;
>   int r = -ENOMEM, i;
>  
> - vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | 
> __GFP_RETRY_MAYFAIL);
> - if (!vs) {
> - vs = vzalloc(sizeof(*vs));
> - if (!vs)
> - goto err_vs;
> - }
> + vs = kvzalloc(sizeof(*vs), GFP_KERNEL);
> + if (!vs)
> + goto err_vs;
>  
>   vqs = kmalloc_array(VHOST_SCSI_MAX_VQ, sizeof(*vqs), GFP_KERNEL);
>   if (!vqs)
> 


[PATCH v2 1/1] vhost scsi: alloc vhost_scsi with kvzalloc() to avoid delay

2021-01-23 Thread Dongli Zhang
The size of 'struct vhost_scsi' is order-10 (~2.3MB). It may take long time
delay by kzalloc() to compact memory pages by retrying multiple times when
there is a lack of high-order pages. As a result, there is latency to
create a VM (with vhost-scsi) or to hotadd vhost-scsi-based storage.

The prior commit 595cb754983d ("vhost/scsi: use vmalloc for order-10
allocation") prefers to fallback only when really needed, while this patch
allocates with kvzalloc() with __GFP_NORETRY implicitly set to avoid
retrying memory pages compact for multiple times.

The __GFP_NORETRY is implicitly set if the size to allocate is more than
PAGE_SZIE and when __GFP_RETRY_MAYFAIL is not explicitly set.

Cc: Aruna Ramakrishna 
Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
Changed since v1:
  - To combine kzalloc() and vzalloc() as kvzalloc()
(suggested by Jason Wang)

 drivers/vhost/scsi.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 4ce9f00ae10e..5de21ad4bd05 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1814,12 +1814,9 @@ static int vhost_scsi_open(struct inode *inode, struct 
file *f)
struct vhost_virtqueue **vqs;
int r = -ENOMEM, i;
 
-   vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | 
__GFP_RETRY_MAYFAIL);
-   if (!vs) {
-   vs = vzalloc(sizeof(*vs));
-   if (!vs)
-   goto err_vs;
-   }
+   vs = kvzalloc(sizeof(*vs), GFP_KERNEL);
+   if (!vs)
+   goto err_vs;
 
vqs = kmalloc_array(VHOST_SCSI_MAX_VQ, sizeof(*vqs), GFP_KERNEL);
if (!vqs)
-- 
2.17.1



Re: [PATCH 1/1] vhost scsi: allocate vhost_scsi with GFP_NOWAIT to avoid delay

2021-01-23 Thread Dongli Zhang



On 1/21/21 1:00 AM, Jason Wang wrote:
> 
> On 2021/1/21 13:03, Dongli Zhang wrote:
>> The size of 'struct vhost_scsi' is order-10 (~2.3MB). It may take long time
>> delay by kzalloc() to compact memory pages when there is a lack of
>> high-order pages. As a result, there is latency to create a VM (with
>> vhost-scsi) or to hotadd vhost-scsi-based storage.
>>
>> The prior commit 595cb754983d ("vhost/scsi: use vmalloc for order-10
>> allocation") prefers to fallback only when really needed, while this patch
>> changes allocation to GFP_NOWAIT in order to avoid the delay caused by
>> memory page compact.
>>
>> Cc: Aruna Ramakrishna 
>> Cc: Joe Jin 
>> Signed-off-by: Dongli Zhang 
>> ---
>> Another option is to rework by reducing the size of 'struct vhost_scsi',
>> e.g., by replacing inline vhost_scsi.vqs with just memory pointers while
>> each vhost_scsi.vqs[i] should be allocated separately. Please let me
>> know if that option is better.
>>
>>   drivers/vhost/scsi.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
>> index 4ce9f00ae10e..85eaa4e883f4 100644
>> --- a/drivers/vhost/scsi.c
>> +++ b/drivers/vhost/scsi.c
>> @@ -1814,7 +1814,7 @@ static int vhost_scsi_open(struct inode *inode, struct
>> file *f)
>>   struct vhost_virtqueue **vqs;
>>   int r = -ENOMEM, i;
>>   -    vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN |
>> __GFP_RETRY_MAYFAIL);
>> +    vs = kzalloc(sizeof(*vs), GFP_NOWAIT | __GFP_NOWARN);
>>   if (!vs) {
>>   vs = vzalloc(sizeof(*vs));
>>   if (!vs)
> 
> 
> Can we use kvzalloc?
> 
Thank you very much for the suggestion.

To use 'GFP_NOWAIT' will avoid any direct compact in __alloc_pages_slowpath(),
while to use kvzalloc() will just avoid retrying direct compact for multiple 
times.

Although the latter will still do direct compact (without retry), I think it is
better than the former using GFP_NOWAIT.

I will send v2 with kvzalloc().

Thank you very much!

Dongli Zhang


[PATCH 1/1] vhost scsi: allocate vhost_scsi with GFP_NOWAIT to avoid delay

2021-01-20 Thread Dongli Zhang
The size of 'struct vhost_scsi' is order-10 (~2.3MB). It may take long time
delay by kzalloc() to compact memory pages when there is a lack of
high-order pages. As a result, there is latency to create a VM (with
vhost-scsi) or to hotadd vhost-scsi-based storage.

The prior commit 595cb754983d ("vhost/scsi: use vmalloc for order-10
allocation") prefers to fallback only when really needed, while this patch
changes allocation to GFP_NOWAIT in order to avoid the delay caused by
memory page compact.

Cc: Aruna Ramakrishna 
Cc: Joe Jin 
Signed-off-by: Dongli Zhang 
---
Another option is to rework by reducing the size of 'struct vhost_scsi',
e.g., by replacing inline vhost_scsi.vqs with just memory pointers while
each vhost_scsi.vqs[i] should be allocated separately. Please let me
know if that option is better.

 drivers/vhost/scsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 4ce9f00ae10e..85eaa4e883f4 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1814,7 +1814,7 @@ static int vhost_scsi_open(struct inode *inode, struct 
file *f)
struct vhost_virtqueue **vqs;
int r = -ENOMEM, i;
 
-   vs = kzalloc(sizeof(*vs), GFP_KERNEL | __GFP_NOWARN | 
__GFP_RETRY_MAYFAIL);
+   vs = kzalloc(sizeof(*vs), GFP_NOWAIT | __GFP_NOWARN);
if (!vs) {
vs = vzalloc(sizeof(*vs));
if (!vs)
-- 
2.17.1



[PATCH v3 1/1] page_frag: Recover from memory pressure

2020-11-15 Thread Dongli Zhang
The ethernet driver may allocate skb (and skb->data) via napi_alloc_skb().
This ends up to page_frag_alloc() to allocate skb->data from
page_frag_cache->va.

During the memory pressure, page_frag_cache->va may be allocated as
pfmemalloc page. As a result, the skb->pfmemalloc is always true as
skb->data is from page_frag_cache->va. The skb will be dropped if the
sock (receiver) does not have SOCK_MEMALLOC. This is expected behaviour
under memory pressure.

However, once kernel is not under memory pressure any longer (suppose large
amount of memory pages are just reclaimed), the page_frag_alloc() may still
re-use the prior pfmemalloc page_frag_cache->va to allocate skb->data. As a
result, the skb->pfmemalloc is always true unless page_frag_cache->va is
re-allocated, even if the kernel is not under memory pressure any longer.

Here is how kernel runs into issue.

1. The kernel is under memory pressure and allocation of
PAGE_FRAG_CACHE_MAX_ORDER in __page_frag_cache_refill() will fail. Instead,
the pfmemalloc page is allocated for page_frag_cache->va.

2: All skb->data from page_frag_cache->va (pfmemalloc) will have
skb->pfmemalloc=true. The skb will always be dropped by sock without
SOCK_MEMALLOC. This is an expected behaviour.

3. Suppose a large amount of pages are reclaimed and kernel is not under
memory pressure any longer. We expect skb->pfmemalloc drop will not happen.

4. Unfortunately, page_frag_alloc() does not proactively re-allocate
page_frag_alloc->va and will always re-use the prior pfmemalloc page. The
skb->pfmemalloc is always true even kernel is not under memory pressure any
longer.

Fix this by freeing and re-allocating the page instead of recycling it.

References: 
https://lore.kernel.org/lkml/20201103193239.1807-1-dongli.zh...@oracle.com/
References: 
https://lore.kernel.org/linux-mm/20201105042140.5253-1-wi...@infradead.org/
Suggested-by: Matthew Wilcox (Oracle) 
Cc: Aruna Ramakrishna 
Cc: Bert Barbe 
Cc: Rama Nichanamatlu 
Cc: Venkat Venkatsubra 
Cc: Manjunath Patil 
Cc: Joe Jin 
Cc: SRINIVAS 
Cc: sta...@vger.kernel.org
Fixes: 79930f5892e ("net: do not deplete pfmemalloc reserve")
Signed-off-by: Dongli Zhang 
Acked-by: Vlastimil Babka 
---
Changed since v1:
  - change author from Matthew to Dongli
  - Add references to all prior discussions
  - Add more details to commit message
Changed since v2:
  - add unlikely (suggested by Eric Dumazet)

 mm/page_alloc.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23f5066bd4a5..91129ce75ed4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5103,6 +5103,11 @@ void *page_frag_alloc(struct page_frag_cache *nc,
if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
 
+   if (unlikely(nc->pfmemalloc)) {
+   free_the_page(page, compound_order(page));
+   goto refill;
+   }
+
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
-- 
2.17.1



Re: [PATCH v2 1/1] page_frag: Recover from memory pressure

2020-11-15 Thread Dongli Zhang



On 11/15/20 4:18 AM, Matthew Wilcox wrote:
> On Sat, Nov 14, 2020 at 10:51:06PM -0800, Dongli Zhang wrote:
>> +if (nc->pfmemalloc) {
> 
> You missed the unlikely() change that Eric recommended.
> 

Thank you very much. I missed that email.

I will send v3.

Dongli Zhang


[PATCH v2 1/1] page_frag: Recover from memory pressure

2020-11-14 Thread Dongli Zhang
The ethernet driver may allocate skb (and skb->data) via napi_alloc_skb().
This ends up to page_frag_alloc() to allocate skb->data from
page_frag_cache->va.

During the memory pressure, page_frag_cache->va may be allocated as
pfmemalloc page. As a result, the skb->pfmemalloc is always true as
skb->data is from page_frag_cache->va. The skb will be dropped if the
sock (receiver) does not have SOCK_MEMALLOC. This is expected behaviour
under memory pressure.

However, once kernel is not under memory pressure any longer (suppose large
amount of memory pages are just reclaimed), the page_frag_alloc() may still
re-use the prior pfmemalloc page_frag_cache->va to allocate skb->data. As a
result, the skb->pfmemalloc is always true unless page_frag_cache->va is
re-allocated, even if the kernel is not under memory pressure any longer.

Here is how kernel runs into issue.

1. The kernel is under memory pressure and allocation of
PAGE_FRAG_CACHE_MAX_ORDER in __page_frag_cache_refill() will fail. Instead,
the pfmemalloc page is allocated for page_frag_cache->va.

2: All skb->data from page_frag_cache->va (pfmemalloc) will have
skb->pfmemalloc=true. The skb will always be dropped by sock without
SOCK_MEMALLOC. This is an expected behaviour.

3. Suppose a large amount of pages are reclaimed and kernel is not under
memory pressure any longer. We expect skb->pfmemalloc drop will not happen.

4. Unfortunately, page_frag_alloc() does not proactively re-allocate
page_frag_alloc->va and will always re-use the prior pfmemalloc page. The
skb->pfmemalloc is always true even kernel is not under memory pressure any
longer.

Fix this by freeing and re-allocating the page instead of recycling it.

References: 
https://lore.kernel.org/lkml/20201103193239.1807-1-dongli.zh...@oracle.com/
References: 
https://lore.kernel.org/linux-mm/20201105042140.5253-1-wi...@infradead.org/
Suggested-by: Matthew Wilcox (Oracle) 
Cc: Aruna Ramakrishna 
Cc: Bert Barbe 
Cc: Rama Nichanamatlu 
Cc: Venkat Venkatsubra 
Cc: Manjunath Patil 
Cc: Joe Jin 
Cc: SRINIVAS 
Cc: sta...@vger.kernel.org
Fixes: 79930f5892e ("net: do not deplete pfmemalloc reserve")
Signed-off-by: Dongli Zhang 
Acked-by: Vlastimil Babka 
---
Changed since v1:
  - change author from Matthew to Dongli
  - Add references to all prior discussions
  - Add more details to commit message

 mm/page_alloc.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23f5066bd4a5..91129ce75ed4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5103,6 +5103,11 @@ void *page_frag_alloc(struct page_frag_cache *nc,
if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
 
+   if (nc->pfmemalloc) {
+   free_the_page(page, compound_order(page));
+   goto refill;
+   }
+
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
-- 
2.17.1



Re: [PATCH 1/1] mm: avoid re-using pfmemalloc page in page_frag_alloc()

2020-11-04 Thread Dongli Zhang



On 11/4/20 4:51 AM, Eric Dumazet wrote:
> 
> 
> On 11/4/20 1:36 PM, Matthew Wilcox wrote:
>> On Wed, Nov 04, 2020 at 09:50:30AM +0100, Eric Dumazet wrote:
>>> On 11/4/20 2:16 AM, Rama Nichanamatlu wrote:
>>>>> Thanks for providing the numbers.  Do you think that dropping (up to)
>>>>> 7 packets is acceptable?
>>>>
>>>> net.ipv4.tcp_syn_retries = 6
>>>>
>>>> tcp clients wouldn't even get that far leading to connect establish issues.
>>>
>>> This does not really matter. If host was under memory pressure,
>>> dropping a few packets is really not an issue.
>>>
>>> Please do not add expensive checks in fast path, just to "not drop a packet"
>>> even if the world is collapsing.
>>
>> Right, that was my first patch -- to only recheck if we're about to
>> reuse the page.  Do you think that's acceptable, or is that still too
>> close to the fast path?
> 
> I think it is totally acceptable.
> 
> The same strategy is used in NIC drivers, before recycling a page.
> 
> If page_is_pfmemalloc() returns true, they simply release the 
> 'problematic'page
> and attempt a new allocation.
> 
> ( git grep -n page_is_pfmemalloc -- drivers/net/ethernet/ )

While the drivers may implement their own page_frag_cache to manage skb->frags 
...

... the skb->data is usually allocated via __netdev_alloc_skb() or
napi_alloc_skb(), which end up to the global this_cpu_ptr(_alloc_cache)
or this_cpu_ptr(_alloc_cache.page).

> 
> 
>>
>>> Also consider that NIC typically have thousands of pre-allocated page/frags
>>> for their RX ring buffers, they might all have pfmemalloc set, so we are 
>>> speaking
>>> of thousands of packet drops before the RX-ring can be refilled with normal 
>>> (non pfmemalloc) page/frags.
>>>
>>> If we want to solve this issue more generically, we would have to try
>>> to copy data into a non pfmemalloc frag instead of dropping skb that
>>> had frags allocated minutes ago under memory pressure.
>>
>> I don't think we need to copy anything.  We need to figure out if the
>> system is still under memory pressure, and if not, we can clear the
>> pfmemalloc bit on the frag, as in my second patch.  The 'least change'
>> way of doing that is to try to allocate a page, but the VM could export
>> a symbol that says "we're not under memory pressure any more".
>>
>> Did you want to move checking that into the networking layer, or do you
>> want to keep it in the pagefrag allocator?
> 
> I think your proposal is fine, thanks !

Hi Matthew, are you going to send out the patch to avoid pfmemalloc recycle?

Thank you very much!

Dongli Zhang


Re: [PATCH 1/1] mm: avoid re-using pfmemalloc page in page_frag_alloc()

2020-11-03 Thread Dongli Zhang



On 11/3/20 1:15 PM, Matthew Wilcox wrote:
> On Tue, Nov 03, 2020 at 12:57:33PM -0800, Dongli Zhang wrote:
>> On 11/3/20 12:35 PM, Matthew Wilcox wrote:
>>> On Tue, Nov 03, 2020 at 11:32:39AM -0800, Dongli Zhang wrote:
>>>> However, once kernel is not under memory pressure any longer (suppose large
>>>> amount of memory pages are just reclaimed), the page_frag_alloc() may still
>>>> re-use the prior pfmemalloc page_frag_cache->va to allocate skb->data. As a
>>>> result, the skb->pfmemalloc is always true unless page_frag_cache->va is
>>>> re-allocated, even the kernel is not under memory pressure any longer.
>>>> +  /*
>>>> +   * Try to avoid re-using pfmemalloc page because kernel may already
>>>> +   * run out of the memory pressure situation at any time.
>>>> +   */
>>>> +  if (unlikely(nc->va && nc->pfmemalloc)) {
>>>> +  page = virt_to_page(nc->va);
>>>> +  __page_frag_cache_drain(page, nc->pagecnt_bias);
>>>> +  nc->va = NULL;
>>>> +  }
>>>
>>> I think this is the wrong way to solve this problem.  Instead, we should
>>> use up this page, but refuse to recycle it.  How about something like this 
>>> (not even compile tested):
>>
>> Thank you very much for the feedback. Yes, the option is to use the same page
>> until it is used up (offset < 0). Instead of recycling it, the kernel free it
>> and allocate new one.
>>
>> This depends on whether we will tolerate the packet drop until this page is 
>> used up.
>>
>> For virtio-net, the payload (skb->data) is of size 128-byte. The padding and
>> alignment will finally make it as 512-byte.
>>
>> Therefore, for virtio-net, we will have at most 4096/512-1=7 packets dropped
>> before the page is used up.
> 
> My thinking is that if the kernel is under memory pressure then freeing
> the page and allocating a new one is likely to put even more strain
> on the memory allocator, so we want to do this "soon", rather than at
> each allocation.
> 
> Thanks for providing the numbers.  Do you think that dropping (up to)
> 7 packets is acceptable?>
> We could also do something like ...
> 
> if (unlikely(nc->pfmemalloc)) {
> page = alloc_page(GFP_NOWAIT | __GFP_NOWARN);
> if (page)
> nc->pfmemalloc = 0;
> put_page(page);
> }
> 
> to test if the memory allocator has free pages at the moment.  Not sure
> whether that's a good idea or not -- hopefully you have a test environment
> set up where you can reproduce this condition on demand and determine
> which of these three approaches is best!
> 


>From mm's perspective, we expect to reduce the number of page allocation
(especially under memory pressure).

>From networking's perspective, we expect to reduce the number of skb drop.

That's why I CCed netdev folks (including David and Jakub), although the patch
is for mm/page_alloc.c. The page_frag_alloc() is primarily used by networking
and nvme-tcp.


Unfortunately, so far I do not have the env to reproduce. I reproduced with a
patch to fail page allocation and set nc->pfmemalloc on purpose.

>From mm's perspective, I think to use up the page is a good option. Indeed tt 
>is
system administrator's duty to avoid memory pressure, in order to avoid the
extra packet drops.

Dongli Zhang


Re: [PATCH 1/1] mm: avoid re-using pfmemalloc page in page_frag_alloc()

2020-11-03 Thread Dongli Zhang
Hi Matthew,

On 11/3/20 12:35 PM, Matthew Wilcox wrote:
> On Tue, Nov 03, 2020 at 11:32:39AM -0800, Dongli Zhang wrote:
>> The ethernet driver may allocates skb (and skb->data) via napi_alloc_skb().
>> This ends up to page_frag_alloc() to allocate skb->data from
>> page_frag_cache->va.
>>
>> During the memory pressure, page_frag_cache->va may be allocated as
>> pfmemalloc page. As a result, the skb->pfmemalloc is always true as
>> skb->data is from page_frag_cache->va. The skb will be dropped if the
>> sock (receiver) does not have SOCK_MEMALLOC. This is expected behaviour
>> under memory pressure.
>>
>> However, once kernel is not under memory pressure any longer (suppose large
>> amount of memory pages are just reclaimed), the page_frag_alloc() may still
>> re-use the prior pfmemalloc page_frag_cache->va to allocate skb->data. As a
>> result, the skb->pfmemalloc is always true unless page_frag_cache->va is
>> re-allocated, even the kernel is not under memory pressure any longer.
>>
>> Here is how kernel runs into issue.
>>
>> 1. The kernel is under memory pressure and allocation of
>> PAGE_FRAG_CACHE_MAX_ORDER in __page_frag_cache_refill() will fail. Instead,
>> the pfmemalloc page is allocated for page_frag_cache->va.
>>
>> 2: All skb->data from page_frag_cache->va (pfmemalloc) will have
>> skb->pfmemalloc=true. The skb will always be dropped by sock without
>> SOCK_MEMALLOC. This is an expected behaviour.
>>
>> 3. Suppose a large amount of pages are reclaimed and kernel is not under
>> memory pressure any longer. We expect skb->pfmemalloc drop will not happen.
>>
>> 4. Unfortunately, page_frag_alloc() does not proactively re-allocate
>> page_frag_alloc->va and will always re-use the prior pfmemalloc page. The
>> skb->pfmemalloc is always true even kernel is not under memory pressure any
>> longer.
>>
>> Therefore, this patch always checks and tries to avoid re-using the
>> pfmemalloc page for page_frag_alloc->va.
>>
>> Cc: Aruna Ramakrishna 
>> Cc: Bert Barbe 
>> Cc: Rama Nichanamatlu 
>> Cc: Venkat Venkatsubra 
>> Cc: Manjunath Patil 
>> Cc: Joe Jin 
>> Cc: SRINIVAS 
>> Signed-off-by: Dongli Zhang 
>> ---
>>  mm/page_alloc.c | 10 ++
>>  1 file changed, 10 insertions(+)
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 23f5066bd4a5..291df2f9f8f3 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -5075,6 +5075,16 @@ void *page_frag_alloc(struct page_frag_cache *nc,
>>  struct page *page;
>>  int offset;
>>  
>> +/*
>> + * Try to avoid re-using pfmemalloc page because kernel may already
>> + * run out of the memory pressure situation at any time.
>> + */
>> +if (unlikely(nc->va && nc->pfmemalloc)) {
>> +page = virt_to_page(nc->va);
>> +__page_frag_cache_drain(page, nc->pagecnt_bias);
>> +nc->va = NULL;
>> +}
> 
> I think this is the wrong way to solve this problem.  Instead, we should
> use up this page, but refuse to recycle it.  How about something like this 
> (not even compile tested):

Thank you very much for the feedback. Yes, the option is to use the same page
until it is used up (offset < 0). Instead of recycling it, the kernel free it
and allocate new one.

This depends on whether we will tolerate the packet drop until this page is 
used up.

For virtio-net, the payload (skb->data) is of size 128-byte. The padding and
alignment will finally make it as 512-byte.

Therefore, for virtio-net, we will have at most 4096/512-1=7 packets dropped
before the page is used up.

Dongli Zhang

> 
> +++ b/mm/page_alloc.c
> @@ -5139,6 +5139,10 @@ void *page_frag_alloc(struct page_frag_cache *nc,
>  
> if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> goto refill;
> +   if (nc->pfmemalloc) {
> +   free_the_page(page);
> +   goto refill;
> +   }
>  
>  #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> /* if size can vary use size else just use PAGE_SIZE */
> 


[PATCH 1/1] mm: avoid re-using pfmemalloc page in page_frag_alloc()

2020-11-03 Thread Dongli Zhang
The ethernet driver may allocates skb (and skb->data) via napi_alloc_skb().
This ends up to page_frag_alloc() to allocate skb->data from
page_frag_cache->va.

During the memory pressure, page_frag_cache->va may be allocated as
pfmemalloc page. As a result, the skb->pfmemalloc is always true as
skb->data is from page_frag_cache->va. The skb will be dropped if the
sock (receiver) does not have SOCK_MEMALLOC. This is expected behaviour
under memory pressure.

However, once kernel is not under memory pressure any longer (suppose large
amount of memory pages are just reclaimed), the page_frag_alloc() may still
re-use the prior pfmemalloc page_frag_cache->va to allocate skb->data. As a
result, the skb->pfmemalloc is always true unless page_frag_cache->va is
re-allocated, even the kernel is not under memory pressure any longer.

Here is how kernel runs into issue.

1. The kernel is under memory pressure and allocation of
PAGE_FRAG_CACHE_MAX_ORDER in __page_frag_cache_refill() will fail. Instead,
the pfmemalloc page is allocated for page_frag_cache->va.

2: All skb->data from page_frag_cache->va (pfmemalloc) will have
skb->pfmemalloc=true. The skb will always be dropped by sock without
SOCK_MEMALLOC. This is an expected behaviour.

3. Suppose a large amount of pages are reclaimed and kernel is not under
memory pressure any longer. We expect skb->pfmemalloc drop will not happen.

4. Unfortunately, page_frag_alloc() does not proactively re-allocate
page_frag_alloc->va and will always re-use the prior pfmemalloc page. The
skb->pfmemalloc is always true even kernel is not under memory pressure any
longer.

Therefore, this patch always checks and tries to avoid re-using the
pfmemalloc page for page_frag_alloc->va.

Cc: Aruna Ramakrishna 
Cc: Bert Barbe 
Cc: Rama Nichanamatlu 
Cc: Venkat Venkatsubra 
Cc: Manjunath Patil 
Cc: Joe Jin 
Cc: SRINIVAS 
Signed-off-by: Dongli Zhang 
---
 mm/page_alloc.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 23f5066bd4a5..291df2f9f8f3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5075,6 +5075,16 @@ void *page_frag_alloc(struct page_frag_cache *nc,
struct page *page;
int offset;
 
+   /*
+* Try to avoid re-using pfmemalloc page because kernel may already
+* run out of the memory pressure situation at any time.
+*/
+   if (unlikely(nc->va && nc->pfmemalloc)) {
+   page = virt_to_page(nc->va);
+   __page_frag_cache_drain(page, nc->pagecnt_bias);
+   nc->va = NULL;
+   }
+
if (unlikely(!nc->va)) {
 refill:
page = __page_frag_cache_refill(nc, gfp_mask);
-- 
2.17.1



[PATCH 1/1] blk-mq: fix comment as bt_tags_for_each() is not only for started request

2020-10-20 Thread Dongli Zhang
Since the introduction of 'BT_TAG_ITER_STARTED' by commit 602380d28e28
("blk-mq: add blk_mq_all_tag_iter"), the bt_tags_for_each() is not only
used for started request, e.g., in below case:

blk_mq_hctx_has_requests()
-> blk_mq_all_tag_iter()
   -> __blk_mq_all_tag_iter() --> only BT_TAG_ITER_STATIC_RQS is set
  -> bt_tags_for_each()
 -> bt_tags_iter()

Signed-off-by: Dongli Zhang 
---
 block/blk-mq-tag.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 9c92053e704d..43af5063fe2e 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -289,10 +289,10 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned 
int bitnr, void *data)
  * @tags:  Tag map to iterate over.
  * @bt:sbitmap to examine. This is either the breserved_tags 
member
  * or the bitmap_tags member of struct blk_mq_tags.
- * @fn:Pointer to the function that will be called for each 
started
- * request. @fn will be called as follows: @fn(rq, @data,
- * @reserved) where rq is a pointer to a request. Return true
- * to continue iterating tags, false to stop.
+ * @fn:Pointer to the function that will be called for each 
request
+ * depending on the value of @flags. @fn will be called as
+ * follows: @fn(rq, @data, @reserved) where rq is a pointer to a
+ * request. Return true to continue iterating tags, false to stop.
  * @data:  Will be passed as second argument to @fn.
  * @flags: BT_TAG_ITER_*
  */
-- 
2.17.1



Re: [PATCH 1/1] nvme-fcloop: verify wwnn and wwpn format

2020-06-10 Thread Dongli Zhang
Hi Christoph,

Would you mind apply this one with the Reviewed-by from James and Sagi?

https://lore.kernel.org/linux-nvme/60df6752-3512-f7a9-b0df-1096b93b8...@broadcom.com/

https://lore.kernel.org/linux-nvme/c4ec2d9e-b08c-19b2-16a5-93520ca13...@grimberg.me/

Thank you very much!

Dongli Zhang

On 6/4/20 8:20 AM, James Smart wrote:
> On 5/25/2020 9:21 PM, Dongli Zhang wrote:
>> The nvme host and target verify the wwnn and wwpn format via
>> nvme_fc_parse_traddr(). For instance, it is required that the length of
>> wwnn to be either 21 ("nn-0x") or 19 (nn-).
>>
>> Add this verification to nvme-fcloop so that the input should always be in
>> hex and the length of input should always be 18.
>>
>> Otherwise, the user may use e.g. 0x2 to create fcloop local port, while
>> 0x0002 is required for nvme host and target. This makes the
>> requirement of format confusing.
>>
>> Signed-off-by: Dongli Zhang 
>> ---
>>   drivers/nvme/target/fcloop.c | 29 +++--
>>   1 file changed, 23 insertions(+), 6 deletions(-)
>>
>>
> 
> Reviewed-by: James Smart 
> 
> Looks good. Sorry for the delay.
> 
> -- james
> 
> 


[PATCH 0/2] nvme: simple cleanup

2020-06-08 Thread Dongli Zhang
Just cleanup without functional change.

Thank you very much!

Dongli Zhang




[PATCH 2/2] nvme-pci: remove empty line following nvme_should_reset()

2020-06-08 Thread Dongli Zhang
Just cleanup by removing the empty line.

Signed-off-by: Dongli Zhang 
---
 drivers/nvme/host/pci.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d690d5593a80..4471e9910906 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1153,7 +1153,6 @@ static void abort_endio(struct request *req, blk_status_t 
error)
 
 static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
 {
-
/* If true, indicates loss of adapter communication, possibly by a
 * NVMe Subsystem reset.
 */
-- 
2.17.1



[PATCH 1/2] nvmet-loop: remove unused 'target_ctrl' in nvme_loop_ctrl

2020-06-08 Thread Dongli Zhang
This field is never used since the introduction of nvme loopback by
commit 3a85a5de29ea ("nvme-loop: add a NVMe loopback host driver").

Signed-off-by: Dongli Zhang 
---
 drivers/nvme/target/loop.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 0d54e730cbf2..e8fa70c15143 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -36,7 +36,6 @@ struct nvme_loop_ctrl {
struct nvme_loop_iodasync_event_iod;
struct nvme_ctrlctrl;
 
-   struct nvmet_ctrl   *target_ctrl;
struct nvmet_port   *port;
 };
 
-- 
2.17.1



Re: [PATCH v2] docs: block: Create blk-mq documentation

2020-06-05 Thread Dongli Zhang
ture struct :c:type:`bio`. The block
> +layer will then build a new structure from it, the struct :c:type:`request`
> +that will be used to communicate with the device driver. Each queue has its
> +own lock and the number of queues is defined by a per-CPU or per-node basis.
> +
> +The staging queue can be used to merge requests for adjacent sectors. For
> +instance, requests for sector 3-6, 6-7, 7-9 can become one request for 3-9.
> +Even if random access to SSDs and NVMs have the same time of response 
> compared
> +to sequential access, grouped requests for sequential access decreases the
> +number of individual requests. This technique of merging requests is called
> +plugging.
> +
> +Along with that, the requests can be reordered to ensure fairness of system
> +resources (e.g. to ensure that no application suffers from starvation) 
> and/or to
> +improve IO performance, by an IO scheduler.
> +
> +IO Schedulers
> +^
> +
> +There are several schedulers implemented by the block layer, each one 
> following
> +a heuristic to improve the IO performance. They are "pluggable" (as in plug
> +and play), in the sense of they can be selected at run time using sysfs. You
> +can read more about Linux's IO schedulers `here
> +<https://www.kernel.org/doc/html/latest/block/index.html>`_. The scheduling
> +happens only between requests in the same queue, so it is not possible to 
> merge
> +requests from different queues, otherwise there would be cache trashing and a
> +need to have a lock for each queue. After the scheduling, the requests are
> +eligible to be sent to the hardware. One of the possible schedulers to be
> +selected is the NOOP scheduler, the most straightforward one, that 
> implements a
> +simple FIFO, without performing any reordering. This is useful in the 
> following
> +scenarios: when scheduling will be performed in a next step somewhere in the
> +stack, like block device controllers; the actual sector position of blocks 
> are
> +transparent for the host, meaning it hasn't enough information to take a 
> proper
> +decision; or the overhead of reordering is higher than the handicap of
> +non-sequential accesses.
> +
> +Hardware dispatch queues
> +
> +
> +The hardware queues (represented by struct :c:type:`blk_mq_hw_ctx`) have a 
> 1:1
> +correspondence to the device driver's submission queues, and are the last 
> step

I am not clear with the definition of "submission queues". Is it the device
queue with DMA ring buffer?

If it is the DMA ring buffer, multiple blk_mq_hw_ctx would map to the same DMA
ring buffer, e.g., multiple nvme namespaces would share the same tagset. This is
not 1:1 any longer.

> +of the block layer submission code before the low level device driver taking
> +ownership of the request. To run this queue, the block layer removes requests
> +from the associated software queues and tries to dispatch to the hardware.
> +
> +If it's not possible to send the requests directly to hardware, they will be
> +added to a linked list (:c:type:`hctx->dispatch`) of requests. Then,
> +next time the block layer runs a queue, it will send the requests laying at 
> the
> +:c:type:`dispatch` list first, to ensure a fairness dispatch with those
> +requests that were ready to be sent first. The number of hardware queues
> +depends on the number of hardware contexts supported by the hardware and its
> +device driver, but it will not be more than the number of cores of the 
> system.
> +There is no reordering at this stage, and each software queue has a set of
> +hardware queues to send requests for.
> +
> +.. note::
> +
> +Neither the block layer nor the device protocols guarantee
> +the order of completion of requests. This must be handled by
> +higher layers, like the filesystem.
> +
> +Tag-based completion
> +
> +
> +In order to indicate which request has been completed, every request is
> +identified by an integer, ranging from 0 to the dispatch queue size. This tag
> +is generated by the block layer and later reused by the device driver, 
> removing
> +the need to create a redundant identifier. When a request is completed in the
> +drive, the tag is sent back to the block layer to notify it of the 
> finalization.
> +This removes the need to do a linear search to find out which IO has been
> +completed.

Assume I am a beginner and does not know about blk-mq well. What I expect is to
expand this sections to explain the usage of sbitmap to manage tags, e.g., like
the comments in block/blk-mq-tag.c or block/blk-mq-tag.h.

In addition, I would be interested in that percpu-refcount is used to track the
lifecycle of requests.

I have no idea how much detail is required for a kernel doc. The is just the
feedback from me by assuming the audience is beginner :)

Thank you very much!

Dongli Zhang


Re: [PATCH 1/1] nvme-fcloop: verify wwnn and wwpn format

2020-06-04 Thread Dongli Zhang
May I get feedback for this?

For the first time I use fcloop, I set:

# echo "wwnn=0x3,wwpn=0x1" > /sys/class/fcloop/ctl/add_target_port

However, I would not be able to move forward if I use "0x3" or "0x1" for nvme-fc
target or host further. Instead, the address and port should be
0x0003 and 0x0001.

This patch would sync the requirements of input format for nvme-fc and
nvme-fcloop, unless this would break existing test suite (e.g., blktest).

Thank you very much!

Dongli Zhang

On 5/25/20 9:21 PM, Dongli Zhang wrote:
> The nvme host and target verify the wwnn and wwpn format via
> nvme_fc_parse_traddr(). For instance, it is required that the length of
> wwnn to be either 21 ("nn-0x") or 19 (nn-).
> 
> Add this verification to nvme-fcloop so that the input should always be in
> hex and the length of input should always be 18.
> 
> Otherwise, the user may use e.g. 0x2 to create fcloop local port, while
> 0x0002 is required for nvme host and target. This makes the
> requirement of format confusing.
> 
> Signed-off-by: Dongli Zhang 
> ---
>  drivers/nvme/target/fcloop.c | 29 +++--
>  1 file changed, 23 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
> index f69ce66e2d44..14124e6d4bf2 100644
> --- a/drivers/nvme/target/fcloop.c
> +++ b/drivers/nvme/target/fcloop.c
> @@ -43,6 +43,17 @@ static const match_table_t opt_tokens = {
>   { NVMF_OPT_ERR, NULL}
>  };
>  
> +static int fcloop_verify_addr(substring_t *s)
> +{
> + size_t blen = s->to - s->from + 1;
> +
> + if (strnlen(s->from, blen) != NVME_FC_TRADDR_HEXNAMELEN + 2 ||
> + strncmp(s->from, "0x", 2))
> + return -EINVAL;
> +
> + return 0;
> +}
> +
>  static int
>  fcloop_parse_options(struct fcloop_ctrl_options *opts,
>   const char *buf)
> @@ -64,14 +75,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts,
>   opts->mask |= token;
>   switch (token) {
>   case NVMF_OPT_WWNN:
> - if (match_u64(args, )) {
> + if (fcloop_verify_addr(args) ||
> + match_u64(args, )) {
>   ret = -EINVAL;
>   goto out_free_options;
>   }
>   opts->wwnn = token64;
>   break;
>   case NVMF_OPT_WWPN:
> - if (match_u64(args, )) {
> + if (fcloop_verify_addr(args) ||
> + match_u64(args, )) {
>   ret = -EINVAL;
>   goto out_free_options;
>   }
> @@ -92,14 +105,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts,
>   opts->fcaddr = token;
>   break;
>   case NVMF_OPT_LPWWNN:
> - if (match_u64(args, )) {
> + if (fcloop_verify_addr(args) ||
> + match_u64(args, )) {
>   ret = -EINVAL;
>   goto out_free_options;
>   }
>   opts->lpwwnn = token64;
>   break;
>   case NVMF_OPT_LPWWPN:
> - if (match_u64(args, )) {
> + if (fcloop_verify_addr(args) ||
> + match_u64(args, )) {
>   ret = -EINVAL;
>   goto out_free_options;
>   }
> @@ -141,14 +156,16 @@ fcloop_parse_nm_options(struct device *dev, u64 *nname, 
> u64 *pname,
>   token = match_token(p, opt_tokens, args);
>   switch (token) {
>   case NVMF_OPT_WWNN:
> - if (match_u64(args, )) {
> + if (fcloop_verify_addr(args) ||
> + match_u64(args, )) {
>   ret = -EINVAL;
>   goto out_free_options;
>   }
>   *nname = token64;
>   break;
>   case NVMF_OPT_WWPN:
> - if (match_u64(args, )) {
> + if (fcloop_verify_addr(args) ||
> + match_u64(args, )) {
>   ret = -EINVAL;
>   goto out_free_options;
>   }
> 


Re: [PATCH 1/1] blk-mq: get ctx in order to handle BLK_MQ_S_INACTIVE in blk_mq_get_tag()

2020-06-03 Thread Dongli Zhang
Hi John,

On 6/3/20 4:59 AM, John Garry wrote:
> On 02/06/2020 07:17, Dongli Zhang wrote:
>> When scheduler is set, we hit below page fault when we offline cpu.
>>
>> [ 1061.007725] BUG: kernel NULL pointer dereference, address: 
>> 0040
>> [ 1061.008710] #PF: supervisor read access in kernel mode
>> [ 1061.009492] #PF: error_code(0x) - not-present page
>> [ 1061.010241] PGD 0 P4D 0
>> [ 1061.010614] Oops:  [#1] SMP PTI
>> [ 1061.011130] CPU: 0 PID: 122 Comm: kworker/0:1H Not tainted 5.7.0-rc7+ #2'
>> ... ...
>> [ 1061.013760] Workqueue: kblockd blk_mq_run_work_fn
>> [ 1061.014446] RIP: 0010:blk_mq_put_tag+0xf/0x30
>> ... ...
>> [ 1061.017726] RSP: 0018:a5c18037fc70 EFLAGS: 00010287
>> [ 1061.018475] RAX:  RBX: a5c18037fcf0 RCX: 
>> 0004
>> [ 1061.019507] RDX:  RSI:  RDI: 
>> 911535dc1180
>> ... ...
>> [ 1061.028454] Call Trace:
>> [ 1061.029307]  blk_mq_get_tag+0x26e/0x280
>> [ 1061.029866]  ? wait_woken+0x80/0x80
>> [ 1061.030378]  blk_mq_get_driver_tag+0x99/0x110
>> [ 1061.031009]  blk_mq_dispatch_rq_list+0x107/0x5e0
>> [ 1061.031672]  ? elv_rb_del+0x1a/0x30
>> [ 1061.032178]  blk_mq_do_dispatch_sched+0xe2/0x130
>> [ 1061.032844]  __blk_mq_sched_dispatch_requests+0xcc/0x150
>> [ 1061.033638]  blk_mq_sched_dispatch_requests+0x2b/0x50
>> [ 1061.034239]  __blk_mq_run_hw_queue+0x75/0x110
>> [ 1061.034867]  process_one_work+0x15c/0x370
>> [ 1061.035450]  worker_thread+0x44/0x3d0
>> [ 1061.035980]  kthread+0xf3/0x130
>> [ 1061.036440]  ? max_active_store+0x80/0x80
>> [ 1061.037018]  ? kthread_bind+0x10/0x10
>> [ 1061.037554]  ret_from_fork+0x35/0x40
>> [ 1061.038073] Modules linked in:
>> [ 1061.038543] CR2: 0040
>> [ 1061.038962] ---[ end trace d20e1df7d028e69f ]---
>>
>> This is because blk_mq_get_driver_tag() would be used to allocate tag once
>> scheduler (e.g., mq-deadline) is set. 
> 
> I tried mq-deadline and I did not see this. Anyway else special or specific
> about your test?
> 

I think you just hit the issue as mentioned in another thread.

To reproduce the issue reproduce to hit the condition that:

1. blk_mq_hctx_notify_offline() BLK_MQ_S_INACTIVE with the barrier ...

... while ...

2. blk_mq_get_tag() gets the tag but BLK_MQ_S_INACTIVE is already set.
Therefore, it would put the tag to release it.

Dongli Zhang


[PATCH 1/1] blk-mq: get ctx in order to handle BLK_MQ_S_INACTIVE in blk_mq_get_tag()

2020-06-02 Thread Dongli Zhang
When scheduler is set, we hit below page fault when we offline cpu.

[ 1061.007725] BUG: kernel NULL pointer dereference, address: 0040
[ 1061.008710] #PF: supervisor read access in kernel mode
[ 1061.009492] #PF: error_code(0x) - not-present page
[ 1061.010241] PGD 0 P4D 0
[ 1061.010614] Oops:  [#1] SMP PTI
[ 1061.011130] CPU: 0 PID: 122 Comm: kworker/0:1H Not tainted 5.7.0-rc7+ #2'
... ...
[ 1061.013760] Workqueue: kblockd blk_mq_run_work_fn
[ 1061.014446] RIP: 0010:blk_mq_put_tag+0xf/0x30
... ...
[ 1061.017726] RSP: 0018:a5c18037fc70 EFLAGS: 00010287
[ 1061.018475] RAX:  RBX: a5c18037fcf0 RCX: 0004
[ 1061.019507] RDX:  RSI:  RDI: 911535dc1180
... ...
[ 1061.028454] Call Trace:
[ 1061.029307]  blk_mq_get_tag+0x26e/0x280
[ 1061.029866]  ? wait_woken+0x80/0x80
[ 1061.030378]  blk_mq_get_driver_tag+0x99/0x110
[ 1061.031009]  blk_mq_dispatch_rq_list+0x107/0x5e0
[ 1061.031672]  ? elv_rb_del+0x1a/0x30
[ 1061.032178]  blk_mq_do_dispatch_sched+0xe2/0x130
[ 1061.032844]  __blk_mq_sched_dispatch_requests+0xcc/0x150
[ 1061.033638]  blk_mq_sched_dispatch_requests+0x2b/0x50
[ 1061.034239]  __blk_mq_run_hw_queue+0x75/0x110
[ 1061.034867]  process_one_work+0x15c/0x370
[ 1061.035450]  worker_thread+0x44/0x3d0
[ 1061.035980]  kthread+0xf3/0x130
[ 1061.036440]  ? max_active_store+0x80/0x80
[ 1061.037018]  ? kthread_bind+0x10/0x10
[ 1061.037554]  ret_from_fork+0x35/0x40
[ 1061.038073] Modules linked in:
[ 1061.038543] CR2: 0040
[ 1061.038962] ---[ end trace d20e1df7d028e69f ]---

This is because blk_mq_get_driver_tag() would be used to allocate tag once
scheduler (e.g., mq-deadline) is set. However, in order to handle
BLK_MQ_S_INACTIVE in blk_mq_get_tag(), we need to set data->ctx for
blk_mq_put_tag().

Fixes: bf0beec0607db3c6 ("blk-mq: drain I/O when all CPUs in a hctx are 
offline")
Signed-off-by: Dongli Zhang 
---
This is based on for-next because currently the pull request for v5.8 is
not picked by mainline.

 block/blk-mq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9a36ac1c1fa1..8bf6c06a86c1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1056,6 +1056,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
 {
struct blk_mq_alloc_data data = {
.q = rq->q,
+   .ctx = rq->mq_ctx,
.hctx = rq->mq_hctx,
.flags = BLK_MQ_REQ_NOWAIT,
.cmd_flags = rq->cmd_flags,
-- 
2.17.1



[PATCH 2/2] nbd: remove unused 'NBD_MAGIC'

2020-05-31 Thread Dongli Zhang
Remove 'NBD_MAGIC' as it is not used since commit 5ea8d10802ec ("nbd:
separate out the config information").

Signed-off-by: Dongli Zhang 
---
 drivers/block/nbd.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 74c1363702f5..83435ce141a8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -141,8 +141,6 @@ static struct dentry *nbd_dbg_dir;
 
 #define nbd_name(nbd) ((nbd)->disk->disk_name)
 
-#define NBD_MAGIC 0x68797548
-
 #define NBD_DEF_BLKSIZE 1024
 
 static unsigned int nbds_max = 16;
-- 
2.17.1



[PATCH 0/2] nbd: two cleanup

2020-05-31 Thread Dongli Zhang
This is just cleanup without functional change.

Thank you very much!

Dongli Zhang




[PATCH 1/2] nbd: append module param and description following corresponding variables

2020-05-31 Thread Dongli Zhang
A lot of drivers append the module parameter and its description following
the corresponding variables (e.g., 'g_submit_queues' in null or
'admin_timeout' in nvme).

This patch would do the same for 'nbds_max' and 'max_part' in nbd driver.
This makes it much more friendly to cscope when reading the code.

Signed-off-by: Dongli Zhang 
---
 drivers/block/nbd.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 43cff01a5a67..74c1363702f5 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -146,7 +146,13 @@ static struct dentry *nbd_dbg_dir;
 #define NBD_DEF_BLKSIZE 1024
 
 static unsigned int nbds_max = 16;
+module_param(nbds_max, int, 0444);
+MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize 
(default: 16)");
+
 static int max_part = 16;
+module_param(max_part, int, 0444);
+MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
+
 static int part_shift;
 
 static int nbd_dev_dbg_init(struct nbd_device *nbd);
@@ -2444,8 +2450,3 @@ module_exit(nbd_cleanup);
 
 MODULE_DESCRIPTION("Network Block Device");
 MODULE_LICENSE("GPL");
-
-module_param(nbds_max, int, 0444);
-MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize 
(default: 16)");
-module_param(max_part, int, 0444);
-MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
-- 
2.17.1



[PATCH for-next 1/1] null_blk: force complete for timeout request

2020-05-29 Thread Dongli Zhang
The commit 7b11eab041da ("blk-mq: blk-mq: provide forced completion
method") exports new API to force a request to complete without error
injection.

There should be no error injection when completing a request by timeout
handler.

Otherwise, the below would hang because timeout handler is failed.

echo 100 > /sys/kernel/debug/fail_io_timeout/probability
echo 1000 > /sys/kernel/debug/fail_io_timeout/times
echo 1 > /sys/block/nullb0/io-timeout-fail
dd if=/dev/zero of=/dev/nullb0 bs=512 count=1 oflag=direct

With this patch, the timeout handler is able to complete the IO.

Signed-off-by: Dongli Zhang 
---
 drivers/block/null_blk_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 6126f771ae99..87b31f9ca362 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -1423,7 +1423,7 @@ static bool should_requeue_request(struct request *rq)
 static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
 {
pr_info("rq %p timed out\n", rq);
-   blk_mq_complete_request(rq);
+   blk_mq_force_complete_rq(rq);
return BLK_EH_DONE;
 }
 
-- 
2.17.1



[PATCH v2 1/1] nvme-pci: avoid race between nvme_reap_pending_cqes() and nvme_poll()

2020-05-27 Thread Dongli Zhang
There may be a race between nvme_reap_pending_cqes() and nvme_poll(), e.g.,
when doing live reset while polling the nvme device.

  CPU XCPU Y
   nvme_poll()
nvme_dev_disable()
-> nvme_stop_queues()
-> nvme_suspend_io_queues()
-> nvme_suspend_queue()
   -> spin_lock(>cq_poll_lock);
-> nvme_reap_pending_cqes()
   -> nvme_process_cq()-> nvme_process_cq()

In the above scenario, the nvme_process_cq() for the same queue may be
running on both CPU X and CPU Y concurrently.

It is much more easier to reproduce the issue when CONFIG_PREEMPT is
enabled in kernel. When CONFIG_PREEMPT is disabled, it would take longer
time for nvme_stop_queues()-->blk_mq_quiesce_queue() to wait for grace
period.

This patch protects nvme_process_cq() with nvmeq->cq_poll_lock in
nvme_reap_pending_cqes().

Fixes: fa46c6fb5d61 ("nvme/pci: move cqe check after device shutdown")
Signed-off-by: Dongli Zhang 
Reviewed-by: Ming Lei 
Reviewed-by: Keith Busch 
---
Changed since v1:
  - Add "Fixes" tag

 drivers/nvme/host/pci.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3726dc780d15..cc46e250fcac 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1382,16 +1382,19 @@ static void nvme_disable_admin_queue(struct nvme_dev 
*dev, bool shutdown)
 
 /*
  * Called only on a device that has been disabled and after all other threads
- * that can check this device's completion queues have synced. This is the
- * last chance for the driver to see a natural completion before
- * nvme_cancel_request() terminates all incomplete requests.
+ * that can check this device's completion queues have synced, except
+ * nvme_poll(). This is the last chance for the driver to see a natural
+ * completion before nvme_cancel_request() terminates all incomplete requests.
  */
 static void nvme_reap_pending_cqes(struct nvme_dev *dev)
 {
int i;
 
-   for (i = dev->ctrl.queue_count - 1; i > 0; i--)
+   for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
+   spin_lock(>queues[i].cq_poll_lock);
nvme_process_cq(>queues[i]);
+   spin_unlock(>queues[i].cq_poll_lock);
+   }
 }
 
 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
-- 
2.17.1



[PATCH 1/1] nvme-pci: avoid race between nvme_reap_pending_cqes() and nvme_poll()

2020-05-26 Thread Dongli Zhang
There may be a race between nvme_reap_pending_cqes() and nvme_poll(), e.g.,
when doing live reset while polling the nvme device.

  CPU XCPU Y
   nvme_poll()
nvme_dev_disable()
-> nvme_stop_queues()
-> nvme_suspend_io_queues()
-> nvme_suspend_queue()
   -> spin_lock(>cq_poll_lock);
-> nvme_reap_pending_cqes()
   -> nvme_process_cq()-> nvme_process_cq()

In the above scenario, the nvme_process_cq() for the same queue may be
running on both CPU X and CPU Y concurrently.

It is much more easier to reproduce the issue when CONFIG_PREEMPT is
enabled in kernel. When CONFIG_PREEMPT is disabled, it would take longer
time for nvme_stop_queues()-->blk_mq_quiesce_queue() to wait for grace
period.

This patch protects nvme_process_cq() with nvmeq->cq_poll_lock in
nvme_reap_pending_cqes().

Signed-off-by: Dongli Zhang 
---
 drivers/nvme/host/pci.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3726dc780d15..cc46e250fcac 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1382,16 +1382,19 @@ static void nvme_disable_admin_queue(struct nvme_dev 
*dev, bool shutdown)
 
 /*
  * Called only on a device that has been disabled and after all other threads
- * that can check this device's completion queues have synced. This is the
- * last chance for the driver to see a natural completion before
- * nvme_cancel_request() terminates all incomplete requests.
+ * that can check this device's completion queues have synced, except
+ * nvme_poll(). This is the last chance for the driver to see a natural
+ * completion before nvme_cancel_request() terminates all incomplete requests.
  */
 static void nvme_reap_pending_cqes(struct nvme_dev *dev)
 {
int i;
 
-   for (i = dev->ctrl.queue_count - 1; i > 0; i--)
+   for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
+   spin_lock(>queues[i].cq_poll_lock);
nvme_process_cq(>queues[i]);
+   spin_unlock(>queues[i].cq_poll_lock);
+   }
 }
 
 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
-- 
2.17.1



[PATCH 1/1] nvme-fcloop: verify wwnn and wwpn format

2020-05-25 Thread Dongli Zhang
The nvme host and target verify the wwnn and wwpn format via
nvme_fc_parse_traddr(). For instance, it is required that the length of
wwnn to be either 21 ("nn-0x") or 19 (nn-).

Add this verification to nvme-fcloop so that the input should always be in
hex and the length of input should always be 18.

Otherwise, the user may use e.g. 0x2 to create fcloop local port, while
0x0002 is required for nvme host and target. This makes the
requirement of format confusing.

Signed-off-by: Dongli Zhang 
---
 drivers/nvme/target/fcloop.c | 29 +++--
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index f69ce66e2d44..14124e6d4bf2 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -43,6 +43,17 @@ static const match_table_t opt_tokens = {
{ NVMF_OPT_ERR, NULL}
 };
 
+static int fcloop_verify_addr(substring_t *s)
+{
+   size_t blen = s->to - s->from + 1;
+
+   if (strnlen(s->from, blen) != NVME_FC_TRADDR_HEXNAMELEN + 2 ||
+   strncmp(s->from, "0x", 2))
+   return -EINVAL;
+
+   return 0;
+}
+
 static int
 fcloop_parse_options(struct fcloop_ctrl_options *opts,
const char *buf)
@@ -64,14 +75,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts,
opts->mask |= token;
switch (token) {
case NVMF_OPT_WWNN:
-   if (match_u64(args, )) {
+   if (fcloop_verify_addr(args) ||
+   match_u64(args, )) {
ret = -EINVAL;
goto out_free_options;
}
opts->wwnn = token64;
break;
case NVMF_OPT_WWPN:
-   if (match_u64(args, )) {
+   if (fcloop_verify_addr(args) ||
+   match_u64(args, )) {
ret = -EINVAL;
goto out_free_options;
}
@@ -92,14 +105,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts,
opts->fcaddr = token;
break;
case NVMF_OPT_LPWWNN:
-   if (match_u64(args, )) {
+   if (fcloop_verify_addr(args) ||
+   match_u64(args, )) {
ret = -EINVAL;
goto out_free_options;
}
opts->lpwwnn = token64;
break;
case NVMF_OPT_LPWWPN:
-   if (match_u64(args, )) {
+   if (fcloop_verify_addr(args) ||
+   match_u64(args, )) {
ret = -EINVAL;
goto out_free_options;
}
@@ -141,14 +156,16 @@ fcloop_parse_nm_options(struct device *dev, u64 *nname, 
u64 *pname,
token = match_token(p, opt_tokens, args);
switch (token) {
case NVMF_OPT_WWNN:
-   if (match_u64(args, )) {
+   if (fcloop_verify_addr(args) ||
+   match_u64(args, )) {
ret = -EINVAL;
goto out_free_options;
}
*nname = token64;
break;
case NVMF_OPT_WWPN:
-   if (match_u64(args, )) {
+   if (fcloop_verify_addr(args) ||
+   match_u64(args, )) {
ret = -EINVAL;
goto out_free_options;
}
-- 
2.17.1



[PATCH 1/1] xen/manage: enable C_A_D to force reboot

2020-05-13 Thread Dongli Zhang
The systemd may be configured to mask ctrl-alt-del via "systemctl mask
ctrl-alt-del.target". As a result, the pv reboot would not work as signal
is ignored.

This patch always enables C_A_D before the call of ctrl_alt_del() in order
to force the reboot.

Reported-by: Rose Wang 
Cc: Joe Jin 
Cc: Boris Ostrovsky 
Signed-off-by: Dongli Zhang 
---
 drivers/xen/manage.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index cd046684e0d1..3190d0ecb52e 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -204,6 +204,13 @@ static void do_poweroff(void)
 static void do_reboot(void)
 {
shutting_down = SHUTDOWN_POWEROFF; /* ? */
+   /*
+* The systemd may be configured to mask ctrl-alt-del via
+* "systemctl mask ctrl-alt-del.target". As a result, the pv reboot
+* would not work. To enable C_A_D would force the reboot.
+*/
+   C_A_D = 1;
+
ctrl_alt_del();
 }
 
-- 
2.17.1



[PATCH v2 1/1] xen-netfront: do not use ~0U as error return value for xennet_fill_frags()

2019-10-01 Thread Dongli Zhang
xennet_fill_frags() uses ~0U as return value when the sk_buff is not able
to cache extra fragments. This is incorrect because the return type of
xennet_fill_frags() is RING_IDX and 0x is an expected value for
ring buffer index.

In the situation when the rsp_cons is approaching 0x, the return
value of xennet_fill_frags() may become 0x which xennet_poll() (the
caller) would regard as error. As a result, queue->rx.rsp_cons is set
incorrectly because it is updated only when there is error. If there is no
error, xennet_poll() would be responsible to update queue->rx.rsp_cons.
Finally, queue->rx.rsp_cons would point to the rx ring buffer entries whose
queue->rx_skbs[i] and queue->grant_rx_ref[i] are already cleared to NULL.
This leads to NULL pointer access in the next iteration to process rx ring
buffer entries.

The symptom is similar to the one fixed in
commit 00b368502d18 ("xen-netfront: do not assume sk_buff_head list is
empty in error handling").

This patch changes the return type of xennet_fill_frags() to indicate
whether it is successful or failed. The queue->rx.rsp_cons will be
always updated inside this function.

Fixes: ad4f15dc2c70 ("xen/netfront: don't bug in case of too many frags")
Signed-off-by: Dongli Zhang 
---
Changed since v1:
  - Always update queue->rx.rsp_cons inside xennet_fill_frags() so we do
not need to add extra argument to xennet_fill_frags().

 drivers/net/xen-netfront.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index e14ec75..482c6c8 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -887,9 +887,9 @@ static int xennet_set_skb_gso(struct sk_buff *skb,
return 0;
 }
 
-static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
- struct sk_buff *skb,
- struct sk_buff_head *list)
+static int xennet_fill_frags(struct netfront_queue *queue,
+struct sk_buff *skb,
+struct sk_buff_head *list)
 {
RING_IDX cons = queue->rx.rsp_cons;
struct sk_buff *nskb;
@@ -908,7 +908,7 @@ static RING_IDX xennet_fill_frags(struct netfront_queue 
*queue,
if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {
queue->rx.rsp_cons = ++cons + skb_queue_len(list);
kfree_skb(nskb);
-   return ~0U;
+   return -ENOENT;
}
 
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
@@ -919,7 +919,9 @@ static RING_IDX xennet_fill_frags(struct netfront_queue 
*queue,
kfree_skb(nskb);
}
 
-   return cons;
+   queue->rx.rsp_cons = cons;
+
+   return 0;
 }
 
 static int checksum_setup(struct net_device *dev, struct sk_buff *skb)
@@ -1045,8 +1047,7 @@ static int xennet_poll(struct napi_struct *napi, int 
budget)
skb->data_len = rx->status;
skb->len += rx->status;
 
-   i = xennet_fill_frags(queue, skb, );
-   if (unlikely(i == ~0U))
+   if (unlikely(xennet_fill_frags(queue, skb, )))
goto err;
 
if (rx->flags & XEN_NETRXF_csum_blank)
@@ -1056,7 +1057,7 @@ static int xennet_poll(struct napi_struct *napi, int 
budget)
 
__skb_queue_tail(, skb);
 
-   queue->rx.rsp_cons = ++i;
+   i = ++queue->rx.rsp_cons;
work_done++;
}
 
-- 
2.7.4



[PATCH 1/1] xen-netfront: do not use ~0U as error return value for xennet_fill_frags()

2019-09-30 Thread Dongli Zhang
xennet_fill_frags() uses ~0U as return value when the sk_buff is not able
to cache extra fragments. This is incorrect because the return type of
xennet_fill_frags() is RING_IDX and 0x is an expected value for
ring buffer index.

In the situation when the rsp_cons is approaching 0x, the return
value of xennet_fill_frags() may become 0x which xennet_poll() (the
caller) would regard as error. As a result, queue->rx.rsp_cons is set
incorrectly because it is updated only when there is error. If there is no
error, xennet_poll() would be responsible to update queue->rx.rsp_cons.
Finally, queue->rx.rsp_cons would point to the rx ring buffer entries whose
queue->rx_skbs[i] and queue->grant_rx_ref[i] are already cleared to NULL.
This leads to NULL pointer access in the next iteration to process rx ring
buffer entries.

The symptom is similar to the one fixed in
commit 00b368502d18 ("xen-netfront: do not assume sk_buff_head list is
empty in error handling").

This patch uses an extra argument to help return if there is error in
xennet_fill_frags().

Fixes: ad4f15dc2c70 ("xen/netfront: don't bug in case of too many frags")
Signed-off-by: Dongli Zhang 
---
 drivers/net/xen-netfront.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index e14ec75..c2a1e09 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -889,11 +889,14 @@ static int xennet_set_skb_gso(struct sk_buff *skb,
 
 static RING_IDX xennet_fill_frags(struct netfront_queue *queue,
  struct sk_buff *skb,
- struct sk_buff_head *list)
+ struct sk_buff_head *list,
+ int *errno)
 {
RING_IDX cons = queue->rx.rsp_cons;
struct sk_buff *nskb;
 
+   *errno = 0;
+
while ((nskb = __skb_dequeue(list))) {
struct xen_netif_rx_response *rx =
RING_GET_RESPONSE(>rx, ++cons);
@@ -908,6 +911,7 @@ static RING_IDX xennet_fill_frags(struct netfront_queue 
*queue,
if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {
queue->rx.rsp_cons = ++cons + skb_queue_len(list);
kfree_skb(nskb);
+   *errno = -ENOENT;
return ~0U;
}
 
@@ -1009,6 +1013,8 @@ static int xennet_poll(struct napi_struct *napi, int 
budget)
i = queue->rx.rsp_cons;
work_done = 0;
while ((i != rp) && (work_done < budget)) {
+   int errno;
+
memcpy(rx, RING_GET_RESPONSE(>rx, i), sizeof(*rx));
memset(extras, 0, sizeof(rinfo.extras));
 
@@ -1045,8 +1051,8 @@ static int xennet_poll(struct napi_struct *napi, int 
budget)
skb->data_len = rx->status;
skb->len += rx->status;
 
-   i = xennet_fill_frags(queue, skb, );
-   if (unlikely(i == ~0U))
+   i = xennet_fill_frags(queue, skb, , );
+   if (unlikely(errno == -ENOENT))
goto err;
 
if (rx->flags & XEN_NETRXF_csum_blank)
-- 
2.7.4



[PATCH 1/1] xen-netfront: do not assume sk_buff_head list is empty in error handling

2019-09-15 Thread Dongli Zhang
When skb_shinfo(skb) is not able to cache extra fragment (that is,
skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS), xennet_fill_frags() assumes
the sk_buff_head list is already empty. As a result, cons is increased only
by 1 and returns to error handling path in xennet_poll().

However, if the sk_buff_head list is not empty, queue->rx.rsp_cons may be
set incorrectly. That is, queue->rx.rsp_cons would point to the rx ring
buffer entries whose queue->rx_skbs[i] and queue->grant_rx_ref[i] are
already cleared to NULL. This leads to NULL pointer access in the next
iteration to process rx ring buffer entries.

Below is how xennet_poll() does error handling. All remaining entries in
tmpq are accounted to queue->rx.rsp_cons without assuming how many
outstanding skbs are remained in the list.

 985 static int xennet_poll(struct napi_struct *napi, int budget)
... ...
1032   if (unlikely(xennet_set_skb_gso(skb, gso))) {
1033   __skb_queue_head(, skb);
1034   queue->rx.rsp_cons += skb_queue_len();
1035   goto err;
1036   }

It is better to always have the error handling in the same way.

Fixes: ad4f15dc2c70 ("xen/netfront: don't bug in case of too many frags")
Signed-off-by: Dongli Zhang 
---
 drivers/net/xen-netfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 8d33970..5f5722b 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -906,7 +906,7 @@ static RING_IDX xennet_fill_frags(struct netfront_queue 
*queue,
__pskb_pull_tail(skb, pull_to - skb_headlen(skb));
}
if (unlikely(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) {
-   queue->rx.rsp_cons = ++cons;
+   queue->rx.rsp_cons = ++cons + skb_queue_len(list);
kfree_skb(nskb);
return ~0U;
}
-- 
2.7.4



[PATCH 1/1] scsi: virtio_scsi: remove unused 'affinity_hint_set'

2019-06-19 Thread Dongli Zhang
The 'affinity_hint_set' is not used any longer since
commit 0d9f0a52c8b9 ("virtio_scsi: use virtio IRQ affinity").

Signed-off-by: Dongli Zhang 
---
 drivers/scsi/virtio_scsi.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 13f1b3b..1705398 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -74,9 +74,6 @@ struct virtio_scsi {
 
u32 num_queues;
 
-   /* If the affinity hint is set for virtqueues */
-   bool affinity_hint_set;
-
struct hlist_node node;
 
/* Protected by event_vq lock */
-- 
2.7.4



Re: [5.2-rc1 regression]: nvme vs. hibernation

2019-05-24 Thread Dongli Zhang
Hi Jiri,

Looks this has been discussed in the past.

http://lists.infradead.org/pipermail/linux-nvme/2019-April/023234.html

I created a fix for a case but not good enough.

http://lists.infradead.org/pipermail/linux-nvme/2019-April/023277.html

Perhaps people would have better solution.

Dongli Zhang

On 05/25/2019 06:27 AM, Jiri Kosina wrote:
> On Fri, 24 May 2019, Keith Busch wrote:
> 
>>> Something is broken in Linus' tree (4dde821e429) with respec to 
>>> hibernation on my thinkpad x270, and it seems to be nvme related.
>>>
>>> I reliably see the warning below during hibernation, and then sometimes 
>>> resume sort of works but the machine misbehaves here and there (seems like 
>>> lost IRQs), sometimes it never comes back from the hibernated state.
>>>
>>> I will not have too much have time to look into this over weekend, so I am 
>>> sending this out as-is in case anyone has immediate idea. Otherwise I'll 
>>> bisect it on monday (I don't even know at the moment what exactly was the 
>>> last version that worked reliably, I'll have to figure that out as well 
>>> later).
>>
>> I believe the warning call trace was introduced when we converted nvme to
>> lock-less completions. On device shutdown, we'll check queues for any
>> pending completions, and we temporarily disable the interrupts to make
>> sure that queues interrupt handler can't run concurrently.
> 
> Yeah, the completion changes were the primary reason why I brought this up 
> with all of you guys in CC.
> 
>> On hibernation, most CPUs are offline, and the interrupt re-enabling
>> is hitting this warning that says the IRQ is not associated with any
>> online CPUs.
>>
>> I'm sure we can find a way to fix this warning, but I'm not sure that
>> explains the rest of the symptoms you're describing though.
> 
> It seems to be more or less reliable enough for bisect. I'll try that on 
> monday and will let you know.
> 
> Thanks,
> 


Re: [PATCH 1/1] swiotlb: save io_tlb_used to local variable before leaving critical section

2019-04-15 Thread Dongli Zhang



On 04/15/2019 07:26 PM, Konrad Rzeszutek Wilk wrote:
> 
> 
> On Mon, Apr 15, 2019, 2:50 AM Dongli Zhang  <mailto:dongli.zh...@oracle.com>> wrote:
> 
> As the patch to be fixed is still in Konrad's own tree, I will send the 
> v2 for
> the patch to be fixed, instead of an incremental fix.
> 
> 
> I squashed it in.

Thank you very much!

I saw it is at:

https://git.kernel.org/pub/scm/linux/kernel/git/konrad/swiotlb.git/commit/?h=devel/for-linus-5.1=53b29c336830db48ad3dc737f88b8c065b1f0851

Dongli Zhang


Re: [PATCH] block/mq: blk map queues by core id

2019-03-22 Thread Dongli Zhang



On 03/22/2019 06:09 PM, luferry wrote:
> under virtual machine environment, cpu topology may differ from normal
> physical server.

Would mind share the name of virtual machine monitor, the command line if
available, and which device to reproduce.

For instance, I am not able to reproduce with qemu nvme or virtio-blk as I
assume they use pci or virtio specific mapper to establish the mapping.

E.g., with qemu and nvme: -smp 8,sockets=1,cores=4,threads=2

Indeed I use three queues instead of twp as one is reserved for admin.

# ls /sys/block/nvme0n1/mq/*
/sys/block/nvme0n1/mq/0:
cpu0  cpu1  cpu2  cpu3  cpu_list  nr_reserved_tags  nr_tags

/sys/block/nvme0n1/mq/1:
cpu4  cpu5  cpu6  cpu7  cpu_list  nr_reserved_tags  nr_tags


Thank you very much!

Dongli Zhang

> for example (machine with 4 cores, 2 threads per core):
> 
> normal physical server:
> core-id   thread-0-id  thread-1-id
> 0 04
> 1 15
> 2 26
> 3 37
> 
> virtual machine:
> core-id   thread-0-id  thread-1-id
> 0 01
> 1 23
> 2 45
> 3 67
> 
> When attach disk with two queues, all the even numbered cpus will be
> mapped to queue 0. Under virtual machine, all the cpus is followed by
> its sibling cpu.Before this patch, all the odd numbered cpus will also
> be mapped to queue 0, can cause serious imbalance.this will lead to
> performance impact on system IO
> 
> So suggest to allocate cpu map by core id, this can be more currency
> 
> Signed-off-by: luferry 
> ---
>  block/blk-mq-cpumap.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 03a534820271..4125e8e77679 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -35,7 +35,7 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>  {
>   unsigned int *map = qmap->mq_map;
>   unsigned int nr_queues = qmap->nr_queues;
> - unsigned int cpu, first_sibling;
> + unsigned int cpu, first_sibling, core = 0;
>  
>   for_each_possible_cpu(cpu) {
>   /*
> @@ -48,9 +48,10 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>   map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
>   } else {
>   first_sibling = get_first_sibling(cpu);
> - if (first_sibling == cpu)
> - map[cpu] = cpu_to_queue_index(qmap, nr_queues, 
> cpu);
> - else
> + if (first_sibling == cpu) {
> + map[cpu] = cpu_to_queue_index(qmap, nr_queues, 
> core);
> + core++;
> + } else
>   map[cpu] = map[first_sibling];
>   }
>   }
> 


Re: [PATCH] block/mq: blk map queues by core id

2019-03-22 Thread Dongli Zhang



On 03/22/2019 06:09 PM, luferry wrote:
> under virtual machine environment, cpu topology may differ from normal

Would mind share the name of virtual machine monitor, the command line if
available, and which device to reproduce.

For instance, I am not able to reproduce with qemu nvme or virtio-blk as I
assume they use pci or virtio specific mapper to establish the mapping.

E.g., with qemu and nvme: -smp 8,sockets=1,cores=4,threads=2

Indeed I use three queues instead of twp as one is reserved for admin.

# ls /sys/block/nvme0n1/mq/*
/sys/block/nvme0n1/mq/0:
cpu0  cpu1  cpu2  cpu3  cpu_list  nr_reserved_tags  nr_tags

/sys/block/nvme0n1/mq/1:
cpu4  cpu5  cpu6  cpu7  cpu_list  nr_reserved_tags  nr_tags

> physical server.
> for example (machine with 4 cores, 2 threads per core):
> 
> normal physical server:
> core-id   thread-0-id  thread-1-id
> 0 04
> 1 15
> 2 26
> 3 37
> 
> virtual machine:
> core-id   thread-0-id  thread-1-id
> 0 01
> 1 23
> 2 45
> 3 67
> 
> When attach disk with two queues, all the even numbered cpus will be
> mapped to queue 0. Under virtual machine, all the cpus is followed by
> its sibling cpu.Before this patch, all the odd numbered cpus will also
> be mapped to queue 0, can cause serious imbalance.this will lead to
> performance impact on system IO
> 
> So suggest to allocate cpu map by core id, this can be more currency
> 
> Signed-off-by: luferry 
> ---
>  block/blk-mq-cpumap.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
> 
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 03a534820271..4125e8e77679 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -35,7 +35,7 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>  {
>   unsigned int *map = qmap->mq_map;
>   unsigned int nr_queues = qmap->nr_queues;
> - unsigned int cpu, first_sibling;
> + unsigned int cpu, first_sibling, core = 0;
>  
>   for_each_possible_cpu(cpu) {
>   /*
> @@ -48,9 +48,10 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
>   map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
>   } else {
>   first_sibling = get_first_sibling(cpu);
> - if (first_sibling == cpu)
> - map[cpu] = cpu_to_queue_index(qmap, nr_queues, 
> cpu);
> - else
> + if (first_sibling == cpu) {
> + map[cpu] = cpu_to_queue_index(qmap, nr_queues, 
> core);
> + core++;
> + } else
>   map[cpu] = map[first_sibling];
>   }
>   }
> 


Re: virtio-blk: should num_vqs be limited by num_possible_cpus()?

2019-03-20 Thread Dongli Zhang



On 3/20/19 8:53 PM, Jason Wang wrote:
> 
> On 2019/3/19 上午10:22, Dongli Zhang wrote:
>> Hi Jason,
>>
>> On 3/18/19 3:47 PM, Jason Wang wrote:
>>> On 2019/3/15 下午8:41, Cornelia Huck wrote:
>>>> On Fri, 15 Mar 2019 12:50:11 +0800
>>>> Jason Wang  wrote:
>>>>
>>>>> Or something like I proposed several years ago?
>>>>> https://do-db2.lkml.org/lkml/2014/12/25/169
>>>>>
>>>>> Btw, for virtio-net, I think we actually want to go for having a maximum
>>>>> number of supported queues like what hardware did. This would be useful
>>>>> for e.g cpu hotplug or XDP (requires per cpu TX queue). But the current
>>>>> vector allocation doesn't support this which will results all virtqueues
>>>>> to share a single vector. We may indeed need more flexible policy here.
>>>> I think it should be possible for the driver to give the transport
>>>> hints how to set up their queues/interrupt structures. (The driver
>>>> probably knows best about its requirements.) Perhaps whether a queue is
>>>> high or low frequency, or whether it should be low latency, or even
>>>> whether two queues could share a notification mechanism without
>>>> drawbacks. It's up to the transport to make use of that information, if
>>>> possible.
>>>
>>> Exactly and it was what the above series tried to do by providing hints of 
>>> e.g
>>> which queues want to share a notification.
>>>
>> I read about your patch set on providing more flexibility of queue-to-vector
>> mapping.
>>
>> One use case of the patch set is we would be able to enable more queues when
>> there is limited number of vectors.
>>
>> Another use case we may classify queues as hight priority or low priority as
>> mentioned by Cornelia.
>>
>> For virtio-blk, we may extend virtio-blk based on this patch set to enable
>> something similar to write_queues/poll_queues in nvme, when (set->nr_maps != 
>> 1).
>>
>>
>> Yet, the question I am asking in this email thread is for a difference 
>> scenario.
>>
>> The issue is not we are not having enough vectors (although this is why only 
>> 1
>> vector is allocated for all virtio-blk queues). As so far virtio-blk has
>> (set->nr_maps == 1), block layer would limit the number of hw queues by
>> nr_cpu_ids, we indeed do not need more than nr_cpu_ids hw queues in 
>> virtio-blk.
>>
>> That's why I ask why not change the flow as below options when the number of
>> supported hw queues is more than nr_cpu_ids (and set->nr_maps == 1. 
>> virtio-blk
>> does not set nr_maps and block layer would set it to 1 when the driver does 
>> not
>> specify with a value):
>>
>> option 1:
>> As what nvme and xen-netfront do, limit the hw queue number by nr_cpu_ids.
> 
> 
> How do they limit the hw queue number? A command?

The max #queue is also limited by other factors, e.g., kernel param
configuration, xen dom0 configuration or nvme hardware support.

Here we would ignore those factors for simplicity and only talk about the
relation between #queue and #cpu.


About nvme pci:

Regardless about new write_queues and poll_queues, the default queue type number
is limited by num_possible_cpus() as line 2120 and 252.

2113 static int nvme_setup_io_queues(struct nvme_dev *dev)
2114 {
2115 struct nvme_queue *adminq = >queues[0];
2116 struct pci_dev *pdev = to_pci_dev(dev->dev);
2117 int result, nr_io_queues;
2118 unsigned long size;
2119
2120 nr_io_queues = max_io_queues();
2121 result = nvme_set_queue_count(>ctrl, _io_queues);

 250 static unsigned int max_io_queues(void)
 251 {
 252 return num_possible_cpus() + write_queues + poll_queues;
 253 }

The cons of this is there might be many unused hw queues and vectors when
num_possible_cpus() is very very large while only a small number of cpu are
online. I am looking if there is way to improve this.



About xen-blkfront:

Indeed the max #queue is limited by num_online_cpus() when xen-blkfront module
is loaded as line 2733 and 2736.

2707 static int __init xlblk_init(void)
... ...
2710 int nr_cpus = num_online_cpus();
... ...
2733 if (xen_blkif_max_queues > nr_cpus) {
2734 pr_info("Invalid max_queues (%d), will use default max: 
%d.\n",
2735 xen_blkif_max_queues, nr_cpus);
2736 xen_blkif_max_queues = nr_cpus;
2737     }

The cons of this is the number of hw-queue/hctx is limited and cannot increase
after cpu hotplug. I am looking if there is way to i

Re: virtio-blk: should num_vqs be limited by num_possible_cpus()?

2019-03-14 Thread Dongli Zhang



On 03/14/2019 08:32 PM, Michael S. Tsirkin wrote:
> On Tue, Mar 12, 2019 at 10:22:46AM -0700, Dongli Zhang wrote:
>> I observed that there is one msix vector for config and one shared vector
>> for all queues in below qemu cmdline, when the num-queues for virtio-blk
>> is more than the number of possible cpus:
>>
>> qemu: "-smp 4" while "-device 
>> virtio-blk-pci,drive=drive-0,id=virtblk0,num-queues=6"
> 
> So why do this?

I observed this when I was testing virtio-blk and block layer.

I just assign a very large 'num-queues' to virtio-blk and keep changing the
number of vcpu in order to study blk-mq.

The num-queues for nvme (qemu) is by default 64, while it is 1 for virtio-blk.

> 
>> # cat /proc/interrupts 
>>CPU0   CPU1   CPU2   CPU3
>> ... ...
>>  24:  0  0  0  0   PCI-MSI 65536-edge  
>> virtio0-config
>>  25:  0  0  0 59   PCI-MSI 65537-edge  
>> virtio0-virtqueues
>> ... ...
>>
>>
>> However, when num-queues is the same as number of possible cpus:
>>
>> qemu: "-smp 4" while "-device 
>> virtio-blk-pci,drive=drive-0,id=virtblk0,num-queues=4"
>>
>> # cat /proc/interrupts 
>>CPU0   CPU1   CPU2   CPU3
>> ... ... 
>>  24:  0  0  0  0   PCI-MSI 65536-edge  
>> virtio0-config
>>  25:  2  0  0  0   PCI-MSI 65537-edge  
>> virtio0-req.0
>>  26:  0 35  0  0   PCI-MSI 65538-edge  
>> virtio0-req.1
>>  27:  0  0 32  0   PCI-MSI 65539-edge  
>> virtio0-req.2
>>  28:  0  0  0  0   PCI-MSI 65540-edge  
>> virtio0-req.3
>> ... ...
>>
>> In above case, there is one msix vector per queue.
>>
>>
>> This is because the max number of queues is not limited by the number of
>> possible cpus.
>>
>> By default, nvme (regardless about write_queues and poll_queues) and
>> xen-blkfront limit the number of queues with num_possible_cpus().
>>
>>
>> Is this by design on purpose, or can we fix with below?
>>
>>
>> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
>> index 4bc083b..df95ce3 100644
>> --- a/drivers/block/virtio_blk.c
>> +++ b/drivers/block/virtio_blk.c
>> @@ -513,6 +513,8 @@ static int init_vq(struct virtio_blk *vblk)
>>  if (err)
>>  num_vqs = 1;
>>  
>> +num_vqs = min(num_possible_cpus(), num_vqs);
>> +
>>  vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
>>  if (!vblk->vqs)
>>  return -ENOMEM;
>> --
>>
>>
>> PS: The same issue is applicable to virtio-scsi as well.
>>
>> Thank you very much!
>>
>> Dongli Zhang
> 
> I don't think this will address the issue if there's vcpu hotplug though.
> Because it's not about num_possible_cpus it's about the # of active VCPUs,
> right? Does block hangle CPU hotplug generally?
> We could maybe address that by switching vq to msi vector mapping in
> a cpu hotplug notifier...
> 

It looks it is about num_possible_cpus/nr_cpu_ids for cpu hotplug.


For instance, below is when only 2 out of 6 cpus are initialized while
virtio-blk has 6 queues.

"-smp 2,maxcpus=6" and "-device
virtio-blk-pci,drive=drive0,id=disk0,num-queues=6,iothread=io1"

# cat /sys/devices/system/cpu/present
0-1
# cat /sys/devices/system/cpu/possible
0-5
# cat /proc/interrupts  | grep virtio
 24:  0  0   PCI-MSI 65536-edge  virtio0-config
 25:   1864  0   PCI-MSI 65537-edge  virtio0-req.0
 26:  0   1069   PCI-MSI 65538-edge  virtio0-req.1
 27:  0  0   PCI-MSI 65539-edge  virtio0-req.2
 28:  0  0   PCI-MSI 65540-edge  virtio0-req.3
 29:  0  0   PCI-MSI 65541-edge  virtio0-req.4
 30:  0  0   PCI-MSI 65542-edge  virtio0-req.5

6 + 1 irqs are assigned even 4 out of 6 cpus are still offline.


Below is about the nvme emulated by qemu. While 2 out of 6 cpus are initial
assigned, nvme has 64 queues by default.

"-smp 2,maxcpus=6" and "-device nvme,drive=drive1,serial=deadbeaf1"

# cat /sys/devices/system/cpu/present
0-1
# cat /sys/devices/system/cpu/possible
0-5
# cat /proc/interrupts | grep nvme
 31:  0 16   PCI-MSI 81920-edge  nvme0q0
 32: 35  0   PCI-MSI 81921-edge  nvme0q1
 33:  0 42   PCI-MSI 81922-edge  nvme0q2
 34:   

Re: [Xen-devel] [PATCH v4.9 1/1] jiffies: use jiffies64_to_nsecs() to fix 100% steal usage for xen vcpu hotplug

2019-03-05 Thread Dongli Zhang
Thanks to Joe Jin's reminding, this patch is applicable to mainline linux
kernel, although there is no issue due to this kind of bug in mainline kernel.

Therefore, can I first submit this patch to mainline kernel and then backport it
to stable linux with more detailed explanation how the issue is reproduced on 
xen?

This would help synchronize stable with mainline better.

Thank you very much!

Dongli Zhang

On 3/5/19 3:59 PM, Dongli Zhang wrote:
> [ Not relevant upstream, therefore no upstream commit. ]
> 
> To fix, use jiffies64_to_nsecs() directly instead of deriving the result
> according to jiffies_to_usecs().
> 
> As the return type of jiffies_to_usecs() is 'unsigned int', when the return
> value is more than the size of 'unsigned int', the leading 32 bits would be
> discarded.
> 
> Suppose USEC_PER_SEC=100L and HZ=1000, below are the expected and
> actual incorrect result of jiffies_to_usecs(0x7770ef70):
> 
> - expected  : jiffies_to_usecs(0x7770ef70) = 0x01d291274d80
> - incorrect : jiffies_to_usecs(0x7770ef70) = 0x91274d80
> 
> The leading 0x01d2 is discarded.
> 
> After xen vcpu hotplug and when the new vcpu steal clock is calculated for
> the first time, the result of this_rq()->prev_steal_time in
> steal_account_process_tick() would be far smaller than the expected
> value, due to that jiffies_to_usecs() discards the leading 32 bits.
> 
> As a result, the diff between current steal and this_rq()->prev_steal_time
> is always very large. Steal usage would become 100% when the initial steal
> clock obtained from xen hypervisor is very large during xen vcpu hotplug,
> that is, when the guest is already up for a long time.
> 
> The bug can be detected by doing the following:
> 
> * Boot xen guest with vcpus=2 and maxvcpus=4
> * Leave the guest running for a month so that the initial steal clock for
>   the new vcpu would be very large
> * Hotplug 2 extra vcpus
> * The steal time of new vcpus in /proc/stat would increase abnormally and
>   sometimes steal usage in top can become 100%
> 
> This was incidentally fixed in the patch set starting by
> commit 93825f2ec736 ("jiffies: Reuse TICK_NSEC instead of NSEC_PER_JIFFY")
> and ended with
> commit b672592f0221 ("sched/cputime: Remove generic asm headers").
> 
> This version applies to the v4.9 series.
> 
> Link: https://lkml.org/lkml/2019/2/28/1373
> Suggested-by: Juergen Gross 
> Signed-off-by: Dongli Zhang 
> ---
>  include/linux/jiffies.h | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
> index 734377a..94aff43 100644
> --- a/include/linux/jiffies.h
> +++ b/include/linux/jiffies.h
> @@ -287,13 +287,13 @@ extern unsigned long preset_lpj;
>  extern unsigned int jiffies_to_msecs(const unsigned long j);
>  extern unsigned int jiffies_to_usecs(const unsigned long j);
>  
> +extern u64 jiffies64_to_nsecs(u64 j);
> +
>  static inline u64 jiffies_to_nsecs(const unsigned long j)
>  {
> - return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
> + return jiffies64_to_nsecs(j);
>  }
>  
> -extern u64 jiffies64_to_nsecs(u64 j);
> -
>  extern unsigned long __msecs_to_jiffies(const unsigned int m);
>  #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
>  /*
> 


[PATCH v4.9 1/1] jiffies: use jiffies64_to_nsecs() to fix 100% steal usage for xen vcpu hotplug

2019-03-04 Thread Dongli Zhang
[ Not relevant upstream, therefore no upstream commit. ]

To fix, use jiffies64_to_nsecs() directly instead of deriving the result
according to jiffies_to_usecs().

As the return type of jiffies_to_usecs() is 'unsigned int', when the return
value is more than the size of 'unsigned int', the leading 32 bits would be
discarded.

Suppose USEC_PER_SEC=100L and HZ=1000, below are the expected and
actual incorrect result of jiffies_to_usecs(0x7770ef70):

- expected  : jiffies_to_usecs(0x7770ef70) = 0x01d291274d80
- incorrect : jiffies_to_usecs(0x7770ef70) = 0x91274d80

The leading 0x01d2 is discarded.

After xen vcpu hotplug and when the new vcpu steal clock is calculated for
the first time, the result of this_rq()->prev_steal_time in
steal_account_process_tick() would be far smaller than the expected
value, due to that jiffies_to_usecs() discards the leading 32 bits.

As a result, the diff between current steal and this_rq()->prev_steal_time
is always very large. Steal usage would become 100% when the initial steal
clock obtained from xen hypervisor is very large during xen vcpu hotplug,
that is, when the guest is already up for a long time.

The bug can be detected by doing the following:

* Boot xen guest with vcpus=2 and maxvcpus=4
* Leave the guest running for a month so that the initial steal clock for
  the new vcpu would be very large
* Hotplug 2 extra vcpus
* The steal time of new vcpus in /proc/stat would increase abnormally and
  sometimes steal usage in top can become 100%

This was incidentally fixed in the patch set starting by
commit 93825f2ec736 ("jiffies: Reuse TICK_NSEC instead of NSEC_PER_JIFFY")
and ended with
commit b672592f0221 ("sched/cputime: Remove generic asm headers").

This version applies to the v4.9 series.

Link: https://lkml.org/lkml/2019/2/28/1373
Suggested-by: Juergen Gross 
Signed-off-by: Dongli Zhang 
---
 include/linux/jiffies.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 734377a..94aff43 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -287,13 +287,13 @@ extern unsigned long preset_lpj;
 extern unsigned int jiffies_to_msecs(const unsigned long j);
 extern unsigned int jiffies_to_usecs(const unsigned long j);
 
+extern u64 jiffies64_to_nsecs(u64 j);
+
 static inline u64 jiffies_to_nsecs(const unsigned long j)
 {
-   return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
+   return jiffies64_to_nsecs(j);
 }
 
-extern u64 jiffies64_to_nsecs(u64 j);
-
 extern unsigned long __msecs_to_jiffies(const unsigned int m);
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
 /*
-- 
2.7.4



Re: [BUG linux-4.9.x] xen hotplug cpu leads to 100% steal usage

2019-03-04 Thread Dongli Zhang
Hi Thomas,

On 3/2/19 7:43 AM, Thomas Gleixner wrote:
> On Thu, 28 Feb 2019, Dongli Zhang wrote:
>>
>> The root cause is that the return type of jiffies_to_usecs() is 'unsigned 
>> int',
>> but not 'unsigned long'. As a result, the leading 32 bits are discarded.
> 
> Errm. No. The root cause is that jiffies_to_usecs() is used for that in the
> first place. The function has been that way forever and all usage sites
> (except a broken dev_debug print in infiniband) feed delta values. Yes, it
> could have documentation

Thank you very much for the explanation. It would help the developers clarify
the usage of jiffies_to_usecs() (which we should always feed with dealt value)
with comments above it.

Indeed, the input value in this bug is also a delta value. Because of the
special mechanisms used by xen to account steal clock, the initial delta value
is always very large, only when the new cpu is added after the VM is already up
for very long time.

Dongli Zhang


> 
>> jiffies_to_usecs() is indirectly triggered by cputime_to_nsecs() at line 264.
>> If guest is already up for long time, the initial steal time for new vcpu 
>> might
>> be large and the leading 32 bits of jiffies_to_usecs() would be discarded.
> 
>> So far, I have two solutions:
>>
>> 1. Change the return type from 'unsigned int' to 'unsigned long' as in above
>> link and I am afraid it would bring side effect. The return type in latest
>> mainline kernel is still 'unsigned int'.
> 
> Changing it to unsigned long would just solve the issue for 64bit.
> 
> Thanks,
> 
>   tglx
> 


Re: [BUG linux-4.9.x] xen hotplug cpu leads to 100% steal usage

2019-03-04 Thread Dongli Zhang
Hi Juergen,

On 3/4/19 4:14 PM, Juergen Gross wrote:
> On 01/03/2019 03:35, Dongli Zhang wrote:
>> This issue is only for stable 4.9.x (e.g., 4.9.160), while the root cause is
>> still in the lasted mainline kernel.
>>
>> This is obviated by new feature patch set ended with b672592f0221
>> ("sched/cputime: Remove generic asm headers").
>>
>> After xen guest is up for long time, once we hotplug new vcpu, the 
>> corresponding
>> steal usage might become 100% and the steal time from /proc/stat would 
>> increase
>> abnormally.
>>
>> As we cannot wait for long time to reproduce the issue, here is how I 
>> reproduce
>> it on purpose by accounting a large initial steal clock for new vcpu 2 and 3.
>>
>> 1. Apply the below patch to guest 4.9.160 to account large initial steal 
>> clock
>> for new vcpu 2 and 3:
>>
>> diff --git a/drivers/xen/time.c b/drivers/xen/time.c
>> index ac5f23f..3cf629e 100644
>> --- a/drivers/xen/time.c
>> +++ b/drivers/xen/time.c
>> @@ -85,7 +85,14 @@ u64 xen_steal_clock(int cpu)
>> struct vcpu_runstate_info state;
>>  
>> xen_get_runstate_snapshot_cpu(, cpu);
>> -   return state.time[RUNSTATE_runnable] + state.time[RUNSTATE_offline];
>> +
>> +   if (cpu == 2 || cpu == 3)
>> +   return state.time[RUNSTATE_runnable]
>> +  + state.time[RUNSTATE_offline]
>> +  + 0x00071e87e677aa12;
>> +   else
>> +   return state.time[RUNSTATE_runnable]
>> +  + state.time[RUNSTATE_offline];
>>  }
>>  
>>  void xen_setup_runstate_info(int cpu)
>>
>>
>> 2. Boot hvm guest with "vcpus=2" and "maxvcpus=4". By default, VM boot with
>> vcpu 0 and 1.
>>
>> 3. Hotplug vcpu 2 and 3 via "xl vcpu-set  4" on dom0.
>>
>> In my env, the steal becomes 100% within 10s after the "xl vcpu-set" command 
>> on
>> dom0.
>>
>> I can reproduce on kvm with similar method. However, as the initial steal 
>> clock
>> on kvm guest is always 0, I do not think it is easy to hit this issue on kvm.
>>
>> 
>>
>> The root cause is that the return type of jiffies_to_usecs() is 'unsigned 
>> int',
>> but not 'unsigned long'. As a result, the leading 32 bits are discarded.
>>
>> jiffies_to_usecs() is indirectly triggered by cputime_to_nsecs() at line 264.
>> If guest is already up for long time, the initial steal time for new vcpu 
>> might
>> be large and the leading 32 bits of jiffies_to_usecs() would be discarded.
>>
>> As a result, the steal at line 259 is always large and the
>> this_rq()->prev_steal_time at line 264 is always small. The difference at 
>> line
>> 260 is always large during each time steal_account_process_time() is 
>> involved.
>> Finally, the steal time in /proc/stat would increase abnormally.
>>
>> 252 static __always_inline cputime_t steal_account_process_time(cputime_t 
>> maxtime)
>> 253 {
>> 254 #ifdef CONFIG_PARAVIRT
>> 255 if (static_key_false(_steal_enabled)) {
>> 256 cputime_t steal_cputime;
>> 257 u64 steal;
>> 258 
>> 259 steal = paravirt_steal_clock(smp_processor_id());
>> 260 steal -= this_rq()->prev_steal_time;
>> 261 
>> 262 steal_cputime = min(nsecs_to_cputime(steal), maxtime);
>> 263 account_steal_time(steal_cputime);
>> 264 this_rq()->prev_steal_time += 
>> cputime_to_nsecs(steal_cputime);
>> 265 
>> 266 return steal_cputime;
>> 267 }
>> 268 #endif
>> 269 return 0;
>> 270 }
>>
>> 
>>
>> I have emailed the kernel mailing list about the return type of
>> jiffies_to_usecs() and jiffies_to_msecs():
>>
>> https://lkml.org/lkml/2019/2/26/899
>>
>>
>> So far, I have two solutions:
>>
>> 1. Change the return type from 'unsigned int' to 'unsigned long' as in above
>> link and I am afraid it would bring side effect. The return type in latest
>> mainline kernel is still 'unsigned int'.
>>
>> 2. Something like below based on stable 4.9.160:
> 
> 3. use jiffies64_to_nsecs() instead of trying to open code it.

Thank you very much for the suggestion!

I have tested that jiffies64_to_nsecs() works well by reproducing the issue in
kvm g

[BUG linux-4.9.x] xen hotplug cpu leads to 100% steal usage

2019-02-28 Thread Dongli Zhang
gt;> HZ_TO_USEC_SHR32;
+# else
+   return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
+# endif
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_usecs64);
+
+
 /**
  * timespec_trunc - Truncate timespec to a granularity
  * @t: Timespec

People may dislike the 2nd solution.

3. Backport patch set ended with b672592f0221 ("sched/cputime: 
Remove generic asm headers").

This is not reasonable for stable branch as the patch set involves
lots of changes.


Would you please let me know if there is any suggestion on this issue?

Thank you very much!

Dongli Zhang


[PATCH for-next 1/1] blk-mq: use HCTX_TYPE_DEFAULT but not 0 to index blk_mq_tag_set->map

2019-02-27 Thread Dongli Zhang
Replace set->map[0] with set->map[HCTX_TYPE_DEFAULT] to avoid hardcoding.

Signed-off-by: Dongli Zhang 
---
 block/blk-mq.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 54535f4..4e502db 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2069,7 +2069,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct 
blk_mq_tag_set *set,
struct blk_mq_tags *tags;
int node;
 
-   node = blk_mq_hw_queue_to_node(>map[0], hctx_idx);
+   node = blk_mq_hw_queue_to_node(>map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
 
@@ -2125,7 +2125,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct 
blk_mq_tags *tags,
size_t rq_size, left;
int node;
 
-   node = blk_mq_hw_queue_to_node(>map[0], hctx_idx);
+   node = blk_mq_hw_queue_to_node(>map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
 
@@ -2424,7 +2424,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 * If the cpu isn't present, the cpu is mapped to first hctx.
 */
for_each_possible_cpu(i) {
-   hctx_idx = set->map[0].mq_map[i];
+   hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] &&
!__blk_mq_alloc_rq_map(set, hctx_idx)) {
@@ -2434,7 +2434,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 * case, remap the current ctx to hctx[0] which
 * is guaranteed to always have tags allocated
 */
-   set->map[0].mq_map[i] = 0;
+   set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
}
 
ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -2741,7 +2741,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set 
*set,
int node;
struct blk_mq_hw_ctx *hctx;
 
-   node = blk_mq_hw_queue_to_node(>map[0], i);
+   node = blk_mq_hw_queue_to_node(>map[HCTX_TYPE_DEFAULT], i);
/*
 * If the hw queue has been mapped to another numa node,
 * we need to realloc the hctx. If allocation fails, fallback
@@ -2972,7 +2972,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set 
*set)
return set->ops->map_queues(set);
} else {
BUG_ON(set->nr_maps > 1);
-   return blk_mq_map_queues(>map[0]);
+   return blk_mq_map_queues(>map[HCTX_TYPE_DEFAULT]);
}
 }
 
@@ -3242,7 +3242,7 @@ static void __blk_mq_update_nr_hw_queues(struct 
blk_mq_tag_set *set,
pr_warn("Increasing nr_hw_queues to %d fails, fallback 
to %d\n",
nr_hw_queues, prev_nr_hw_queues);
set->nr_hw_queues = prev_nr_hw_queues;
-   blk_mq_map_queues(>map[0]);
+   blk_mq_map_queues(>map[HCTX_TYPE_DEFAULT]);
goto fallback;
}
blk_mq_map_swqueue(q);
-- 
2.7.4



Why the return type of jiffies_to_usecs() is 'unsigned int' but not 'unsigned long'

2019-02-26 Thread Dongli Zhang
Hi,

I am writing to ask about why the return type of jiffies_to_usecs() and
jiffies_to_msecs() are 'unsigned int', but not 'unsigned long'?

For instance, about jiffies_to_usecs(), suppose the input is 0x7770ef70
(2003893872), that is, jiffies_to_usecs(0x7770ef70).

Suppose USEC_PER_SEC=100L and HZ=1000, we expect the result to be
0x01d291274d80. However, as the return type is 'unsigned int', the leading
32 bits are ignored and the actual result is 0x91274d80.

expected jiffies_to_usecs(0x7770ef70) : 0x01d291274d80
current jiffies_to_usecs(0x7770ef70)  : 0x91274d80

I think jiffies_to_msecs() would have the same issue.

As both jiffies_to_usecs() and jiffies_to_msecs() are totally called by 680
locations in the lasted mainline linux kernel, the below patch would generate
lots of warnings. Almost all warnings are due to printk or trace event.



diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index fa92824..d961b30 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -288,8 +288,8 @@ extern unsigned long preset_lpj;
 /*
  * Convert various time units to each other:
  */
-extern unsigned int jiffies_to_msecs(const unsigned long j);
-extern unsigned int jiffies_to_usecs(const unsigned long j);
+extern unsigned long jiffies_to_msecs(const unsigned long j);
+extern unsigned long jiffies_to_usecs(const unsigned long j);

 static inline u64 jiffies_to_nsecs(const unsigned long j)
 {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2edb508..72c139d 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -305,7 +305,7 @@ COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user
*, utp)
  * Avoid unnecessary multiplications/divisions in the
  * two most common HZ cases:
  */
-unsigned int jiffies_to_msecs(const unsigned long j)
+unsigned long jiffies_to_msecs(const unsigned long j)
 {
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
return (MSEC_PER_SEC / HZ) * j;
@@ -322,7 +322,7 @@ unsigned int jiffies_to_msecs(const unsigned long j)
 }
 EXPORT_SYMBOL(jiffies_to_msecs);

-unsigned int jiffies_to_usecs(const unsigned long j)
+unsigned long jiffies_to_usecs(const unsigned long j)
 {
/*
 * Hz usually doesn't go much further MSEC_PER_SEC.
-- 
2.7.4

--



While I am still struggling on those warnings, would you please confirm if the
'unsigned int' return type is by design on purpose or it is a bug as I mentioned
above?

So far the latest 4.9.x stable kernel is affected by a steal usage 100% issue
and the root cause is the incorrect return type of jiffies_to_usecs(). I will
discuss that in another email thread for more details later to facilitate people
to grep/search for similar/same issue on google in the future. That issue is not
reproducible after I change the return type of jiffies_to_usecs() to 'unsigned
long'.

Thank you very much!

Dongli Zhang


[PATCH v3 0/2] loop: fix two issues introduced by prior commit

2019-02-22 Thread Dongli Zhang
This patch set fix two issues introduced by prior commit.


[PATCH v3 1/2] loop: do not print warn message if partition scan is successful
[PATCH v3 1/2] fixes d57f3374ba48 ("loop: Move special partition reread
handling in loop_clr_fd()") to not always print warn message even when
partition scan is successful.


[PATCH v3 2/2] loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()
[PATCH v3 2/2] fixes 0da03cab87e6 ("loop: Fix deadlock when calling
blkdev_reread_part()") to not set GENHD_FL_NO_PART_SCAN before partition
scan when detaching loop device from the file.

Changed since v1:
  * move the setting of lo->lo_state to Lo_unbound after partscan has finished 
as well
(suggested by Jan Kara)

Changed since v2:
  * Put the code inline in __loop_clr_fd() but not introduce new function
(suggested by Jan Kara)


Dongli Zhang



[PATCH v3 2/2] loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()

2019-02-22 Thread Dongli Zhang
Commit 0da03cab87e6
("loop: Fix deadlock when calling blkdev_reread_part()") moves
blkdev_reread_part() out of the loop_ctl_mutex. However,
GENHD_FL_NO_PART_SCAN is set before __blkdev_reread_part(). As a result,
__blkdev_reread_part() will fail the check of GENHD_FL_NO_PART_SCAN and
will not rescan the loop device to delete all partitions.

Below are steps to reproduce the issue:

step1 # dd if=/dev/zero of=tmp.raw bs=1M count=100
step2 # losetup -P /dev/loop0 tmp.raw
step3 # parted /dev/loop0 mklabel gpt
step4 # parted -a none -s /dev/loop0 mkpart primary 64s 1
step5 # losetup -d /dev/loop0

Step5 will not be able to delete /dev/loop0p1 (introduced by step4) and
there is below kernel warning message:

[  464.414043] __loop_clr_fd: partition scan of loop0 failed (rc=-22)

This patch sets GENHD_FL_NO_PART_SCAN after blkdev_reread_part().

Fixes: 0da03cab87e6 ("loop: Fix deadlock when calling blkdev_reread_part()")
Signed-off-by: Dongli Zhang 
Reviewed-by: Jan Kara 
---
Changed since v1:
  * move the setting of lo->lo_state to Lo_unbound after partscan has finished 
as well
(suggested by Jan Kara)

Changed since v2:
  * Put the code inline in __loop_clr_fd() but not introduce new function
(suggested by Jan Kara)

 drivers/block/loop.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 7908673..7ec3297 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1089,16 +1089,12 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
release)
kobject_uevent(_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
}
mapping_set_gfp_mask(filp->f_mapping, gfp);
-   lo->lo_state = Lo_unbound;
/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);
blk_mq_unfreeze_queue(lo->lo_queue);
 
partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
lo_number = lo->lo_number;
-   lo->lo_flags = 0;
-   if (!part_shift)
-   lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
loop_unprepare_queue(lo);
 out_unlock:
mutex_unlock(_ctl_mutex);
@@ -1121,6 +1117,23 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
release)
/* Device is gone, no point in returning error */
err = 0;
}
+
+   /*
+* lo->lo_state is set to Lo_unbound here after above partscan has
+* finished.
+*
+* There cannot be anybody else entering __loop_clr_fd() as
+* lo->lo_backing_file is already cleared and Lo_rundown state
+* protects us from all the other places trying to change the 'lo'
+* device.
+*/
+   mutex_lock(_ctl_mutex);
+   lo->lo_flags = 0;
+   if (!part_shift)
+   lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+   lo->lo_state = Lo_unbound;
+   mutex_unlock(_ctl_mutex);
+
/*
 * Need not hold loop_ctl_mutex to fput backing file.
 * Calling fput holding loop_ctl_mutex triggers a circular
-- 
2.7.4



[PATCH v3 1/2] loop: do not print warn message if partition scan is successful

2019-02-22 Thread Dongli Zhang
Do not print warn message when the partition scan returns 0.

Fixes: d57f3374ba48 ("loop: Move special partition reread handling in 
loop_clr_fd()")
Signed-off-by: Dongli Zhang 
Reviewed-by: Jan Kara 
---
 drivers/block/loop.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cf55389..7908673 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1115,8 +1115,9 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
release)
err = __blkdev_reread_part(bdev);
else
err = blkdev_reread_part(bdev);
-   pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
-   __func__, lo_number, err);
+   if (err)
+   pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
+   __func__, lo_number, err);
/* Device is gone, no point in returning error */
err = 0;
}
-- 
2.7.4



Re: [PATCH v2 2/2] loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()

2019-02-22 Thread Dongli Zhang



On 02/22/2019 07:47 PM, Jan Kara wrote:
> On Thu 21-02-19 23:33:43, Dongli Zhang wrote:
>> Commit 0da03cab87e6
>> ("loop: Fix deadlock when calling blkdev_reread_part()") moves
>> blkdev_reread_part() out of the loop_ctl_mutex. However,
>> GENHD_FL_NO_PART_SCAN is set before __blkdev_reread_part(). As a result,
>> __blkdev_reread_part() will fail the check of GENHD_FL_NO_PART_SCAN and
>> will not rescan the loop device to delete all partitions.
>>
>> Below are steps to reproduce the issue:
>>
>> step1 # dd if=/dev/zero of=tmp.raw bs=1M count=100
>> step2 # losetup -P /dev/loop0 tmp.raw
>> step3 # parted /dev/loop0 mklabel gpt
>> step4 # parted -a none -s /dev/loop0 mkpart primary 64s 1
>> step5 # losetup -d /dev/loop0
>>
>> Step5 will not be able to delete /dev/loop0p1 (introduced by step4) and
>> there is below kernel warning message:
>>
>> [  464.414043] __loop_clr_fd: partition scan of loop0 failed (rc=-22)
>>
>> This patch sets GENHD_FL_NO_PART_SCAN after blkdev_reread_part().
>>
>> Fixes: 0da03cab87e6 ("loop: Fix deadlock when calling blkdev_reread_part()")
>> Signed-off-by: Dongli Zhang 
>> ---
>> Changed since v1:
>>   * move the setting of lo->lo_state to Lo_unbound after partscan has 
>> finished as well
>> (suggested by Jan Kara)
>>
>>  drivers/block/loop.c | 26 ++
>>  1 file changed, 22 insertions(+), 4 deletions(-)
> 
> Thanks the patch looks good! Just one nit below:
> 
>> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
>> index 7908673..a13f5dc 100644
>> --- a/drivers/block/loop.c
>> +++ b/drivers/block/loop.c
>> @@ -1034,6 +1034,16 @@ loop_init_xfer(struct loop_device *lo, struct 
>> loop_func_table *xfer,
>>  return err;
>>  }
>>  
>> +static void loop_disable_partscan(struct loop_device *lo)
>> +{
> 
> I don't think there's any benefit in having this small function with a single
> caller and furthermore with the subtle sideeffect that it changes lo_state.
> So I'd just put the code inline in __loop_clr_fd(). With that you can add:
> 
> Reviewed-by: Jan Kara 

Thank you very much!

I will send out v3 with the Reviewed-by and put the code inline in 
__loop_clr_fd().

Dongli Zhang

> 
>   Honza
> 


[PATCH v2 2/2] loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()

2019-02-21 Thread Dongli Zhang
Commit 0da03cab87e6
("loop: Fix deadlock when calling blkdev_reread_part()") moves
blkdev_reread_part() out of the loop_ctl_mutex. However,
GENHD_FL_NO_PART_SCAN is set before __blkdev_reread_part(). As a result,
__blkdev_reread_part() will fail the check of GENHD_FL_NO_PART_SCAN and
will not rescan the loop device to delete all partitions.

Below are steps to reproduce the issue:

step1 # dd if=/dev/zero of=tmp.raw bs=1M count=100
step2 # losetup -P /dev/loop0 tmp.raw
step3 # parted /dev/loop0 mklabel gpt
step4 # parted -a none -s /dev/loop0 mkpart primary 64s 1
step5 # losetup -d /dev/loop0

Step5 will not be able to delete /dev/loop0p1 (introduced by step4) and
there is below kernel warning message:

[  464.414043] __loop_clr_fd: partition scan of loop0 failed (rc=-22)

This patch sets GENHD_FL_NO_PART_SCAN after blkdev_reread_part().

Fixes: 0da03cab87e6 ("loop: Fix deadlock when calling blkdev_reread_part()")
Signed-off-by: Dongli Zhang 
---
Changed since v1:
  * move the setting of lo->lo_state to Lo_unbound after partscan has finished 
as well
(suggested by Jan Kara)

 drivers/block/loop.c | 26 ++
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 7908673..a13f5dc 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1034,6 +1034,16 @@ loop_init_xfer(struct loop_device *lo, struct 
loop_func_table *xfer,
return err;
 }
 
+static void loop_disable_partscan(struct loop_device *lo)
+{
+   mutex_lock(_ctl_mutex);
+   lo->lo_flags = 0;
+   if (!part_shift)
+   lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+   lo->lo_state = Lo_unbound;
+   mutex_unlock(_ctl_mutex);
+}
+
 static int __loop_clr_fd(struct loop_device *lo, bool release)
 {
struct file *filp = NULL;
@@ -1089,16 +1099,12 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
release)
kobject_uevent(_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
}
mapping_set_gfp_mask(filp->f_mapping, gfp);
-   lo->lo_state = Lo_unbound;
/* This is safe: open() is still holding a reference. */
module_put(THIS_MODULE);
blk_mq_unfreeze_queue(lo->lo_queue);
 
partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
lo_number = lo->lo_number;
-   lo->lo_flags = 0;
-   if (!part_shift)
-   lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
loop_unprepare_queue(lo);
 out_unlock:
mutex_unlock(_ctl_mutex);
@@ -1121,6 +1127,18 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
release)
/* Device is gone, no point in returning error */
err = 0;
}
+
+   /*
+* lo->lo_state is set to Lo_unbound inside loop_disable_partscan()
+* here after above partscan has finished.
+*
+* There cannot be anybody else entering __loop_clr_fd() as
+* lo->lo_backing_file is already cleared and Lo_rundown state
+* protects us from all the other places trying to change the 'lo'
+* device.
+*/
+   loop_disable_partscan(lo);
+
/*
 * Need not hold loop_ctl_mutex to fput backing file.
 * Calling fput holding loop_ctl_mutex triggers a circular
-- 
2.7.4



[PATCH v2 0/2] loop: fix two issues introduced by prior commit

2019-02-21 Thread Dongli Zhang
This patch set fix two issues introduced by prior commit.


[PATCH v2 1/2] loop: do not print warn message if partition scan is successful

[PATCH v2 1/2] fixes d57f3374ba48 ("loop: Move special partition reread
handling in loop_clr_fd()") to not always print warn message even when
partition scan is successful.


[PATCH v2 2/2] loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()

[PATCH v2 2/2] fixes 0da03cab87e6 ("loop: Fix deadlock when calling
blkdev_reread_part()") to not set GENHD_FL_NO_PART_SCAN before partition
scan when detaching loop device from the file.

Changed since v1:
  * move the setting of lo->lo_state to Lo_unbound after partscan has finished 
as well
(suggested by Jan Kara)


Dongli Zhang



[PATCH v2 1/2] loop: do not print warn message if partition scan is successful

2019-02-21 Thread Dongli Zhang
Do not print warn message when the partition scan returns 0.

Fixes: d57f3374ba48 ("loop: Move special partition reread handling in 
loop_clr_fd()")
Signed-off-by: Dongli Zhang 
Reviewed-by: Jan Kara 
---
 drivers/block/loop.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cf55389..7908673 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1115,8 +1115,9 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
release)
err = __blkdev_reread_part(bdev);
else
err = blkdev_reread_part(bdev);
-   pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
-   __func__, lo_number, err);
+   if (err)
+   pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
+   __func__, lo_number, err);
/* Device is gone, no point in returning error */
err = 0;
}
-- 
2.7.4



Re: [PATCH 2/2] loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()

2019-02-21 Thread Dongli Zhang



On 02/21/2019 07:30 PM, Jan Kara wrote:
> On Thu 21-02-19 12:17:35, Dongli Zhang wrote:
>> Commit 0da03cab87e6
>> ("loop: Fix deadlock when calling blkdev_reread_part()") moves
>> blkdev_reread_part() out of the loop_ctl_mutex. However,
>> GENHD_FL_NO_PART_SCAN is set before __blkdev_reread_part(). As a result,
>> __blkdev_reread_part() will fail the check of GENHD_FL_NO_PART_SCAN and
>> will not rescan the loop device to delete all partitions.
>>
>> Below are steps to reproduce the issue:
>>
>> step1 # dd if=/dev/zero of=tmp.raw bs=1M count=100
>> step2 # losetup -P /dev/loop0 tmp.raw
>> step3 # parted /dev/loop0 mklabel gpt
>> step4 # parted -a none -s /dev/loop0 mkpart primary 64s 1
>> step5 # losetup -d /dev/loop0
> 
> Can you perhaps write a blktest for this? Thanks!

I will write a blktest for above case. Thanks for the suggestion.

> 
>> Step5 will not be able to delete /dev/loop0p1 (introduced by step4) and
>> there is below kernel warning message:
>>
>> [  464.414043] __loop_clr_fd: partition scan of loop0 failed (rc=-22)
>>
>> This patch sets GENHD_FL_NO_PART_SCAN after blkdev_reread_part().
>>
>> Fixes: 0da03cab87e6 ("loop: Fix deadlock when calling blkdev_reread_part()")
>> Signed-off-by: Dongli Zhang 
>> ---
>>  drivers/block/loop.c | 15 ---
>>  1 file changed, 12 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
>> index 7908673..736e55b 100644
>> --- a/drivers/block/loop.c
>> +++ b/drivers/block/loop.c
>> @@ -1034,6 +1034,15 @@ loop_init_xfer(struct loop_device *lo, struct 
>> loop_func_table *xfer,
>>  return err;
>>  }
>>  
>> +static void loop_disable_partscan(struct loop_device *lo)
>> +{
>> +mutex_lock(_ctl_mutex);
>> +lo->lo_flags = 0;
>> +if (!part_shift)
>> +lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
>> +mutex_unlock(_ctl_mutex);
>> +}
>> +
>>  static int __loop_clr_fd(struct loop_device *lo, bool release)
>>  {
>>  struct file *filp = NULL;
>> @@ -1096,9 +1105,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
>> release)
>>  
>>  partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
>>  lo_number = lo->lo_number;
>> -lo->lo_flags = 0;
>> -if (!part_shift)
>> -lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
>>  loop_unprepare_queue(lo);
>>  out_unlock:
>>  mutex_unlock(_ctl_mutex);
>> @@ -1121,6 +1127,9 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
>> release)
>>  /* Device is gone, no point in returning error */
>>  err = 0;
>>  }
>> +
>> +loop_disable_partscan(lo);
>> +
>>  /*
>>   * Need not hold loop_ctl_mutex to fput backing file.
>>   * Calling fput holding loop_ctl_mutex triggers a circular
> 
> So I don't think this change is actually correct. The problem is that once
> lo->lo_state is set to Lo_unbound and loop_ctl_mutex is unlocked, the loop
> device structure can be reused for a new device (bound to a new file). So
> you cannot safely manipulate flags on lo->lo_disk anymore. But I think we
> can just move the setting of lo->lo_state to Lo_unbound after partscan has
> finished as well. There cannot be anybody else entering __loop_clr_fd() as
> lo->lo_backing_file is already cleared and Lo_rundown state protects us
> from all the other places trying to change the 'lo' device (please make
> this last sentence into a comment in the code explaining why setting
> lo->lo_state so late is fine). Thanks!

I will set lo->lo_state to Lo_unbound after partscan in v2.

Thank you very much!

Dongli Zhang

> 
>   Honza
> 


[PATCH 2/2] loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()

2019-02-20 Thread Dongli Zhang
Commit 0da03cab87e6
("loop: Fix deadlock when calling blkdev_reread_part()") moves
blkdev_reread_part() out of the loop_ctl_mutex. However,
GENHD_FL_NO_PART_SCAN is set before __blkdev_reread_part(). As a result,
__blkdev_reread_part() will fail the check of GENHD_FL_NO_PART_SCAN and
will not rescan the loop device to delete all partitions.

Below are steps to reproduce the issue:

step1 # dd if=/dev/zero of=tmp.raw bs=1M count=100
step2 # losetup -P /dev/loop0 tmp.raw
step3 # parted /dev/loop0 mklabel gpt
step4 # parted -a none -s /dev/loop0 mkpart primary 64s 1
step5 # losetup -d /dev/loop0

Step5 will not be able to delete /dev/loop0p1 (introduced by step4) and
there is below kernel warning message:

[  464.414043] __loop_clr_fd: partition scan of loop0 failed (rc=-22)

This patch sets GENHD_FL_NO_PART_SCAN after blkdev_reread_part().

Fixes: 0da03cab87e6 ("loop: Fix deadlock when calling blkdev_reread_part()")
Signed-off-by: Dongli Zhang 
---
 drivers/block/loop.c | 15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 7908673..736e55b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1034,6 +1034,15 @@ loop_init_xfer(struct loop_device *lo, struct 
loop_func_table *xfer,
return err;
 }
 
+static void loop_disable_partscan(struct loop_device *lo)
+{
+   mutex_lock(_ctl_mutex);
+   lo->lo_flags = 0;
+   if (!part_shift)
+   lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+   mutex_unlock(_ctl_mutex);
+}
+
 static int __loop_clr_fd(struct loop_device *lo, bool release)
 {
struct file *filp = NULL;
@@ -1096,9 +1105,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
release)
 
partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
lo_number = lo->lo_number;
-   lo->lo_flags = 0;
-   if (!part_shift)
-   lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
loop_unprepare_queue(lo);
 out_unlock:
mutex_unlock(_ctl_mutex);
@@ -1121,6 +1127,9 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
release)
/* Device is gone, no point in returning error */
err = 0;
}
+
+   loop_disable_partscan(lo);
+
/*
 * Need not hold loop_ctl_mutex to fput backing file.
 * Calling fput holding loop_ctl_mutex triggers a circular
-- 
2.7.4



[PATCH 0/2] loop: fix two issues introduced by prior commit

2019-02-20 Thread Dongli Zhang
This patch set fix two issues introduced by prior commit.


[PATCH 1/2] loop: do not print warn message if partition scan is successful

[PATCH 1/2] fixes d57f3374ba48 ("loop: Move special partition reread
handling in loop_clr_fd()") to not always print warn message even when
partition scan is successful.

[PATCH 2/2] loop: set GENHD_FL_NO_PART_SCAN after blkdev_reread_part()

[PATCH 2/2] fixes 0da03cab87e6 ("loop: Fix deadlock when calling
blkdev_reread_part()") to not set GENHD_FL_NO_PART_SCAN before partition
scan when detaching loop device from the file.

Thank you very much!

Dongli Zhang



[PATCH 1/2] loop: do not print warn message if partition scan is successful

2019-02-20 Thread Dongli Zhang
Do not print warn message when the partition scan returns 0.

Fixes: d57f3374ba48 ("loop: Move special partition reread handling in 
loop_clr_fd()")
Signed-off-by: Dongli Zhang 
---
 drivers/block/loop.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cf55389..7908673 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1115,8 +1115,9 @@ static int __loop_clr_fd(struct loop_device *lo, bool 
release)
err = __blkdev_reread_part(bdev);
else
err = blkdev_reread_part(bdev);
-   pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
-   __func__, lo_number, err);
+   if (err)
+   pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
+   __func__, lo_number, err);
/* Device is gone, no point in returning error */
err = 0;
}
-- 
2.7.4



Re: [Xen-devel] [PATCH v6 2/2] xen/blkback: rework connect_ring() to avoid inconsistent xenstore 'ring-page-order' set by malicious blkfront

2019-02-18 Thread Dongli Zhang
Hi Konrad,

On 1/17/19 11:29 PM, Konrad Rzeszutek Wilk wrote:
> On Tue, Jan 15, 2019 at 09:20:36AM +0100, Roger Pau Monné wrote:
>> On Tue, Jan 15, 2019 at 12:41:44AM +0800, Dongli Zhang wrote:
>>> The xenstore 'ring-page-order' is used globally for each blkback queue and
>>> therefore should be read from xenstore only once. However, it is obtained
>>> in read_per_ring_refs() which might be called multiple times during the
>>> initialization of each blkback queue.
>>>
>>> If the blkfront is malicious and the 'ring-page-order' is set in different
>>> value by blkfront every time before blkback reads it, this may end up at
>>> the "WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));" in
>>> xen_blkif_disconnect() when frontend is destroyed.
>>>
>>> This patch reworks connect_ring() to read xenstore 'ring-page-order' only
>>> once.
>>>
>>> Signed-off-by: Dongli Zhang 
>>
>> LGTM:
>>
>> Reviewed-by: Roger Pau Monné 
> 
> Applied.
> 
> Will push out to Jens in a couple of days. Thank you!
>>
>> Thanks!

I only see the PATCH 1/2 (xen/blkback: add stack variable 'blkif' in
connect_ring()) on the top of below branch for Jens, would you please apply this
one (PATCH 2/2) as well?

https://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen.git/log/?h=linux-next

1/2 is only a prerequisite for 2/2.

Thank you very much!

Dongli Zhang


[PATCH v3 1/3] swiotlb: fix comment on swiotlb_bounce()

2019-01-17 Thread Dongli Zhang
Fix the comment as swiotlb_bounce() is used to copy from original dma
location to swiotlb buffer during swiotlb_tbl_map_single(), while to
copy from swiotlb buffer to original dma location during
swiotlb_tbl_unmap_single().

Signed-off-by: Dongli Zhang 
---
 kernel/dma/swiotlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1fb6fd6..1d8b377 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -385,7 +385,7 @@ void __init swiotlb_exit(void)
 }
 
 /*
- * Bounce: copy the swiotlb buffer back to the original dma location
+ * Bounce: copy the swiotlb buffer from or back to the original dma location
  */
 static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
   size_t size, enum dma_data_direction dir)
-- 
2.7.4



[PATCH v3 2/3] swiotlb: add debugfs to track swiotlb buffer usage

2019-01-17 Thread Dongli Zhang
The device driver will not be able to do dma operations once swiotlb buffer
is full, either because the driver is using so many IO TLB blocks inflight,
or because there is memory leak issue in device driver. To export the
swiotlb buffer usage via debugfs would help the user estimate the size of
swiotlb buffer to pre-allocate or analyze device driver memory leak issue.

Signed-off-by: Dongli Zhang 
---
Changed since v1:
  * init debugfs with late_initcall (suggested by Robin Murphy)
  * create debugfs entries with debugfs_create_ulong(suggested by Robin Murphy)

Changed since v2:
  * some #ifdef folding (suggested by Konrad Rzeszutek Wilk)

 kernel/dma/swiotlb.c | 44 
 1 file changed, 44 insertions(+)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 1d8b377..bedc9f9 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -34,6 +34,9 @@
 #include 
 #include 
 #include 
+#ifdef CONFIG_DEBUG_FS
+#include 
+#endif
 
 #include 
 #include 
@@ -73,6 +76,11 @@ phys_addr_t io_tlb_start, io_tlb_end;
 static unsigned long io_tlb_nslabs;
 
 /*
+ * The number of used IO TLB block
+ */
+static unsigned long io_tlb_used;
+
+/*
  * This is a free list describing the number of free entries available from
  * each index
  */
@@ -524,6 +532,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", 
size);
return DMA_MAPPING_ERROR;
 found:
+   io_tlb_used += nslots;
spin_unlock_irqrestore(_tlb_lock, flags);
 
/*
@@ -584,6 +593,8 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, 
phys_addr_t tlb_addr,
 */
for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != 
IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--)
io_tlb_list[i] = ++count;
+
+   io_tlb_used -= nslots;
}
spin_unlock_irqrestore(_tlb_lock, flags);
 }
@@ -662,3 +673,36 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
 {
return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
+
+#ifdef CONFIG_DEBUG_FS
+
+static int __init swiotlb_create_debugfs(void)
+{
+   static struct dentry *d_swiotlb_usage;
+   struct dentry *ent;
+
+   d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL);
+
+   if (!d_swiotlb_usage)
+   return -ENOMEM;
+
+   ent = debugfs_create_ulong("io_tlb_nslabs", 0400,
+  d_swiotlb_usage, _tlb_nslabs);
+   if (!ent)
+   goto fail;
+
+   ent = debugfs_create_ulong("io_tlb_used", 0400,
+  d_swiotlb_usage, _tlb_used);
+   if (!ent)
+   goto fail;
+
+   return 0;
+
+fail:
+   debugfs_remove_recursive(d_swiotlb_usage);
+   return -ENOMEM;
+}
+
+late_initcall(swiotlb_create_debugfs);
+
+#endif
-- 
2.7.4



[PATCH v3 3/3] swiotlb: checking whether swiotlb buffer is full with io_tlb_used

2019-01-17 Thread Dongli Zhang
This patch uses io_tlb_used to help check whether swiotlb buffer is full.
io_tlb_used is no longer used for only debugfs. It is also used to help
optimize swiotlb_tbl_map_single().

Suggested-by: Joe Jin 
Signed-off-by: Dongli Zhang 
---
Changed since v2:
  * move #ifdef folding to previous patch (suggested by Konrad Rzeszutek Wilk)

 kernel/dma/swiotlb.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index bedc9f9..a01b83e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -483,6 +483,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
 * request and allocate a buffer from that IO TLB pool.
 */
spin_lock_irqsave(_tlb_lock, flags);
+
+   if (unlikely(nslots > io_tlb_nslabs - io_tlb_used))
+   goto not_found;
+
index = ALIGN(io_tlb_index, stride);
if (index >= io_tlb_nslabs)
index = 0;
-- 
2.7.4



Re: [Xen-devel] [PATCH 1/1] xen-blkback: do not wake up shutdown_wq after xen_blkif_schedule() is stopped

2019-01-17 Thread Dongli Zhang
Hi Roger,

On 01/17/2019 04:20 PM, Roger Pau Monné wrote:
> On Thu, Jan 17, 2019 at 10:10:00AM +0800, Dongli Zhang wrote:
>> Hi Roger,
>>
>> On 2019/1/16 下午10:52, Roger Pau Monné wrote:
>>> On Wed, Jan 16, 2019 at 09:47:41PM +0800, Dongli Zhang wrote:
>>>> There is no need to wake up xen_blkif_schedule() as kthread_stop() is able
>>>> to already wake up the kernel thread.
>>>>
>>>> Signed-off-by: Dongli Zhang 
>>>> ---
>>>>  drivers/block/xen-blkback/xenbus.c | 4 +---
>>>>  1 file changed, 1 insertion(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/block/xen-blkback/xenbus.c 
>>>> b/drivers/block/xen-blkback/xenbus.c
>>>> index a4bc74e..37ac59e 100644
>>>> --- a/drivers/block/xen-blkback/xenbus.c
>>>> +++ b/drivers/block/xen-blkback/xenbus.c
>>>> @@ -254,10 +254,8 @@ static int xen_blkif_disconnect(struct xen_blkif 
>>>> *blkif)
>>>>if (!ring->active)
>>>>continue;
>>>>  
>>>> -  if (ring->xenblkd) {
>>>> +  if (ring->xenblkd)
>>>>kthread_stop(ring->xenblkd);
>>>> -  wake_up(>shutdown_wq);
>>>
>>> I've now realized that shutdown_wq is basically useless, and should be
>>> removed, could you please do it in this patch?
>>
>> I do not think shutdown_wq is useless.
>>
>> It is used to halt the xen_blkif_schedule() kthread when
>> RING_REQUEST_PROD_OVERFLOW() returns true in __do_block_io_op():
>>
>> 1121 static int
>> 1122 __do_block_io_op(struct xen_blkif_ring *ring)
>> ... ...
>> 1134 if (RING_REQUEST_PROD_OVERFLOW(_rings->common, rp)) {
>> 1135 rc = blk_rings->common.rsp_prod_pvt;
>> 1136 pr_warn("Frontend provided bogus ring requests (%d - %d 
>> =
>> %d). Halting ring processing on dev=%04x\n",
>> 1137 rp, rc, rp - rc, ring->blkif->vbd.pdevice);
>> 1138 return -EACCES;
>> 1139 }
>>
>>
>> If there is bogus/invalid ring requests, __do_block_io_op() would return 
>> -EACCES
>> without modifying prod/cons index.
>>
>> Without shutdown_wq (just simply assuming we remove the below code without
>> handling -EACCES in xen_blkif_schedule()), the kernel thread would continue 
>> the
>> while loop.
>>
>> 648 if (ret == -EACCES)
>> 649 wait_event_interruptible(ring->shutdown_wq,
>> 650  kthread_should_stop());
>>
>>
>> If xen_blkif_be_int() is triggered again (let's assume there is no 
>> optimization
>> on guest part and guest would send event for every request it puts on ring
>> buffer), we may come to do_block_io_op() again.
>>
>>
>> As the prod/cons index are not modified last time the code runs into
>> do_block_io_op() to process bogus request, we would hit the bogus request 
>> issue
>> again.
>>
>>
>> With shutdown_wq, the kernel kthread is blocked forever until such 
>> queue/ring is
>> destroyed. If we remove shutdown_wq, we are changing the policy to handle 
>> bogus
>> requests on ring buffer?
> 
> AFAICT the only wakeup call to shutdown_wq is removed in this patch,
> hence waiting on it seems useless. I would replace the
> wait_event_interruptible call in xen_blkif_schedule with a break, so
> that the kthread ends as soon as a bogus request is found. I think
> there's no point in waiting for xen_blkif_disconnect to stop the
> kthread.
> 
> Thanks, Roger.
> 

My fault. The shutdown_wq is useless. I think I was going to say
wait_event_interruptible() is useful as it is used to halt the kthread when
there is bogus request.

It is fine to replace wait_event_interruptible with a break to exit immediately
when there is bogus request.

Thank you very much!

Dongli Zhang


Re: [Xen-devel] [PATCH 1/1] xen-blkback: do not wake up shutdown_wq after xen_blkif_schedule() is stopped

2019-01-16 Thread Dongli Zhang
Hi Roger,

On 2019/1/16 下午10:52, Roger Pau Monné wrote:
> On Wed, Jan 16, 2019 at 09:47:41PM +0800, Dongli Zhang wrote:
>> There is no need to wake up xen_blkif_schedule() as kthread_stop() is able
>> to already wake up the kernel thread.
>>
>> Signed-off-by: Dongli Zhang 
>> ---
>>  drivers/block/xen-blkback/xenbus.c | 4 +---
>>  1 file changed, 1 insertion(+), 3 deletions(-)
>>
>> diff --git a/drivers/block/xen-blkback/xenbus.c 
>> b/drivers/block/xen-blkback/xenbus.c
>> index a4bc74e..37ac59e 100644
>> --- a/drivers/block/xen-blkback/xenbus.c
>> +++ b/drivers/block/xen-blkback/xenbus.c
>> @@ -254,10 +254,8 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
>>  if (!ring->active)
>>  continue;
>>  
>> -if (ring->xenblkd) {
>> +if (ring->xenblkd)
>>  kthread_stop(ring->xenblkd);
>> -wake_up(>shutdown_wq);
> 
> I've now realized that shutdown_wq is basically useless, and should be
> removed, could you please do it in this patch?

I do not think shutdown_wq is useless.

It is used to halt the xen_blkif_schedule() kthread when
RING_REQUEST_PROD_OVERFLOW() returns true in __do_block_io_op():

1121 static int
1122 __do_block_io_op(struct xen_blkif_ring *ring)
... ...
1134 if (RING_REQUEST_PROD_OVERFLOW(_rings->common, rp)) {
1135 rc = blk_rings->common.rsp_prod_pvt;
1136 pr_warn("Frontend provided bogus ring requests (%d - %d =
%d). Halting ring processing on dev=%04x\n",
1137 rp, rc, rp - rc, ring->blkif->vbd.pdevice);
1138 return -EACCES;
1139 }


If there is bogus/invalid ring requests, __do_block_io_op() would return -EACCES
without modifying prod/cons index.

Without shutdown_wq (just simply assuming we remove the below code without
handling -EACCES in xen_blkif_schedule()), the kernel thread would continue the
while loop.

648 if (ret == -EACCES)
649 wait_event_interruptible(ring->shutdown_wq,
650  kthread_should_stop());


If xen_blkif_be_int() is triggered again (let's assume there is no optimization
on guest part and guest would send event for every request it puts on ring
buffer), we may come to do_block_io_op() again.


As the prod/cons index are not modified last time the code runs into
do_block_io_op() to process bogus request, we would hit the bogus request issue
again.


With shutdown_wq, the kernel kthread is blocked forever until such queue/ring is
destroyed. If we remove shutdown_wq, we are changing the policy to handle bogus
requests on ring buffer?

Please correct me if my understanding is wrong.

Thank you very much!

Dongli Zhang


Re: [Xen-devel] [PATCH v5 2/2] xen/blkback: rework connect_ring() to avoid inconsistent xenstore 'ring-page-order' set by malicious blkfront

2019-01-16 Thread Dongli Zhang



On 2019/1/17 上午12:32, Konrad Rzeszutek Wilk wrote:
> On Tue, Jan 08, 2019 at 04:24:32PM +0800, Dongli Zhang wrote:
>> oops. Please ignore this v5 patch.
>>
>> I just realized Linus suggested in an old email not use BUG()/BUG_ON() in 
>> the code.
>>
>> I will switch to the WARN() solution and resend again.
> 
> OK. Did I miss it?

Hi Konrad,

If you are talking about the new patch set:

https://lists.xenproject.org/archives/html/xen-devel/2019-01/msg00977.html
https://lists.xenproject.org/archives/html/xen-devel/2019-01/msg00978.html

About Linus' suggestion:

http://lkml.iu.edu/hypermail/linux/kernel/1610.0/00878.html

Dongli Zhang

> 
> ___
> Xen-devel mailing list
> xen-de...@lists.xenproject.org
> https://lists.xenproject.org/mailman/listinfo/xen-devel
> 


[PATCH 1/1] xen-blkback: do not wake up shutdown_wq after xen_blkif_schedule() is stopped

2019-01-16 Thread Dongli Zhang
There is no need to wake up xen_blkif_schedule() as kthread_stop() is able
to already wake up the kernel thread.

Signed-off-by: Dongli Zhang 
---
 drivers/block/xen-blkback/xenbus.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c 
b/drivers/block/xen-blkback/xenbus.c
index a4bc74e..37ac59e 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -254,10 +254,8 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
if (!ring->active)
continue;
 
-   if (ring->xenblkd) {
+   if (ring->xenblkd)
kthread_stop(ring->xenblkd);
-   wake_up(>shutdown_wq);
-   }
 
/* The above kthread_stop() guarantees that at this point we
 * don't have any discard_io or other_io requests. So, checking
-- 
2.7.4



[PATCH v6 2/2] xen/blkback: rework connect_ring() to avoid inconsistent xenstore 'ring-page-order' set by malicious blkfront

2019-01-14 Thread Dongli Zhang
The xenstore 'ring-page-order' is used globally for each blkback queue and
therefore should be read from xenstore only once. However, it is obtained
in read_per_ring_refs() which might be called multiple times during the
initialization of each blkback queue.

If the blkfront is malicious and the 'ring-page-order' is set in different
value by blkfront every time before blkback reads it, this may end up at
the "WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));" in
xen_blkif_disconnect() when frontend is destroyed.

This patch reworks connect_ring() to read xenstore 'ring-page-order' only
once.

Signed-off-by: Dongli Zhang 
---
Changed since v1:
  * change the order of xenstore read in read_per_ring_refs
  * use xenbus_read_unsigned() in connect_ring()

Changed since v2:
  * simplify the condition check as "(err != 1 && nr_grefs > 1)"
  * avoid setting err as -EINVAL to remove extra one line of code

Changed since v3:
  * exit at the beginning if !nr_grefs
  * change the if statements to avoid test (err != 1) twice
  * initialize a 'blkif' stack variable (refer to PATCH 1/2)

Changed since v4:
  * use BUG_ON() when (nr_grefs == 0) to reminder the developer
  * set err = -EINVAL before xenbus_dev_fatal()

Changed since v5:
  * use WARN_ON() instead of BUG_ON()

 drivers/block/xen-blkback/xenbus.c | 72 +++---
 1 file changed, 43 insertions(+), 29 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c 
b/drivers/block/xen-blkback/xenbus.c
index a4aadac..0878e55 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -926,7 +926,7 @@ static int read_per_ring_refs(struct xen_blkif_ring *ring, 
const char *dir)
int err, i, j;
struct xen_blkif *blkif = ring->blkif;
struct xenbus_device *dev = blkif->be->dev;
-   unsigned int ring_page_order, nr_grefs, evtchn;
+   unsigned int nr_grefs, evtchn;
 
err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
  );
@@ -936,43 +936,42 @@ static int read_per_ring_refs(struct xen_blkif_ring 
*ring, const char *dir)
return err;
}
 
-   err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
- _page_order);
-   if (err != 1) {
-   err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", 
_ref[0]);
+   nr_grefs = blkif->nr_ring_pages;
+
+   if (unlikely(!nr_grefs)) {
+   WARN_ON(true);
+   return -EINVAL;
+   }
+
+   for (i = 0; i < nr_grefs; i++) {
+   char ring_ref_name[RINGREF_NAME_LEN];
+
+   snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+   err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
+  "%u", _ref[i]);
+
if (err != 1) {
+   if (nr_grefs == 1)
+   break;
+
err = -EINVAL;
-   xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
+   xenbus_dev_fatal(dev, err, "reading %s/%s",
+dir, ring_ref_name);
return err;
}
-   nr_grefs = 1;
-   } else {
-   unsigned int i;
+   }
+
+   if (err != 1) {
+   WARN_ON(nr_grefs != 1);
 
-   if (ring_page_order > xen_blkif_max_ring_order) {
+   err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u",
+  _ref[0]);
+   if (err != 1) {
err = -EINVAL;
-   xenbus_dev_fatal(dev, err, "%s/request %d ring page 
order exceed max:%d",
-dir, ring_page_order,
-xen_blkif_max_ring_order);
+   xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
return err;
}
-
-   nr_grefs = 1 << ring_page_order;
-   for (i = 0; i < nr_grefs; i++) {
-   char ring_ref_name[RINGREF_NAME_LEN];
-
-   snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", 
i);
-   err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
-  "%u", _ref[i]);
-   if (err != 1) {
-   err = -EINVAL;
-   xenbus_dev_fatal(dev, err, "reading %s/%s",
-dir, ring_ref_name);
-   return err;
-   }
-   }
}
-   blkif->nr_ring_pag

[PATCH v6 1/2] xen/blkback: add stack variable 'blkif' in connect_ring()

2019-01-14 Thread Dongli Zhang
As 'be->blkif' is used for many times in connect_ring(), the stack variable
'blkif' is added to substitute 'be-blkif'.

Suggested-by: Paul Durrant 
Signed-off-by: Dongli Zhang 
Reviewed-by: Paul Durrant 
Reviewed-by: Roger Pau Monné 
---
 drivers/block/xen-blkback/xenbus.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c 
b/drivers/block/xen-blkback/xenbus.c
index a4bc74e..a4aadac 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -1023,6 +1023,7 @@ static int read_per_ring_refs(struct xen_blkif_ring 
*ring, const char *dir)
 static int connect_ring(struct backend_info *be)
 {
struct xenbus_device *dev = be->dev;
+   struct xen_blkif *blkif = be->blkif;
unsigned int pers_grants;
char protocol[64] = "";
int err, i;
@@ -1033,25 +1034,25 @@ static int connect_ring(struct backend_info *be)
 
pr_debug("%s %s\n", __func__, dev->otherend);
 
-   be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
+   blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_scanf(XBT_NIL, dev->otherend, "protocol",
   "%63s", protocol);
if (err <= 0)
strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
-   be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+   blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
-   be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+   blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
-   be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+   blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
else {
xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
return -ENOSYS;
}
pers_grants = xenbus_read_unsigned(dev->otherend, "feature-persistent",
   0);
-   be->blkif->vbd.feature_gnt_persistent = pers_grants;
-   be->blkif->vbd.overflow_max_grants = 0;
+   blkif->vbd.feature_gnt_persistent = pers_grants;
+   blkif->vbd.overflow_max_grants = 0;
 
/*
 * Read the number of hardware queues from frontend.
@@ -1067,16 +1068,16 @@ static int connect_ring(struct backend_info *be)
requested_num_queues, xenblk_max_queues);
return -ENOSYS;
}
-   be->blkif->nr_rings = requested_num_queues;
-   if (xen_blkif_alloc_rings(be->blkif))
+   blkif->nr_rings = requested_num_queues;
+   if (xen_blkif_alloc_rings(blkif))
return -ENOMEM;
 
pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
-be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
+blkif->nr_rings, blkif->blk_protocol, protocol,
 pers_grants ? "persistent grants" : "");
 
-   if (be->blkif->nr_rings == 1)
-   return read_per_ring_refs(>blkif->rings[0], dev->otherend);
+   if (blkif->nr_rings == 1)
+   return read_per_ring_refs(>rings[0], dev->otherend);
else {
xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
xspath = kmalloc(xspathsize, GFP_KERNEL);
@@ -1085,10 +1086,10 @@ static int connect_ring(struct backend_info *be)
return -ENOMEM;
}
 
-   for (i = 0; i < be->blkif->nr_rings; i++) {
+   for (i = 0; i < blkif->nr_rings; i++) {
memset(xspath, 0, xspathsize);
snprintf(xspath, xspathsize, "%s/queue-%u", 
dev->otherend, i);
-   err = read_per_ring_refs(>blkif->rings[i], xspath);
+   err = read_per_ring_refs(>rings[i], xspath);
if (err) {
kfree(xspath);
return err;
-- 
2.7.4



Re: [Xen-devel] [PATCH v4 2/2] xen/blkback: rework connect_ring() to avoid inconsistent xenstore 'ring-page-order' set by malicious blkfront

2019-01-08 Thread Dongli Zhang
Hi Roger,

On 01/07/2019 11:27 PM, Roger Pau Monné wrote:
> On Mon, Jan 07, 2019 at 10:07:34PM +0800, Dongli Zhang wrote:
>>
>>
>> On 01/07/2019 10:05 PM, Dongli Zhang wrote:
>>>
>>>
>>> On 01/07/2019 08:01 PM, Roger Pau Monné wrote:
>>>> On Mon, Jan 07, 2019 at 01:35:59PM +0800, Dongli Zhang wrote:
>>>>> The xenstore 'ring-page-order' is used globally for each blkback queue and
>>>>> therefore should be read from xenstore only once. However, it is obtained
>>>>> in read_per_ring_refs() which might be called multiple times during the
>>>>> initialization of each blkback queue.
>>>>>
>>>>> If the blkfront is malicious and the 'ring-page-order' is set in different
>>>>> value by blkfront every time before blkback reads it, this may end up at
>>>>> the "WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));" in
>>>>> xen_blkif_disconnect() when frontend is destroyed.
>>>>>
>>>>> This patch reworks connect_ring() to read xenstore 'ring-page-order' only
>>>>> once.
>>>>>
>>>>> Signed-off-by: Dongli Zhang 
>>>>> ---
>>>>> Changed since v1:
>>>>>   * change the order of xenstore read in read_per_ring_refs
>>>>>   * use xenbus_read_unsigned() in connect_ring()
>>>>>
>>>>> Changed since v2:
>>>>>   * simplify the condition check as "(err != 1 && nr_grefs > 1)"
>>>>>   * avoid setting err as -EINVAL to remove extra one line of code
>>>>>
>>>>> Changed since v3:
>>>>>   * exit at the beginning if !nr_grefs
>>>>>   * change the if statements to avoid test (err != 1) twice
>>>>>   * initialize a 'blkif' stack variable (refer to PATCH 1/2)
>>>>>
>>>>>  drivers/block/xen-blkback/xenbus.c | 76 
>>>>> +-
>>>>>  1 file changed, 43 insertions(+), 33 deletions(-)
>>>>>
>>>>> diff --git a/drivers/block/xen-blkback/xenbus.c 
>>>>> b/drivers/block/xen-blkback/xenbus.c
>>>>> index a4aadac..a2acbc9 100644
>>>>> --- a/drivers/block/xen-blkback/xenbus.c
>>>>> +++ b/drivers/block/xen-blkback/xenbus.c
>>>>> @@ -926,7 +926,7 @@ static int read_per_ring_refs(struct xen_blkif_ring 
>>>>> *ring, const char *dir)
>>>>>   int err, i, j;
>>>>>   struct xen_blkif *blkif = ring->blkif;
>>>>>   struct xenbus_device *dev = blkif->be->dev;
>>>>> - unsigned int ring_page_order, nr_grefs, evtchn;
>>>>> + unsigned int nr_grefs, evtchn;
>>>>>  
>>>>>   err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
>>>>> );
>>>>> @@ -936,43 +936,38 @@ static int read_per_ring_refs(struct xen_blkif_ring 
>>>>> *ring, const char *dir)
>>>>>   return err;
>>>>>   }
>>>>>  
>>>>> - err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
>>>>> -   _page_order);
>>>>> - if (err != 1) {
>>>>> - err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", 
>>>>> _ref[0]);
>>>>> + nr_grefs = blkif->nr_ring_pages;
>>>>> +
>>>>> + if (unlikely(!nr_grefs))
>>>>> + return -EINVAL;
>>>>
>>>> Is this even possible? AFAICT read_per_ring_refs will always be called
>>>> with blkif->nr_ring_pages != 0?
>>>>
>>>> If so, I would consider turning this into a BUG_ON/WARN_ON.
>>>
>>> It used to be "WARN_ON(!nr_grefs);" in the v3 of the patch.
>>>
>>> I would turn it into WARN_ON if it is fine with both Paul and you.
>>
>> To clarify, I would use WARN_ON() before exit with -EINVAL (when
>> blkif->nr_ring_pages is 0).
> 
> Given that this function will never be called with nr_ring_pages == 0
> I would be fine with just using a BUG_ON, getting here with
> nr_ring_pages == 0 would imply memory corruption or some other severe
> issue has happened, and there's no possible recovery.
> 
> If you want to instead keep the return, please use plain WARN instead
> of WARN_ON.
> 
> Thanks, Roger.
> 

Is there any reason using WARN than WARN_ON? Because of the message printed by
WARN? something like below?

 941 if (unlikely(!nr_grefs)) {
 942 WARN(!nr_grefs,
 943  "read_per_ring_refs() should be called with
blkif->nr_ring_pages != 0");
 944 return -EINVAL;
 945 }

Thank you very much.

Dongli Zhang


Re: [Xen-devel] [PATCH v5 2/2] xen/blkback: rework connect_ring() to avoid inconsistent xenstore 'ring-page-order' set by malicious blkfront

2019-01-08 Thread Dongli Zhang
oops. Please ignore this v5 patch.

I just realized Linus suggested in an old email not use BUG()/BUG_ON() in the 
code.

I will switch to the WARN() solution and resend again.

Sorry for the junk email.

Dongli Zhang

On 2019/1/8 下午4:15, Dongli Zhang wrote:
> The xenstore 'ring-page-order' is used globally for each blkback queue and
> therefore should be read from xenstore only once. However, it is obtained
> in read_per_ring_refs() which might be called multiple times during the
> initialization of each blkback queue.
> 
> If the blkfront is malicious and the 'ring-page-order' is set in different
> value by blkfront every time before blkback reads it, this may end up at
> the "WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));" in
> xen_blkif_disconnect() when frontend is destroyed.
> 
> This patch reworks connect_ring() to read xenstore 'ring-page-order' only
> once.
> 
> Signed-off-by: Dongli Zhang 
> ---
> Changed since v1:
>   * change the order of xenstore read in read_per_ring_refs
>   * use xenbus_read_unsigned() in connect_ring()
> 
> Changed since v2:
>   * simplify the condition check as "(err != 1 && nr_grefs > 1)"
>   * avoid setting err as -EINVAL to remove extra one line of code
> 
> Changed since v3:
>   * exit at the beginning if !nr_grefs
>   * change the if statements to avoid test (err != 1) twice
>   * initialize a 'blkif' stack variable (refer to PATCH 1/2)
> 
> Changed since v4:
>   * use BUG_ON() when (nr_grefs == 0) to reminder the developer
>   * set err = -EINVAL before xenbus_dev_fatal()
> 
>  drivers/block/xen-blkback/xenbus.c | 69 
> ++
>  1 file changed, 40 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/block/xen-blkback/xenbus.c 
> b/drivers/block/xen-blkback/xenbus.c
> index a4aadac..f6146cd 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -926,7 +926,7 @@ static int read_per_ring_refs(struct xen_blkif_ring 
> *ring, const char *dir)
>   int err, i, j;
>   struct xen_blkif *blkif = ring->blkif;
>   struct xenbus_device *dev = blkif->be->dev;
> - unsigned int ring_page_order, nr_grefs, evtchn;
> + unsigned int nr_grefs, evtchn;
>  
>   err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
> );
> @@ -936,43 +936,39 @@ static int read_per_ring_refs(struct xen_blkif_ring 
> *ring, const char *dir)
>   return err;
>   }
>  
> - err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
> -   _page_order);
> - if (err != 1) {
> - err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", 
> _ref[0]);
> + nr_grefs = blkif->nr_ring_pages;
> +
> + BUG_ON(!nr_grefs);
> +
> + for (i = 0; i < nr_grefs; i++) {
> + char ring_ref_name[RINGREF_NAME_LEN];
> +
> + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
> + err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
> +"%u", _ref[i]);
> +
>   if (err != 1) {
> + if (nr_grefs == 1)
> + break;
> +
>   err = -EINVAL;
> - xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
> + xenbus_dev_fatal(dev, err, "reading %s/%s",
> +  dir, ring_ref_name);
>   return err;
>   }
> - nr_grefs = 1;
> - } else {
> - unsigned int i;
> + }
>  
> - if (ring_page_order > xen_blkif_max_ring_order) {
> + if (err != 1) {
> + WARN_ON(nr_grefs != 1);
> +
> + err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u",
> +_ref[0]);
> + if (err != 1) {
>   err = -EINVAL;
> - xenbus_dev_fatal(dev, err, "%s/request %d ring page 
> order exceed max:%d",
> -  dir, ring_page_order,
> -  xen_blkif_max_ring_order);
> + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir);
>   return err;
>   }
> -
> - nr_grefs = 1 << ring_page_order;
> - for (i = 0; i < nr_grefs; i++) {
> - char ring_ref_name[RINGREF_NAME_LEN];
> -
> - snprintf(r

[PATCH v5 1/2] xen/blkback: add stack variable 'blkif' in connect_ring()

2019-01-08 Thread Dongli Zhang
As 'be->blkif' is used for many times in connect_ring(), the stack variable
'blkif' is added to substitute 'be-blkif'.

Suggested-by: Paul Durrant 
Signed-off-by: Dongli Zhang 
Reviewed-by: Paul Durrant 
Reviewed-by: Roger Pau Monné 
---
 drivers/block/xen-blkback/xenbus.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/block/xen-blkback/xenbus.c 
b/drivers/block/xen-blkback/xenbus.c
index a4bc74e..a4aadac 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -1023,6 +1023,7 @@ static int read_per_ring_refs(struct xen_blkif_ring 
*ring, const char *dir)
 static int connect_ring(struct backend_info *be)
 {
struct xenbus_device *dev = be->dev;
+   struct xen_blkif *blkif = be->blkif;
unsigned int pers_grants;
char protocol[64] = "";
int err, i;
@@ -1033,25 +1034,25 @@ static int connect_ring(struct backend_info *be)
 
pr_debug("%s %s\n", __func__, dev->otherend);
 
-   be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
+   blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
err = xenbus_scanf(XBT_NIL, dev->otherend, "protocol",
   "%63s", protocol);
if (err <= 0)
strcpy(protocol, "unspecified, assuming default");
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
-   be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+   blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
-   be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+   blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
-   be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+   blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
else {
xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
return -ENOSYS;
}
pers_grants = xenbus_read_unsigned(dev->otherend, "feature-persistent",
   0);
-   be->blkif->vbd.feature_gnt_persistent = pers_grants;
-   be->blkif->vbd.overflow_max_grants = 0;
+   blkif->vbd.feature_gnt_persistent = pers_grants;
+   blkif->vbd.overflow_max_grants = 0;
 
/*
 * Read the number of hardware queues from frontend.
@@ -1067,16 +1068,16 @@ static int connect_ring(struct backend_info *be)
requested_num_queues, xenblk_max_queues);
return -ENOSYS;
}
-   be->blkif->nr_rings = requested_num_queues;
-   if (xen_blkif_alloc_rings(be->blkif))
+   blkif->nr_rings = requested_num_queues;
+   if (xen_blkif_alloc_rings(blkif))
return -ENOMEM;
 
pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
-be->blkif->nr_rings, be->blkif->blk_protocol, protocol,
+blkif->nr_rings, blkif->blk_protocol, protocol,
 pers_grants ? "persistent grants" : "");
 
-   if (be->blkif->nr_rings == 1)
-   return read_per_ring_refs(>blkif->rings[0], dev->otherend);
+   if (blkif->nr_rings == 1)
+   return read_per_ring_refs(>rings[0], dev->otherend);
else {
xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
xspath = kmalloc(xspathsize, GFP_KERNEL);
@@ -1085,10 +1086,10 @@ static int connect_ring(struct backend_info *be)
return -ENOMEM;
}
 
-   for (i = 0; i < be->blkif->nr_rings; i++) {
+   for (i = 0; i < blkif->nr_rings; i++) {
memset(xspath, 0, xspathsize);
snprintf(xspath, xspathsize, "%s/queue-%u", 
dev->otherend, i);
-   err = read_per_ring_refs(>blkif->rings[i], xspath);
+   err = read_per_ring_refs(>rings[i], xspath);
if (err) {
kfree(xspath);
return err;
-- 
2.7.4



  1   2   >