netif_schedule uses a bit QDISC_STATE_SCHED to tell the qdisc layer
if a run of the qdisc has been scheduler. This is important when
tearing down qdisc instances. We can rcu_free an instance for example
if its possible that we might have outstanding references to it.

Perhaps more importantly in the per cpu lockless case we need to
schedule a run of the qdisc on all qdiscs that are enqueu'ing packets
and hitting the gso_skb requeue logic or else the skb may get stuck
on the gso_skb queue without anything to finish the xmit.

This patch uses a reference counter instead of a bit to account for
the multiple CPUs.
---
 include/net/sch_generic.h |    1 +
 net/core/dev.c            |   32 +++++++++++++++++++++++---------
 net/sched/sch_api.c       |    5 +++++
 net/sched/sch_generic.c   |   16 +++++++++++++++-
 4 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index cc28af0..2e0e5b0 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -94,6 +94,7 @@ struct Qdisc {
        seqcount_t              running;
        struct gnet_stats_queue qstats;
        unsigned long           state;
+       unsigned long __percpu  *cpu_state;
        struct Qdisc            *next_sched;
        struct sk_buff          *skb_bad_txq;
        struct rcu_head         rcu_head;
diff --git a/net/core/dev.c b/net/core/dev.c
index 5db395d..f491845 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2272,8 +2272,14 @@ static void __netif_reschedule(struct Qdisc *q)
 
 void __netif_schedule(struct Qdisc *q)
 {
-       if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
+       if (q->flags & TCQ_F_NOLOCK) {
+               unsigned long *s = this_cpu_ptr(q->cpu_state);
+
+               if (!test_and_set_bit(__QDISC_STATE_SCHED, s))
+                       __netif_reschedule(q);
+       } else if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) {
                __netif_reschedule(q);
+       }
 }
 EXPORT_SYMBOL(__netif_schedule);
 
@@ -3925,15 +3931,23 @@ static void net_tx_action(struct softirq_action *h)
                        if (!(q->flags & TCQ_F_NOLOCK)) {
                                root_lock = qdisc_lock(q);
                                spin_lock(root_lock);
-                       }
-                       /* We need to make sure head->next_sched is read
-                        * before clearing __QDISC_STATE_SCHED
-                        */
-                       smp_mb__before_atomic();
-                       clear_bit(__QDISC_STATE_SCHED, &q->state);
-                       qdisc_run(q);
-                       if (!(q->flags & TCQ_F_NOLOCK))
+
+                               /* We need to make sure head->next_sched is read
+                                * before clearing __QDISC_STATE_SCHED
+                                */
+                               smp_mb__before_atomic();
+                               clear_bit(__QDISC_STATE_SCHED, &q->state);
+
+                               qdisc_run(q);
+
                                spin_unlock(root_lock);
+                       } else {
+                               unsigned long *s = this_cpu_ptr(q->cpu_state);
+
+                               smp_mb__before_atomic();
+                               clear_bit(__QDISC_STATE_SCHED, s);
+                               __qdisc_run(q);
+                       }
                }
        }
 }
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 6c5bf13..89989a6 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -975,6 +975,10 @@ qdisc_create(struct net_device *dev, struct netdev_queue 
*dev_queue,
                                alloc_percpu(struct bad_txq_cell);
                        if (!sch->skb_bad_txq_cpu)
                                goto err_out4;
+
+                       sch->cpu_state = alloc_percpu(unsigned long);
+                       if (!sch->cpu_state)
+                               goto err_out4;
                }
 
                if (tca[TCA_STAB]) {
@@ -1027,6 +1031,7 @@ err_out4:
        free_percpu(sch->cpu_qstats);
        free_percpu(sch->gso_cpu_skb);
        free_percpu(sch->skb_bad_txq_cpu);
+       free_percpu(sch->cpu_state);
        /*
         * Any broken qdiscs that would require a ops->reset() here?
         * The qdisc was never in action so it shouldn't be necessary.
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index d10b762..f5b7254 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -171,6 +171,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
                        if (qdisc_is_percpu_stats(q)) {
                                qdisc_qstats_cpu_backlog_inc(q, nskb);
                                qdisc_qstats_cpu_qlen_inc(q);
+                               set_thread_flag(TIF_NEED_RESCHED);
                        } else {
                                qdisc_qstats_backlog_inc(q, nskb);
                                q->q.qlen++;
@@ -768,6 +769,10 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue 
*dev_queue,
                sch->skb_bad_txq_cpu = alloc_percpu(struct bad_txq_cell);
                if (!sch->skb_bad_txq_cpu)
                        goto errout;
+
+               sch->cpu_state = alloc_percpu(unsigned long);
+               if (!sch->cpu_state)
+                       goto errout;
        }
 
        return sch;
@@ -1037,7 +1042,16 @@ static bool some_qdisc_is_busy(struct net_device *dev)
                q = dev_queue->qdisc_sleeping;
 
                if (q->flags & TCQ_F_NOLOCK) {
-                       val = test_bit(__QDISC_STATE_SCHED, &q->state);
+                       int i;
+
+                       for_each_possible_cpu(i) {
+                               unsigned long *s;
+
+                               s = per_cpu_ptr(q->cpu_state, i);
+                               val = test_bit(__QDISC_STATE_SCHED, s);
+                               if (val)
+                                       break;
+                       }
                } else {
                        root_lock = qdisc_lock(q);
                        spin_lock_bh(root_lock);

Reply via email to