[GIT PULL] scheduler changes for v4.12

Ingo Molnar Mon, 01 May 2017 03:08:35 -0700

Linus,

Please pull the latest sched-core-for-linus git tree from:


   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
sched-core-for-linus

   # HEAD: 21173d0b4d2a0b9e9e5f3155cf2cfc5781a6f4b1 sched/x86: Update 
reschedule warning text

The main changes in this cycle were:

 - Another round of rq-clock handling debugging, robustization and fixes

 - PELT accounting improvements

 - CPU hotplug related ->cpus_allowed affinity handling fixes all around the 
tree

 - ... plus misc fixes, cleanups and updates.

 Thanks,

        Ingo

------------------>
NeilBrown (1):
      sched/core: Remove 'task' parameter and rename tsk_restore_flags() to 
current_restore_flags()

Peter Zijlstra (14):
      sched/core: Add WARNING for multiple update_rq_clock() calls
      sched/core: Add rq->lock wrappers
      sched/core: Add {EN,DE}QUEUE_NOCLOCK flags
      sched/core: Add ENQUEUE_NOCLOCK to ENQUEUE_RESTORE
      sched/core: Make sched_ttwu_pending() atomic in time
      sched/core: Simplify update_rq_clock() in __schedule()
      sched/core: Avoid obvious double update_rq_clock()
      sched/core: Fix double update_rq_clock) calls in 
attach_task()/detach_task()
      sched/core: Avoid double update_rq_clock() in move_queued_task()
      sched/fair: Explicitly generate __update_load_avg() instances
      sched/fair: Fix corner case in __accumulate_sum()
      sched/fair: Fix comments
      sched/fair: Increase PELT accuracy for small tasks
      sched/fair: Move the PELT constants into a generated header

Prarit Bhargava (1):
      sched/x86: Update reschedule warning text

Srikar Dronamraju (1):
      sched/fair: Prefer sibiling only if local group is under-utilized

Steven Rostedt (VMware) (1):
      sched/rt: Add comments describing the RT IPI pull method

Thomas Gleixner (13):
      ia64/topology: Remove cpus_allowed manipulation
      workqueue: Provide work_on_cpu_safe()
      ia64/salinfo: Replace racy task affinity logic
      ia64/sn/hwperf: Replace racy task affinity logic
      powerpc/smp: Replace open coded task affinity logic
      sparc/sysfs: Replace racy task affinity logic
      ACPI/processor: Fix error handling in __acpi_processor_start()
      ACPI/processor: Replace racy task affinity logic
      cpufreq/ia64: Replace racy task affinity logic
      cpufreq/sh: Replace racy task affinity logic
      cpufreq/sparc-us3: Replace racy task affinity logic
      cpufreq/sparc-us2e: Replace racy task affinity logic
      crypto: N2 - Replace racy task affinity logic

Vincent Guittot (1):
      sched/fair: Fix FTQ noise bench regression

Wanpeng Li (1):
      sched/core: Fix rq lock pinning warning after call balance callbacks

Yuyang Du (2):
      sched/fair: Optimize ___update_sched_avg()
      sched/Documentation: Add 'sched-pelt' tool


 Documentation/scheduler/sched-pelt.c | 108 +++++++++
 arch/ia64/kernel/salinfo.c           |  31 +--
 arch/ia64/kernel/topology.c          |   6 -
 arch/ia64/sn/kernel/sn2/sn_hwperf.c  |  17 +-
 arch/powerpc/kernel/smp.c            |  26 +--
 arch/sparc/kernel/sysfs.c            |  39 +---
 arch/x86/kernel/smp.c                |   2 +-
 drivers/acpi/processor_driver.c      |  10 +-
 drivers/acpi/processor_throttling.c  |  62 +++---
 drivers/block/nbd.c                  |   2 +-
 drivers/cpufreq/ia64-acpi-cpufreq.c  |  92 ++++----
 drivers/cpufreq/sh-cpufreq.c         |  45 ++--
 drivers/cpufreq/sparc-us2e-cpufreq.c |  45 ++--
 drivers/cpufreq/sparc-us3-cpufreq.c  |  46 ++--
 drivers/crypto/n2_core.c             |  31 +--
 drivers/scsi/iscsi_tcp.c             |   2 +-
 fs/nfsd/vfs.c                        |   2 +-
 include/linux/sched.h                |   6 +-
 include/linux/workqueue.h            |   5 +
 kernel/sched/core.c                  | 201 +++++++++--------
 kernel/sched/fair.c                  | 418 +++++++++++++++++++----------------
 kernel/sched/features.h              |   7 +
 kernel/sched/rt.c                    |  81 +++++++
 kernel/sched/sched-pelt.h            |  13 ++
 kernel/sched/sched.h                 |  65 +++++-
 kernel/softirq.c                     |   2 +-
 kernel/workqueue.c                   |  23 ++
 net/core/dev.c                       |   2 +-
 net/core/sock.c                      |   2 +-
 29 files changed, 847 insertions(+), 544 deletions(-)
 create mode 100644 Documentation/scheduler/sched-pelt.c
 create mode 100644 kernel/sched/sched-pelt.h

diff --git a/Documentation/scheduler/sched-pelt.c 
b/Documentation/scheduler/sched-pelt.c
new file mode 100644
index 000000000000..e4219139386a
--- /dev/null
+++ b/Documentation/scheduler/sched-pelt.c
@@ -0,0 +1,108 @@
+/*
+ * The following program is used to generate the constants for
+ * computing sched averages.
+ *
+ * ==============================================================
+ *             C program (compile with -lm)
+ * ==============================================================
+ */
+
+#include <math.h>
+#include <stdio.h>
+
+#define HALFLIFE 32
+#define SHIFT 32
+
+double y;
+
+void calc_runnable_avg_yN_inv(void)
+{
+       int i;
+       unsigned int x;
+
+       printf("static const u32 runnable_avg_yN_inv[] = {");
+       for (i = 0; i < HALFLIFE; i++) {
+               x = ((1UL<<32)-1)*pow(y, i);
+
+               if (i % 6 == 0) printf("\n\t");
+               printf("0x%8x, ", x);
+       }
+       printf("\n};\n\n");
+}
+
+int sum = 1024;
+
+void calc_runnable_avg_yN_sum(void)
+{
+       int i;
+
+       printf("static const u32 runnable_avg_yN_sum[] = {\n\t    0,");
+       for (i = 1; i <= HALFLIFE; i++) {
+               if (i == 1)
+                       sum *= y;
+               else
+                       sum = sum*y + 1024*y;
+
+               if (i % 11 == 0)
+                       printf("\n\t");
+
+               printf("%5d,", sum);
+       }
+       printf("\n};\n\n");
+}
+
+int n = -1;
+/* first period */
+long max = 1024;
+
+void calc_converged_max(void)
+{
+       long last = 0, y_inv = ((1UL<<32)-1)*y;
+
+       for (; ; n++) {
+               if (n > -1)
+                       max = ((max*y_inv)>>SHIFT) + 1024;
+                       /*
+                        * This is the same as:
+                        * max = max*y + 1024;
+                        */
+
+               if (last == max)
+                       break;
+
+               last = max;
+       }
+       n--;
+       printf("#define LOAD_AVG_PERIOD %d\n", HALFLIFE);
+       printf("#define LOAD_AVG_MAX %ld\n", max);
+//     printf("#define LOAD_AVG_MAX_N %d\n\n", n);
+}
+
+void calc_accumulated_sum_32(void)
+{
+       int i, x = sum;
+
+       printf("static const u32 __accumulated_sum_N32[] = {\n\t     0,");
+       for (i = 1; i <= n/HALFLIFE+1; i++) {
+               if (i > 1)
+                       x = x/2 + sum;
+
+               if (i % 6 == 0)
+                       printf("\n\t");
+
+               printf("%6d,", x);
+       }
+       printf("\n};\n\n");
+}
+
+void main(void)
+{
+       printf("/* Generated by Documentation/scheduler/sched-pelt; do not 
modify. */\n\n");
+
+       y = pow(0.5, 1/(double)HALFLIFE);
+
+       calc_runnable_avg_yN_inv();
+//     calc_runnable_avg_yN_sum();
+       calc_converged_max();
+//     calc_accumulated_sum_32();
+}
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index d194d5c83d32..63dc9cdc95c5 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -179,14 +179,14 @@ struct salinfo_platform_oemdata_parms {
        const u8 *efi_guid;
        u8 **oemdata;
        u64 *oemdata_size;
-       int ret;
 };
 
-static void
+static long
 salinfo_platform_oemdata_cpu(void *context)
 {
        struct salinfo_platform_oemdata_parms *parms = context;
-       parms->ret = salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, 
parms->oemdata_size);
+
+       return salinfo_platform_oemdata(parms->efi_guid, parms->oemdata, 
parms->oemdata_size);
 }
 
 static void
@@ -380,16 +380,7 @@ salinfo_log_release(struct inode *inode, struct file *file)
        return 0;
 }
 
-static void
-call_on_cpu(int cpu, void (*fn)(void *), void *arg)
-{
-       cpumask_t save_cpus_allowed = current->cpus_allowed;
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-       (*fn)(arg);
-       set_cpus_allowed_ptr(current, &save_cpus_allowed);
-}
-
-static void
+static long
 salinfo_log_read_cpu(void *context)
 {
        struct salinfo_data *data = context;
@@ -399,6 +390,7 @@ salinfo_log_read_cpu(void *context)
        /* Clear corrected errors as they are read from SAL */
        if (rh->severity == sal_log_severity_corrected)
                ia64_sal_clear_state_info(data->type);
+       return 0;
 }
 
 static void
@@ -430,7 +422,7 @@ salinfo_log_new_read(int cpu, struct salinfo_data *data)
        spin_unlock_irqrestore(&data_saved_lock, flags);
 
        if (!data->saved_num)
-               call_on_cpu(cpu, salinfo_log_read_cpu, data);
+               work_on_cpu_safe(cpu, salinfo_log_read_cpu, data);
        if (!data->log_size) {
                data->state = STATE_NO_DATA;
                cpumask_clear_cpu(cpu, &data->cpu_event);
@@ -459,11 +451,13 @@ salinfo_log_read(struct file *file, char __user *buffer, 
size_t count, loff_t *p
        return simple_read_from_buffer(buffer, count, ppos, buf, bufsize);
 }
 
-static void
+static long
 salinfo_log_clear_cpu(void *context)
 {
        struct salinfo_data *data = context;
+
        ia64_sal_clear_state_info(data->type);
+       return 0;
 }
 
 static int
@@ -486,7 +480,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu)
        rh = (sal_log_record_header_t *)(data->log_buffer);
        /* Corrected errors have already been cleared from SAL */
        if (rh->severity != sal_log_severity_corrected)
-               call_on_cpu(cpu, salinfo_log_clear_cpu, data);
+               work_on_cpu_safe(cpu, salinfo_log_clear_cpu, data);
        /* clearing a record may make a new record visible */
        salinfo_log_new_read(cpu, data);
        if (data->state == STATE_LOG_RECORD) {
@@ -531,9 +525,8 @@ salinfo_log_write(struct file *file, const char __user 
*buffer, size_t count, lo
                                .oemdata = &data->oemdata,
                                .oemdata_size = &data->oemdata_size
                        };
-                       call_on_cpu(cpu, salinfo_platform_oemdata_cpu, &parms);
-                       if (parms.ret)
-                               count = parms.ret;
+                       count = work_on_cpu_safe(cpu, 
salinfo_platform_oemdata_cpu,
+                                                &parms);
                } else
                        data->oemdata_size = 0;
        } else
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index 1a68f012a6dc..d76529cbff20 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -355,18 +355,12 @@ static int cache_add_dev(unsigned int cpu)
        unsigned long i, j;
        struct cache_info *this_object;
        int retval = 0;
-       cpumask_t oldmask;
 
        if (all_cpu_cache_info[cpu].kobj.parent)
                return 0;
 
-       oldmask = current->cpus_allowed;
-       retval = set_cpus_allowed_ptr(current, cpumask_of(cpu));
-       if (unlikely(retval))
-               return retval;
 
        retval = cpu_cache_sysfs_init(cpu);
-       set_cpus_allowed_ptr(current, &oldmask);
        if (unlikely(retval < 0))
                return retval;
 
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c 
b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 52704f199dd6..55febd65911a 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -598,12 +598,17 @@ static void sn_hwperf_call_sal(void *info)
        op_info->ret = r;
 }
 
+static long sn_hwperf_call_sal_work(void *info)
+{
+       sn_hwperf_call_sal(info);
+       return 0;
+}
+
 static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
 {
        u32 cpu;
        u32 use_ipi;
        int r = 0;
-       cpumask_t save_allowed;
        
        cpu = (op_info->a->arg & SN_HWPERF_ARG_CPU_MASK) >> 32;
        use_ipi = op_info->a->arg & SN_HWPERF_ARG_USE_IPI_MASK;
@@ -629,13 +634,9 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info 
*op_info)
                        /* use an interprocessor interrupt to call SAL */
                        smp_call_function_single(cpu, sn_hwperf_call_sal,
                                op_info, 1);
-               }
-               else {
-                       /* migrate the task before calling SAL */ 
-                       save_allowed = current->cpus_allowed;
-                       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-                       sn_hwperf_call_sal(op_info);
-                       set_cpus_allowed_ptr(current, &save_allowed);
+               } else {
+                       /* Call on the target CPU */
+                       work_on_cpu_safe(cpu, sn_hwperf_call_sal_work, op_info);
                }
        }
        r = op_info->ret;
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 46f89e66a273..d68ed1f004a3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -787,24 +787,21 @@ static struct sched_domain_topology_level 
powerpc_topology[] = {
        { NULL, },
 };
 
-void __init smp_cpus_done(unsigned int max_cpus)
+static __init long smp_setup_cpu_workfn(void *data __always_unused)
 {
-       cpumask_var_t old_mask;
+       smp_ops->setup_cpu(boot_cpuid);
+       return 0;
+}
 
-       /* We want the setup_cpu() here to be called from CPU 0, but our
-        * init thread may have been "borrowed" by another CPU in the meantime
-        * se we pin us down to CPU 0 for a short while
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+       /*
+        * We want the setup_cpu() here to be called on the boot CPU, but
+        * init might run on any CPU, so make sure it's invoked on the boot
+        * CPU.
         */
-       alloc_cpumask_var(&old_mask, GFP_NOWAIT);
-       cpumask_copy(old_mask, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(boot_cpuid));
-       
        if (smp_ops && smp_ops->setup_cpu)
-               smp_ops->setup_cpu(boot_cpuid);
-
-       set_cpus_allowed_ptr(current, old_mask);
-
-       free_cpumask_var(old_mask);
+               work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
 
        if (smp_ops && smp_ops->bringup_done)
                smp_ops->bringup_done();
@@ -812,7 +809,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
        dump_numa_cpu_topology();
 
        set_sched_topology(powerpc_topology);
-
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c
index d63fc613e7a9..5fd352b759af 100644
--- a/arch/sparc/kernel/sysfs.c
+++ b/arch/sparc/kernel/sysfs.c
@@ -98,27 +98,7 @@ static struct attribute_group mmu_stat_group = {
        .name = "mmu_stats",
 };
 
-/* XXX convert to rusty's on_one_cpu */
-static unsigned long run_on_cpu(unsigned long cpu,
-                               unsigned long (*func)(unsigned long),
-                               unsigned long arg)
-{
-       cpumask_t old_affinity;
-       unsigned long ret;
-
-       cpumask_copy(&old_affinity, &current->cpus_allowed);
-       /* should return -EINVAL to userspace */
-       if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
-               return 0;
-
-       ret = func(arg);
-
-       set_cpus_allowed_ptr(current, &old_affinity);
-
-       return ret;
-}
-
-static unsigned long read_mmustat_enable(unsigned long junk)
+static long read_mmustat_enable(void *data __maybe_unused)
 {
        unsigned long ra = 0;
 
@@ -127,11 +107,11 @@ static unsigned long read_mmustat_enable(unsigned long 
junk)
        return ra != 0;
 }
 
-static unsigned long write_mmustat_enable(unsigned long val)
+static long write_mmustat_enable(void *data)
 {
-       unsigned long ra, orig_ra;
+       unsigned long ra, orig_ra, *val = data;
 
-       if (val)
+       if (*val)
                ra = __pa(&per_cpu(mmu_stats, smp_processor_id()));
        else
                ra = 0UL;
@@ -142,7 +122,8 @@ static unsigned long write_mmustat_enable(unsigned long val)
 static ssize_t show_mmustat_enable(struct device *s,
                                struct device_attribute *attr, char *buf)
 {
-       unsigned long val = run_on_cpu(s->id, read_mmustat_enable, 0);
+       long val = work_on_cpu(s->id, read_mmustat_enable, NULL);
+
        return sprintf(buf, "%lx\n", val);
 }
 
@@ -150,13 +131,15 @@ static ssize_t store_mmustat_enable(struct device *s,
                        struct device_attribute *attr, const char *buf,
                        size_t count)
 {
-       unsigned long val, err;
-       int ret = sscanf(buf, "%lu", &val);
+       unsigned long val;
+       long err;
+       int ret;
 
+       ret = sscanf(buf, "%lu", &val);
        if (ret != 1)
                return -EINVAL;
 
-       err = run_on_cpu(s->id, write_mmustat_enable, val);
+       err = work_on_cpu(s->id, write_mmustat_enable, &val);
        if (err)
                return -EIO;
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index d3c66a15bbde..3cab8415389a 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -124,7 +124,7 @@ static bool smp_no_nmi_ipi = false;
 static void native_smp_send_reschedule(int cpu)
 {
        if (unlikely(cpu_is_offline(cpu))) {
-               WARN_ON(1);
+               WARN(1, "sched: Unexpected reschedule of offline CPU#%d!\n", 
cpu);
                return;
        }
        apic->send_IPI(cpu, RESCHEDULE_VECTOR);
diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index 9d5f0c7ed3f7..8697a82bd465 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -251,6 +251,9 @@ static int __acpi_processor_start(struct acpi_device 
*device)
        if (ACPI_SUCCESS(status))
                return 0;
 
+       result = -ENODEV;
+       acpi_pss_perf_exit(pr, device);
+
 err_power_exit:
        acpi_processor_power_exit(pr);
        return result;
@@ -259,11 +262,16 @@ static int __acpi_processor_start(struct acpi_device 
*device)
 static int acpi_processor_start(struct device *dev)
 {
        struct acpi_device *device = ACPI_COMPANION(dev);
+       int ret;
 
        if (!device)
                return -ENODEV;
 
-       return __acpi_processor_start(device);
+       /* Protect against concurrent CPU hotplug operations */
+       get_online_cpus();
+       ret = __acpi_processor_start(device);
+       put_online_cpus();
+       return ret;
 }
 
 static int acpi_processor_stop(struct device *dev)
diff --git a/drivers/acpi/processor_throttling.c 
b/drivers/acpi/processor_throttling.c
index a12f96cc93ff..3de34633f7f9 100644
--- a/drivers/acpi/processor_throttling.c
+++ b/drivers/acpi/processor_throttling.c
@@ -62,8 +62,8 @@ struct acpi_processor_throttling_arg {
 #define THROTTLING_POSTCHANGE      (2)
 
 static int acpi_processor_get_throttling(struct acpi_processor *pr);
-int acpi_processor_set_throttling(struct acpi_processor *pr,
-                                               int state, bool force);
+static int __acpi_processor_set_throttling(struct acpi_processor *pr,
+                                          int state, bool force, bool direct);
 
 static int acpi_processor_update_tsd_coord(void)
 {
@@ -891,7 +891,8 @@ static int acpi_processor_get_throttling_ptc(struct 
acpi_processor *pr)
                        ACPI_DEBUG_PRINT((ACPI_DB_INFO,
                                "Invalid throttling state, reset\n"));
                        state = 0;
-                       ret = acpi_processor_set_throttling(pr, state, true);
+                       ret = __acpi_processor_set_throttling(pr, state, true,
+                                                             true);
                        if (ret)
                                return ret;
                }
@@ -901,36 +902,31 @@ static int acpi_processor_get_throttling_ptc(struct 
acpi_processor *pr)
        return 0;
 }
 
-static int acpi_processor_get_throttling(struct acpi_processor *pr)
+static long __acpi_processor_get_throttling(void *data)
 {
-       cpumask_var_t saved_mask;
-       int ret;
+       struct acpi_processor *pr = data;
+
+       return pr->throttling.acpi_processor_get_throttling(pr);
+}
 
+static int acpi_processor_get_throttling(struct acpi_processor *pr)
+{
        if (!pr)
                return -EINVAL;
 
        if (!pr->flags.throttling)
                return -ENODEV;
 
-       if (!alloc_cpumask_var(&saved_mask, GFP_KERNEL))
-               return -ENOMEM;
-
        /*
-        * Migrate task to the cpu pointed by pr.
+        * This is either called from the CPU hotplug callback of
+        * processor_driver or via the ACPI probe function. In the latter
+        * case the CPU is not guaranteed to be online. Both call sites are
+        * protected against CPU hotplug.
         */
-       cpumask_copy(saved_mask, &current->cpus_allowed);
-       /* FIXME: use work_on_cpu() */
-       if (set_cpus_allowed_ptr(current, cpumask_of(pr->id))) {
-               /* Can't migrate to the target pr->id CPU. Exit */
-               free_cpumask_var(saved_mask);
+       if (!cpu_online(pr->id))
                return -ENODEV;
-       }
-       ret = pr->throttling.acpi_processor_get_throttling(pr);
-       /* restore the previous state */
-       set_cpus_allowed_ptr(current, saved_mask);
-       free_cpumask_var(saved_mask);
 
-       return ret;
+       return work_on_cpu(pr->id, __acpi_processor_get_throttling, pr);
 }
 
 static int acpi_processor_get_fadt_info(struct acpi_processor *pr)
@@ -1080,8 +1076,15 @@ static long acpi_processor_throttling_fn(void *data)
                        arg->target_state, arg->force);
 }
 
-int acpi_processor_set_throttling(struct acpi_processor *pr,
-                                               int state, bool force)
+static int call_on_cpu(int cpu, long (*fn)(void *), void *arg, bool direct)
+{
+       if (direct)
+               return fn(arg);
+       return work_on_cpu(cpu, fn, arg);
+}
+
+static int __acpi_processor_set_throttling(struct acpi_processor *pr,
+                                          int state, bool force, bool direct)
 {
        int ret = 0;
        unsigned int i;
@@ -1130,7 +1133,8 @@ int acpi_processor_set_throttling(struct acpi_processor 
*pr,
                arg.pr = pr;
                arg.target_state = state;
                arg.force = force;
-               ret = work_on_cpu(pr->id, acpi_processor_throttling_fn, &arg);
+               ret = call_on_cpu(pr->id, acpi_processor_throttling_fn, &arg,
+                                 direct);
        } else {
                /*
                 * When the T-state coordination is SW_ALL or HW_ALL,
@@ -1163,8 +1167,8 @@ int acpi_processor_set_throttling(struct acpi_processor 
*pr,
                        arg.pr = match_pr;
                        arg.target_state = state;
                        arg.force = force;
-                       ret = work_on_cpu(pr->id, acpi_processor_throttling_fn,
-                               &arg);
+                       ret = call_on_cpu(pr->id, acpi_processor_throttling_fn,
+                                         &arg, direct);
                }
        }
        /*
@@ -1182,6 +1186,12 @@ int acpi_processor_set_throttling(struct acpi_processor 
*pr,
        return ret;
 }
 
+int acpi_processor_set_throttling(struct acpi_processor *pr, int state,
+                                 bool force)
+{
+       return __acpi_processor_set_throttling(pr, state, force, false);
+}
+
 int acpi_processor_get_throttling_info(struct acpi_processor *pr)
 {
        int result = 0;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d8a23561b4cb..3c9052bf2327 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -244,7 +244,7 @@ static int sock_xmit(struct nbd_device *nbd, int index, int 
send,
                        *sent += result;
        } while (msg_data_left(&msg));
 
-       tsk_restore_flags(current, pflags, PF_MEMALLOC);
+       current_restore_flags(pflags, PF_MEMALLOC);
 
        return result;
 }
diff --git a/drivers/cpufreq/ia64-acpi-cpufreq.c 
b/drivers/cpufreq/ia64-acpi-cpufreq.c
index e28a31a40829..a757c0a1e7b5 100644
--- a/drivers/cpufreq/ia64-acpi-cpufreq.c
+++ b/drivers/cpufreq/ia64-acpi-cpufreq.c
@@ -34,6 +34,11 @@ struct cpufreq_acpi_io {
        unsigned int                            resume;
 };
 
+struct cpufreq_acpi_req {
+       unsigned int            cpu;
+       unsigned int            state;
+};
+
 static struct cpufreq_acpi_io  *acpi_io_data[NR_CPUS];
 
 static struct cpufreq_driver acpi_cpufreq_driver;
@@ -83,8 +88,7 @@ processor_get_pstate (
 static unsigned
 extract_clock (
        struct cpufreq_acpi_io *data,
-       unsigned value,
-       unsigned int cpu)
+       unsigned value)
 {
        unsigned long i;
 
@@ -98,60 +102,43 @@ extract_clock (
 }
 
 
-static unsigned int
+static long
 processor_get_freq (
-       struct cpufreq_acpi_io  *data,
-       unsigned int            cpu)
+       void *arg)
 {
-       int                     ret = 0;
-       u32                     value = 0;
-       cpumask_t               saved_mask;
-       unsigned long           clock_freq;
+       struct cpufreq_acpi_req *req = arg;
+       unsigned int            cpu = req->cpu;
+       struct cpufreq_acpi_io  *data = acpi_io_data[cpu];
+       u32                     value;
+       int                     ret;
 
        pr_debug("processor_get_freq\n");
-
-       saved_mask = current->cpus_allowed;
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
        if (smp_processor_id() != cpu)
-               goto migrate_end;
+               return -EAGAIN;
 
        /* processor_get_pstate gets the instantaneous frequency */
        ret = processor_get_pstate(&value);
-
        if (ret) {
-               set_cpus_allowed_ptr(current, &saved_mask);
                pr_warn("get performance failed with error %d\n", ret);
-               ret = 0;
-               goto migrate_end;
+               return ret;
        }
-       clock_freq = extract_clock(data, value, cpu);
-       ret = (clock_freq*1000);
-
-migrate_end:
-       set_cpus_allowed_ptr(current, &saved_mask);
-       return ret;
+       return 1000 * extract_clock(data, value);
 }
 
 
-static int
+static long
 processor_set_freq (
-       struct cpufreq_acpi_io  *data,
-       struct cpufreq_policy   *policy,
-       int                     state)
+       void *arg)
 {
-       int                     ret = 0;
-       u32                     value = 0;
-       cpumask_t               saved_mask;
-       int                     retval;
+       struct cpufreq_acpi_req *req = arg;
+       unsigned int            cpu = req->cpu;
+       struct cpufreq_acpi_io  *data = acpi_io_data[cpu];
+       int                     ret, state = req->state;
+       u32                     value;
 
        pr_debug("processor_set_freq\n");
-
-       saved_mask = current->cpus_allowed;
-       set_cpus_allowed_ptr(current, cpumask_of(policy->cpu));
-       if (smp_processor_id() != policy->cpu) {
-               retval = -EAGAIN;
-               goto migrate_end;
-       }
+       if (smp_processor_id() != cpu)
+               return -EAGAIN;
 
        if (state == data->acpi_data.state) {
                if (unlikely(data->resume)) {
@@ -159,8 +146,7 @@ processor_set_freq (
                        data->resume = 0;
                } else {
                        pr_debug("Already at target state (P%d)\n", state);
-                       retval = 0;
-                       goto migrate_end;
+                       return 0;
                }
        }
 
@@ -171,7 +157,6 @@ processor_set_freq (
         * First we write the target state's 'control' value to the
         * control_register.
         */
-
        value = (u32) data->acpi_data.states[state].control;
 
        pr_debug("Transitioning to state: 0x%08x\n", value);
@@ -179,17 +164,11 @@ processor_set_freq (
        ret = processor_set_pstate(value);
        if (ret) {
                pr_warn("Transition failed with error %d\n", ret);
-               retval = -ENODEV;
-               goto migrate_end;
+               return -ENODEV;
        }
 
        data->acpi_data.state = state;
-
-       retval = 0;
-
-migrate_end:
-       set_cpus_allowed_ptr(current, &saved_mask);
-       return (retval);
+       return 0;
 }
 
 
@@ -197,11 +176,13 @@ static unsigned int
 acpi_cpufreq_get (
        unsigned int            cpu)
 {
-       struct cpufreq_acpi_io *data = acpi_io_data[cpu];
+       struct cpufreq_acpi_req req;
+       long ret;
 
-       pr_debug("acpi_cpufreq_get\n");
+       req.cpu = cpu;
+       ret = work_on_cpu(cpu, processor_get_freq, &req);
 
-       return processor_get_freq(data, cpu);
+       return ret > 0 ? (unsigned int) ret : 0;
 }
 
 
@@ -210,7 +191,12 @@ acpi_cpufreq_target (
        struct cpufreq_policy   *policy,
        unsigned int index)
 {
-       return processor_set_freq(acpi_io_data[policy->cpu], policy, index);
+       struct cpufreq_acpi_req req;
+
+       req.cpu = policy->cpu;
+       req.state = index;
+
+       return work_on_cpu(req.cpu, processor_set_freq, &req);
 }
 
 static int
diff --git a/drivers/cpufreq/sh-cpufreq.c b/drivers/cpufreq/sh-cpufreq.c
index 86628e22b2a3..719c3d9f07fb 100644
--- a/drivers/cpufreq/sh-cpufreq.c
+++ b/drivers/cpufreq/sh-cpufreq.c
@@ -30,54 +30,63 @@
 
 static DEFINE_PER_CPU(struct clk, sh_cpuclk);
 
+struct cpufreq_target {
+       struct cpufreq_policy   *policy;
+       unsigned int            freq;
+};
+
 static unsigned int sh_cpufreq_get(unsigned int cpu)
 {
        return (clk_get_rate(&per_cpu(sh_cpuclk, cpu)) + 500) / 1000;
 }
 
-/*
- * Here we notify other drivers of the proposed change and the final change.
- */
-static int sh_cpufreq_target(struct cpufreq_policy *policy,
-                            unsigned int target_freq,
-                            unsigned int relation)
+static long __sh_cpufreq_target(void *arg)
 {
-       unsigned int cpu = policy->cpu;
+       struct cpufreq_target *target = arg;
+       struct cpufreq_policy *policy = target->policy;
+       int cpu = policy->cpu;
        struct clk *cpuclk = &per_cpu(sh_cpuclk, cpu);
-       cpumask_t cpus_allowed;
        struct cpufreq_freqs freqs;
        struct device *dev;
        long freq;
 
-       cpus_allowed = current->cpus_allowed;
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-
-       BUG_ON(smp_processor_id() != cpu);
+       if (smp_processor_id() != cpu)
+               return -ENODEV;
 
        dev = get_cpu_device(cpu);
 
        /* Convert target_freq from kHz to Hz */
-       freq = clk_round_rate(cpuclk, target_freq * 1000);
+       freq = clk_round_rate(cpuclk, target->freq * 1000);
 
        if (freq < (policy->min * 1000) || freq > (policy->max * 1000))
                return -EINVAL;
 
-       dev_dbg(dev, "requested frequency %u Hz\n", target_freq * 1000);
+       dev_dbg(dev, "requested frequency %u Hz\n", target->freq * 1000);
 
        freqs.old       = sh_cpufreq_get(cpu);
        freqs.new       = (freq + 500) / 1000;
        freqs.flags     = 0;
 
-       cpufreq_freq_transition_begin(policy, &freqs);
-       set_cpus_allowed_ptr(current, &cpus_allowed);
+       cpufreq_freq_transition_begin(target->policy, &freqs);
        clk_set_rate(cpuclk, freq);
-       cpufreq_freq_transition_end(policy, &freqs, 0);
+       cpufreq_freq_transition_end(target->policy, &freqs, 0);
 
        dev_dbg(dev, "set frequency %lu Hz\n", freq);
-
        return 0;
 }
 
+/*
+ * Here we notify other drivers of the proposed change and the final change.
+ */
+static int sh_cpufreq_target(struct cpufreq_policy *policy,
+                            unsigned int target_freq,
+                            unsigned int relation)
+{
+       struct cpufreq_target data = { .policy = policy, .freq = target_freq };
+
+       return work_on_cpu(policy->cpu, __sh_cpufreq_target, &data);
+}
+
 static int sh_cpufreq_verify(struct cpufreq_policy *policy)
 {
        struct clk *cpuclk = &per_cpu(sh_cpuclk, policy->cpu);
diff --git a/drivers/cpufreq/sparc-us2e-cpufreq.c 
b/drivers/cpufreq/sparc-us2e-cpufreq.c
index 35ddb6da93aa..90f33efee5fc 100644
--- a/drivers/cpufreq/sparc-us2e-cpufreq.c
+++ b/drivers/cpufreq/sparc-us2e-cpufreq.c
@@ -118,10 +118,6 @@ static void us2e_transition(unsigned long estar, unsigned 
long new_bits,
                            unsigned long clock_tick,
                            unsigned long old_divisor, unsigned long divisor)
 {
-       unsigned long flags;
-
-       local_irq_save(flags);
-
        estar &= ~ESTAR_MODE_DIV_MASK;
 
        /* This is based upon the state transition diagram in the IIe manual.  
*/
@@ -152,8 +148,6 @@ static void us2e_transition(unsigned long estar, unsigned 
long new_bits,
        } else {
                BUG();
        }
-
-       local_irq_restore(flags);
 }
 
 static unsigned long index_to_estar_mode(unsigned int index)
@@ -229,48 +223,51 @@ static unsigned long estar_to_divisor(unsigned long estar)
        return ret;
 }
 
+static void __us2e_freq_get(void *arg)
+{
+       unsigned long *estar = arg;
+
+       *estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
+}
+
 static unsigned int us2e_freq_get(unsigned int cpu)
 {
-       cpumask_t cpus_allowed;
        unsigned long clock_tick, estar;
 
-       cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-
        clock_tick = sparc64_get_clock_tick(cpu) / 1000;
-       estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
-
-       set_cpus_allowed_ptr(current, &cpus_allowed);
+       if (smp_call_function_single(cpu, __us2e_freq_get, &estar, 1))
+               return 0;
 
        return clock_tick / estar_to_divisor(estar);
 }
 
-static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
+static void __us2e_freq_target(void *arg)
 {
-       unsigned int cpu = policy->cpu;
+       unsigned int cpu = smp_processor_id();
+       unsigned int *index = arg;
        unsigned long new_bits, new_freq;
        unsigned long clock_tick, divisor, old_divisor, estar;
-       cpumask_t cpus_allowed;
-
-       cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
 
        new_freq = clock_tick = sparc64_get_clock_tick(cpu) / 1000;
-       new_bits = index_to_estar_mode(index);
-       divisor = index_to_divisor(index);
+       new_bits = index_to_estar_mode(*index);
+       divisor = index_to_divisor(*index);
        new_freq /= divisor;
 
        estar = read_hbreg(HBIRD_ESTAR_MODE_ADDR);
 
        old_divisor = estar_to_divisor(estar);
 
-       if (old_divisor != divisor)
+       if (old_divisor != divisor) {
                us2e_transition(estar, new_bits, clock_tick * 1000,
                                old_divisor, divisor);
+       }
+}
 
-       set_cpus_allowed_ptr(current, &cpus_allowed);
+static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index)
+{
+       unsigned int cpu = policy->cpu;
 
-       return 0;
+       return smp_call_function_single(cpu, __us2e_freq_target, &index, 1);
 }
 
 static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy)
diff --git a/drivers/cpufreq/sparc-us3-cpufreq.c 
b/drivers/cpufreq/sparc-us3-cpufreq.c
index a8d86a449ca1..30645b0118f9 100644
--- a/drivers/cpufreq/sparc-us3-cpufreq.c
+++ b/drivers/cpufreq/sparc-us3-cpufreq.c
@@ -35,22 +35,28 @@ static struct us3_freq_percpu_info *us3_freq_table;
 #define SAFARI_CFG_DIV_32      0x0000000080000000UL
 #define SAFARI_CFG_DIV_MASK    0x00000000C0000000UL
 
-static unsigned long read_safari_cfg(void)
+static void read_safari_cfg(void *arg)
 {
-       unsigned long ret;
+       unsigned long ret, *val = arg;
 
        __asm__ __volatile__("ldxa      [%%g0] %1, %0"
                             : "=&r" (ret)
                             : "i" (ASI_SAFARI_CONFIG));
-       return ret;
+       *val = ret;
 }
 
-static void write_safari_cfg(unsigned long val)
+static void update_safari_cfg(void *arg)
 {
+       unsigned long reg, *new_bits = arg;
+
+       read_safari_cfg(&reg);
+       reg &= ~SAFARI_CFG_DIV_MASK;
+       reg |= *new_bits;
+
        __asm__ __volatile__("stxa      %0, [%%g0] %1\n\t"
                             "membar    #Sync"
                             : /* no outputs */
-                            : "r" (val), "i" (ASI_SAFARI_CONFIG)
+                            : "r" (reg), "i" (ASI_SAFARI_CONFIG)
                             : "memory");
 }
 
@@ -78,29 +84,17 @@ static unsigned long get_current_freq(unsigned int cpu, 
unsigned long safari_cfg
 
 static unsigned int us3_freq_get(unsigned int cpu)
 {
-       cpumask_t cpus_allowed;
        unsigned long reg;
-       unsigned int ret;
-
-       cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
-
-       reg = read_safari_cfg();
-       ret = get_current_freq(cpu, reg);
-
-       set_cpus_allowed_ptr(current, &cpus_allowed);
 
-       return ret;
+       if (smp_call_function_single(cpu, read_safari_cfg, &reg, 1))
+               return 0;
+       return get_current_freq(cpu, reg);
 }
 
 static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index)
 {
        unsigned int cpu = policy->cpu;
-       unsigned long new_bits, new_freq, reg;
-       cpumask_t cpus_allowed;
-
-       cpumask_copy(&cpus_allowed, &current->cpus_allowed);
-       set_cpus_allowed_ptr(current, cpumask_of(cpu));
+       unsigned long new_bits, new_freq;
 
        new_freq = sparc64_get_clock_tick(cpu) / 1000;
        switch (index) {
@@ -121,15 +115,7 @@ static int us3_freq_target(struct cpufreq_policy *policy, 
unsigned int index)
                BUG();
        }
 
-       reg = read_safari_cfg();
-
-       reg &= ~SAFARI_CFG_DIV_MASK;
-       reg |= new_bits;
-       write_safari_cfg(reg);
-
-       set_cpus_allowed_ptr(current, &cpus_allowed);
-
-       return 0;
+       return smp_call_function_single(cpu, update_safari_cfg, &new_bits, 1);
 }
 
 static int __init us3_freq_cpu_init(struct cpufreq_policy *policy)
diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c
index c5aac25a5738..4ecb77aa60e1 100644
--- a/drivers/crypto/n2_core.c
+++ b/drivers/crypto/n2_core.c
@@ -65,6 +65,11 @@ struct spu_queue {
        struct list_head        list;
 };
 
+struct spu_qreg {
+       struct spu_queue        *queue;
+       unsigned long           type;
+};
+
 static struct spu_queue **cpu_to_cwq;
 static struct spu_queue **cpu_to_mau;
 
@@ -1631,31 +1636,27 @@ static void queue_cache_destroy(void)
        kmem_cache_destroy(queue_cache[HV_NCS_QTYPE_CWQ - 1]);
 }
 
-static int spu_queue_register(struct spu_queue *p, unsigned long q_type)
+static long spu_queue_register_workfn(void *arg)
 {
-       cpumask_var_t old_allowed;
+       struct spu_qreg *qr = arg;
+       struct spu_queue *p = qr->queue;
+       unsigned long q_type = qr->type;
        unsigned long hv_ret;
 
-       if (cpumask_empty(&p->sharing))
-               return -EINVAL;
-
-       if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
-               return -ENOMEM;
-
-       cpumask_copy(old_allowed, &current->cpus_allowed);
-
-       set_cpus_allowed_ptr(current, &p->sharing);
-
        hv_ret = sun4v_ncs_qconf(q_type, __pa(p->q),
                                 CWQ_NUM_ENTRIES, &p->qhandle);
        if (!hv_ret)
                sun4v_ncs_sethead_marker(p->qhandle, 0);
 
-       set_cpus_allowed_ptr(current, old_allowed);
+       return hv_ret ? -EINVAL : 0;
+}
 
-       free_cpumask_var(old_allowed);
+static int spu_queue_register(struct spu_queue *p, unsigned long q_type)
+{
+       int cpu = cpumask_any_and(&p->sharing, cpu_online_mask);
+       struct spu_qreg qr = { .queue = p, .type = q_type };
 
-       return (hv_ret ? -EINVAL : 0);
+       return work_on_cpu_safe(cpu, spu_queue_register_workfn, &qr);
 }
 
 static int spu_queue_setup(struct spu_queue *p)
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 4228aba1f654..bbea8eac9abb 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -387,7 +387,7 @@ static int iscsi_sw_tcp_pdu_xmit(struct iscsi_task *task)
                rc = 0;
        }
 
-       tsk_restore_flags(current, pflags, PF_MEMALLOC);
+       current_restore_flags(pflags, PF_MEMALLOC);
        return rc;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 19d50f600e8d..9aaf6ca77569 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1004,7 +1004,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh 
*fhp, struct file *file,
        else
                err = nfserrno(host_err);
        if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
-               tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
+               current_restore_flags(pflags, PF_LESS_THROTTLE);
        return err;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee84fd43..0978fb74e45a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1286,10 +1286,10 @@ TASK_PFA_TEST(LMK_WAITING, lmk_waiting)
 TASK_PFA_SET(LMK_WAITING, lmk_waiting)
 
 static inline void
-tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned 
long flags)
+current_restore_flags(unsigned long orig_flags, unsigned long flags)
 {
-       task->flags &= ~flags;
-       task->flags |= orig_flags & flags;
+       current->flags &= ~flags;
+       current->flags |= orig_flags & flags;
 }
 
 extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct 
cpumask *trial);
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bde063cefd04..c102ef65cb64 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu, long (*fn)(void 
*), void *arg)
 {
        return fn(arg);
 }
+static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+       return fn(arg);
+}
 #else
 long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc05a0f1..ab9f6ac099a7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -86,21 +86,6 @@ int sysctl_sched_rt_runtime = 950000;
 cpumask_var_t cpu_isolated_map;
 
 /*
- * this_rq_lock - lock this runqueue and disable interrupts.
- */
-static struct rq *this_rq_lock(void)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       local_irq_disable();
-       rq = this_rq();
-       raw_spin_lock(&rq->lock);
-
-       return rq;
-}
-
-/*
  * __task_rq_lock - lock the rq @p resides on.
  */
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq)
                return;
 
 #ifdef CONFIG_SCHED_DEBUG
+       if (sched_feat(WARN_DOUBLE_CLOCK))
+               SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
        rq->clock_update_flags |= RQCF_UPDATED;
 #endif
+
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
        if (delta < 0)
                return;
@@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq)
 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 {
        struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+       struct rq_flags rf;
 
        WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
        update_rq_clock(rq);
        rq->curr->sched_class->task_tick(rq, rq->curr, 1);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
 
        return HRTIMER_NORESTART;
 }
@@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq)
 static void __hrtick_start(void *arg)
 {
        struct rq *rq = arg;
+       struct rq_flags rf;
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
        __hrtick_restart(rq);
        rq->hrtick_csd_pending = 0;
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
 }
 
 /*
@@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p)
 
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int 
flags)
 {
-       update_rq_clock(rq);
+       if (!(flags & ENQUEUE_NOCLOCK))
+               update_rq_clock(rq);
+
        if (!(flags & ENQUEUE_RESTORE))
                sched_info_queued(rq, p);
+
        p->sched_class->enqueue_task(rq, p, flags);
 }
 
 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int 
flags)
 {
-       update_rq_clock(rq);
+       if (!(flags & DEQUEUE_NOCLOCK))
+               update_rq_clock(rq);
+
        if (!(flags & DEQUEUE_SAVE))
                sched_info_dequeued(rq, p);
+
        p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct 
*p, int flags)
  *
  * Returns (locked) new rq. Old rq's lock is released.
  */
-static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int 
new_cpu)
+static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
+                                  struct task_struct *p, int new_cpu)
 {
        lockdep_assert_held(&rq->lock);
 
        p->on_rq = TASK_ON_RQ_MIGRATING;
-       dequeue_task(rq, p, 0);
+       dequeue_task(rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, new_cpu);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, rf);
 
        rq = cpu_rq(new_cpu);
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, rf);
        BUG_ON(task_cpu(p) != new_cpu);
        enqueue_task(rq, p, 0);
        p->on_rq = TASK_ON_RQ_QUEUED;
@@ -980,7 +977,8 @@ struct migration_arg {
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  */
-static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int 
dest_cpu)
+static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
+                                struct task_struct *p, int dest_cpu)
 {
        if (unlikely(!cpu_active(dest_cpu)))
                return rq;
@@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct 
task_struct *p, int dest_
        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                return rq;
 
-       rq = move_queued_task(rq, p, dest_cpu);
+       update_rq_clock(rq);
+       rq = move_queued_task(rq, rf, p, dest_cpu);
 
        return rq;
 }
@@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data)
        struct migration_arg *arg = data;
        struct task_struct *p = arg->task;
        struct rq *rq = this_rq();
+       struct rq_flags rf;
 
        /*
         * The original target CPU might have gone down and we might
@@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data)
        sched_ttwu_pending();
 
        raw_spin_lock(&p->pi_lock);
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
        /*
         * If task_rq(p) != rq, it cannot be migrated here, because we're
         * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data)
         */
        if (task_rq(p) == rq) {
                if (task_on_rq_queued(p))
-                       rq = __migrate_task(rq, p, arg->dest_cpu);
+                       rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
                else
                        p->wake_cpu = arg->dest_cpu;
        }
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
        raw_spin_unlock(&p->pi_lock);
 
        local_irq_enable();
@@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const 
struct cpumask *new_mask)
                 * holding rq->lock.
                 */
                lockdep_assert_held(&rq->lock);
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
        }
        if (running)
                put_prev_task(rq, p);
@@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const 
struct cpumask *new_mask)
        p->sched_class->set_cpus_allowed(p, new_mask);
 
        if (queued)
-               enqueue_task(rq, p, ENQUEUE_RESTORE);
+               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
                set_curr_task(rq, p);
 }
@@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                 * OK, since we're going to drop the lock immediately
                 * afterwards anyway.
                 */
-               rq_unpin_lock(rq, &rf);
-               rq = move_queued_task(rq, p, dest_cpu);
-               rq_repin_lock(rq, &rf);
+               rq = move_queued_task(rq, &rf, p, dest_cpu);
        }
 out:
        task_rq_unlock(rq, p, &rf);
@@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, 
int cpu)
 {
        if (task_on_rq_queued(p)) {
                struct rq *src_rq, *dst_rq;
+               struct rq_flags srf, drf;
 
                src_rq = task_rq(p);
                dst_rq = cpu_rq(cpu);
 
+               rq_pin_lock(src_rq, &srf);
+               rq_pin_lock(dst_rq, &drf);
+
                p->on_rq = TASK_ON_RQ_MIGRATING;
                deactivate_task(src_rq, p, 0);
                set_task_cpu(p, cpu);
                activate_task(dst_rq, p, 0);
                p->on_rq = TASK_ON_RQ_QUEUED;
                check_preempt_curr(dst_rq, p, 0);
+
+               rq_unpin_lock(dst_rq, &drf);
+               rq_unpin_lock(src_rq, &srf);
+
        } else {
                /*
                 * Task isn't running anymore; make it appear like we migrated
@@ -1680,7 +1686,7 @@ static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
                 struct rq_flags *rf)
 {
-       int en_flags = ENQUEUE_WAKEUP;
+       int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
 
        lockdep_assert_held(&rq->lock);
 
@@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void)
        struct rq *rq = this_rq();
        struct llist_node *llist = llist_del_all(&rq->wake_list);
        struct task_struct *p;
-       unsigned long flags;
        struct rq_flags rf;
 
        if (!llist)
                return;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
-       rq_pin_lock(rq, &rf);
+       rq_lock_irqsave(rq, &rf);
+       update_rq_clock(rq);
 
        while (llist) {
                int wake_flags = 0;
@@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void)
                ttwu_do_activate(rq, p, wake_flags, &rf);
        }
 
-       rq_unpin_lock(rq, &rf);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
 }
 
 void scheduler_ipi(void)
@@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int 
cpu, int wake_flags)
 void wake_up_if_idle(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
+       struct rq_flags rf;
 
        rcu_read_lock();
 
@@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu)
        if (set_nr_if_polling(rq->idle)) {
                trace_sched_wake_idle_without_ipi(cpu);
        } else {
-               raw_spin_lock_irqsave(&rq->lock, flags);
+               rq_lock_irqsave(rq, &rf);
                if (is_idle_task(rq->curr))
                        smp_send_reschedule(cpu);
                /* Else CPU is not idle, do nothing here: */
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               rq_unlock_irqrestore(rq, &rf);
        }
 
 out:
@@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, 
int wake_flags)
        }
 #endif
 
-       raw_spin_lock(&rq->lock);
-       rq_pin_lock(rq, &rf);
+       rq_lock(rq, &rf);
+       update_rq_clock(rq);
        ttwu_do_activate(rq, p, wake_flags, &rf);
-       rq_unpin_lock(rq, &rf);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
 }
 
 /*
@@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, 
struct rq_flags *rf)
                 * disabled avoiding further scheduler activity on it and we've
                 * not yet picked a replacement task.
                 */
-               rq_unpin_lock(rq, rf);
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq, rf);
                raw_spin_lock(&p->pi_lock);
-               raw_spin_lock(&rq->lock);
-               rq_repin_lock(rq, rf);
+               rq_relock(rq, rf);
        }
 
        if (!(p->state & TASK_NORMAL))
@@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, 
struct rq_flags *rf)
                        delayacct_blkio_end();
                        atomic_dec(&rq->nr_iowait);
                }
-               ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+               ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
        }
 
        ttwu_do_wakeup(rq, p, 0, rf);
@@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p)
        update_rq_clock(rq);
        post_init_entity_util_avg(&p->se);
 
-       activate_task(rq, p, 0);
+       activate_task(rq, p, ENQUEUE_NOCLOCK);
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
@@ -3093,15 +3094,18 @@ void scheduler_tick(void)
        int cpu = smp_processor_id();
        struct rq *rq = cpu_rq(cpu);
        struct task_struct *curr = rq->curr;
+       struct rq_flags rf;
 
        sched_clock_tick();
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
+
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        cpu_load_update_active(rq);
        calc_global_load_tick(rq);
-       raw_spin_unlock(&rq->lock);
+
+       rq_unlock(rq, &rf);
 
        perf_event_task_tick();
 
@@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt)
         * done by the caller to avoid the race with signal_wake_up().
         */
        smp_mb__before_spinlock();
-       raw_spin_lock(&rq->lock);
-       rq_pin_lock(rq, &rf);
+       rq_lock(rq, &rf);
 
        /* Promote REQ to ACT */
        rq->clock_update_flags <<= 1;
+       update_rq_clock(rq);
 
        switch_count = &prev->nivcsw;
        if (!preempt && prev->state) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
-                       deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                       deactivate_task(rq, prev, DEQUEUE_SLEEP | 
DEQUEUE_NOCLOCK);
                        prev->on_rq = 0;
 
                        if (prev->in_iowait) {
@@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt)
                switch_count = &prev->nvcsw;
        }
 
-       if (task_on_rq_queued(prev))
-               update_rq_clock(rq);
-
        next = pick_next_task(rq, prev, &rf);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
@@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt)
                rq = context_switch(rq, prev, next, &rf);
        } else {
                rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
-               rq_unpin_lock(rq, &rf);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock_irq(rq, &rf);
        }
 
        balance_callback(rq);
@@ -3684,7 +3684,8 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-       int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
+       int oldprio, queued, running, queue_flag =
+               DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        const struct sched_class *prev_class;
        struct rq_flags rf;
        struct rq *rq;
@@ -3805,7 +3806,7 @@ void set_user_nice(struct task_struct *p, long nice)
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
        if (running)
                put_prev_task(rq, p);
 
@@ -3816,7 +3817,7 @@ void set_user_nice(struct task_struct *p, long nice)
        delta = p->prio - old_prio;
 
        if (queued) {
-               enqueue_task(rq, p, ENQUEUE_RESTORE);
+               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -4126,7 +4127,7 @@ static int __sched_setscheduler(struct task_struct *p,
        const struct sched_class *prev_class;
        struct rq_flags rf;
        int reset_on_fork;
-       int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+       int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        struct rq *rq;
 
        /* May grab non-irq protected spin_locks: */
@@ -4923,7 +4924,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned 
int, len,
  */
 SYSCALL_DEFINE0(sched_yield)
 {
-       struct rq *rq = this_rq_lock();
+       struct rq_flags rf;
+       struct rq *rq;
+
+       local_irq_disable();
+       rq = this_rq();
+       rq_lock(rq, &rf);
 
        schedstat_inc(rq->yld_count);
        current->sched_class->yield_task(rq);
@@ -4932,9 +4938,8 @@ SYSCALL_DEFINE0(sched_yield)
         * Since we are going to call schedule() anyway, there's
         * no need to preempt or enable interrupts:
         */
-       __release(rq->lock);
-       spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-       do_raw_spin_unlock(&rq->lock);
+       preempt_disable();
+       rq_unlock(rq, &rf);
        sched_preempt_enable_no_resched();
 
        schedule();
@@ -5514,7 +5519,7 @@ void sched_setnuma(struct task_struct *p, int nid)
        p->numa_preferred_nid = nid;
 
        if (queued)
-               enqueue_task(rq, p, ENQUEUE_RESTORE);
+               enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
        if (running)
                set_curr_task(rq, p);
        task_rq_unlock(rq, p, &rf);
@@ -5579,11 +5584,11 @@ static struct task_struct fake_task = {
  * there's no concurrency possible, we hold the required locks anyway
  * because of lock validation efforts.
  */
-static void migrate_tasks(struct rq *dead_rq)
+static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
 {
        struct rq *rq = dead_rq;
        struct task_struct *next, *stop = rq->stop;
-       struct rq_flags rf;
+       struct rq_flags orf = *rf;
        int dest_cpu;
 
        /*
@@ -5602,9 +5607,7 @@ static void migrate_tasks(struct rq *dead_rq)
         * class method both need to have an up-to-date
         * value of rq->clock[_task]
         */
-       rq_pin_lock(rq, &rf);
        update_rq_clock(rq);
-       rq_unpin_lock(rq, &rf);
 
        for (;;) {
                /*
@@ -5617,8 +5620,7 @@ static void migrate_tasks(struct rq *dead_rq)
                /*
                 * pick_next_task() assumes pinned rq->lock:
                 */
-               rq_repin_lock(rq, &rf);
-               next = pick_next_task(rq, &fake_task, &rf);
+               next = pick_next_task(rq, &fake_task, rf);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
 
@@ -5631,10 +5633,9 @@ static void migrate_tasks(struct rq *dead_rq)
                 * because !cpu_active at this point, which means load-balance
                 * will not interfere. Also, stop-machine.
                 */
-               rq_unpin_lock(rq, &rf);
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq, rf);
                raw_spin_lock(&next->pi_lock);
-               raw_spin_lock(&rq->lock);
+               rq_relock(rq, rf);
 
                /*
                 * Since we're inside stop-machine, _nothing_ should have
@@ -5648,12 +5649,12 @@ static void migrate_tasks(struct rq *dead_rq)
 
                /* Find suitable destination for @next, with force if needed. */
                dest_cpu = select_fallback_rq(dead_rq->cpu, next);
-
-               rq = __migrate_task(rq, next, dest_cpu);
+               rq = __migrate_task(rq, rf, next, dest_cpu);
                if (rq != dead_rq) {
-                       raw_spin_unlock(&rq->lock);
+                       rq_unlock(rq, rf);
                        rq = dead_rq;
-                       raw_spin_lock(&rq->lock);
+                       *rf = orf;
+                       rq_relock(rq, rf);
                }
                raw_spin_unlock(&next->pi_lock);
        }
@@ -5766,7 +5767,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 int sched_cpu_activate(unsigned int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
+       struct rq_flags rf;
 
        set_cpu_active(cpu, true);
 
@@ -5784,12 +5785,12 @@ int sched_cpu_activate(unsigned int cpu)
         * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
         *    domains.
         */
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       rq_lock_irqsave(rq, &rf);
        if (rq->rd) {
                BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                set_rq_online(rq);
        }
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
 
        update_max_interval();
 
@@ -5847,18 +5848,20 @@ int sched_cpu_starting(unsigned int cpu)
 int sched_cpu_dying(unsigned int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
+       struct rq_flags rf;
 
        /* Handle pending wakeups and then migrate everything off */
        sched_ttwu_pending();
-       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       rq_lock_irqsave(rq, &rf);
        if (rq->rd) {
                BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                set_rq_offline(rq);
        }
-       migrate_tasks(rq);
+       migrate_tasks(rq, &rf);
        BUG_ON(rq->nr_running != 1);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
+
        calc_load_migrate(rq);
        update_max_interval();
        nohz_balance_exit_idle(cpu);
@@ -6412,7 +6415,8 @@ static void sched_change_group(struct task_struct *tsk, 
int type)
  */
 void sched_move_task(struct task_struct *tsk)
 {
-       int queued, running;
+       int queued, running, queue_flags =
+               DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
        struct rq_flags rf;
        struct rq *rq;
 
@@ -6423,14 +6427,14 @@ void sched_move_task(struct task_struct *tsk)
        queued = task_on_rq_queued(tsk);
 
        if (queued)
-               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+               dequeue_task(rq, tsk, queue_flags);
        if (running)
                put_prev_task(rq, tsk);
 
        sched_change_group(tsk, TASK_MOVE_GROUP);
 
        if (queued)
-               enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+               enqueue_task(rq, tsk, queue_flags);
        if (running)
                set_curr_task(rq, tsk);
 
@@ -7008,14 +7012,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, 
u64 period, u64 quota)
        for_each_online_cpu(i) {
                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
                struct rq *rq = cfs_rq->rq;
+               struct rq_flags rf;
 
-               raw_spin_lock_irq(&rq->lock);
+               rq_lock_irq(rq, &rf);
                cfs_rq->runtime_enabled = runtime_enabled;
                cfs_rq->runtime_remaining = 0;
 
                if (cfs_rq->throttled)
                        unthrottle_cfs_rq(cfs_rq);
-               raw_spin_unlock_irq(&rq->lock);
+               rq_unlock_irq(rq, &rf);
        }
        if (runtime_was_enabled && !runtime_enabled)
                cfs_bandwidth_usage_dec();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dea138964b91..a903276fcb62 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 }
 
 #ifdef CONFIG_SMP
+
+#include "sched-pelt.h"
+
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
-/*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
- * dependent on this value.
- */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
-
 /* Give new sched_entity start runnable values to heavy its load in infant 
time */
 void init_entity_runnable_average(struct sched_entity *se)
 {
@@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct 
sched_entity *se)
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
-/* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
-       0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
-       0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
-       0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
-       0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
-       0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
-       0x85aac367, 0x82cd8698,
-};
-
-/*
- * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
- * over-estimates when re-combining.
- */
-static const u32 runnable_avg_yN_sum[] = {
-           0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
-        9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
-       17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
-};
-
-/*
- * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers. See Documentation/scheduler/sched-avg.txt how these
- * were generated:
- */
-static const u32 __accumulated_sum_N32[] = {
-           0, 23371, 35056, 40899, 43820, 45281,
-       46011, 46376, 46559, 46650, 46696, 46719,
-};
-
 /*
  * Approximate:
  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
  */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
 {
        unsigned int local_n;
 
-       if (!n)
-               return val;
-       else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+       if (unlikely(n > LOAD_AVG_PERIOD * 63))
                return 0;
 
        /* after bounds checking we can collapse to 32-bit */
@@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n)
        return val;
 }
 
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
+{
+       u32 c1, c2, c3 = d3; /* y^0 == 1 */
+
+       /*
+        * c1 = d1 y^p
+        */
+       c1 = decay_load((u64)d1, periods);
+
+       /*
+        *            p-1
+        * c2 = 1024 \Sum y^n
+        *            n=1
+        *
+        *              inf        inf
+        *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+        *              n=0        n=p
+        */
+       c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+
+       return c1 + c2 + c3;
+}
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
 /*
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ *           d1          d2           d3
+ *           ^           ^            ^
+ *           |           |            |
+ *         |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ *                           p-1
+ * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
+ *                           n=1
  *
- * We can compute this reasonably efficiently by combining:
- *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
+ *    = u y^p +                                        (Step 1)
+ *
+ *                     p-1
+ *      d1 y^p + 1024 \Sum y^n + d3 y^0                (Step 2)
+ *                     n=1
  */
-static u32 __compute_runnable_contrib(u64 n)
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+              unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-       u32 contrib = 0;
+       unsigned long scale_freq, scale_cpu;
+       u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+       u64 periods;
 
-       if (likely(n <= LOAD_AVG_PERIOD))
-               return runnable_avg_yN_sum[n];
-       else if (unlikely(n >= LOAD_AVG_MAX_N))
-               return LOAD_AVG_MAX;
+       scale_freq = arch_scale_freq_capacity(NULL, cpu);
+       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 
-       /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
-       contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
-       n %= LOAD_AVG_PERIOD;
-       contrib = decay_load(contrib, n);
-       return contrib + runnable_avg_yN_sum[n];
-}
+       delta += sa->period_contrib;
+       periods = delta / 1024; /* A period is 1024us (~1ms) */
 
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+       /*
+        * Step 1: decay old *_sum if we crossed period boundaries.
+        */
+       if (periods) {
+               sa->load_sum = decay_load(sa->load_sum, periods);
+               if (cfs_rq) {
+                       cfs_rq->runnable_load_sum =
+                               decay_load(cfs_rq->runnable_load_sum, periods);
+               }
+               sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+
+               /*
+                * Step 2
+                */
+               delta %= 1024;
+               contrib = __accumulate_pelt_segments(periods,
+                               1024 - sa->period_contrib, delta);
+       }
+       sa->period_contrib = delta;
+
+       contrib = cap_scale(contrib, scale_freq);
+       if (weight) {
+               sa->load_sum += weight * contrib;
+               if (cfs_rq)
+                       cfs_rq->runnable_load_sum += weight * contrib;
+       }
+       if (running)
+               sa->util_sum += contrib * scale_cpu;
+
+       return periods;
+}
 
 /*
  * We can represent the historical contribution to runnable average as the
@@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n)
  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  */
 static __always_inline int
-__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-       u64 delta, scaled_delta, periods;
-       u32 contrib;
-       unsigned int delta_w, scaled_delta_w, decayed = 0;
-       unsigned long scale_freq, scale_cpu;
+       u64 delta;
 
        delta = now - sa->last_update_time;
        /*
@@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg 
*sa,
        delta >>= 10;
        if (!delta)
                return 0;
-       sa->last_update_time = now;
-
-       scale_freq = arch_scale_freq_capacity(NULL, cpu);
-       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
-       /* delta_w is the amount already accumulated against our next period */
-       delta_w = sa->period_contrib;
-       if (delta + delta_w >= 1024) {
-               decayed = 1;
 
-               /* how much left for next period will start over, we don't know 
yet */
-               sa->period_contrib = 0;
+       sa->last_update_time += delta << 10;
 
-               /*
-                * Now that we know we're crossing a period boundary, figure
-                * out how much from delta we need to complete the current
-                * period and accrue it.
-                */
-               delta_w = 1024 - delta_w;
-               scaled_delta_w = cap_scale(delta_w, scale_freq);
-               if (weight) {
-                       sa->load_sum += weight * scaled_delta_w;
-                       if (cfs_rq) {
-                               cfs_rq->runnable_load_sum +=
-                                               weight * scaled_delta_w;
-                       }
-               }
-               if (running)
-                       sa->util_sum += scaled_delta_w * scale_cpu;
-
-               delta -= delta_w;
-
-               /* Figure out how many additional periods this update spans */
-               periods = delta / 1024;
-               delta %= 1024;
+       /*
+        * Now we know we crossed measurement unit boundaries. The *_avg
+        * accrues by two steps:
+        *
+        * Step 1: accumulate *_sum since last_update_time. If we haven't
+        * crossed period boundaries, finish.
+        */
+       if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+               return 0;
 
-               sa->load_sum = decay_load(sa->load_sum, periods + 1);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_sum =
-                               decay_load(cfs_rq->runnable_load_sum, periods + 
1);
-               }
-               sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
-               /* Efficiently calculate \sum (1..n_period) 1024*y^i */
-               contrib = __compute_runnable_contrib(periods);
-               contrib = cap_scale(contrib, scale_freq);
-               if (weight) {
-                       sa->load_sum += weight * contrib;
-                       if (cfs_rq)
-                               cfs_rq->runnable_load_sum += weight * contrib;
-               }
-               if (running)
-                       sa->util_sum += contrib * scale_cpu;
+       /*
+        * Step 2: update *_avg.
+        */
+       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+       if (cfs_rq) {
+               cfs_rq->runnable_load_avg =
+                       div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
        }
+       sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
 
-       /* Remainder of delta accrued against u_0` */
-       scaled_delta = cap_scale(delta, scale_freq);
-       if (weight) {
-               sa->load_sum += weight * scaled_delta;
-               if (cfs_rq)
-                       cfs_rq->runnable_load_sum += weight * scaled_delta;
-       }
-       if (running)
-               sa->util_sum += scaled_delta * scale_cpu;
+       return 1;
+}
 
-       sa->period_contrib += delta;
+static int
+__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+       return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+}
 
-       if (decayed) {
-               sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-               if (cfs_rq) {
-                       cfs_rq->runnable_load_avg =
-                               div_u64(cfs_rq->runnable_load_sum, 
LOAD_AVG_MAX);
-               }
-               sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
-       }
+static int
+__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct 
sched_entity *se)
+{
+       return ___update_load_avg(now, cpu, &se->avg,
+                                 se->on_rq * scale_load_down(se->load.weight),
+                                 cfs_rq->curr == se, NULL);
+}
 
-       return decayed;
+static int
+__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+       return ___update_load_avg(now, cpu, &cfs_rq->avg,
+                       scale_load_down(cfs_rq->load.weight),
+                       cfs_rq->curr != NULL, cfs_rq);
 }
 
 /*
@@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq 
*cfs_rq, int force)
 void set_task_rq_fair(struct sched_entity *se,
                      struct cfs_rq *prev, struct cfs_rq *next)
 {
+       u64 p_last_update_time;
+       u64 n_last_update_time;
+
        if (!sched_feat(ATTACH_AGE_LOAD))
                return;
 
@@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se,
         * time. This will result in the wakee task is less decayed, but giving
         * the wakee more load sounds not bad.
         */
-       if (se->avg.last_update_time && prev) {
-               u64 p_last_update_time;
-               u64 n_last_update_time;
+       if (!(se->avg.last_update_time && prev))
+               return;
 
 #ifndef CONFIG_64BIT
+       {
                u64 p_last_update_time_copy;
                u64 n_last_update_time_copy;
 
@@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se,
 
                } while (p_last_update_time != p_last_update_time_copy ||
                         n_last_update_time != n_last_update_time_copy);
+       }
 #else
-               p_last_update_time = prev->avg.last_update_time;
-               n_last_update_time = next->avg.last_update_time;
+       p_last_update_time = prev->avg.last_update_time;
+       n_last_update_time = next->avg.last_update_time;
 #endif
-               __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
-                                 &se->avg, 0, 0, NULL);
-               se->avg.last_update_time = n_last_update_time;
-       }
+       __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), 
se);
+       se->avg.last_update_time = n_last_update_time;
 }
 
 /* Take into account change of utilization of a child task group */
@@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct 
sched_entity *se)
        return 1;
 }
 
+/*
+ * Check if we need to update the load and the utilization of a blocked
+ * group_entity:
+ */
+static inline bool skip_blocked_update(struct sched_entity *se)
+{
+       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+       /*
+        * If sched_entity still have not zero load or utilization, we have to
+        * decay it:
+        */
+       if (se->avg.load_avg || se->avg.util_avg)
+               return false;
+
+       /*
+        * If there is a pending propagation, we have to update the load and
+        * the utilization of the sched_entity:
+        */
+       if (gcfs_rq->propagate_avg)
+               return false;
+
+       /*
+        * Otherwise, the load and the utilization of the sched_entity is
+        * already zero and there is no pending propagation, so it will be a
+        * waste of time to try to decay it:
+        */
+       return true;
+}
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
 
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
@@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, 
bool update_freq)
                set_tg_cfs_propagate(cfs_rq);
        }
 
-       decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-               scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, 
cfs_rq);
+       decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
 
 #ifndef CONFIG_64BIT
        smp_wmb();
@@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity 
*se, int flags)
         * Track task load average for carrying it to new CPU after migrated, 
and
         * track group sched_entity load average for task_h_load calc in 
migration
         */
-       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
-               __update_load_avg(now, cpu, &se->avg,
-                         se->on_rq * scale_load_down(se->load.weight),
-                         cfs_rq->curr == se, NULL);
-       }
+       if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+               __update_load_avg_se(now, cpu, cfs_rq, se);
 
        decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
        decayed |= propagate_entity_load_avg(se);
@@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se)
        u64 last_update_time;
 
        last_update_time = cfs_rq_last_update_time(cfs_rq);
-       __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 
0, NULL);
+       __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), 
se);
 }
 
 /*
@@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
                                throttled_list) {
                struct rq *rq = rq_of(cfs_rq);
+               struct rq_flags rf;
 
-               raw_spin_lock(&rq->lock);
+               rq_lock(rq, &rf);
                if (!cfs_rq_throttled(cfs_rq))
                        goto next;
 
@@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
                        unthrottle_cfs_rq(cfs_rq);
 
 next:
-               raw_spin_unlock(&rq->lock);
+               rq_unlock(rq, &rf);
 
                if (!remaining)
                        break;
@@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void)
        unsigned long curr_jiffies = READ_ONCE(jiffies);
        struct rq *this_rq = this_rq();
        unsigned long load;
+       struct rq_flags rf;
 
        if (curr_jiffies == this_rq->last_load_update_tick)
                return;
 
        load = weighted_cpuload(cpu_of(this_rq));
-       raw_spin_lock(&this_rq->lock);
+       rq_lock(this_rq, &rf);
        update_rq_clock(this_rq);
        cpu_load_update_nohz(this_rq, curr_jiffies, load);
-       raw_spin_unlock(&this_rq->lock);
+       rq_unlock(this_rq, &rf);
 }
 #else /* !CONFIG_NO_HZ_COMMON */
 static inline void cpu_load_update_nohz(struct rq *this_rq,
@@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct 
lb_env *env)
        lockdep_assert_held(&env->src_rq->lock);
 
        p->on_rq = TASK_ON_RQ_MIGRATING;
-       deactivate_task(env->src_rq, p, 0);
+       deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
        set_task_cpu(p, env->dst_cpu);
 }
 
@@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct 
*p)
        lockdep_assert_held(&rq->lock);
 
        BUG_ON(task_rq(p) != rq);
-       activate_task(rq, p, 0);
+       activate_task(rq, p, ENQUEUE_NOCLOCK);
        p->on_rq = TASK_ON_RQ_QUEUED;
        check_preempt_curr(rq, p, 0);
 }
@@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct 
task_struct *p)
  */
 static void attach_one_task(struct rq *rq, struct task_struct *p)
 {
-       raw_spin_lock(&rq->lock);
+       struct rq_flags rf;
+
+       rq_lock(rq, &rf);
+       update_rq_clock(rq);
        attach_task(rq, p);
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
 }
 
 /*
@@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env)
 {
        struct list_head *tasks = &env->tasks;
        struct task_struct *p;
+       struct rq_flags rf;
 
-       raw_spin_lock(&env->dst_rq->lock);
+       rq_lock(env->dst_rq, &rf);
+       update_rq_clock(env->dst_rq);
 
        while (!list_empty(tasks)) {
                p = list_first_entry(tasks, struct task_struct, se.group_node);
@@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env)
                attach_task(env->dst_rq, p);
        }
 
-       raw_spin_unlock(&env->dst_rq->lock);
+       rq_unlock(env->dst_rq, &rf);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        struct cfs_rq *cfs_rq;
-       unsigned long flags;
+       struct rq_flags rf;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
 
        /*
@@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu)
         * list_add_leaf_cfs_rq() for details.
         */
        for_each_leaf_cfs_rq(rq, cfs_rq) {
+               struct sched_entity *se;
+
                /* throttled entities do not contribute to load */
                if (throttled_hierarchy(cfs_rq))
                        continue;
@@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu)
                if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, 
true))
                        update_tg_load_avg(cfs_rq, 0);
 
-               /* Propagate pending load changes to the parent */
-               if (cfs_rq->tg->se[cpu])
-                       update_load_avg(cfs_rq->tg->se[cpu], 0);
+               /* Propagate pending load changes to the parent, if any: */
+               se = cfs_rq->tg->se[cpu];
+               if (se && !skip_blocked_update(se))
+                       update_load_avg(se, 0);
        }
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
 }
 
 /*
@@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        struct cfs_rq *cfs_rq = &rq->cfs;
-       unsigned long flags;
+       struct rq_flags rf;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
        update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       rq_unlock_irqrestore(rq, &rf);
 }
 
 static unsigned long task_h_load(struct task_struct *p)
@@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, 
struct sd_lb_stats *sd
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
+       struct sg_lb_stats *local = &sds->local_stat;
        struct sg_lb_stats tmp_sgs;
        int load_idx, prefer_sibling = 0;
        bool overload = false;
@@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, 
struct sd_lb_stats *sd
                local_group = cpumask_test_cpu(env->dst_cpu, 
sched_group_cpus(sg));
                if (local_group) {
                        sds->local = sg;
-                       sgs = &sds->local_stat;
+                       sgs = local;
 
                        if (env->idle != CPU_NEWLY_IDLE ||
                            time_after_eq(jiffies, sg->sgc->next_update))
@@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, 
struct sd_lb_stats *sd
                 * the tasks on the system).
                 */
                if (prefer_sibling && sds->local &&
-                   group_has_capacity(env, &sds->local_stat) &&
-                   (sgs->sum_nr_running > 1)) {
+                   group_has_capacity(env, local) &&
+                   (sgs->sum_nr_running > local->sum_nr_running + 1)) {
                        sgs->group_no_capacity = 1;
                        sgs->group_type = group_classify(sg, sgs);
                }
@@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        struct sched_domain *sd_parent = sd->parent;
        struct sched_group *group;
        struct rq *busiest;
-       unsigned long flags;
+       struct rq_flags rf;
        struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
 
        struct lb_env env = {
@@ -8105,7 +8139,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                env.loop_max  = min(sysctl_sched_nr_migrate, 
busiest->nr_running);
 
 more_balance:
-               raw_spin_lock_irqsave(&busiest->lock, flags);
+               rq_lock_irqsave(busiest, &rf);
                update_rq_clock(busiest);
 
                /*
@@ -8122,14 +8156,14 @@ static int load_balance(int this_cpu, struct rq 
*this_rq,
                 * See task_rq_lock() family for the details.
                 */
 
-               raw_spin_unlock(&busiest->lock);
+               rq_unlock(busiest, &rf);
 
                if (cur_ld_moved) {
                        attach_tasks(&env);
                        ld_moved += cur_ld_moved;
                }
 
-               local_irq_restore(flags);
+               local_irq_restore(rf.flags);
 
                if (env.flags & LBF_NEED_BREAK) {
                        env.flags &= ~LBF_NEED_BREAK;
@@ -8207,6 +8241,8 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        sd->nr_balance_failed++;
 
                if (need_active_balance(&env)) {
+                       unsigned long flags;
+
                        raw_spin_lock_irqsave(&busiest->lock, flags);
 
                        /* don't kick the active_load_balance_cpu_stop,
@@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data)
        struct rq *target_rq = cpu_rq(target_cpu);
        struct sched_domain *sd;
        struct task_struct *p = NULL;
+       struct rq_flags rf;
 
-       raw_spin_lock_irq(&busiest_rq->lock);
+       rq_lock_irq(busiest_rq, &rf);
 
        /* make sure the requested cpu hasn't gone down in the meantime */
        if (unlikely(busiest_cpu != smp_processor_id() ||
@@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data)
        rcu_read_unlock();
 out_unlock:
        busiest_rq->active_balance = 0;
-       raw_spin_unlock(&busiest_rq->lock);
+       rq_unlock(busiest_rq, &rf);
 
        if (p)
                attach_one_task(target_rq, p);
@@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum 
cpu_idle_type idle)
                 * do the balance.
                 */
                if (time_after_eq(jiffies, rq->next_balance)) {
-                       raw_spin_lock_irq(&rq->lock);
+                       struct rq_flags rf;
+
+                       rq_lock_irq(rq, &rf);
                        update_rq_clock(rq);
                        cpu_load_update_idle(rq);
-                       raw_spin_unlock_irq(&rq->lock);
+                       rq_unlock_irq(rq, &rf);
+
                        rebalance_domains(rq, CPU_IDLE);
                }
 
@@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p)
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se, *curr;
        struct rq *rq = this_rq();
+       struct rq_flags rf;
 
-       raw_spin_lock(&rq->lock);
+       rq_lock(rq, &rf);
        update_rq_clock(rq);
 
        cfs_rq = task_cfs_rq(current);
@@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p)
        }
 
        se->vruntime -= cfs_rq->min_vruntime;
-       raw_spin_unlock(&rq->lock);
+       rq_unlock(rq, &rf);
 }
 
 /*
@@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex);
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
        int i;
-       unsigned long flags;
 
        /*
         * We can't change the weight of the root cgroup.
@@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, 
unsigned long shares)
        tg->shares = shares;
        for_each_possible_cpu(i) {
                struct rq *rq = cpu_rq(i);
-               struct sched_entity *se;
+               struct sched_entity *se = tg->se[i];
+               struct rq_flags rf;
 
-               se = tg->se[i];
                /* Propagate contribution to hierarchy */
-               raw_spin_lock_irqsave(&rq->lock, flags);
-
-               /* Possible calls to update_curr() need rq clock */
+               rq_lock_irqsave(rq, &rf);
                update_rq_clock(rq);
                for_each_sched_entity(se) {
                        update_load_avg(se, UPDATE_TG);
                        update_cfs_shares(se);
                }
-               raw_spin_unlock_irqrestore(&rq->lock, flags);
+               rq_unlock_irqrestore(rq, &rf);
        }
 
 done:
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1b3c8189b286..11192e0cb122 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true)
  */
 SCHED_FEAT(SIS_AVG_CPU, false)
 
+/*
+ * Issue a WARN when we do multiple update_rq_clock() calls
+ * in a single rq->lock section. Default disabled because the
+ * annotations are not complete.
+ */
+SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
+
 #ifdef HAVE_RT_PUSH_IPI
 /*
  * In order to avoid a thundering herd attack of CPUs that are
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 9f3e40226dec..979b7341008a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq)
 #define RT_PUSH_IPI_EXECUTING          1
 #define RT_PUSH_IPI_RESTART            2
 
+/*
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * On large CPU boxes, there's the case that several CPUs could schedule
+ * a lower priority task at the same time, in which case it will look for
+ * any overloaded CPUs that it could pull a task from. To do this, the runqueue
+ * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
+ * for a single overloaded CPU's runqueue lock can produce a large latency.
+ * (This has actually been observed on large boxes running cyclictest).
+ * Instead of taking the runqueue lock of the overloaded CPU, each of the
+ * CPUs that scheduled a lower priority task simply sends an IPI to the
+ * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
+ * lots of contention. The overloaded CPU will look to push its non-running
+ * RT task off, and if it does, it can then ignore the other IPIs coming
+ * in, and just pass those IPIs off to any other overloaded CPU.
+ *
+ * When a CPU schedules a lower priority task, it only sends an IPI to
+ * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
+ * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
+ * RT overloaded tasks, would cause 100 IPIs to go out at once.
+ *
+ * The overloaded RT CPU, when receiving an IPI, will try to push off its
+ * overloaded RT tasks and then send an IPI to the next CPU that has
+ * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
+ * have completed. Just because a CPU may have pushed off its own overloaded
+ * RT task does not mean it should stop sending the IPI around to other
+ * overloaded CPUs. There may be another RT task waiting to run on one of
+ * those CPUs that are of higher priority than the one that was just
+ * pushed.
+ *
+ * An optimization that could possibly be made is to make a CPU array similar
+ * to the cpupri array mask of all running RT tasks, but for the overloaded
+ * case, then the IPI could be sent to only the CPU with the highest priority
+ * RT task waiting, and that CPU could send off further IPIs to the CPU with
+ * the next highest waiting task. Since the overloaded case is much less likely
+ * to happen, the complexity of this implementation may not be worth it.
+ * Instead, just send an IPI around to all overloaded CPUs.
+ *
+ * The rq->rt.push_flags holds the status of the IPI that is going around.
+ * A run queue can only send out a single IPI at a time. The possible flags
+ * for rq->rt.push_flags are:
+ *
+ *    (None or zero):          No IPI is going around for the current rq
+ *    RT_PUSH_IPI_EXECUTING:   An IPI for the rq is being passed around
+ *    RT_PUSH_IPI_RESTART:     The priority of the running task for the rq
+ *                             has changed, and the IPI should restart
+ *                             circulating the overloaded CPUs again.
+ *
+ * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
+ * before sending to the next CPU.
+ *
+ * Instead of having all CPUs that schedule a lower priority task send
+ * an IPI to the same "first" CPU in the RT overload mask, they send it
+ * to the next overloaded CPU after their own CPU. This helps distribute
+ * the work when there's more than one overloaded CPU and multiple CPUs
+ * scheduling in lower priority tasks.
+ *
+ * When a rq schedules a lower priority task than what was currently
+ * running, the next CPU with overloaded RT tasks is examined first.
+ * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
+ * priority task, it will send an IPI first to CPU 5, then CPU 5 will
+ * send to CPU 1 if it is still overloaded. CPU 1 will clear the
+ * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
+ *
+ * The first CPU to notice IPI_RESTART is set, will clear that flag and then
+ * send an IPI to the next overloaded CPU after the rq->cpu and not the next
+ * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
+ * schedules a lower priority task, and the IPI_RESTART gets set while the
+ * handling is being done on CPU 5, it will clear the flag and send it back to
+ * CPU 4 instead of CPU 1.
+ *
+ * Note, the above logic can be disabled by turning off the sched_feature
+ * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
+ * taken by the CPU requesting a pull and the waiting RT task will be pulled
+ * by that CPU. This may be fine for machines with few CPUs.
+ */
 static void tell_cpu_to_push(struct rq *rq)
 {
        int cpu;
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
new file mode 100644
index 000000000000..cd200d16529e
--- /dev/null
+++ b/kernel/sched/sched-pelt.h
@@ -0,0 +1,13 @@
+/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
+
+static const u32 runnable_avg_yN_inv[] = {
+       0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+       0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+       0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+       0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+       0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+       0x85aac367, 0x82cd8698,
+};
+
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5cbf92214ad8..de4b934ba974 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40];
 #define DEQUEUE_SLEEP          0x01
 #define DEQUEUE_SAVE           0x02 /* matches ENQUEUE_RESTORE */
 #define DEQUEUE_MOVE           0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_NOCLOCK                0x08 /* matches ENQUEUE_NOCLOCK */
 
 #define ENQUEUE_WAKEUP         0x01
 #define ENQUEUE_RESTORE                0x02
 #define ENQUEUE_MOVE           0x04
+#define ENQUEUE_NOCLOCK                0x08
 
-#define ENQUEUE_HEAD           0x08
-#define ENQUEUE_REPLENISH      0x10
+#define ENQUEUE_HEAD           0x10
+#define ENQUEUE_REPLENISH      0x20
 #ifdef CONFIG_SMP
-#define ENQUEUE_MIGRATED       0x20
+#define ENQUEUE_MIGRATED       0x40
 #else
 #define ENQUEUE_MIGRATED       0x00
 #endif
@@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { }
 
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
        __acquires(rq->lock);
+
 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
        __acquires(p->pi_lock)
        __acquires(rq->lock);
@@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, 
struct rq_flags *rf)
        raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 }
 
+static inline void
+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock_irqsave(&rq->lock, rf->flags);
+       rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock_irq(&rq->lock);
+       rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_lock(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock(&rq->lock);
+       rq_pin_lock(rq, rf);
+}
+
+static inline void
+rq_relock(struct rq *rq, struct rq_flags *rf)
+       __acquires(rq->lock)
+{
+       raw_spin_lock(&rq->lock);
+       rq_repin_lock(rq, rf);
+}
+
+static inline void
+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+       __releases(rq->lock)
+{
+       rq_unpin_lock(rq, rf);
+       raw_spin_unlock_irqrestore(&rq->lock, rf->flags);
+}
+
+static inline void
+rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+       __releases(rq->lock)
+{
+       rq_unpin_lock(rq, rf);
+       raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void
+rq_unlock(struct rq *rq, struct rq_flags *rf)
+       __releases(rq->lock)
+{
+       rq_unpin_lock(rq, rf);
+       raw_spin_unlock(&rq->lock);
+}
+
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 744fa611cae0..4e09821f9d9e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -309,7 +309,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
        account_irq_exit_time(current);
        __local_bh_enable(SOFTIRQ_OFFSET);
        WARN_ON_ONCE(in_interrupt());
-       tsk_restore_flags(current, old_flags, PF_MEMALLOC);
+       current_restore_flags(old_flags, PF_MEMALLOC);
 }
 
 asmlinkage __visible void do_softirq(void)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c0168b7da1ea..5bf1be018628 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4735,6 +4735,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
        return wfc.ret;
 }
 EXPORT_SYMBOL_GPL(work_on_cpu);
+
+/**
+ * work_on_cpu_safe - run a function in thread context on a particular cpu
+ * @cpu: the cpu to run on
+ * @fn:  the function to run
+ * @arg: the function argument
+ *
+ * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
+ * any locks which would prevent @fn from completing.
+ *
+ * Return: The value @fn returns.
+ */
+long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
+{
+       long ret = -ENODEV;
+
+       get_online_cpus();
+       if (cpu_online(cpu))
+               ret = work_on_cpu(cpu, fn, arg);
+       put_online_cpus();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(work_on_cpu_safe);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
diff --git a/net/core/dev.c b/net/core/dev.c
index 7869ae3837ca..e8a366387a99 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4240,7 +4240,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
                 */
                current->flags |= PF_MEMALLOC;
                ret = __netif_receive_skb_core(skb, true);
-               tsk_restore_flags(current, pflags, PF_MEMALLOC);
+               current_restore_flags(pflags, PF_MEMALLOC);
        } else
                ret = __netif_receive_skb_core(skb, false);
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 2c4f574168fb..b416a537bd0a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -325,7 +325,7 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 
        current->flags |= PF_MEMALLOC;
        ret = sk->sk_backlog_rcv(sk, skb);
-       tsk_restore_flags(current, pflags, PF_MEMALLOC);
+       current_restore_flags(pflags, PF_MEMALLOC);
 
        return ret;
 }

[GIT PULL] scheduler changes for v4.12

Reply via email to