[PATCH v4 1/2] kheaders: Move from proc to sysfs

2019-05-15 Thread Joel Fernandes (Google)
The kheaders archive consisting of the kernel headers used for compiling
bpf programs is in /proc. However there is concern that moving it here
will make it permanent. Let us move it to /sys/kernel as discussed [1].

[1] https://lore.kernel.org/patchwork/patch/1067310/#1265969

Suggested-by: Steven Rostedt 
Signed-off-by: Joel Fernandes (Google) 
---
This patch applies on top of the previous patch that was applied to the
driver tree:
https://lore.kernel.org/patchwork/patch/1067310/

v2->v3: Fixed sysfs file mode nit (Greg).
v1->v2: Fixed some kconfig nits (Masami).

 init/Kconfig| 17 +
 kernel/Makefile |  4 +--
 kernel/{gen_ikh_data.sh => gen_kheaders.sh} |  2 +-
 kernel/kheaders.c   | 40 +
 4 files changed, 27 insertions(+), 36 deletions(-)
 rename kernel/{gen_ikh_data.sh => gen_kheaders.sh} (98%)

diff --git a/init/Kconfig b/init/Kconfig
index 8b9ffe236e4f..16a7540d60c8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -579,15 +579,14 @@ config IKCONFIG_PROC
  This option enables access to the kernel configuration file
  through /proc/config.gz.
 
-config IKHEADERS_PROC
-   tristate "Enable kernel header artifacts through /proc/kheaders.tar.xz"
-   depends on PROC_FS
-   help
- This option enables access to the kernel header and other artifacts 
that
- are generated during the build process. These can be used to build 
eBPF
- tracing programs, or similar programs.  If you build the headers as a
- module, a module called kheaders.ko is built which can be loaded 
on-demand
- to get access to the headers.
+config IKHEADERS
+   tristate "Enable kernel headers through /sys/kernel/kheaders.tar.xz"
+   depends on SYSFS
+   help
+ This option enables access to the in-kernel headers that are 
generated during
+ the build process. These can be used to build eBPF tracing programs,
+ or similar programs.  If you build the headers as a module, a module 
called
+ kheaders.ko is built which can be loaded on-demand to get access to 
headers.
 
 config LOG_BUF_SHIFT
int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
diff --git a/kernel/Makefile b/kernel/Makefile
index 33824f0385b3..a8d923b5481b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o
+obj-$(CONFIG_IKHEADERS) += kheaders.o
 obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
 
 quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
-cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@
+cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
 $(obj)/kheaders_data.tar.xz: FORCE
$(call cmd,genikh)
 
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_kheaders.sh
similarity index 98%
rename from kernel/gen_ikh_data.sh
rename to kernel/gen_kheaders.sh
index 591a94f7b387..581b83534587 100755
--- a/kernel/gen_ikh_data.sh
+++ b/kernel/gen_kheaders.sh
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 # This script generates an archive consisting of kernel headers
-# for CONFIG_IKHEADERS_PROC.
+# for CONFIG_IKHEADERS.
 set -e
 spath="$(dirname "$(readlink -f "$0")")"
 kroot="$spath/.."
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
index 70ae6052920d..8f69772af77b 100644
--- a/kernel/kheaders.c
+++ b/kernel/kheaders.c
@@ -8,9 +8,8 @@
 
 #include 
 #include 
-#include 
+#include 
 #include 
-#include 
 
 /*
  * Define kernel_headers_data and kernel_headers_data_end, within which the
@@ -31,39 +30,32 @@ extern char kernel_headers_data;
 extern char kernel_headers_data_end;
 
 static ssize_t
-ikheaders_read_current(struct file *file, char __user *buf,
- size_t len, loff_t *offset)
+ikheaders_read(struct file *file,  struct kobject *kobj,
+  struct bin_attribute *bin_attr,
+  char *buf, loff_t off, size_t len)
 {
-   return simple_read_from_buffer(buf, len, offset,
-  _headers_data,
-  _headers_data_end -
-  _headers_data);
+   memcpy(buf, _headers_data + off, len);
+   return len;
 }
 
-static const struct file_operations ikheaders_file_ops = {
-   .read = ikheaders_read_current,
-   .llseek = default_llseek,
+static struct bin_attribute kheaders_attr __ro_after_init = {
+   .attr = {
+   .name = "kheaders.tar.

[PATCH -manpage 1/2] fcntl.2: Update manpage with new memfd F_SEAL_FUTURE_WRITE seal

2019-03-14 Thread Joel Fernandes (Google)
More details of the seal can be found in the LKML patch:
https://lore.kernel.org/lkml/20181120052137.74317-1-j...@joelfernandes.org/T/#t

Signed-off-by: Joel Fernandes (Google) 
---
 man2/fcntl.2 | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/man2/fcntl.2 b/man2/fcntl.2
index fce4f4c2b3bd..e01e2c075b5b 100644
--- a/man2/fcntl.2
+++ b/man2/fcntl.2
@@ -1525,6 +1525,21 @@ Furthermore, if there are any asynchronous I/O operations
 .RB ( io_submit (2))
 pending on the file,
 all outstanding writes will be discarded.
+.TP
+.BR F_SEAL_FUTURE_WRITE
+If this seal is set, the contents of the file can be modified only from
+existing writeable mappings that were created prior to the seal being set.
+Any attempt to create a new writeable mapping on the memfd via
+.BR mmap (2)
+will fail with
+.BR EPERM.
+Also any attempts to write to the memfd via
+.BR write (2)
+will fail with
+.BR EPERM.
+This is useful in situations where existing writable mapped regions need to be
+kept intact while preventing any future writes. For example, to share a
+read-only memory buffer to other processes that only the sender can write to.
 .\"
 .SS File read/write hints
 Write lifetime hints can be used to inform the kernel about the relative
-- 
2.21.0.360.g471c308f928-goog



[PATCH -manpage 0/2]

2019-03-14 Thread Joel Fernandes (Google)
This documentation is for F_SEAL_FUTURE_WRITE patches that are in linux-next.

Joel Fernandes (Google) (2):
fcntl.2: Update manpage with new memfd F_SEAL_FUTURE_WRITE seal
memfd_create.2: Update manpage with new memfd F_SEAL_FUTURE_WRITE seal

man2/fcntl.2| 15 +++
man2/memfd_create.2 | 15 ++-
2 files changed, 29 insertions(+), 1 deletion(-)

--
2.21.0.360.g471c308f928-goog



[PATCH -manpage 2/2] memfd_create.2: Update manpage with new memfd F_SEAL_FUTURE_WRITE seal

2019-03-14 Thread Joel Fernandes (Google)
More details of the seal can be found in the LKML patch:
https://lore.kernel.org/lkml/20181120052137.74317-1-j...@joelfernandes.org/T/#t

Signed-off-by: Joel Fernandes (Google) 
---
 man2/memfd_create.2 | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/man2/memfd_create.2 b/man2/memfd_create.2
index 15b1362f5525..3b7f032407ed 100644
--- a/man2/memfd_create.2
+++ b/man2/memfd_create.2
@@ -280,7 +280,15 @@ in order to restrict further modifications on the file.
 (If placing the seal
 .BR F_SEAL_WRITE ,
 then it will be necessary to first unmap the shared writable mapping
-created in the previous step.)
+created in the previous step. Otherwise, behavior similar to
+.BR F_SEAL_WRITE
+can be achieved, by using
+.BR F_SEAL_FUTURE_WRITE
+which will prevent future writes via
+.BR mmap (2)
+and
+.BR write (2)
+from succeeding, while keeping existing shared writable mappings).
 .IP 4.
 A second process obtains a file descriptor for the
 .BR tmpfs (5)
@@ -425,6 +433,7 @@ main(int argc, char *argv[])
 fprintf(stderr, "\et\etg \- F_SEAL_GROW\en");
 fprintf(stderr, "\et\ets \- F_SEAL_SHRINK\en");
 fprintf(stderr, "\et\etw \- F_SEAL_WRITE\en");
+fprintf(stderr, "\et\etW \- F_SEAL_FUTURE_WRITE\en");
 fprintf(stderr, "\et\etS \- F_SEAL_SEAL\en");
 exit(EXIT_FAILURE);
 }
@@ -463,6 +472,8 @@ main(int argc, char *argv[])
 seals |= F_SEAL_SHRINK;
 if (strchr(seals_arg, \(aqw\(aq) != NULL)
 seals |= F_SEAL_WRITE;
+if (strchr(seals_arg, \(aqW\(aq) != NULL)
+seals |= F_SEAL_FUTURE_WRITE;
 if (strchr(seals_arg, \(aqS\(aq) != NULL)
 seals |= F_SEAL_SEAL;
 
@@ -518,6 +529,8 @@ main(int argc, char *argv[])
 printf(" GROW");
 if (seals & F_SEAL_WRITE)
 printf(" WRITE");
+if (seals & F_SEAL_FUTURE_WRITE)
+printf(" FUTURE_WRITE");
 if (seals & F_SEAL_SHRINK)
 printf(" SHRINK");
 printf("\en");
-- 
2.21.0.360.g471c308f928-goog



[PATCH v2 1/4] lockdep: Add assertion to check if in an interrupt

2019-03-26 Thread Joel Fernandes (Google)
In rcu_rrupt_from_idle, we want to check if it is called from within an
interrupt, but want to do such checking only for debug builds. lockdep
already tracks when we enter an interrupt. Let us expose it as an
assertion macro so it can be used to assert this.

Suggested-by: Steven Rostedt 
Cc: kernel-t...@android.com
Cc: r...@vger.kernel.org
Signed-off-by: Joel Fernandes (Google) 

---
 include/linux/lockdep.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index c5335df2372f..d24f564823d3 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -601,11 +601,18 @@ do {  
\
  "IRQs not disabled as expected\n");   \
} while (0)
 
+#define lockdep_assert_in_irq() do {   \
+   WARN_ONCE(debug_locks && !current->lockdep_recursion && \
+ !current->hardirq_context,\
+ "Not in hardirq as expected\n");  \
+   } while (0)
+
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
 # define lockdep_assert_irqs_enabled() do { } while (0)
 # define lockdep_assert_irqs_disabled() do { } while (0)
+# define lockdep_assert_in_irq() do { } while (0)
 #endif
 
 #ifdef CONFIG_LOCKDEP
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v2 3/4] rcutorture: Select from only online CPUs

2019-03-26 Thread Joel Fernandes (Google)
The rcutorture jitter.sh script selects a random CPU but does not check
if it is offline or online. This leads to taskset errors many times. On
my machine, hyper threading is disabled so half the cores are offline
causing taskset errors a lot of times. Let us fix this by checking from
only the online CPUs on the system.

Cc: r...@vger.kernel.org
Signed-off-by: Joel Fernandes (Google) 
---
 tools/testing/selftests/rcutorture/bin/jitter.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh 
b/tools/testing/selftests/rcutorture/bin/jitter.sh
index 3633828375e3..47bd9829dc55 100755
--- a/tools/testing/selftests/rcutorture/bin/jitter.sh
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -47,10 +47,11 @@ do
exit 0;
fi
 
-   # Set affinity to randomly selected CPU
-   cpus=`ls /sys/devices/system/cpu/*/online |
+   # Set affinity to randomly selected online CPU
+   cpus=`grep 1 /sys/devices/system/cpu/*/online |
sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' |
grep -v '^0*$'`
+
cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN {
srand(n + me + systime());
ncpus = split(cpus, ca);
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v2 2/4] rcutree: Add checks for dynticks counters in rcu_is_cpu_rrupt_from_idle

2019-03-26 Thread Joel Fernandes (Google)
In the future we would like to combine the dynticks and dynticks_nesting
counters thus leading to simplifying the code. At the moment we cannot
do that due to concerns about usermode upcalls appearing to RCU as half
of an interrupt. Byungchul tried to do it in [1] but the
"half-interrupt" concern was raised. It is half because, what RCU
expects is rcu_irq_enter() and rcu_irq_exit() pairs when the usermode
exception happens. However, only rcu_irq_enter() is observed. This
concern may not be valid anymore, but at least it used to be the case.

Out of abundance of caution, Paul added warnings [2] in the RCU code
which if not fired by 2021 may allow us to assume that such
half-interrupt scenario cannot happen any more, which can lead to
simplification of this code.

Summary of the changes are the following:

(1) In preparation for this combination of counters in the future, we
first need to first be sure that rcu_rrupt_from_idle cannot be called
from anywhere but a hard-interrupt because previously, the comments
suggested otherwise so let us be sure. We discussed this here [3]. We
use the services of lockdep to accomplish this.

(2) Further rcu_rrupt_from_idle() is not explicit about how it is using
the counters which can lead to weird future bugs. This patch therefore
makes it more explicit about the specific counter values being tested

(3) Lastly, we check for counter underflows just to be sure these are
not happening, because the previous code in rcu_rrupt_from_idle() was
allowing the case where the counters can underflow, and the function
would still return true. Now we are checking for specific values so let
us be confident by additional checking, that such underflows don't
happen. Any case, if they do, we should fix them and the screaming
warning is appropriate. All these checks checks are NOOPs if PROVE_RCU
and PROVE_LOCKING are disabled.

[1] https://lore.kernel.org/patchwork/patch/952349/
[2] Commit e11ec65cc8d6 ("rcu: Add warning to detect half-interrupts")
[3] https://lore.kernel.org/lkml/20190312150514.gb249...@google.com/

Cc: byungchul.p...@lge.com
Cc: kernel-t...@android.com
Cc: r...@vger.kernel.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9180158756d2..dbff8a274c46 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -381,16 +381,29 @@ static void __maybe_unused 
rcu_momentary_dyntick_idle(void)
 }
 
 /**
- * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from 
idle
+ * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
  *
- * If the current CPU is idle or running at a first-level (not nested)
+ * If the current CPU is idle and running at a first-level (not nested)
  * interrupt from idle, return true.  The caller must have at least
  * disabled preemption.
  */
 static int rcu_is_cpu_rrupt_from_idle(void)
 {
-   return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 &&
-  __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1;
+   /* Called only from within the scheduling-clock interrupt */
+   lockdep_assert_in_irq();
+
+   /* Check for counter underflows */
+   RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0,
+"RCU dynticks_nesting counter underflow!");
+   RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0,
+"RCU dynticks_nmi_nesting counter underflow/zero!");
+
+   /* Are we at first interrupt nesting level? */
+   if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1)
+   return false;
+
+   /* Does CPU appear to be idle from an RCU standpoint? */
+   return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
 }
 
 #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v2 4/4] rcutorture: Add cpu0 to the set of CPUs to add jitter

2019-03-26 Thread Joel Fernandes (Google)
jitter.sh currently does not add CPU0 to the list of CPUs for adding of
jitter. Let us add it to this list even when it is not hot-pluggable.

Signed-off-by: Joel Fernandes (Google) 
---
 tools/testing/selftests/rcutorture/bin/jitter.sh | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh 
b/tools/testing/selftests/rcutorture/bin/jitter.sh
index 47bd9829dc55..26faf5824a1f 100755
--- a/tools/testing/selftests/rcutorture/bin/jitter.sh
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -49,8 +49,12 @@ do
 
# Set affinity to randomly selected online CPU
cpus=`grep 1 /sys/devices/system/cpu/*/online |
-   sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' |
-   grep -v '^0*$'`
+   sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'`
+
+   # Do not leave out poor old cpu0 which may not be hot-pluggable
+   if [ ! -f "/sys/devices/system/cpu/cpu0/online" ]; then
+   cpus="0 $cpus"
+   fi
 
cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN {
srand(n + me + systime());
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH] Convert struct pid count to refcount_t

2019-03-27 Thread Joel Fernandes (Google)
struct pid's count is an atomic_t field used as a refcount. Use
refcount_t for it which is basically atomic_t but does additional
checking to prevent use-after-free bugs. No change in behavior if
CONFIG_REFCOUNT_FULL=n.

Cc: keesc...@chromium.org
Cc: kernel-t...@android.com
Cc: kernel-harden...@lists.openwall.com
Signed-off-by: Joel Fernandes (Google) 

---
 include/linux/pid.h | 5 +++--
 kernel/pid.c| 8 
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 14a9a39da9c7..8cb86d377ff5 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
 #define _LINUX_PID_H
 
 #include 
+#include 
 
 enum pid_type
 {
@@ -56,7 +57,7 @@ struct upid {
 
 struct pid
 {
-   atomic_t count;
+   refcount_t count;
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
@@ -69,7 +70,7 @@ extern struct pid init_struct_pid;
 static inline struct pid *get_pid(struct pid *pid)
 {
if (pid)
-   atomic_inc(>count);
+   refcount_inc(>count);
return pid;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..2095c7da644d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,7 +37,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 
@@ -106,8 +106,8 @@ void put_pid(struct pid *pid)
return;
 
ns = pid->numbers[pid->level].ns;
-   if ((atomic_read(>count) == 1) ||
-atomic_dec_and_test(>count)) {
+   if ((refcount_read(>count) == 1) ||
+refcount_dec_and_test(>count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -210,7 +210,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
 
get_pid_ns(ns);
-   atomic_set(>count, 1);
+   refcount_set(>count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(>tasks[type]);
 
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v3 2/2] Add selftests for module build using in-kernel headers

2019-02-27 Thread Joel Fernandes (Google)
This test tries to build a module successfully using the in-kernel
headers found in /proc/kheaders.tar.xz.

Verified pass and fail scenarios by running:
make -C tools/testing/selftests TARGETS=kheaders run_tests

Signed-off-by: Joel Fernandes (Google) 
---
 tools/testing/selftests/Makefile  |  1 +
 tools/testing/selftests/kheaders/Makefile |  5 +
 tools/testing/selftests/kheaders/config   |  1 +
 .../kheaders/run_kheaders_modbuild.sh | 18 +
 .../selftests/kheaders/testmod/Makefile   |  3 +++
 .../testing/selftests/kheaders/testmod/test.c | 20 +++
 6 files changed, 48 insertions(+)
 create mode 100644 tools/testing/selftests/kheaders/Makefile
 create mode 100644 tools/testing/selftests/kheaders/config
 create mode 100755 tools/testing/selftests/kheaders/run_kheaders_modbuild.sh
 create mode 100644 tools/testing/selftests/kheaders/testmod/Makefile
 create mode 100644 tools/testing/selftests/kheaders/testmod/test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 400ee81a3043..5a9287fddd0d 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -20,6 +20,7 @@ TARGETS += intel_pstate
 TARGETS += ipc
 TARGETS += ir
 TARGETS += kcmp
+TARGETS += kheaders
 TARGETS += kvm
 TARGETS += lib
 TARGETS += membarrier
diff --git a/tools/testing/selftests/kheaders/Makefile 
b/tools/testing/selftests/kheaders/Makefile
new file mode 100644
index ..51035ab0732b
--- /dev/null
+++ b/tools/testing/selftests/kheaders/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_PROGS := run_kheaders_modbuild.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/kheaders/config 
b/tools/testing/selftests/kheaders/config
new file mode 100644
index ..5221f9fb5e79
--- /dev/null
+++ b/tools/testing/selftests/kheaders/config
@@ -0,0 +1 @@
+CONFIG_IKHEADERS_PROC=y
diff --git a/tools/testing/selftests/kheaders/run_kheaders_modbuild.sh 
b/tools/testing/selftests/kheaders/run_kheaders_modbuild.sh
new file mode 100755
index ..f001568e08b0
--- /dev/null
+++ b/tools/testing/selftests/kheaders/run_kheaders_modbuild.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+HEADERS_XZ=/proc/kheaders.tar.xz
+TMP_DIR_HEADERS=$(mktemp -d)
+TMP_DIR_MODULE=$(mktemp -d)
+SPATH="$(dirname "$(readlink -f "$0")")"
+
+tar -xvf $HEADERS_XZ -C $TMP_DIR_HEADERS > /dev/null
+
+cp -r $SPATH/testmod $TMP_DIR_MODULE/
+
+pushd $TMP_DIR_MODULE/testmod > /dev/null
+make -C $TMP_DIR_HEADERS M=$(pwd) modules
+popd > /dev/null
+
+rm -rf $TMP_DIR_HEADERS
+rm -rf $TMP_DIR_MODULE
diff --git a/tools/testing/selftests/kheaders/testmod/Makefile 
b/tools/testing/selftests/kheaders/testmod/Makefile
new file mode 100644
index ..7083e28706e8
--- /dev/null
+++ b/tools/testing/selftests/kheaders/testmod/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-m += test.o
diff --git a/tools/testing/selftests/kheaders/testmod/test.c 
b/tools/testing/selftests/kheaders/testmod/test.c
new file mode 100644
index ..6eb0b8492ffa
--- /dev/null
+++ b/tools/testing/selftests/kheaders/testmod/test.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+
+static int __init hello_init(void)
+{
+   printk(KERN_INFO "Hello, world\n");
+   return 0;
+}
+
+static void __exit hello_exit(void)
+{
+   printk(KERN_INFO "Goodbye, world\n");
+}
+
+module_init(hello_init);
+module_exit(hello_exit);
+MODULE_LICENSE("GPL v2");
-- 
2.21.0.rc2.261.ga7da99ff1b-goog



[PATCH v3 1/2] Provide in-kernel headers for making it easy to extend the kernel

2019-02-27 Thread Joel Fernandes (Google)
Introduce in-kernel headers and other artifacts which are made available
as an archive through proc (/proc/kheaders.tar.xz file). This archive makes
it possible to build kernel modules, run eBPF programs, and other
tracing programs that need to extend the kernel for tracing purposes
without any dependency on the file system having headers and build
artifacts.

On Android and embedded systems, it is common to switch kernels but not
have kernel headers available on the file system. Raw kernel headers
also cannot be copied into the filesystem like they can be on other
distros, due to licensing and other issues. There's no linux-headers
package on Android. Further once a different kernel is booted, any
headers stored on the file system will no longer be useful. By storing
the headers as a compressed archive within the kernel, we can avoid these
issues that have been a hindrance for a long time.

The feature is also buildable as a module just in case the user desires
it not being part of the kernel image. This makes it possible to load
and unload the headers on demand. A tracing program, or a kernel module
builder can load the module, do its operations, and then unload the
module to save kernel memory. The total memory needed is 3.8MB.

The code to read the headers is based on /proc/config.gz code and uses
the same technique to embed the headers.

To build a module, the below steps have been tested on an x86 machine:
modprobe kheaders
rm -rf $HOME/headers
mkdir -p $HOME/headers
tar -xvf /proc/kheaders.tar.xz -C $HOME/headers >/dev/null
cd my-kernel-module
make -C $HOME/headers M=$(pwd) modules
rmmod kheaders

Additional notes:
(1)
A limitation of module building with this is, since Module.symvers is
not available in the archive due to a cyclic dependency with building of
the archive into the kernel or module binaries, the modules built using
the archive will not contain symbol versioning (modversion). This is
usually not an issue since the idea of this patch is to build a kernel
module on the fly and load it into the same kernel. An appropriate
warning is already printed by the kernel to alert the user of modules
not having modversions when built using the archive. For building with
modversions, the user can use traditional header packages. For our
tracing usecases, we build modules on the fly with this so it is not a
concern.

(2) I have left IKHD_ST and IKHD_ED markers as is to facilitate
future patches that would extract the headers from a kernel or module
image.

Signed-off-by: Joel Fernandes (Google) 
---

Changes since v2:
(Thanks to Masahiro Yamada for several excellent suggestions)
- Added support for out of tree builds.
- Added incremental build support bringing down build time of
  incremental builds from 50 seconds to 5 seconds.
- Fixed various small nits / cleanups.
- clean ups to kheaders.c pointed by Alexey Dobriyan.
- Fixed MODULE_LICENSE in test module and kheaders.c
- Dropped Module.symvers from archive due to circular dependency.

Changes since v1:
- removed IKH_EXTRA variable, not needed (Masahiro Yamada)
- small fix ups to selftest
   - added target to main Makefile etc
   - added MODULE_LICENSE to test module
   - made selftest more quiet

Changes since RFC:
Both changes bring size down to 3.8MB:
- use xz for compression
- strip comments except SPDX lines
- Call out the module name in Kconfig
- Also added selftests in second patch to ensure headers are always
working.

Other notes:
By the way I still see this error (without the patch) when doing a clean
build: Makefile:594: include/config/auto.conf: No such file or directory

It appears to be because of commit 0a16d2e8cb7e ("kbuild: use 'include'
directive to load auto.conf from top Makefile")

 Documentation/dontdiff|  1 +
 init/Kconfig  | 11 ++
 kernel/.gitignore |  3 ++
 kernel/Makefile   | 36 +++
 kernel/kheaders.c | 72 +
 scripts/gen_ikh_data.sh   | 76 +++
 scripts/strip-comments.pl |  8 +
 7 files changed, 207 insertions(+)
 create mode 100644 kernel/kheaders.c
 create mode 100755 scripts/gen_ikh_data.sh
 create mode 100755 scripts/strip-comments.pl

diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 2228fcc8e29f..05a2319ee2a2 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -151,6 +151,7 @@ int8.c
 kallsyms
 kconfig
 keywords.c
+kheaders_data.h*
 ksym.c*
 ksym.h*
 kxgettext
diff --git a/init/Kconfig b/init/Kconfig
index c9386a365eea..63ff0990ae55 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -563,6 +563,17 @@ config IKCONFIG_PROC
  This option enables access to the kernel configuration file
  through /proc/config.gz.
 
+config IKHEADERS_PROC
+   tristate "Enable kernel header artifacts through /proc/kheaders.tar.xz"
+  

[PATCH 1/2] module: Prepare for addition of new ro_after_init sections

2019-04-09 Thread Joel Fernandes (Google)
For the purposes of hardening modules by adding sections to
ro_after_init sections, prepare for addition of new ro_after_init
entries which we do in future patches. Create a table to which new
entries could be added later. This makes it less error prone and reduce
code duplication.

Cc: paul...@linux.vnet.ibm.com
Cc: rost...@goodmis.org
Cc: mathieu.desnoy...@efficios.com
Cc: r...@vger.kernel.org
Cc: kernel-harden...@lists.openwall.com
Cc: kernel-t...@android.com
Suggested-by: keesc...@chromium.org
Signed-off-by: Joel Fernandes (Google) 

---
 kernel/module.c | 42 --
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 524da609c884..f9221381d076 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3300,11 +3300,28 @@ static bool blacklisted(const char *module_name)
 }
 core_param(module_blacklist, module_blacklist, charp, 0400);
 
+/*
+ * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ * layout_sections() can put it in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+static char *ro_after_init_sections[] = {
+   ".data..ro_after_init",
+
+   /*
+* __jump_table structures are never modified, with the exception of
+* entries that refer to code in the __init section, which are
+* annotated as such at module load time.
+*/
+   "__jump_table",
+   NULL
+};
+
 static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
struct module *mod;
unsigned int ndx;
-   int err;
+   int err, i;
 
err = check_modinfo(info->mod, info, flags);
if (err)
@@ -3319,23 +3336,12 @@ static struct module *layout_and_allocate(struct 
load_info *info, int flags)
/* We will do a special allocation for per-cpu sections later. */
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
 
-   /*
-* Mark ro_after_init section with SHF_RO_AFTER_INIT so that
-* layout_sections() can put it in the right place.
-* Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
-*/
-   ndx = find_sec(info, ".data..ro_after_init");
-   if (ndx)
-   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-   /*
-* Mark the __jump_table section as ro_after_init as well: these data
-* structures are never modified, with the exception of entries that
-* refer to code in the __init section, which are annotated as such
-* at module load time.
-*/
-   ndx = find_sec(info, "__jump_table");
-   if (ndx)
-   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+   /* Set sh_flags for read-only after init sections */
+   for (i = 0; ro_after_init_sections[i]; i++) {
+   ndx = find_sec(info, ro_after_init_sections[i]);
+   if (ndx)
+   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+   }
 
/* Determine total sizes, and put offsets in sh_entsize.  For now
   this is done generically; there doesn't appear to be any
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH 2/2] module: Make srcu_struct ptr array as read-only post init

2019-04-09 Thread Joel Fernandes (Google)
Since commit title ("srcu: Allocate per-CPU data for DEFINE_SRCU() in
modules"), modules that call DEFINE_{STATIC,}SRCU will have a new array
of srcu_struct pointers which is used by srcu code to initialize and
clean up these structures.

There is no reason for this array of pointers to be writable, and can
cause security or other hidden bugs. Mark these are read-only after the
module init has completed.

Suggested-by: paul...@linux.vnet.ibm.com
Suggested-by: keesc...@chromium.org
Signed-off-by: Joel Fernandes (Google) 

---
 kernel/module.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/module.c b/kernel/module.c
index f9221381d076..ed1f2612aebc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3301,7 +3301,7 @@ static bool blacklisted(const char *module_name)
 core_param(module_blacklist, module_blacklist, charp, 0400);
 
 /*
- * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ * These are section names marked with SHF_RO_AFTER_INIT so that
  * layout_sections() can put it in the right place.
  * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
  */
@@ -3314,6 +3314,13 @@ static char *ro_after_init_sections[] = {
 * annotated as such at module load time.
 */
"__jump_table",
+
+   /*
+* Used for SRCU structures which need to be initialized/cleaned up
+* by the SRCU notifiers
+*/
+   "___srcu_struct_ptrs",
+
NULL
 };
 
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v2 2/3] module: Make srcu_struct ptr array as read-only post init

2019-04-10 Thread Joel Fernandes (Google)
Since commit title ("srcu: Allocate per-CPU data for DEFINE_SRCU() in
modules"), modules that call DEFINE_{STATIC,}SRCU will have a new array
of srcu_struct pointers which is used by srcu code to initialize and
clean up these structures.

There is no reason for this array of pointers to be writable, and can
cause security or other hidden bugs. Mark these are read-only after the
module init has completed.

Suggested-by: paul...@linux.vnet.ibm.com
Suggested-by: keesc...@chromium.org
Acked-by: keesc...@chromium.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/module.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 1acddb93282a..8b9631e789f0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3305,7 +3305,7 @@ core_param(module_blacklist, module_blacklist, charp, 
0400);
  * layout_sections() can put it in the right place.
  * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
  */
-static char *ro_after_init_sections[] = {
+static const char * const ro_after_init_sections[] = {
".data..ro_after_init",
 
/*
@@ -3314,6 +3314,12 @@ static char *ro_after_init_sections[] = {
 * annotated as such at module load time.
 */
"__jump_table",
+
+   /*
+* Used for SRCU structures which need to be initialized/cleaned up
+* by the SRCU notifiers
+*/
+   "___srcu_struct_ptrs",
 };
 
 static struct module *layout_and_allocate(struct load_info *info, int flags)
@@ -3336,7 +3342,7 @@ static struct module *layout_and_allocate(struct 
load_info *info, int flags)
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
 
/* Set sh_flags for read-only after init sections */
-   for (i = 0; ro_after_init_sections[i]; i++) {
+   for (i = 0; i < ARRAY_SIZE(ro_after_init_sections); i++) {
ndx = find_sec(info, ro_after_init_sections[i]);
if (ndx)
info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v2 1/3] module: Prepare for addition of new ro_after_init sections

2019-04-10 Thread Joel Fernandes (Google)
For the purposes of hardening modules by adding sections to
ro_after_init sections, prepare for addition of new ro_after_init
entries which we do in future patches. Create a table to which new
entries could be added later. This makes it less error prone and reduce
code duplication.

Cc: paul...@linux.vnet.ibm.com
Cc: rost...@goodmis.org
Cc: mathieu.desnoy...@efficios.com
Cc: r...@vger.kernel.org
Cc: kernel-harden...@lists.openwall.com
Cc: kernel-t...@android.com
Suggested-by: keesc...@chromium.org
Reviewed-by: keesc...@chromium.org
Acked-by: rost...@goodmis.org
Signed-off-by: Joel Fernandes (Google) 

---
 kernel/module.c | 41 +++--
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 524da609c884..1acddb93282a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3300,11 +3300,27 @@ static bool blacklisted(const char *module_name)
 }
 core_param(module_blacklist, module_blacklist, charp, 0400);
 
+/*
+ * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ * layout_sections() can put it in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+static char *ro_after_init_sections[] = {
+   ".data..ro_after_init",
+
+   /*
+* __jump_table structures are never modified, with the exception of
+* entries that refer to code in the __init section, which are
+* annotated as such at module load time.
+*/
+   "__jump_table",
+};
+
 static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
struct module *mod;
unsigned int ndx;
-   int err;
+   int err, i;
 
err = check_modinfo(info->mod, info, flags);
if (err)
@@ -3319,23 +3335,12 @@ static struct module *layout_and_allocate(struct 
load_info *info, int flags)
/* We will do a special allocation for per-cpu sections later. */
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
 
-   /*
-* Mark ro_after_init section with SHF_RO_AFTER_INIT so that
-* layout_sections() can put it in the right place.
-* Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
-*/
-   ndx = find_sec(info, ".data..ro_after_init");
-   if (ndx)
-   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-   /*
-* Mark the __jump_table section as ro_after_init as well: these data
-* structures are never modified, with the exception of entries that
-* refer to code in the __init section, which are annotated as such
-* at module load time.
-*/
-   ndx = find_sec(info, "__jump_table");
-   if (ndx)
-   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+   /* Set sh_flags for read-only after init sections */
+   for (i = 0; ro_after_init_sections[i]; i++) {
+   ndx = find_sec(info, ro_after_init_sections[i]);
+   if (ndx)
+   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+   }
 
/* Determine total sizes, and put offsets in sh_entsize.  For now
   this is done generically; there doesn't appear to be any
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v2 3/3] module: Make __tracepoints_ptrs as read-only

2019-04-10 Thread Joel Fernandes (Google)
This series hardens the tracepoints in modules by making the array of
pointers referring to the tracepoints as read-only. This array is needed
during module unloading to verify that the tracepoint is quiescent.
There is no reason for the array to be to be writable after init, and
can cause security or other hidden bugs. Mark these as ro_after_init.

Suggested-by: paul...@linux.vnet.ibm.com
Suggested-by: keesc...@chromium.org
Suggested-by: mathieu.desnoy...@efficios.com
Cc: rost...@goodmis.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/module.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 8b9631e789f0..be980aaa8804 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3320,6 +3320,12 @@ static const char * const ro_after_init_sections[] = {
 * by the SRCU notifiers
 */
"___srcu_struct_ptrs",
+
+   /*
+* Array of tracepoint pointers used for checking if tracepoints are
+* quiescent during unloading.
+*/
+   "__tracepoints_ptrs",
 };
 
 static struct module *layout_and_allocate(struct load_info *info, int flags)
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v3 3/3] module: Make __tracepoints_ptrs as read-only

2019-04-10 Thread Joel Fernandes (Google)
This series hardens the tracepoints in modules by making the array of
pointers referring to the tracepoints as read-only. This array is needed
during module unloading to verify that the tracepoint is quiescent.
There is no reason for the array to be to be writable after init, and
can cause security or other hidden bugs. Mark these as ro_after_init.

Suggested-by: paul...@linux.vnet.ibm.com
Suggested-by: keesc...@chromium.org
Suggested-by: mathieu.desnoy...@efficios.com
Cc: rost...@goodmis.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/module.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 8b9631e789f0..be980aaa8804 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3320,6 +3320,12 @@ static const char * const ro_after_init_sections[] = {
 * by the SRCU notifiers
 */
"___srcu_struct_ptrs",
+
+   /*
+* Array of tracepoint pointers used for checking if tracepoints are
+* quiescent during unloading.
+*/
+   "__tracepoints_ptrs",
 };
 
 static struct module *layout_and_allocate(struct load_info *info, int flags)
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v3 1/3] module: Prepare for addition of new ro_after_init sections

2019-04-10 Thread Joel Fernandes (Google)
For the purposes of hardening modules by adding sections to
ro_after_init sections, prepare for addition of new ro_after_init
entries which we do in future patches. Create a table to which new
entries could be added later. This makes it less error prone and reduce
code duplication.

Cc: paul...@linux.vnet.ibm.com
Cc: rost...@goodmis.org
Cc: mathieu.desnoy...@efficios.com
Cc: r...@vger.kernel.org
Cc: kernel-harden...@lists.openwall.com
Cc: kernel-t...@android.com
Suggested-by: keesc...@chromium.org
Reviewed-by: keesc...@chromium.org
Acked-by: rost...@goodmis.org
Signed-off-by: Joel Fernandes (Google) 

---
 kernel/module.c | 41 +++--
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index 524da609c884..42e4e289d6c7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3300,11 +3300,27 @@ static bool blacklisted(const char *module_name)
 }
 core_param(module_blacklist, module_blacklist, charp, 0400);
 
+/*
+ * Mark ro_after_init section with SHF_RO_AFTER_INIT so that
+ * layout_sections() can put it in the right place.
+ * Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
+ */
+static const char * const ro_after_init_sections[] = {
+   ".data..ro_after_init",
+
+   /*
+* __jump_table structures are never modified, with the exception of
+* entries that refer to code in the __init section, which are
+* annotated as such at module load time.
+*/
+   "__jump_table",
+};
+
 static struct module *layout_and_allocate(struct load_info *info, int flags)
 {
struct module *mod;
unsigned int ndx;
-   int err;
+   int err, i;
 
err = check_modinfo(info->mod, info, flags);
if (err)
@@ -3319,23 +3335,12 @@ static struct module *layout_and_allocate(struct 
load_info *info, int flags)
/* We will do a special allocation for per-cpu sections later. */
info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
 
-   /*
-* Mark ro_after_init section with SHF_RO_AFTER_INIT so that
-* layout_sections() can put it in the right place.
-* Note: ro_after_init sections also have SHF_{WRITE,ALLOC} set.
-*/
-   ndx = find_sec(info, ".data..ro_after_init");
-   if (ndx)
-   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
-   /*
-* Mark the __jump_table section as ro_after_init as well: these data
-* structures are never modified, with the exception of entries that
-* refer to code in the __init section, which are annotated as such
-* at module load time.
-*/
-   ndx = find_sec(info, "__jump_table");
-   if (ndx)
-   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+   /* Set sh_flags for read-only after init sections */
+   for (i = 0; i < ARRAY_SIZE(ro_after_init_sections); i++) {
+   ndx = find_sec(info, ro_after_init_sections[i]);
+   if (ndx)
+   info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT;
+   }
 
/* Determine total sizes, and put offsets in sh_entsize.  For now
   this is done generically; there doesn't appear to be any
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v3 2/3] module: Make srcu_struct ptr array as read-only post init

2019-04-10 Thread Joel Fernandes (Google)
Since commit title ("srcu: Allocate per-CPU data for DEFINE_SRCU() in
modules"), modules that call DEFINE_{STATIC,}SRCU will have a new array
of srcu_struct pointers which is used by srcu code to initialize and
clean up these structures.

There is no reason for this array of pointers to be writable, and can
cause security or other hidden bugs. Mark these are read-only after the
module init has completed.

Suggested-by: paul...@linux.vnet.ibm.com
Suggested-by: keesc...@chromium.org
Acked-by: keesc...@chromium.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/module.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 42e4e289d6c7..8b9631e789f0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3314,6 +3314,12 @@ static const char * const ro_after_init_sections[] = {
 * annotated as such at module load time.
 */
"__jump_table",
+
+   /*
+* Used for SRCU structures which need to be initialized/cleaned up
+* by the SRCU notifiers
+*/
+   "___srcu_struct_ptrs",
 };
 
 static struct module *layout_and_allocate(struct load_info *info, int flags)
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH RFC 2/2] Add selftests for pidfd polling

2019-04-11 Thread Joel Fernandes (Google)
Other than verifying pidfd based polling, the tests make sure that
wait semantics are preserved with the pidfd poll. Notably the 2 cases:
1. If a thread group leader exits while threads still there, then no
   pidfd poll notifcation should happen.
2. If a non-thread group leader does an execve, then the thread group
   leader is signaled to exit and is replaced with the execing thread
   as the new leader, however the parent is not notified in this case.

Signed-off-by: Joel Fernandes (Google) 
---
 tools/testing/selftests/pidfd/Makefile |   2 +-
 tools/testing/selftests/pidfd/pidfd_test.c | 216 -
 2 files changed, 208 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/pidfd/Makefile 
b/tools/testing/selftests/pidfd/Makefile
index deaf8073bc06..4b31c14f273c 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -1,4 +1,4 @@
-CFLAGS += -g -I../../../../usr/include/
+CFLAGS += -g -I../../../../usr/include/ -lpthread
 
 TEST_GEN_PROGS := pidfd_test
 
diff --git a/tools/testing/selftests/pidfd/pidfd_test.c 
b/tools/testing/selftests/pidfd/pidfd_test.c
index d59378a93782..4d5206280091 100644
--- a/tools/testing/selftests/pidfd/pidfd_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_test.c
@@ -4,18 +4,26 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
+#include 
 #include 
 
 #include "../kselftest.h"
 
+#define CHILD_THREAD_MIN_WAIT 3 /* seconds */
+#define MAX_EVENTS 5
+#define __NR_pidfd_send_signal 424
+
 static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
unsigned int flags)
 {
@@ -30,6 +38,22 @@ static void set_signal_received_on_sigusr1(int sig)
signal_received = 1;
 }
 
+static int open_pidfd(const char *test_name, pid_t pid)
+{
+   char buf[256];
+   int pidfd;
+
+   snprintf(buf, sizeof(buf), "/proc/%d", pid);
+   pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
+
+   if (pidfd < 0)
+   ksft_exit_fail_msg(
+   "%s test: Failed to open process file descriptor\n",
+   test_name);
+
+   return pidfd;
+}
+
 /*
  * Straightforward test to see whether pidfd_send_signal() works is to send
  * a signal to ourself.
@@ -87,7 +111,6 @@ static int wait_for_pid(pid_t pid)
 static int test_pidfd_send_signal_exited_fail(void)
 {
int pidfd, ret, saved_errno;
-   char buf[256];
pid_t pid;
const char *test_name = "pidfd_send_signal signal exited process";
 
@@ -99,17 +122,10 @@ static int test_pidfd_send_signal_exited_fail(void)
if (pid == 0)
_exit(EXIT_SUCCESS);
 
-   snprintf(buf, sizeof(buf), "/proc/%d", pid);
-
-   pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
+   pidfd = open_pidfd(test_name, pid);
 
(void)wait_for_pid(pid);
 
-   if (pidfd < 0)
-   ksft_exit_fail_msg(
-   "%s test: Failed to open process file descriptor\n",
-   test_name);
-
ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0);
saved_errno = errno;
close(pidfd);
@@ -368,10 +384,192 @@ static int test_pidfd_send_signal_syscall_support(void)
return 0;
 }
 
+void *test_pidfd_poll_exec_thread(void *priv)
+{
+   char waittime[256];
+
+   ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n",
+   getpid(), syscall(SYS_gettid));
+   ksft_print_msg("Child Thread: doing exec of sleep\n");
+
+   sprintf(waittime, "%d", CHILD_THREAD_MIN_WAIT);
+   execl("/bin/sleep", "sleep", waittime, (char *)NULL);
+
+   ksft_print_msg("Child Thread: DONE. pid %d tid %d\n",
+   getpid(), syscall(SYS_gettid));
+   return NULL;
+}
+
+static int poll_pidfd(const char *test_name, int pidfd)
+{
+   int c;
+   int epoll_fd = epoll_create1(0);
+   struct epoll_event event, events[MAX_EVENTS];
+
+   if (epoll_fd == -1)
+   ksft_exit_fail_msg("%s test: Failed to create epoll file 
descriptor\n",
+  test_name);
+
+   event.events = EPOLLIN;
+   event.data.fd = pidfd;
+
+   if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pidfd, )) {
+   ksft_print_msg("%s test: Failed to add epoll file descriptor: 
Skipping\n",
+  test_name);
+   _exit(PIDFD_SKIP);
+   }
+
+   c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000);
+   if (c != 1 || !(events[0].events & EPOLLIN))
+   ksft_exit_fail_msg("%s test: Unexpected epoll_wait result 
(c=%d, events=%x)\n",
+  test_name, c, events[0].events);

[PATCH RFC 1/2] Add polling support to pidfd

2019-04-11 Thread Joel Fernandes (Google)
pidfd are /proc/pid directory file descriptors referring to a task group
leader. Android low memory killer (LMK) needs pidfd polling support to
replace code that currently checks for existence of /proc/pid for
knowing a process that is signalled to be killed has died, which is both
racy and slow. The pidfd poll approach is race-free, and also allows the
LMK to do other things (such as by polling on other fds) while awaiting
the process being killed to die.

It prevents a situation where a PID is reused between when LMK sends a
kill signal and checks for existence of the PID, since the wrong PID is
now possibly checked for existence.

In this patch, we follow the same mechanism used uhen the parent of the
task group is to be notified, that is when the tasks waiting on a poll
of pidfd are also awakened.

We have decided to include the waitqueue in struct pid for the following
reasons:
1. The wait queue has to survive for the lifetime of the poll. Including
it in task_struct would not be option in this case because the task can
be reaped and destroyed before the poll returns.

2. By including the struct pid for the waitqueue means that during
de_exec, the thread doing de_thread() automatically gets the new
waitqueue/pid even though its task_struct is different.

Appropriate test cases are added in the second patch to provide coverage
of all the cases the patch is handling.

Andy had a similar patch [1] in the past which was a good reference
however this patch tries to handle different situations properly related
to thread group existence, and how/where it notifies. And also solves
other bugs (existence of taks_struct).  Daniel had a similar patch [2]
recently which this patch supercedes.

[1] https://lore.kernel.org/patchwork/patch/345098/
[2] https://lore.kernel.org/lkml/20181029175322.189042-1-dan...@google.com/

Cc: l...@amacapital.net
Cc: rost...@goodmis.org
Cc: dan...@google.com
Cc: christ...@brauner.io
Cc: ja...@google.com
Cc: sur...@google.com
Cc: torva...@linux-foundation.org
Co-developed-by: Daniel Colascione 
Signed-off-by: Joel Fernandes (Google) 

---
 fs/proc/base.c  | 39 +++
 include/linux/pid.h |  3 +++
 kernel/exit.c   |  1 -
 kernel/pid.c|  2 ++
 kernel/signal.c | 14 ++
 5 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6a803a0b75df..879900082647 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3069,8 +3069,47 @@ static int proc_tgid_base_readdir(struct file *file, 
struct dir_context *ctx)
   tgid_base_stuff, 
ARRAY_SIZE(tgid_base_stuff));
 }
 
+static unsigned int proc_tgid_base_poll(struct file *file, struct 
poll_table_struct *pts)
+{
+   int poll_flags = 0;
+   struct task_struct *task;
+   struct pid *pid;
+
+   task = get_proc_task(file->f_path.dentry->d_inode);
+
+   WARN_ON_ONCE(task && !thread_group_leader(task));
+
+   /*
+* tasklist_lock must be held because to avoid racing with
+* changes in exit_state and wake up. Basically to avoid:
+*
+* P0: read exit_state = 0
+* P1: write exit_state = EXIT_DEAD
+* P1: Do a wake up - wq is empty, so do nothing
+* P0: Queue for polling - wait forever.
+*/
+   read_lock(_lock);
+   if (!task)
+   poll_flags = POLLIN | POLLRDNORM | POLLERR;
+   else if (task->exit_state == EXIT_DEAD)
+   poll_flags = POLLIN | POLLRDNORM;
+   else if (task->exit_state == EXIT_ZOMBIE && thread_group_empty(task))
+   poll_flags = POLLIN | POLLRDNORM;
+
+   if (!poll_flags) {
+   pid = proc_pid(file->f_path.dentry->d_inode);
+   poll_wait(file, >wait_pidfd, pts);
+   }
+   read_unlock(_lock);
+
+   if (task)
+   put_task_struct(task);
+   return poll_flags;
+}
+
 static const struct file_operations proc_tgid_base_operations = {
.read   = generic_read_dir,
+   .poll   = proc_tgid_base_poll,
.iterate_shared = proc_tgid_base_readdir,
.llseek = generic_file_llseek,
 };
diff --git a/include/linux/pid.h b/include/linux/pid.h
index b6f4ba16065a..2e0dcbc6d14e 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
 #define _LINUX_PID_H
 
 #include 
+#include 
 
 enum pid_type
 {
@@ -60,6 +61,8 @@ struct pid
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
+   /* wait queue for pidfd pollers */
+   wait_queue_head_t wait_pidfd;
struct rcu_head rcu;
struct upid numbers[1];
 };
diff --git a/kernel/exit.c b/kernel/exit.c
index 2166c2d92ddc..c386ec52687d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -181,7 +181,6 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(tsk);
 }
 
-
 vo

[PATCH] srcu: Remove unused vmlinux srcu linker entries

2019-04-07 Thread Joel Fernandes (Google)
The SRCU for modules optimization introduced vmlinux linker entries
which is unused since it applies only to the built-in vmlinux. So remove
it to prevent any space usage due to the 8 byte alignment it added.

Tested with SRCU torture_type and rcutorture. Put prints in module
loader to confirm it is able to find and initialize the srcu structures.

Cc: kernel-t...@android.com
Cc: paul...@linux.vnet.ibm.com
Signed-off-by: Joel Fernandes (Google) 

---
 include/asm-generic/vmlinux.lds.h | 4 
 1 file changed, 4 deletions(-)

diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index c2d919a1566e..f8f6f04c4453 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -338,10 +338,6 @@
KEEP(*(__tracepoints_ptrs)) /* Tracepoints: pointer array */ \
__stop___tracepoints_ptrs = .;  \
*(__tracepoints_strings)/* Tracepoints: strings */  \
-   . = ALIGN(8);   \
-   __start___srcu_struct = .;  \
-   *(___srcu_struct_ptrs)  \
-   __end___srcu_struct = .;\
}   \
\
.rodata1  : AT(ADDR(.rodata1) - LOAD_OFFSET) {  \
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH v2] srcu: Remove unused vmlinux srcu linker entries

2019-04-07 Thread Joel Fernandes (Google)
The SRCU for modules optimization (commit title "srcu: Allocate per-CPU
data for DEFINE_SRCU() in modules") introduced vmlinux linker entries
which is unused since it applies only to the built-in vmlinux. So remove
it to prevent any space usage due to the 8 byte alignment it added.
vmlinux.lds.h has no effect on module loading and is not used for
building the module object, so the changes were not needed in the first
place since the optimization is specific to modules.

Tested with SRCU torture_type and rcutorture. Put prints in module
loader to confirm it is able to find and initialize the srcu structures.

Cc: Josh Triplett 
Cc: Steven Rostedt 
Cc: Mathieu Desnoyers 
Cc: Lai Jiangshan 
Cc: kernel-t...@android.com
Cc: paul...@linux.vnet.ibm.com
Signed-off-by: Joel Fernandes (Google) 
---
v1->v2: Added more context to change log.

 include/asm-generic/vmlinux.lds.h | 4 
 1 file changed, 4 deletions(-)

diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index c2d919a1566e..f8f6f04c4453 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -338,10 +338,6 @@
KEEP(*(__tracepoints_ptrs)) /* Tracepoints: pointer array */ \
__stop___tracepoints_ptrs = .;  \
*(__tracepoints_strings)/* Tracepoints: strings */  \
-   . = ALIGN(8);   \
-   __start___srcu_struct = .;  \
-   *(___srcu_struct_ptrs)  \
-   __end___srcu_struct = .;\
}   \
\
.rodata1  : AT(ADDR(.rodata1) - LOAD_OFFSET) {  \
-- 
2.21.0.392.gf8f6787159e-goog


[PATCH v2 0/4] RCU fixes for rcu_assign_pointer() usage

2019-03-20 Thread Joel Fernandes (Google)
This is just a resend with scheduler patches split from the driver fixes and
Paul's Reviewed-by(s) added.

These patches fix various sparse errors ccaused as a result of the recent check
to add rcu_check_sparse() to rcu_assign_pointer().  The errors are due to
missing annotations. The annotations added in the series can also help avoid
future incorrect usages and bugs so it is a good idea to do in any case.

RFC v1 -> Patch v2:
Made changes based on Peter Zijlstra review.

Joel Fernandes (Google) (4):
sched/cpufreq: Annotate cpufreq_update_util_data pointer with __rcu
sched_domain: Annotate RCU pointers properly
rcuwait: Annotate task_struct with __rcu
sched: Annotate perf_domain pointer with __rcu

include/linux/rcuwait.h|  2 +-
include/linux/sched/topology.h |  4 ++--
kernel/sched/cpufreq.c |  2 +-
kernel/sched/sched.h   | 18 +-
kernel/sched/topology.c| 10 +-
5 files changed, 18 insertions(+), 18 deletions(-)

--
2.21.0.225.g810b269d1ac-goog



[PATCH v2 2/4] sched_domain: Annotate RCU pointers properly

2019-03-20 Thread Joel Fernandes (Google)
The scheduler uses RCU API in various places to access sched_domain
pointers. These cause sparse errors as below.

Many new errors show up because of an annotation check I added to
rcu_assign_pointer(). Let us annotate the pointers correctly which also
will help sparse catch any potential future bugs.

This fixes the following sparse errors:

rt.c:1681:9: error: incompatible types in comparison expression
deadline.c:1904:9: error: incompatible types in comparison expression
core.c:519:9: error: incompatible types in comparison expression
core.c:1634:17: error: incompatible types in comparison expression
fair.c:6193:14: error: incompatible types in comparison expression
fair.c:9883:22: error: incompatible types in comparison expression
fair.c:9897:9: error: incompatible types in comparison expression
sched.h:1287:9: error: incompatible types in comparison expression
topology.c:612:9: error: incompatible types in comparison expression
topology.c:615:9: error: incompatible types in comparison expression
sched.h:1300:9: error: incompatible types in comparison expression
topology.c:618:9: error: incompatible types in comparison expression
sched.h:1287:9: error: incompatible types in comparison expression
topology.c:621:9: error: incompatible types in comparison expression
sched.h:1300:9: error: incompatible types in comparison expression
topology.c:624:9: error: incompatible types in comparison expression
topology.c:671:9: error: incompatible types in comparison expression
stats.c:45:17: error: incompatible types in comparison expression
fair.c:5998:15: error: incompatible types in comparison expression
fair.c:5989:15: error: incompatible types in comparison expression
fair.c:5998:15: error: incompatible types in comparison expression
fair.c:5989:15: error: incompatible types in comparison expression
fair.c:6120:19: error: incompatible types in comparison expression
fair.c:6506:14: error: incompatible types in comparison expression
fair.c:6515:14: error: incompatible types in comparison expression
fair.c:6623:9: error: incompatible types in comparison expression
fair.c:5970:17: error: incompatible types in comparison expression
fair.c:8642:21: error: incompatible types in comparison expression
fair.c:9253:9: error: incompatible types in comparison expression
fair.c:9331:9: error: incompatible types in comparison expression
fair.c:9519:15: error: incompatible types in comparison expression
fair.c:9533:14: error: incompatible types in comparison expression
fair.c:9542:14: error: incompatible types in comparison expression
fair.c:9567:14: error: incompatible types in comparison expression
fair.c:9597:14: error: incompatible types in comparison expression
fair.c:9421:16: error: incompatible types in comparison expression
fair.c:9421:16: error: incompatible types in comparison expression

[From an RCU perspective]
Reviewed-by: Paul E. McKenney 
Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/sched/topology.h |  4 ++--
 kernel/sched/sched.h   | 14 +++---
 kernel/sched/topology.c| 10 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 57c7ed3fe465..cfc0a89a7159 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -76,8 +76,8 @@ struct sched_domain_shared {
 
 struct sched_domain {
/* These fields must be setup */
-   struct sched_domain *parent;/* top domain must be null terminated */
-   struct sched_domain *child; /* bottom domain must be null 
terminated */
+   struct sched_domain __rcu *parent;  /* top domain must be null 
terminated */
+   struct sched_domain __rcu *child;   /* bottom domain must be null 
terminated */
struct sched_group *groups; /* the balancing groups of the domain */
unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 713715dd00cf..2b452d68ab2e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -869,8 +869,8 @@ struct rq {
atomic_tnr_iowait;
 
 #ifdef CONFIG_SMP
-   struct root_domain  *rd;
-   struct sched_domain *sd;
+   struct root_domain  *rd;
+   struct sched_domain __rcu   *sd;
 
unsigned long   cpu_capacity;
unsigned long   cpu_capacity_orig;
@@ -1324,13 +1324,13 @@ static inline struct sched_domain 
*lowest_flag_domain(int cpu, int flag)
return sd;
 }
 
-DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing

[PATCH v2 3/4] rcuwait: Annotate task_struct with __rcu

2019-03-20 Thread Joel Fernandes (Google)
This suppresses sparse error generated due to the recently added
rcu_assign_pointer sparse check.

percpu-rwsem.c:162:9: sparse: error: incompatible types in comparison expression
exit.c:316:16: sparse: error: incompatible types in comparison expression

[From an RCU perspective]
Reviewed-by: Paul E. McKenney 
Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/rcuwait.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
index 90bfa3279a01..563290fc194f 100644
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@@ -18,7 +18,7 @@
  * awoken.
  */
 struct rcuwait {
-   struct task_struct *task;
+   struct task_struct __rcu *task;
 };
 
 #define __RCUWAIT_INITIALIZER(name)\
-- 
2.21.0.225.g810b269d1ac-goog



[PATCH v2 1/4] sched/cpufreq: Annotate cpufreq_update_util_data pointer with __rcu

2019-03-20 Thread Joel Fernandes (Google)
Recently I added an RCU annotation check to rcu_assign_pointer(). All
pointers assigned to RCU protected data are to be annotated with __rcu
inorder to be able to use rcu_assign_pointer() similar to checks in
other RCU APIs.

This resulted in a sparse error: kernel//sched/cpufreq.c:41:9: sparse:
error: incompatible types in comparison expression (different address
spaces)

Fix this by annotating cpufreq_update_util_data pointer with __rcu. This
will also help sparse catch any future RCU misuage bugs.

[From an RCU perspective]
Reviewed-by: Paul E. McKenney 
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/cpufreq.c | 2 +-
 kernel/sched/sched.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 835671f0f917..b5dcd1d83c7f 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -7,7 +7,7 @@
  */
 #include "sched.h"
 
-DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
 /**
  * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index efa686eeff26..713715dd00cf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu)
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #ifdef CONFIG_CPU_FREQ
-DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
 /**
  * cpufreq_update_util - Take a note about CPU utilization changes.
-- 
2.21.0.225.g810b269d1ac-goog



[PATCH v2 4/4] sched: Annotate perf_domain pointer with __rcu

2019-03-20 Thread Joel Fernandes (Google)
This fixes the following sparse errors in sched/fair.c:

fair.c:6506:14: error: incompatible types in comparison expression
fair.c:8642:21: error: incompatible types in comparison expression

Using __rcu will also help sparse catch any future bugs.

[From an RCU perspective]
Reviewed-by: Paul E. McKenney 
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2b452d68ab2e..b52ed1ada0be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -780,7 +780,7 @@ struct root_domain {
 * NULL-terminated list of performance domains intersecting with the
 * CPUs of the rd. Protected by RCU.
 */
-   struct perf_domain  *pd;
+   struct perf_domain __rcu *pd;
 };
 
 extern struct root_domain def_root_domain;
-- 
2.21.0.225.g810b269d1ac-goog



[PATCH] module: Make srcu_struct ptr array as read-only

2019-04-11 Thread Joel Fernandes (Google)
Since commit title ("srcu: Allocate per-CPU data for DEFINE_SRCU() in
modules"), modules that call DEFINE_{STATIC,}SRCU will have a new array
of srcu_struct pointers, which is used by srcu code to initialize and
clean up these structures and save valuable per-cpu reserved space.

There is no reason for this array of pointers to be writable, and can
cause security or other hidden bugs. Mark these are read-only after the
module init has completed.

Tested with the following diff to ensure array not writable:

(diff is a bit reduced to avoid patch command getting confused)
 a/kernel/module.c
 b/kernel/module.c
  -3506,6 +3506,14  static noinline int do_init_module [snip]
rcu_assign_pointer(mod->kallsyms, >core_kallsyms);
 #endif
module_enable_ro(mod, true);
+
+   if (mod->srcu_struct_ptrs) {
+   // Check if srcu_struct_ptrs access is possible
+   char x = *(char *)mod->srcu_struct_ptrs;
+   *(char *)mod->srcu_struct_ptrs = 0;
+   *(char *)mod->srcu_struct_ptrs = x;
+   }
+
mod_tree_remove_init(mod);
disable_ro_nx(>init_layout);
module_arch_freeing_init(mod);

Cc: Rasmus Villemoes 
Cc: paul...@linux.vnet.ibm.com
Cc: rost...@goodmis.org
Cc: mathieu.desnoy...@efficios.com
Cc: r...@vger.kernel.org
Cc: kernel-harden...@lists.openwall.com
Cc: kernel-t...@android.com
Signed-off-by: Joel Fernandes (Google) 

---
This single patch superceded the patches at:
https://lore.kernel.org/patchwork/patch/1060298/
https://lore.kernel.org/patchwork/patch/1060298/

 include/linux/srcutree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h
index 8af1824c46a8..9cfcc8a756ae 100644
--- a/include/linux/srcutree.h
+++ b/include/linux/srcutree.h
@@ -123,7 +123,7 @@ struct srcu_struct {
 #ifdef MODULE
 # define __DEFINE_SRCU(name, is_static)
\
is_static struct srcu_struct name;  \
-   struct srcu_struct *__srcu_struct_##name\
+   struct srcu_struct * const __srcu_struct_##name \
__section("___srcu_struct_ptrs") = 
 #else
 # define __DEFINE_SRCU(name, is_static)
\
-- 
2.21.0.392.gf8f6787159e-goog


[RFC 1/2] lockdep: Add assertion to check if in an interrupt

2019-03-22 Thread Joel Fernandes (Google)
In rcu_rrupt_from_idle, we want to check if it is called from within an
interrupt, but want to do such checking only for debug builds. lockdep
already tracks when we enter an interrupt. Let us expose it as an
assertion macro so it can be used to assert this.

Suggested-by: Steven Rostedt 
Cc: kernel-t...@android.com
Cc: r...@vger.kernel.org
Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/lockdep.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index c5335df2372f..d24f564823d3 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -601,11 +601,18 @@ do {  
\
  "IRQs not disabled as expected\n");   \
} while (0)
 
+#define lockdep_assert_in_irq() do {   \
+   WARN_ONCE(debug_locks && !current->lockdep_recursion && \
+ !current->hardirq_context,\
+ "Not in hardirq as expected\n");  \
+   } while (0)
+
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
 # define lockdep_assert_irqs_enabled() do { } while (0)
 # define lockdep_assert_irqs_disabled() do { } while (0)
+# define lockdep_assert_in_irq() do { } while (0)
 #endif
 
 #ifdef CONFIG_LOCKDEP
-- 
2.21.0.392.gf8f6787159e-goog



[RFC 2/2] rcutree: Add checks for dynticks counters in rcu_is_cpu_rrupt_from_idle

2019-03-22 Thread Joel Fernandes (Google)
In the future we would like to combine the dynticks and dynticks_nesting
counters thus leading to simplifying the code. At the moment we cannot
do that due to concerns about usermode upcalls appearing to RCU as half
of an interrupt. Byungchul tried to do it in [1] but the
"half-interrupt" concern was raised. It is half because, what RCU
expects is rcu_irq_enter() and rcu_irq_exit() pairs when the usermode
exception happens. However, only rcu_irq_enter() is observed. This
concern may not be valid anymore, but at least it used to be the case.

Out of abundance of caution, Paul added warnings [2] in the RCU code
which if not fired by 2021 may allow us to assume that such
half-interrupt scenario cannot happen any more, which can lead to
simplification of this code.

Summary of the changes are the following:

(1) In preparation for this combination of counters in the future, we
first need to first be sure that rcu_rrupt_from_idle cannot be called
from anywhere but a hard-interrupt because previously, the comments
suggested otherwise so let us be sure. We discussed this here [3]. We
use the services of lockdep to accomplish this.

(2) Further rcu_rrupt_from_idle() is not explicit about how it is using
the counters which can lead to weird future bugs. This patch therefore
makes it more explicit about the specific counter values being tested

(3) Lastly, we check for counter underflows just to be sure these are
not happening, because the previous code in rcu_rrupt_from_idle() was
allowing the case where the counters can underflow, and the function
would still return true. Now we are checking for specific values so let
us be confident by additional checking, that such underflows don't
happen. Any case, if they do, we should fix them and the screaming
warning is appropriate. All these checks checks are NOOPs if PROVE_RCU
and PROVE_LOCKING are disabled.

[1] https://lore.kernel.org/patchwork/patch/952349/
[2] Commit e11ec65cc8d6 ("rcu: Add warning to detect half-interrupts")
[3] https://lore.kernel.org/lkml/20190312150514.gb249...@google.com/

Cc: byungchul.p...@lge.com
Cc: kernel-t...@android.com
Cc: r...@vger.kernel.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9180158756d2..d94c8ed29f6b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -381,16 +381,29 @@ static void __maybe_unused 
rcu_momentary_dyntick_idle(void)
 }
 
 /**
- * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from 
idle
+ * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle
  *
- * If the current CPU is idle or running at a first-level (not nested)
+ * If the current CPU is idle and running at a first-level (not nested)
  * interrupt from idle, return true.  The caller must have at least
  * disabled preemption.
  */
 static int rcu_is_cpu_rrupt_from_idle(void)
 {
-   return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 &&
-  __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1;
+   /* Called only from within the scheduling-clock interrupt */
+   lockdep_assert_in_irq();
+
+   /* Check for counter underflows */
+   RCU_LOCKDEP_WARN(
+   (__this_cpu_read(rcu_data.dynticks_nesting) < 0) &&
+   (__this_cpu_read(rcu_data.dynticks_nmi_nesting) < 0),
+   "RCU dynticks nesting counters underflow!");
+
+   /* Are we at first interrupt nesting level? */
+   if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1)
+   return false;
+
+   /* Does CPU appear to be idle from an RCU standpoint? */
+   return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
 }
 
 #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */
-- 
2.21.0.392.gf8f6787159e-goog



[PATCH] rcutorture: Select from only online CPUs

2019-03-22 Thread Joel Fernandes (Google)
The rcutorture jitter.sh script selects a random CPU but does not check
if it is offline or online. This leads to taskset errors many times. On
my machine, hyper threading is disabled so half the cores are offline
causing taskset errors a lot of times. Let us fix this by checking from
only the online CPUs on the system.

Signed-off-by: Joel Fernandes (Google) 
---
 tools/testing/selftests/rcutorture/bin/jitter.sh | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh 
b/tools/testing/selftests/rcutorture/bin/jitter.sh
index 3633828375e3..53bf9d99b5cd 100755
--- a/tools/testing/selftests/rcutorture/bin/jitter.sh
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -47,10 +47,19 @@ do
exit 0;
fi
 
-   # Set affinity to randomly selected CPU
+   # Set affinity to randomly selected online CPU
cpus=`ls /sys/devices/system/cpu/*/online |
sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//' |
grep -v '^0*$'`
+
+   for c in $cpus; do
+   if [ "$(cat /sys/devices/system/cpu/cpu$c/online)" == "1" ];
+   then
+   cpus_tmp="$cpus_tmp $c"
+   fi
+   done
+   cpus=$cpus_tmp
+
cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN {
srand(n + me + systime());
ncpus = split(cpus, ca);
-- 
2.21.0.392.gf8f6787159e-goog



[RFC 2/5] ixgbe: Fix incorrect RCU API usage

2019-02-19 Thread Joel Fernandes (Google)
From: Joel Fernandes 

Recently, I added an RCU annotation check in rcu_assign_pointer. This
caused a sparse error to be reported by the ixgbe driver.

Further looking, it seems the adapter->xdp_prog pointer is not annotated
with __rcu. Annonating it fixed the error, but caused a bunch of other
warnings.

This patch tries to fix all warnings by using RCU API properly. This
makes sense to do because not using RCU properly can result in various
hard to find bugs. This is a best effort fix and is only build tested.
The sparse errors and warnings go away with the change. I request
maintainers / developers in this area to test it properly.

Signed-off-by: Joel Fernandes 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h  |  4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 16 +++-
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 08d85e336bd4..3b14daf27516 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -311,7 +311,7 @@ struct ixgbe_ring {
struct ixgbe_ring *next;/* pointer to next ring in q_vector */
struct ixgbe_q_vector *q_vector; /* backpointer to host q_vector */
struct net_device *netdev;  /* netdev ring belongs to */
-   struct bpf_prog *xdp_prog;
+   struct bpf_prog __rcu *xdp_prog;
struct device *dev; /* device for DMA mapping */
void *desc; /* descriptor ring memory */
union {
@@ -560,7 +560,7 @@ struct ixgbe_adapter {
unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
/* OS defined structs */
struct net_device *netdev;
-   struct bpf_prog *xdp_prog;
+   struct bpf_prog __rcu *xdp_prog;
struct pci_dev *pdev;
struct mii_bus *mii_bus;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index daff8183534b..aad7b800aacd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2199,7 +2199,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter 
*adapter,
u32 act;
 
rcu_read_lock();
-   xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+   xdp_prog = rcu_dereference(rx_ring->xdp_prog);
 
if (!xdp_prog)
goto xdp_out;
@@ -6547,7 +6547,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter 
*adapter,
 rx_ring->queue_index) < 0)
goto err;
 
-   rx_ring->xdp_prog = adapter->xdp_prog;
+   rcu_assign_pointer(rx_ring->xdp_prog, adapter->xdp_prog);
 
return 0;
 err:
@@ -10246,7 +10246,8 @@ static int ixgbe_xdp_setup(struct net_device *dev, 
struct bpf_prog *prog)
if (nr_cpu_ids > MAX_XDP_QUEUES)
return -ENOMEM;
 
-   old_prog = xchg(>xdp_prog, prog);
+   old_prog = rcu_dereference(adapter->xdp_prog);
+   rcu_assign_pointer(adapter->xdp_prog, prog);
 
/* If transitioning XDP modes reconfigure rings */
if (!!prog != !!old_prog) {
@@ -10271,13 +10272,18 @@ static int ixgbe_xdp_setup(struct net_device *dev, 
struct bpf_prog *prog)
 static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
struct ixgbe_adapter *adapter = netdev_priv(dev);
+   struct bpf_prog *prog;
+   int ret;
 
switch (xdp->command) {
case XDP_SETUP_PROG:
return ixgbe_xdp_setup(dev, xdp->prog);
case XDP_QUERY_PROG:
-   xdp->prog_id = adapter->xdp_prog ?
-   adapter->xdp_prog->aux->id : 0;
+   rcu_read_lock();
+   prog = rcu_dereference(adapter->xdp_prog);
+   xdp->prog_id = prog ? prog->aux->id : 0;
+   rcu_read_unlock();
+
return 0;
case XDP_QUERY_XSK_UMEM:
return ixgbe_xsk_umem_query(adapter, >xsk.umem,
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[RFC 1/5] net: rtnetlink: Fix incorrect RCU API usage

2019-02-19 Thread Joel Fernandes (Google)
From: Joel Fernandes 

rtnl_register_internal() and rtnl_unregister_all tries to directly
dereference an RCU protected pointed outside RCU read side section.
While this is Ok to do since a lock is held, let us use the correct
API to avoid programmer bugs in the future.

This also fixes sparse warnings arising from not using RCU API.

net/core/rtnetlink.c:332:13: warning: incorrect type in assignment
(different address spaces) net/core/rtnetlink.c:332:13:expected
struct rtnl_link **tab net/core/rtnetlink.c:332:13:got struct
rtnl_link *[noderef] *

Signed-off-by: Joel Fernandes 
---
 net/core/rtnetlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5ea1bed08ede..98be4b4818a9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -188,7 +188,7 @@ static int rtnl_register_internal(struct module *owner,
msgindex = rtm_msgindex(msgtype);
 
rtnl_lock();
-   tab = rtnl_msg_handlers[protocol];
+   tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (tab == NULL) {
tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
if (!tab)
@@ -329,7 +329,7 @@ void rtnl_unregister_all(int protocol)
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 
rtnl_lock();
-   tab = rtnl_msg_handlers[protocol];
+   tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (!tab) {
rtnl_unlock();
return;
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[RFC 0/5] RCU fixes for rcu_assign_pointer usage

2019-02-19 Thread Joel Fernandes (Google)
These patches fix various RCU API usage issues found due to sparse errors as a
result of the recent check to add rcu_check_sparse() to rcu_assign_pointer().

This is very early RFC stage, and is only build tested. I am also only sending
to the RCU group for initial review before sending to LKML. Thanks for any 
feedback!

There are still more usages that cause errors such as rbtree which I am
looking into.

Joel Fernandes (5):
net: rtnetlink: Fix incorrect RCU API usage
ixgbe: Fix incorrect RCU API usage
sched/cpufreq: Fix incorrect RCU API usage
sched/toplogy: Use smp_store_release() instead of rcu_assign_pointer
rcuwait: Replace rcu_assign_pointer with smp_store_release

drivers/net/ethernet/intel/ixgbe/ixgbe.h  |  4 ++--
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 16 +++-
include/linux/rcuwait.h   |  2 +-
kernel/sched/cpufreq.c|  8 ++--
kernel/sched/sched.h  |  2 +-
kernel/sched/topology.c   | 16 
net/core/rtnetlink.c  |  4 ++--
7 files changed, 31 insertions(+), 21 deletions(-)

--
2.21.0.rc0.258.g878e2cd30e-goog



[RFC 4/5] sched/toplogy: Use smp_store_release() instead of rcu_assign_pointer

2019-02-19 Thread Joel Fernandes (Google)
From: Joel Fernandes 

The scheduler's topology code seems to want to use rcu_assign_pointer()
to initialize various pointers for no apparent reason.

With a guess that what was needed here is smp_store_release(), I am
replacing it with that. This suppresses the new sparse errors caused by
an annotation check I added to rcu_assign_pointer(). Let us avoid (ab)using
RCU API and be explicit about what we want.

Fixes sparse errors:
kernel//sched/topology.c:206:1: sparse: warning: symbol
'sched_energy_mutex' was not declared. Should it be static?
kernel//sched/topology.c:207:6: sparse: warning: symbol
'sched_energy_update' was not declared. Should it be static?  >>
kernel//sched/topology.c:378:9: sparse: error: incompatible types in
comparison expression (different address spaces)
kernel//sched/topology.c:387:9: sparse: error: incompatible types in
comparison expression (different address spaces)
kernel//sched/topology.c:612:9: sparse: error: incompatible types in
comparison expression (different address spaces)
kernel//sched/topology.c:615:9: sparse: error: incompatible types in
comparison expression (different address spaces)
kernel//sched/topology.c:618:9: sparse: error: incompatible types in
comparison expression (different address spaces)
kernel//sched/topology.c:621:9: sparse: error: incompatible types in
comparison expression (different address spaces)

Signed-off-by: Joel Fernandes 
---
 kernel/sched/topology.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3f35ba1d8fde..e7a424d8de8e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -375,7 +375,7 @@ static bool build_perf_domains(const struct cpumask 
*cpu_map)
 
/* Attach the new list of performance domains to the root domain. */
tmp = rd->pd;
-   rcu_assign_pointer(rd->pd, pd);
+   smp_store_release(>pd, pd);
if (tmp)
call_rcu(>rcu, destroy_perf_domain_rcu);
 
@@ -384,7 +384,7 @@ static bool build_perf_domains(const struct cpumask 
*cpu_map)
 free:
free_pd(pd);
tmp = rd->pd;
-   rcu_assign_pointer(rd->pd, NULL);
+   smp_store_release(>pd, NULL);
if (tmp)
call_rcu(>rcu, destroy_perf_domain_rcu);
 
@@ -609,19 +609,19 @@ static void update_top_cache_domain(int cpu)
sds = sd->shared;
}
 
-   rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+   smp_store_release(_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
-   rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
+   smp_store_release(_cpu(sd_llc_shared, cpu), sds);
 
sd = lowest_flag_domain(cpu, SD_NUMA);
-   rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
+   smp_store_release(_cpu(sd_numa, cpu), sd);
 
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
-   rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
+   smp_store_release(_cpu(sd_asym_packing, cpu), sd);
 
sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
-   rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
+   smp_store_release(_cpu(sd_asym_cpucapacity, cpu), sd);
 }
 
 /*
@@ -668,7 +668,7 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
 
rq_attach_root(rq, rd);
tmp = rq->sd;
-   rcu_assign_pointer(rq->sd, sd);
+   smp_store_release(>sd, sd);
dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
 
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[RFC 3/5] sched/cpufreq: Fix incorrect RCU API usage

2019-02-19 Thread Joel Fernandes (Google)
From: Joel Fernandes 

Recently I added an RCU annotation check to rcu_assign_pointer(). All
pointers assigned to RCU protected data are to be annotated with __rcu
inorder to be able to use rcu_assign_pointer() similar to checks in
other RCU APIs.

This resulted in a sparse error: kernel//sched/cpufreq.c:41:9: sparse:
error: incompatible types in comparison expression (different address
spaces)

Fix this by using the correct APIs for RCU accesses. This will
potentially avoid any future bugs in the code. If it is felt that RCU
protection is not needed here, then the rcu_assign_pointer call can be
dropped and replaced with, say, WRITE_ONCE or smp_store_release. Or, may
be we add a new API to do it. But calls rcu_assign_pointer seems an
abuse of the RCU API.

Signed-off-by: Joel Fernandes 
---
 kernel/sched/cpufreq.c | 8 ++--
 kernel/sched/sched.h   | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 22bd8980f32f..c9aeb3bf5dc2 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -7,7 +7,7 @@
  */
 #include "sched.h"
 
-DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
 /**
  * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
@@ -34,8 +34,12 @@ void cpufreq_add_update_util_hook(int cpu, struct 
update_util_data *data,
if (WARN_ON(!data || !func))
return;
 
-   if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+   rcu_read_lock();
+   if (WARN_ON(rcu_dereference(per_cpu(cpufreq_update_util_data, cpu {
+   rcu_read_unlock();
return;
+   }
+   rcu_read_unlock();
 
data->func = func;
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d04530bf251f..2ab545d40381 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2166,7 +2166,7 @@ static inline u64 irq_time_read(int cpu)
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #ifdef CONFIG_CPU_FREQ
-DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
 /**
  * cpufreq_update_util - Take a note about CPU utilization changes.
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[RFC 5/5] rcuwait: Replace rcu_assign_pointer with smp_store_release

2019-02-19 Thread Joel Fernandes (Google)
From: Joel Fernandes 

This suppresses a sparse error generated to the recently added
rcu_assign_pointer sparse check:

>> kernel//locking/percpu-rwsem.c:162:9: sparse: error: incompatible
types in comparison expression (different address spaces)

Signed-off-by: Joel Fernandes 
---
 include/linux/rcuwait.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
index 90bfa3279a01..da613efee45c 100644
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@@ -44,7 +44,7 @@ extern void rcuwait_wake_up(struct rcuwait *w);
 */ \
WARN_ON(current->exit_state);   \
\
-   rcu_assign_pointer((w)->task, current); \
+   smp_store_release(&((w)->task), current);   \
for (;;) {  \
/*  \
 * Implicit barrier (A) pairs with (B) in   \
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH RFC 0/5] RCU fixes for rcu_assign_pointer() usage

2019-02-20 Thread Joel Fernandes (Google)
These patches fix various RCU API usage issues found due to sparse errors as a
result of the recent check to add rcu_check_sparse() to rcu_assign_pointer().
The errors in many cases seem to indicate either an incorrect API usage, or
missing annotations. The annotations added can also help avoid future incorrect
usages and bugs so it is a good idea to do in any case.

These are only build/boot tested and I request for feedback from maintainers
and developers in the various areas the patches touch. Thanks for any feedback!

(There are still errors in rbtree.h but I have kept those for a later time
since fixing them is a bit more involved).

Joel Fernandes (Google) (5):
net: rtnetlink: Fix incorrect RCU API usage
ixgbe: Fix incorrect RCU API usage
sched/cpufreq: Fix incorrect RCU API usage
sched/topology: Annonate RCU pointers properly
rcuwait: Replace rcu_assign_pointer() with WRITE_ONCE

drivers/net/ethernet/intel/ixgbe/ixgbe.h  |  4 ++--
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 17 -
include/linux/rcuwait.h   |  2 +-
kernel/sched/cpufreq.c|  8 ++--
kernel/sched/sched.h  | 14 +++---
kernel/sched/topology.c   | 12 ++--
net/core/rtnetlink.c  |  4 ++--
7 files changed, 36 insertions(+), 25 deletions(-)

--
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH RFC 2/5] ixgbe: Fix incorrect RCU API usage

2019-02-20 Thread Joel Fernandes (Google)
Recently, I added an RCU annotation check in rcu_assign_pointer. This
caused a sparse error to be reported by the ixgbe driver.

Further looking, it seems the adapter->xdp_prog pointer is not annotated
with __rcu. Annonating it fixed the error, but caused a bunch of other
warnings.

This patch tries to fix all warnings by using RCU API properly. This
makes sense to do because not using RCU properly can result in various
hard to find bugs. This is a best effort fix and is only build tested.
The sparse errors and warnings go away with the change. I request
maintainers / developers in this area to test it properly.

Signed-off-by: Joel Fernandes (Google) 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h  |  4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 17 -
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 08d85e336bd4..3b14daf27516 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -311,7 +311,7 @@ struct ixgbe_ring {
struct ixgbe_ring *next;/* pointer to next ring in q_vector */
struct ixgbe_q_vector *q_vector; /* backpointer to host q_vector */
struct net_device *netdev;  /* netdev ring belongs to */
-   struct bpf_prog *xdp_prog;
+   struct bpf_prog __rcu *xdp_prog;
struct device *dev; /* device for DMA mapping */
void *desc; /* descriptor ring memory */
union {
@@ -560,7 +560,7 @@ struct ixgbe_adapter {
unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
/* OS defined structs */
struct net_device *netdev;
-   struct bpf_prog *xdp_prog;
+   struct bpf_prog __rcu *xdp_prog;
struct pci_dev *pdev;
struct mii_bus *mii_bus;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index daff8183534b..6aa59bb13a14 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2199,7 +2199,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter 
*adapter,
u32 act;
 
rcu_read_lock();
-   xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+   xdp_prog = rcu_dereference(rx_ring->xdp_prog);
 
if (!xdp_prog)
goto xdp_out;
@@ -6547,7 +6547,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter 
*adapter,
 rx_ring->queue_index) < 0)
goto err;
 
-   rx_ring->xdp_prog = adapter->xdp_prog;
+   rcu_assign_pointer(rx_ring->xdp_prog, adapter->xdp_prog);
 
return 0;
 err:
@@ -10246,7 +10246,10 @@ static int ixgbe_xdp_setup(struct net_device *dev, 
struct bpf_prog *prog)
if (nr_cpu_ids > MAX_XDP_QUEUES)
return -ENOMEM;
 
-   old_prog = xchg(>xdp_prog, prog);
+   rcu_read_lock();
+   old_prog = rcu_dereference(adapter->xdp_prog);
+   rcu_assign_pointer(adapter->xdp_prog, prog);
+   rcu_read_unlock();
 
/* If transitioning XDP modes reconfigure rings */
if (!!prog != !!old_prog) {
@@ -10271,13 +10274,17 @@ static int ixgbe_xdp_setup(struct net_device *dev, 
struct bpf_prog *prog)
 static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
struct ixgbe_adapter *adapter = netdev_priv(dev);
+   struct bpf_prog *prog;
 
switch (xdp->command) {
case XDP_SETUP_PROG:
return ixgbe_xdp_setup(dev, xdp->prog);
case XDP_QUERY_PROG:
-   xdp->prog_id = adapter->xdp_prog ?
-   adapter->xdp_prog->aux->id : 0;
+   rcu_read_lock();
+   prog = rcu_dereference(adapter->xdp_prog);
+   xdp->prog_id = prog ? prog->aux->id : 0;
+   rcu_read_unlock();
+
return 0;
case XDP_QUERY_XSK_UMEM:
return ixgbe_xsk_umem_query(adapter, >xsk.umem,
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH RFC 4/5] sched/topology: Annonate RCU pointers properly

2019-02-20 Thread Joel Fernandes (Google)
The scheduler's topology code uses rcu_assign_pointer() to initialize
various pointers.

Let us annotate the pointers correctly which also help avoid future
bugs. This suppresses the new sparse errors caused by an annotation
check I added to rcu_assign_pointer().

Also replace rcu_assign_pointer call on rq->sd with WRITE_ONCE. This
should be sufficient for the rq->sd initialization.

This fixes sparse errors:
kernel//sched/topology.c:378:9: sparse: error: incompatible types in
comparison expression (different address spaces)
kernel//sched/topology.c:387:9: sparse: error: incompatible types in
comparison expression (different address spaces)
kernel//sched/topology.c:612:9: sparse: error: incompatible types in
comparison expression (different address spaces)
kernel//sched/topology.c:615:9: sparse: error: incompatible types in
comparison expression (different address spaces)
kernel//sched/topology.c:618:9: sparse: error: incompatible types in
comparison expression (different address spaces)
kernel//sched/topology.c:621:9: sparse: error: incompatible types in
comparison expression (different address spaces)

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/sched.h| 12 ++--
 kernel/sched/topology.c | 12 ++--
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2ab545d40381..806703afd4b0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -780,7 +780,7 @@ struct root_domain {
 * NULL-terminated list of performance domains intersecting with the
 * CPUs of the rd. Protected by RCU.
 */
-   struct perf_domain  *pd;
+   struct perf_domain __rcu *pd;
 };
 
 extern struct root_domain def_root_domain;
@@ -1305,13 +1305,13 @@ static inline struct sched_domain 
*lowest_flag_domain(int cpu, int flag)
return sd;
 }
 
-DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 extern struct static_key_false sched_asym_cpucapacity;
 
 struct sched_group_capacity {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 3f35ba1d8fde..2eab2e16ded5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -586,13 +586,13 @@ static void destroy_sched_domains(struct sched_domain *sd)
  * the cpumask of the domain), this allows us to quickly tell if
  * two CPUs are in the same cache domain, see cpus_share_cache().
  */
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 
 static void update_top_cache_domain(int cpu)
@@ -668,7 +668,7 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
 
rq_attach_root(rq, rd);
tmp = rq->sd;
-   rcu_assign_pointer(rq->sd, sd);
+   WRITE_ONCE(rq->sd, sd);
dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
 
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH RFC 1/5] net: rtnetlink: Fix incorrect RCU API usage

2019-02-20 Thread Joel Fernandes (Google)
rtnl_register_internal() and rtnl_unregister_all tries to directly
dereference an RCU protected pointed outside RCU read side section.
While this is Ok to do since a lock is held, let us use the correct
API to avoid programmer bugs in the future.

This also fixes sparse warnings arising from not using RCU API.

net/core/rtnetlink.c:332:13: warning: incorrect type in assignment
(different address spaces) net/core/rtnetlink.c:332:13:expected
struct rtnl_link **tab net/core/rtnetlink.c:332:13:got struct
rtnl_link *[noderef] *

Signed-off-by: Joel Fernandes (Google) 
---
 net/core/rtnetlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5ea1bed08ede..98be4b4818a9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -188,7 +188,7 @@ static int rtnl_register_internal(struct module *owner,
msgindex = rtm_msgindex(msgtype);
 
rtnl_lock();
-   tab = rtnl_msg_handlers[protocol];
+   tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (tab == NULL) {
tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
if (!tab)
@@ -329,7 +329,7 @@ void rtnl_unregister_all(int protocol)
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 
rtnl_lock();
-   tab = rtnl_msg_handlers[protocol];
+   tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (!tab) {
rtnl_unlock();
return;
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH RFC 3/5] sched/cpufreq: Fix incorrect RCU API usage

2019-02-20 Thread Joel Fernandes (Google)
Recently I added an RCU annotation check to rcu_assign_pointer(). All
pointers assigned to RCU protected data are to be annotated with __rcu
inorder to be able to use rcu_assign_pointer() similar to checks in
other RCU APIs.

This resulted in a sparse error: kernel//sched/cpufreq.c:41:9: sparse:
error: incompatible types in comparison expression (different address
spaces)

Fix this by using the correct APIs for RCU accesses. This will
potentially avoid any future bugs in the code. If it is felt that RCU
protection is not needed here, then the rcu_assign_pointer call can be
dropped and replaced with, say, WRITE_ONCE or smp_store_release. Or, may
be we add a new API to do it. But calls rcu_assign_pointer seems an
abuse of the RCU API unless RCU is being used.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/cpufreq.c | 8 ++--
 kernel/sched/sched.h   | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 22bd8980f32f..c9aeb3bf5dc2 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -7,7 +7,7 @@
  */
 #include "sched.h"
 
-DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
 /**
  * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
@@ -34,8 +34,12 @@ void cpufreq_add_update_util_hook(int cpu, struct 
update_util_data *data,
if (WARN_ON(!data || !func))
return;
 
-   if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+   rcu_read_lock();
+   if (WARN_ON(rcu_dereference(per_cpu(cpufreq_update_util_data, cpu {
+   rcu_read_unlock();
return;
+   }
+   rcu_read_unlock();
 
data->func = func;
rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d04530bf251f..2ab545d40381 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2166,7 +2166,7 @@ static inline u64 irq_time_read(int cpu)
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #ifdef CONFIG_CPU_FREQ
-DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
 /**
  * cpufreq_update_util - Take a note about CPU utilization changes.
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH RFC 5/5] rcuwait: Replace rcu_assign_pointer() with WRITE_ONCE

2019-02-20 Thread Joel Fernandes (Google)
This suppresses a sparse error generated due to the recently added
rcu_assign_pointer sparse check below. It seems WRITE_ONCE should be
sufficient here.

>> kernel//locking/percpu-rwsem.c:162:9: sparse: error: incompatible
types in comparison expression (different address spaces)

Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/rcuwait.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
index 90bfa3279a01..9e5b4760e6c2 100644
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@@ -44,7 +44,7 @@ extern void rcuwait_wake_up(struct rcuwait *w);
 */ \
WARN_ON(current->exit_state);   \
\
-   rcu_assign_pointer((w)->task, current); \
+   WRITE_ONCE((w)->task, current); \
for (;;) {  \
/*  \
 * Implicit barrier (A) pairs with (B) in   \
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH v2 2/6] ixgbe: Fix incorrect RCU API usage

2019-02-22 Thread Joel Fernandes (Google)
Recently, I added an RCU annotation check in rcu_assign_pointer. This
caused a sparse error to be reported by the ixgbe driver.

Further looking, it seems the adapter->xdp_prog pointer is not annotated
with __rcu. Annonating it fixed the error, but caused a bunch of other
warnings.

This patch tries to fix all warnings by using RCU API properly. This
makes sense to do because not using RCU properly can result in various
hard to find bugs. This is a best effort fix and is only build tested.
The sparse errors and warnings go away with the change. I request
maintainers / developers in this area to review / test it properly.

The sparse error fixed is:
ixgbe_main.c:10256:25: error: incompatible types in comparison expression

Signed-off-by: Joel Fernandes (Google) 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h  |  4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 15 ++-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 08d85e336bd4..3b14daf27516 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -311,7 +311,7 @@ struct ixgbe_ring {
struct ixgbe_ring *next;/* pointer to next ring in q_vector */
struct ixgbe_q_vector *q_vector; /* backpointer to host q_vector */
struct net_device *netdev;  /* netdev ring belongs to */
-   struct bpf_prog *xdp_prog;
+   struct bpf_prog __rcu *xdp_prog;
struct device *dev; /* device for DMA mapping */
void *desc; /* descriptor ring memory */
union {
@@ -560,7 +560,7 @@ struct ixgbe_adapter {
unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
/* OS defined structs */
struct net_device *netdev;
-   struct bpf_prog *xdp_prog;
+   struct bpf_prog __rcu *xdp_prog;
struct pci_dev *pdev;
struct mii_bus *mii_bus;
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index daff8183534b..408a312aa6ba 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2199,7 +2199,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter 
*adapter,
u32 act;
 
rcu_read_lock();
-   xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+   xdp_prog = rcu_dereference(rx_ring->xdp_prog);
 
if (!xdp_prog)
goto xdp_out;
@@ -6547,7 +6547,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter 
*adapter,
 rx_ring->queue_index) < 0)
goto err;
 
-   rx_ring->xdp_prog = adapter->xdp_prog;
+   rcu_assign_pointer(rx_ring->xdp_prog, adapter->xdp_prog);
 
return 0;
 err:
@@ -10246,7 +10246,8 @@ static int ixgbe_xdp_setup(struct net_device *dev, 
struct bpf_prog *prog)
if (nr_cpu_ids > MAX_XDP_QUEUES)
return -ENOMEM;
 
-   old_prog = xchg(>xdp_prog, prog);
+   old_prog = rcu_access_pointer(adapter->xdp_prog);
+   rcu_assign_pointer(adapter->xdp_prog, prog);
 
/* If transitioning XDP modes reconfigure rings */
if (!!prog != !!old_prog) {
@@ -10271,13 +10272,17 @@ static int ixgbe_xdp_setup(struct net_device *dev, 
struct bpf_prog *prog)
 static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
struct ixgbe_adapter *adapter = netdev_priv(dev);
+   struct bpf_prog *prog;
 
switch (xdp->command) {
case XDP_SETUP_PROG:
return ixgbe_xdp_setup(dev, xdp->prog);
case XDP_QUERY_PROG:
-   xdp->prog_id = adapter->xdp_prog ?
-   adapter->xdp_prog->aux->id : 0;
+   rcu_read_lock();
+   prog = rcu_dereference(adapter->xdp_prog);
+   xdp->prog_id = prog ? prog->aux->id : 0;
+   rcu_read_unlock();
+
return 0;
case XDP_QUERY_XSK_UMEM:
return ixgbe_xsk_umem_query(adapter, >xsk.umem,
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH v2 0/6] RCU fixes for rcu_assign_pointer() usage

2019-02-22 Thread Joel Fernandes (Google)
These patches fix various sparse errors found as a result of the recent check
to add rcu_check_sparse() to rcu_assign_pointer().  The errors in some cases
seem to either missing API usage, or missing annotations. The annotations added
in the series can also help avoid future incorrect usages and bugs so it is a
good idea to do in any case.

RFC v1 -> Patch v2:
Made changes to various scheduler patches (Peter Zijlstra)

Joel Fernandes (Google) (6):
net: rtnetlink: Fix incorrect RCU API usage
ixgbe: Fix incorrect RCU API usage
sched/cpufreq: Annotate cpufreq_update_util_data pointer with __rcu
sched_domain: Annotate RCU pointers properly
rcuwait: Annotate task_struct with __rcu
sched: Annotate perf_domain pointer with __rcu

drivers/net/ethernet/intel/ixgbe/ixgbe.h  |  4 ++--
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 15 ++-
include/linux/rcuwait.h   |  2 +-
include/linux/sched/topology.h|  4 ++--
kernel/sched/cpufreq.c|  2 +-
kernel/sched/sched.h  | 18 +-
kernel/sched/topology.c   | 10 +-
net/core/rtnetlink.c  |  4 ++--
8 files changed, 32 insertions(+), 27 deletions(-)

--
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH v2 3/6] sched/cpufreq: Annotate cpufreq_update_util_data pointer with __rcu

2019-02-22 Thread Joel Fernandes (Google)
Recently I added an RCU annotation check to rcu_assign_pointer(). All
pointers assigned to RCU protected data are to be annotated with __rcu
inorder to be able to use rcu_assign_pointer() similar to checks in
other RCU APIs.

This resulted in a sparse error: kernel//sched/cpufreq.c:41:9: sparse:
error: incompatible types in comparison expression (different address
spaces)

Fix this by annotating cpufreq_update_util_data pointer with __rcu. This
will also help sparse catch any future RCU misuage bugs.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/cpufreq.c | 2 +-
 kernel/sched/sched.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 22bd8980f32f..e316ee7bb2e5 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -7,7 +7,7 @@
  */
 #include "sched.h"
 
-DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
 /**
  * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d04530bf251f..2ab545d40381 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2166,7 +2166,7 @@ static inline u64 irq_time_read(int cpu)
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #ifdef CONFIG_CPU_FREQ
-DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
 
 /**
  * cpufreq_update_util - Take a note about CPU utilization changes.
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH v2 4/6] sched_domain: Annotate RCU pointers properly

2019-02-22 Thread Joel Fernandes (Google)
The scheduler uses RCU API in various places to access sched_domain
pointers. These cause sparse errors as below.

Many new errors show up because of an annotation check I added to
rcu_assign_pointer(). Let us annotate the pointers correctly which also
will help sparse catch any potential future bugs.

This fixes the following sparse errors:

rt.c:1681:9: error: incompatible types in comparison expression
deadline.c:1904:9: error: incompatible types in comparison expression
core.c:519:9: error: incompatible types in comparison expression
core.c:1634:17: error: incompatible types in comparison expression
fair.c:6193:14: error: incompatible types in comparison expression
fair.c:9883:22: error: incompatible types in comparison expression
fair.c:9897:9: error: incompatible types in comparison expression
sched.h:1287:9: error: incompatible types in comparison expression
topology.c:612:9: error: incompatible types in comparison expression
topology.c:615:9: error: incompatible types in comparison expression
sched.h:1300:9: error: incompatible types in comparison expression
topology.c:618:9: error: incompatible types in comparison expression
sched.h:1287:9: error: incompatible types in comparison expression
topology.c:621:9: error: incompatible types in comparison expression
sched.h:1300:9: error: incompatible types in comparison expression
topology.c:624:9: error: incompatible types in comparison expression
topology.c:671:9: error: incompatible types in comparison expression
stats.c:45:17: error: incompatible types in comparison expression
fair.c:5998:15: error: incompatible types in comparison expression
fair.c:5989:15: error: incompatible types in comparison expression
fair.c:5998:15: error: incompatible types in comparison expression
fair.c:5989:15: error: incompatible types in comparison expression
fair.c:6120:19: error: incompatible types in comparison expression
fair.c:6506:14: error: incompatible types in comparison expression
fair.c:6515:14: error: incompatible types in comparison expression
fair.c:6623:9: error: incompatible types in comparison expression
fair.c:5970:17: error: incompatible types in comparison expression
fair.c:8642:21: error: incompatible types in comparison expression
fair.c:9253:9: error: incompatible types in comparison expression
fair.c:9331:9: error: incompatible types in comparison expression
fair.c:9519:15: error: incompatible types in comparison expression
fair.c:9533:14: error: incompatible types in comparison expression
fair.c:9542:14: error: incompatible types in comparison expression
fair.c:9567:14: error: incompatible types in comparison expression
fair.c:9597:14: error: incompatible types in comparison expression
fair.c:9421:16: error: incompatible types in comparison expression
fair.c:9421:16: error: incompatible types in comparison expression

Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/sched/topology.h |  4 ++--
 kernel/sched/sched.h   | 14 +++---
 kernel/sched/topology.c| 10 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index c31d3a47a47c..4819c9e01e42 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -76,8 +76,8 @@ struct sched_domain_shared {
 
 struct sched_domain {
/* These fields must be setup */
-   struct sched_domain *parent;/* top domain must be null terminated */
-   struct sched_domain *child; /* bottom domain must be null 
terminated */
+   struct sched_domain __rcu *parent;  /* top domain must be null 
terminated */
+   struct sched_domain __rcu *child;   /* bottom domain must be null 
terminated */
struct sched_group *groups; /* the balancing groups of the domain */
unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2ab545d40381..ca6a79f57e7a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -866,8 +866,8 @@ struct rq {
atomic_tnr_iowait;
 
 #ifdef CONFIG_SMP
-   struct root_domain  *rd;
-   struct sched_domain *sd;
+   struct root_domain  *rd;
+   struct sched_domain __rcu   *sd;
 
unsigned long   cpu_capacity;
unsigned long   cpu_capacity_orig;
@@ -1305,13 +1305,13 @@ static inline struct sched_domain 
*lowest_flag_domain(int cpu, int flag)
return sd;
 }
 
-DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity

[PATCH v2 5/6] rcuwait: Annotate task_struct with __rcu

2019-02-22 Thread Joel Fernandes (Google)
This suppresses sparse error generated due to the recently added
rcu_assign_pointer sparse check.

percpu-rwsem.c:162:9: sparse: error: incompatible types in comparison expression
exit.c:316:16: sparse: error: incompatible types in comparison expression

Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/rcuwait.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h
index 90bfa3279a01..563290fc194f 100644
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@@ -18,7 +18,7 @@
  * awoken.
  */
 struct rcuwait {
-   struct task_struct *task;
+   struct task_struct __rcu *task;
 };
 
 #define __RCUWAIT_INITIALIZER(name)\
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH v2 6/6] sched: Annotate perf_domain pointer with __rcu

2019-02-22 Thread Joel Fernandes (Google)
This fixes the following sparse errors in sched/fair.c:

fair.c:6506:14: error: incompatible types in comparison expression
fair.c:8642:21: error: incompatible types in comparison expression

Using __rcu will also help sparse catch any future bugs.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ca6a79f57e7a..c8e6514433a9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -780,7 +780,7 @@ struct root_domain {
 * NULL-terminated list of performance domains intersecting with the
 * CPUs of the rd. Protected by RCU.
 */
-   struct perf_domain  *pd;
+   struct perf_domain __rcu *pd;
 };
 
 extern struct root_domain def_root_domain;
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH v2 1/6] net: rtnetlink: Fix incorrect RCU API usage

2019-02-22 Thread Joel Fernandes (Google)
rtnl_register_internal() and rtnl_unregister_all tries to directly
dereference an RCU protected pointed outside RCU read side section.
While this is Ok to do since a lock is held, let us use the correct
API to avoid programmer bugs in the future.

This also fixes sparse warnings arising from not using RCU API.

net/core/rtnetlink.c:332:13: warning: incorrect type in assignment
(different address spaces) net/core/rtnetlink.c:332:13:expected
struct rtnl_link **tab net/core/rtnetlink.c:332:13:got struct
rtnl_link *[noderef] *

Signed-off-by: Joel Fernandes (Google) 
---
 net/core/rtnetlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5ea1bed08ede..98be4b4818a9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -188,7 +188,7 @@ static int rtnl_register_internal(struct module *owner,
msgindex = rtm_msgindex(msgtype);
 
rtnl_lock();
-   tab = rtnl_msg_handlers[protocol];
+   tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (tab == NULL) {
tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
if (!tab)
@@ -329,7 +329,7 @@ void rtnl_unregister_all(int protocol)
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
 
rtnl_lock();
-   tab = rtnl_msg_handlers[protocol];
+   tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (!tab) {
rtnl_unlock();
return;
-- 
2.21.0.rc0.258.g878e2cd30e-goog



[PATCH v4 2/2] Add selftests for module build using in-kernel headers

2019-03-01 Thread Joel Fernandes (Google)
This test tries to build a module successfully using the in-kernel
headers found in /proc/kheaders.tar.xz.

Verified pass and fail scenarios by running:
make -C tools/testing/selftests TARGETS=kheaders run_tests

Signed-off-by: Joel Fernandes (Google) 
---
 tools/testing/selftests/Makefile  |  1 +
 tools/testing/selftests/kheaders/Makefile |  5 +
 tools/testing/selftests/kheaders/config   |  1 +
 .../kheaders/run_kheaders_modbuild.sh | 18 +
 .../selftests/kheaders/testmod/Makefile   |  3 +++
 .../testing/selftests/kheaders/testmod/test.c | 20 +++
 6 files changed, 48 insertions(+)
 create mode 100644 tools/testing/selftests/kheaders/Makefile
 create mode 100644 tools/testing/selftests/kheaders/config
 create mode 100755 tools/testing/selftests/kheaders/run_kheaders_modbuild.sh
 create mode 100644 tools/testing/selftests/kheaders/testmod/Makefile
 create mode 100644 tools/testing/selftests/kheaders/testmod/test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 400ee81a3043..5a9287fddd0d 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -20,6 +20,7 @@ TARGETS += intel_pstate
 TARGETS += ipc
 TARGETS += ir
 TARGETS += kcmp
+TARGETS += kheaders
 TARGETS += kvm
 TARGETS += lib
 TARGETS += membarrier
diff --git a/tools/testing/selftests/kheaders/Makefile 
b/tools/testing/selftests/kheaders/Makefile
new file mode 100644
index ..51035ab0732b
--- /dev/null
+++ b/tools/testing/selftests/kheaders/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_PROGS := run_kheaders_modbuild.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/kheaders/config 
b/tools/testing/selftests/kheaders/config
new file mode 100644
index ..5221f9fb5e79
--- /dev/null
+++ b/tools/testing/selftests/kheaders/config
@@ -0,0 +1 @@
+CONFIG_IKHEADERS_PROC=y
diff --git a/tools/testing/selftests/kheaders/run_kheaders_modbuild.sh 
b/tools/testing/selftests/kheaders/run_kheaders_modbuild.sh
new file mode 100755
index ..f001568e08b0
--- /dev/null
+++ b/tools/testing/selftests/kheaders/run_kheaders_modbuild.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+HEADERS_XZ=/proc/kheaders.tar.xz
+TMP_DIR_HEADERS=$(mktemp -d)
+TMP_DIR_MODULE=$(mktemp -d)
+SPATH="$(dirname "$(readlink -f "$0")")"
+
+tar -xvf $HEADERS_XZ -C $TMP_DIR_HEADERS > /dev/null
+
+cp -r $SPATH/testmod $TMP_DIR_MODULE/
+
+pushd $TMP_DIR_MODULE/testmod > /dev/null
+make -C $TMP_DIR_HEADERS M=$(pwd) modules
+popd > /dev/null
+
+rm -rf $TMP_DIR_HEADERS
+rm -rf $TMP_DIR_MODULE
diff --git a/tools/testing/selftests/kheaders/testmod/Makefile 
b/tools/testing/selftests/kheaders/testmod/Makefile
new file mode 100644
index ..7083e28706e8
--- /dev/null
+++ b/tools/testing/selftests/kheaders/testmod/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-m += test.o
diff --git a/tools/testing/selftests/kheaders/testmod/test.c 
b/tools/testing/selftests/kheaders/testmod/test.c
new file mode 100644
index ..6eb0b8492ffa
--- /dev/null
+++ b/tools/testing/selftests/kheaders/testmod/test.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+
+static int __init hello_init(void)
+{
+   printk(KERN_INFO "Hello, world\n");
+   return 0;
+}
+
+static void __exit hello_exit(void)
+{
+   printk(KERN_INFO "Goodbye, world\n");
+}
+
+module_init(hello_init);
+module_exit(hello_exit);
+MODULE_LICENSE("GPL v2");
-- 
2.21.0.352.gf09ad66450-goog



[PATCH v4 1/2] Provide in-kernel headers for making it easy to extend the kernel

2019-03-01 Thread Joel Fernandes (Google)
Introduce in-kernel headers and other artifacts which are made available
as an archive through proc (/proc/kheaders.tar.xz file). This archive makes
it possible to build kernel modules, run eBPF programs, and other
tracing programs that need to extend the kernel for tracing purposes
without any dependency on the file system having headers and build
artifacts.

On Android and embedded systems, it is common to switch kernels but not
have kernel headers available on the file system. Raw kernel headers
also cannot be copied into the filesystem like they can be on other
distros, due to licensing and other issues. There's no linux-headers
package on Android. Further once a different kernel is booted, any
headers stored on the file system will no longer be useful. By storing
the headers as a compressed archive within the kernel, we can avoid these
issues that have been a hindrance for a long time.

The feature is also buildable as a module just in case the user desires
it not being part of the kernel image. This makes it possible to load
and unload the headers on demand. A tracing program, or a kernel module
builder can load the module, do its operations, and then unload the
module to save kernel memory. The total memory needed is 3.8MB.

The code to read the headers is based on /proc/config.gz code and uses
the same technique to embed the headers.

To build a module, the below steps have been tested on an x86 machine:
modprobe kheaders
rm -rf $HOME/headers
mkdir -p $HOME/headers
tar -xvf /proc/kheaders.tar.xz -C $HOME/headers >/dev/null
cd my-kernel-module
make -C $HOME/headers M=$(pwd) modules
rmmod kheaders

Additional notes:
(1) external modules must be built on the same arch as the host that
built vmlinux. This can be done either in a qemu emulated chroot on the
target, or natively. This is due to host arch dependency of kernel
scripts.

(2)
A limitation of module building with this is, since Module.symvers is
not available in the archive due to a cyclic dependency with building of
the archive into the kernel or module binaries, the modules built using
the archive will not contain symbol versioning (modversion). This is
usually not an issue since the idea of this patch is to build a kernel
module on the fly and load it into the same kernel. An appropriate
warning is already printed by the kernel to alert the user of modules
not having modversions when built using the archive. For building with
modversions, the user can use traditional header packages. For our
tracing usecases, we build modules on the fly with this so it is not a
concern.

(3) I have left IKHD_ST and IKHD_ED markers as is to facilitate
future patches that would extract the headers from a kernel or module
image.

Signed-off-by: Joel Fernandes (Google) 
---

Changes since v3:
- Blank tar was being generated because of a one line I
  forgot to push. It is updated now.
- Added module.lds since arm64 needs it to build modules.

Changes since v2:
(Thanks to Masahiro Yamada for several excellent suggestions)
- Added support for out of tree builds.
- Added incremental build support bringing down build time of
  incremental builds from 50 seconds to 5 seconds.
- Fixed various small nits / cleanups.
- clean ups to kheaders.c pointed by Alexey Dobriyan.
- Fixed MODULE_LICENSE in test module and kheaders.c
- Dropped Module.symvers from archive due to circular dependency.

Changes since v1:
- removed IKH_EXTRA variable, not needed (Masahiro Yamada)
- small fix ups to selftest
   - added target to main Makefile etc
   - added MODULE_LICENSE to test module
   - made selftest more quiet

Changes since RFC:
Both changes bring size down to 3.8MB:
- use xz for compression
- strip comments except SPDX lines
- Call out the module name in Kconfig
- Also added selftests in second patch to ensure headers are always
working.

Other notes:
By the way I still see this error (without the patch) when doing a clean
build: Makefile:594: include/config/auto.conf: No such file or directory

It appears to be because of commit 0a16d2e8cb7e ("kbuild: use 'include'
directive to load auto.conf from top Makefile")

 Documentation/dontdiff|  1 +
 init/Kconfig  | 11 ++
 kernel/.gitignore |  3 ++
 kernel/Makefile   | 37 +++
 kernel/kheaders.c | 72 
 scripts/gen_ikh_data.sh   | 78 +++
 scripts/strip-comments.pl |  8 
 7 files changed, 210 insertions(+)
 create mode 100644 kernel/kheaders.c
 create mode 100755 scripts/gen_ikh_data.sh
 create mode 100755 scripts/strip-comments.pl

diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 2228fcc8e29f..05a2319ee2a2 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -151,6 +151,7 @@ int8.c
 kallsyms
 kconfig
 keywords.c
+kheaders_data.h*
 ksym.c*
 ksym.h*
 kxgette

[PATCH v2 1/2] Provide in-kernel headers for making it easy to extend the kernel

2019-02-11 Thread Joel Fernandes (Google)
Introduce in-kernel headers and other artifacts which are made available
as an archive through proc (/proc/kheaders.txz file). This archive makes
it possible to build kernel modules, run eBPF programs, and other
tracing programs that need to extend the kernel for tracing purposes
without any dependency on the file system having headers and build
artifacts.

On Android and embedded systems, it is common to switch kernels but not
have kernel headers available on the file system. Raw kernel headers
also cannot be copied into the filesystem like they can be on other
distros, due to licensing and other issues. There's no linux-headers
package on Android. Further once a different kernel is booted, any
headers stored on the file system will no longer be useful. By storing
the headers as a compressed archive within the kernel, we can avoid these
issues that have been a hindrance for a long time.

The feature is also buildable as a module just in case the user desires
it not being part of the kernel image. This makes it possible to load
and unload the headers on demand. A tracing program, or a kernel module
builder can load the module, do its operations, and then unload the
module to save kernel memory. The total memory needed is 3.8MB.

The code to read the headers is based on /proc/config.gz code and uses
the same technique to embed the headers.

To build a module, the below steps have been tested on an x86 machine:
modprobe kheaders
rm -rf $HOME/headers
mkdir -p $HOME/headers
tar -xvf /proc/kheaders.txz -C $HOME/headers >/dev/null
cd my-kernel-module
make -C $HOME/headers M=$(pwd) modules
rmmod kheaders

Signed-off-by: Joel Fernandes (Google) 
---

Changes since v1:
- removed IKH_EXTRA variable, not needed (Masahiro Yamada)
- small fix ups to selftest
   - added target to main Makefile etc
   - added MODULE_LICENSE to test module
   - made selftest more quiet

Changes since RFC:
Both changes bring size down to 3.8MB:
- use xz for compression
- strip comments except SPDX lines
- Call out the module name in Kconfig
- Also added selftests in second patch to ensure headers are always
working.

 Documentation/dontdiff|  1 +
 init/Kconfig  | 11 ++
 kernel/.gitignore |  2 ++
 kernel/Makefile   | 27 ++
 kernel/kheaders.c | 74 +++
 scripts/gen_ikh_data.sh   | 19 ++
 scripts/strip-comments.pl |  8 +
 7 files changed, 142 insertions(+)
 create mode 100644 kernel/kheaders.c
 create mode 100755 scripts/gen_ikh_data.sh
 create mode 100755 scripts/strip-comments.pl

diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 2228fcc8e29f..05a2319ee2a2 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -151,6 +151,7 @@ int8.c
 kallsyms
 kconfig
 keywords.c
+kheaders_data.h*
 ksym.c*
 ksym.h*
 kxgettext
diff --git a/init/Kconfig b/init/Kconfig
index c9386a365eea..9fbf4f73d98c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -563,6 +563,17 @@ config IKCONFIG_PROC
  This option enables access to the kernel configuration file
  through /proc/config.gz.
 
+config IKHEADERS_PROC
+   tristate "Enable kernel header artifacts through /proc/kheaders.txz"
+   select BUILD_BIN2C
+   depends on PROC_FS
+   help
+ This option enables access to the kernel header and other artifacts 
that
+  are generated during the build process. These can be used to build 
kernel
+  modules, and other in-kernel programs such as those generated by eBPF
+  and systemtap tools. If you build the headers as a module, a module
+  called kheaders.ko is built which can be loaded to get access to 
them.
+
 config LOG_BUF_SHIFT
int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
range 12 25
diff --git a/kernel/.gitignore b/kernel/.gitignore
index b3097bde4e9c..6acf71acbdcb 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -3,5 +3,7 @@
 #
 config_data.h
 config_data.gz
+kheaders_data.h
+kheaders_data.txz
 timeconst.h
 hz.bc
diff --git a/kernel/Makefile b/kernel/Makefile
index 6aa7543bcdb2..1d13a7a6c537 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -70,6 +70,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
 obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o
 obj-$(CONFIG_SMP) += stop_machine.o
 obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -130,3 +131,29 @@ filechk_ikconfiggz = \
 targets += config_data.h
 $(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
+
+# Build a list of in-kernel headers for building kernel modules
+ikh_file_list := include/
+ikh_file_list += arch/$(ARCH)/Makefile
+ikh_file_list += arch/$(ARCH)/include/
+ikh_file_list += scripts/
+ikh_file_list += Makefile
+ikh_file_list += Module.symvers
+ifeq (

[PATCH v2 2/2] Add selftests for module build using in-kernel headers

2019-02-11 Thread Joel Fernandes (Google)
This test tries to build a module successfully using the in-kernel
headers found in /proc/kheaders.txz.

Verified pass and fail scenarios by running:
make -C tools/testing/selftests TARGETS=kheaders run_tests

Signed-off-by: Joel Fernandes (Google) 
---
 tools/testing/selftests/Makefile  |  1 +
 tools/testing/selftests/kheaders/Makefile |  5 +
 tools/testing/selftests/kheaders/config   |  1 +
 .../kheaders/run_kheaders_modbuild.sh | 18 +
 .../selftests/kheaders/testmod/Makefile   |  3 +++
 .../testing/selftests/kheaders/testmod/test.c | 20 +++
 6 files changed, 48 insertions(+)
 create mode 100644 tools/testing/selftests/kheaders/Makefile
 create mode 100644 tools/testing/selftests/kheaders/config
 create mode 100755 tools/testing/selftests/kheaders/run_kheaders_modbuild.sh
 create mode 100644 tools/testing/selftests/kheaders/testmod/Makefile
 create mode 100644 tools/testing/selftests/kheaders/testmod/test.c

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 400ee81a3043..5a9287fddd0d 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -20,6 +20,7 @@ TARGETS += intel_pstate
 TARGETS += ipc
 TARGETS += ir
 TARGETS += kcmp
+TARGETS += kheaders
 TARGETS += kvm
 TARGETS += lib
 TARGETS += membarrier
diff --git a/tools/testing/selftests/kheaders/Makefile 
b/tools/testing/selftests/kheaders/Makefile
new file mode 100644
index ..51035ab0732b
--- /dev/null
+++ b/tools/testing/selftests/kheaders/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_PROGS := run_kheaders_modbuild.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/kheaders/config 
b/tools/testing/selftests/kheaders/config
new file mode 100644
index ..5221f9fb5e79
--- /dev/null
+++ b/tools/testing/selftests/kheaders/config
@@ -0,0 +1 @@
+CONFIG_IKHEADERS_PROC=y
diff --git a/tools/testing/selftests/kheaders/run_kheaders_modbuild.sh 
b/tools/testing/selftests/kheaders/run_kheaders_modbuild.sh
new file mode 100755
index ..69d6fa237661
--- /dev/null
+++ b/tools/testing/selftests/kheaders/run_kheaders_modbuild.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+HEADERS_XZ=/proc/kheaders.txz
+TMP_DIR_HEADERS=$(mktemp -d)
+TMP_DIR_MODULE=$(mktemp -d)
+SPATH="$(dirname "$(readlink -f "$0")")"
+
+tar -xvf $HEADERS_XZ -C $TMP_DIR_HEADERS > /dev/null
+
+cp -r $SPATH/testmod $TMP_DIR_MODULE/
+
+pushd $TMP_DIR_MODULE/testmod > /dev/null
+make -C $TMP_DIR_HEADERS M=$(pwd) modules
+popd > /dev/null
+
+rm -rf $TMP_DIR_HEADERS
+rm -rf $TMP_DIR_MODULE
diff --git a/tools/testing/selftests/kheaders/testmod/Makefile 
b/tools/testing/selftests/kheaders/testmod/Makefile
new file mode 100644
index ..7083e28706e8
--- /dev/null
+++ b/tools/testing/selftests/kheaders/testmod/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-m += test.o
diff --git a/tools/testing/selftests/kheaders/testmod/test.c 
b/tools/testing/selftests/kheaders/testmod/test.c
new file mode 100644
index ..9178bf6f0cc8
--- /dev/null
+++ b/tools/testing/selftests/kheaders/testmod/test.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+
+static int __init hello_init(void)
+{
+   printk(KERN_INFO "Hello, world\n");
+   return 0;
+}
+
+static void __exit hello_exit(void)
+{
+   printk(KERN_INFO "Goodbye, world\n");
+}
+
+module_init(hello_init);
+module_exit(hello_exit);
+MODULE_LICENSE("GPL");
-- 
2.20.1.791.gb4d0f1c61a-goog



[RFC] rcu: Avoid unnecessary softirq when system is idle

2019-01-19 Thread Joel Fernandes (Google)
When there are no callbacks pending on an idle system, I noticed that
RCU softirq is continuously firing. During this the cpu_no_qs is set to
false, and core_needs_qs is set to true indefinitely. This causes
rcu_process_callbacks to be repeatedly called, even though the node
corresponding to the CPU has that CPU's mask bit cleared and the system
is idle. I believe the race is when such mask clearing is done during
idle CPU scan of the quiescent state forcing stage in the kthread
instead of the softirq. Since the rnp mask is cleared, but the flags on
the CPU's rdp are not cleared, the CPU thinks it still needs to report
to core RCU.

Cure this by clearing the core_needs_qs flag when the CPU detects that
its node is already updated which will avoid the unwanted softirq raises
to the benefit of real-time systems.

Test: Ran rcutorture for various tree RCU configs.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9180158756d2..96ad80c76b15 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2273,6 +2273,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
}
mask = rdp->grpmask;
if ((rnp->qsmask & mask) == 0) {
+   rdp->core_needs_qs = false;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
rdp->core_needs_qs = false;
-- 
2.20.1.321.g9e740568ce-goog



[PATCH RFC v2 0/3] cleanups for pstore and ramoops

2018-11-03 Thread Joel Fernandes (Google)
Here are some simple cleanups and fixes for ramoops in pstore. Let me know
what you think, thanks.

Joel Fernandes (Google) (3):
pstore: map pstore types to names
pstore: simplify ramoops_get_next_prz arguments
pstore: donot treat empty buffers as valid

fs/pstore/inode.c  | 53 +-
fs/pstore/ram.c| 52 +++--
fs/pstore/ram_core.c   |  2 +-
include/linux/pstore.h | 37 ++
include/linux/pstore_ram.h |  2 ++
5 files changed, 67 insertions(+), 79 deletions(-)

--
2.19.1.930.g4563a0d9d0-goog



[PATCH RFC v2 1/3] pstore: map pstore types to names

2018-11-03 Thread Joel Fernandes (Google)
In later patches we will need to map types to names, so create a table
for that which can also be used and reused in different parts of old and
new code. Also use it to save the type in the PRZ which will be useful
in later patches.

Signed-off-by: Joel Fernandes (Google) 
---
 fs/pstore/inode.c  | 53 +-
 fs/pstore/ram.c|  4 ++-
 include/linux/pstore.h | 37 ++
 include/linux/pstore_ram.h |  2 ++
 4 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 8cf2218b46a7..c5c6b8b4b70a 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -304,6 +304,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record 
*record)
struct dentry   *dentry;
struct inode*inode;
int rc = 0;
+   enum pstore_type_id type;
charname[PSTORE_NAMELEN];
struct pstore_private   *private, *pos;
unsigned long   flags;
@@ -335,53 +336,11 @@ int pstore_mkfile(struct dentry *root, struct 
pstore_record *record)
goto fail_alloc;
private->record = record;
 
-   switch (record->type) {
-   case PSTORE_TYPE_DMESG:
-   scnprintf(name, sizeof(name), "dmesg-%s-%llu%s",
- record->psi->name, record->id,
- record->compressed ? ".enc.z" : "");
-   break;
-   case PSTORE_TYPE_CONSOLE:
-   scnprintf(name, sizeof(name), "console-%s-%llu",
- record->psi->name, record->id);
-   break;
-   case PSTORE_TYPE_FTRACE:
-   scnprintf(name, sizeof(name), "ftrace-%s-%llu",
- record->psi->name, record->id);
-   break;
-   case PSTORE_TYPE_MCE:
-   scnprintf(name, sizeof(name), "mce-%s-%llu",
- record->psi->name, record->id);
-   break;
-   case PSTORE_TYPE_PPC_RTAS:
-   scnprintf(name, sizeof(name), "rtas-%s-%llu",
- record->psi->name, record->id);
-   break;
-   case PSTORE_TYPE_PPC_OF:
-   scnprintf(name, sizeof(name), "powerpc-ofw-%s-%llu",
- record->psi->name, record->id);
-   break;
-   case PSTORE_TYPE_PPC_COMMON:
-   scnprintf(name, sizeof(name), "powerpc-common-%s-%llu",
- record->psi->name, record->id);
-   break;
-   case PSTORE_TYPE_PMSG:
-   scnprintf(name, sizeof(name), "pmsg-%s-%llu",
- record->psi->name, record->id);
-   break;
-   case PSTORE_TYPE_PPC_OPAL:
-   scnprintf(name, sizeof(name), "powerpc-opal-%s-%llu",
- record->psi->name, record->id);
-   break;
-   case PSTORE_TYPE_UNKNOWN:
-   scnprintf(name, sizeof(name), "unknown-%s-%llu",
- record->psi->name, record->id);
-   break;
-   default:
-   scnprintf(name, sizeof(name), "type%d-%s-%llu",
- record->type, record->psi->name, record->id);
-   break;
-   }
+   scnprintf(name, sizeof(name), "%s-%s-%llu%s",
+   pstore_type_to_name(record->type),
+   record->psi->name, record->id,
+   (record->type == PSTORE_TYPE_DMESG
+&& record->compressed) ? ".enc.z" : "");
 
dentry = d_alloc_name(root, name);
if (!dentry)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 10ac4d23c423..b174d0fc009f 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -611,6 +611,7 @@ static int ramoops_init_przs(const char *name,
goto fail;
}
*paddr += zone_sz;
+   prz_ar[i]->type = pstore_name_to_type(name);
}
 
*przs = prz_ar;
@@ -650,6 +651,7 @@ static int ramoops_init_prz(const char *name,
}
 
*paddr += sz;
+   (*prz)->type = pstore_name_to_type(name);
 
return 0;
 }
@@ -785,7 +787,7 @@ static int ramoops_probe(struct platform_device *pdev)
 
dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
- cxt->pmsg_size;
-   err = ramoops_init_przs("dump", dev, cxt, >dprzs, ,
+   err = ramoops_init_przs("dmesg", dev, cxt, >dprzs, ,
dump_mem_sz, cxt->record_size,
>

[PATCH RFC v2 3/3] pstore: donot treat empty buffers as valid

2018-11-03 Thread Joel Fernandes (Google)
pstore currently calls persistent_ram_save_old even if a buffer is
empty. While this appears to work, it is does not seem like the right
thing to do and could lead to future bugs so lets avoid that. It also
prevent misleading prints in the logs which claim the buffer is valid.

I got something like:
found existing buffer, size 0, start 0

When I was expecting:
no valid data in buffer (sig = ...)

Signed-off-by: Joel Fernandes (Google) 
---
Note that if you feel this patch is not necessary, then feel free to
drop it. I would say it is harmless and is a good clean up.

 fs/pstore/ram_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index e6375439c5ac..196e4fd7ba8c 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -510,7 +510,7 @@ static int persistent_ram_post_init(struct 
persistent_ram_zone *prz, u32 sig,
 
sig ^= PERSISTENT_RAM_SIG;
 
-   if (prz->buffer->sig == sig) {
+   if (prz->buffer->sig == sig && buffer_size(prz)) {
if (buffer_size(prz) > prz->buffer_size ||
buffer_start(prz) > buffer_size(prz)) {
pr_info("found existing invalid buffer, size %zu, start 
%zu\n",
-- 
2.19.1.930.g4563a0d9d0-goog


[PATCH RFC v2 2/3] pstore: simplify ramoops_get_next_prz arguments

2018-11-03 Thread Joel Fernandes (Google)
(1) remove type argument from ramoops_get_next_prz

Since we store the type of the prz when we initialize it, we no longer
need to pass it again in ramoops_get_next_prz since we can just use that
to setup the pstore record. So lets remove it from the argument list.

(2) remove max argument from ramoops_get_next_prz

>From the code flow, the 'max' checks are already being done on the prz
passed to ramoops_get_next_prz. Lets remove it to simplify this function
and reduce its arguments.

(3) further reduce ramoops_get_next_prz arguments by passing record

Both the id and type fields of a pstore_record are set by
ramoops_get_next_prz. So we can just pass a pointer to the pstore_record
instead of passing individual elements. This results in cleaner more
readable code and fewer lines.

In addition lets also remove the 'update' argument since we can detect
that. Changes are squashed into a single patch to reduce fixup conflicts.

Signed-off-by: Joel Fernandes (Google) 
---
 fs/pstore/ram.c | 48 ++--
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index b174d0fc009f..202eaa82bcc6 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -124,19 +124,17 @@ static int ramoops_pstore_open(struct pstore_info *psi)
 }
 
 static struct persistent_ram_zone *
-ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
-u64 *id,
-enum pstore_type_id *typep, enum pstore_type_id type,
-bool update)
+ramoops_get_next_prz(struct persistent_ram_zone *przs[], int id,
+struct pstore_record *record)
 {
struct persistent_ram_zone *prz;
-   int i = (*c)++;
+   bool update = (record->type == PSTORE_TYPE_DMESG);
 
/* Give up if we never existed or have hit the end. */
-   if (!przs || i >= max)
+   if (!przs)
return NULL;
 
-   prz = przs[i];
+   prz = przs[id];
if (!prz)
return NULL;
 
@@ -147,8 +145,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], 
uint *c, uint max,
if (!persistent_ram_old_size(prz))
return NULL;
 
-   *typep = type;
-   *id = i;
+   record->type = prz->type;
+   record->id = id;
 
return prz;
 }
@@ -255,10 +253,8 @@ static ssize_t ramoops_pstore_read(struct pstore_record 
*record)
 
/* Find the next valid persistent_ram_zone for DMESG */
while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
-   prz = ramoops_get_next_prz(cxt->dprzs, >dump_read_cnt,
-  cxt->max_dump_cnt, >id,
-  >type,
-  PSTORE_TYPE_DMESG, 1);
+   prz = ramoops_get_next_prz(cxt->dprzs, cxt->dump_read_cnt++,
+  record);
if (!prz_ok(prz))
continue;
header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz),
@@ -272,22 +268,18 @@ static ssize_t ramoops_pstore_read(struct pstore_record 
*record)
}
}
 
-   if (!prz_ok(prz))
-   prz = ramoops_get_next_prz(>cprz, >console_read_cnt,
-  1, >id, >type,
-  PSTORE_TYPE_CONSOLE, 0);
+   if (!prz_ok(prz) && !cxt->console_read_cnt++)
+   prz = ramoops_get_next_prz(>cprz, 0 /* single */, record);
 
-   if (!prz_ok(prz))
-   prz = ramoops_get_next_prz(>mprz, >pmsg_read_cnt,
-  1, >id, >type,
-  PSTORE_TYPE_PMSG, 0);
+   if (!prz_ok(prz) && !cxt->pmsg_read_cnt++)
+   prz = ramoops_get_next_prz(>mprz, 0 /* single */, record);
 
/* ftrace is last since it may want to dynamically allocate memory. */
if (!prz_ok(prz)) {
-   if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
-   prz = ramoops_get_next_prz(cxt->fprzs,
-   >ftrace_read_cnt, 1, >id,
-   >type, PSTORE_TYPE_FTRACE, 0);
+   if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) &&
+   !cxt->ftrace_read_cnt++) {
+   prz = ramoops_get_next_prz(cxt->fprzs, 0 /* single */,
+  record);
} else {
/*
 * Build a new dummy record which combines all the
@@ -303,11 +295,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record 
*record)
 
while (cxt->ftrace_read_cnt < cxt->m

[PATCH v3 resend 1/2] mm: Add an F_SEAL_FUTURE_WRITE seal to memfd

2018-11-07 Thread Joel Fernandes (Google)
Android uses ashmem for sharing memory regions. We are looking forward
to migrating all usecases of ashmem to memfd so that we can possibly
remove the ashmem driver in the future from staging while also
benefiting from using memfd and contributing to it. Note staging drivers
are also not ABI and generally can be removed at anytime.

One of the main usecases Android has is the ability to create a region
and mmap it as writeable, then add protection against making any
"future" writes while keeping the existing already mmap'ed
writeable-region active.  This allows us to implement a usecase where
receivers of the shared memory buffer can get a read-only view, while
the sender continues to write to the buffer.
See CursorWindow documentation in Android for more details:
https://developer.android.com/reference/android/database/CursorWindow

This usecase cannot be implemented with the existing F_SEAL_WRITE seal.
To support the usecase, this patch adds a new F_SEAL_FUTURE_WRITE seal
which prevents any future mmap and write syscalls from succeeding while
keeping the existing mmap active. The following program shows the seal
working in action:

 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #define F_SEAL_FUTURE_WRITE 0x0010
 #define REGION_SIZE (5 * 1024 * 1024)

int memfd_create_region(const char *name, size_t size)
{
int ret;
int fd = syscall(__NR_memfd_create, name, MFD_ALLOW_SEALING);
if (fd < 0) return fd;
ret = ftruncate(fd, size);
if (ret < 0) { close(fd); return ret; }
return fd;
}

int main() {
int ret, fd;
void *addr, *addr2, *addr3, *addr1;
ret = memfd_create_region("test_region", REGION_SIZE);
printf("ret=%d\n", ret);
fd = ret;

// Create map
addr = mmap(0, REGION_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED)
printf("map 0 failed\n");
else
printf("map 0 passed\n");

if ((ret = write(fd, "test", 4)) != 4)
printf("write failed even though no future-write seal "
   "(ret=%d errno =%d)\n", ret, errno);
else
printf("write passed\n");

addr1 = mmap(0, REGION_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (addr1 == MAP_FAILED)
perror("map 1 prot-write failed even though no seal\n");
else
printf("map 1 prot-write passed as expected\n");

ret = fcntl(fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE |
 F_SEAL_GROW |
 F_SEAL_SHRINK);
if (ret == -1)
printf("fcntl failed, errno: %d\n", errno);
else
printf("future-write seal now active\n");

if ((ret = write(fd, "test", 4)) != 4)
printf("write failed as expected due to future-write seal\n");
else
printf("write passed (unexpected)\n");

addr2 = mmap(0, REGION_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (addr2 == MAP_FAILED)
perror("map 2 prot-write failed as expected due to seal\n");
else
printf("map 2 passed\n");

addr3 = mmap(0, REGION_SIZE, PROT_READ, MAP_SHARED, fd, 0);
if (addr3 == MAP_FAILED)
perror("map 3 failed\n");
else
printf("map 3 prot-read passed as expected\n");
}

The output of running this program is as follows:
ret=3
map 0 passed
write passed
map 1 prot-write passed as expected
future-write seal now active
write failed as expected due to future-write seal
map 2 prot-write failed as expected due to seal
: Permission denied
map 3 prot-read passed as expected

Cc: jr...@google.com
Cc: john.stu...@linaro.org
Cc: tk...@google.com
Cc: gre...@linuxfoundation.org
Cc: h...@infradead.org
Reviewed-by: John Stultz 
Signed-off-by: Joel Fernandes (Google) 
---
v1->v2: No change, just added selftests to the series. manpages are
ready and I'll submit them once the patches are accepted.

v2->v3: Updated commit message to have more support code (John Stultz)
Renamed seal from F_SEAL_FS_WRITE to F_SEAL_FUTURE_WRITE
(Christoph Hellwig)
Allow for this seal only if grow/shrink seals are also
either previous set, or are requested along with this seal.
(Christoph Hellwig)
Added locking to synchronize access to file->f_mode.
(Christoph Hellwig)

 include/uapi/linux/fcntl.h |  1 +
 mm/memfd.c | 22 +-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd9a350..a2f8658f1c55 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -41,6 +41,7

[PATCH v3 resend 2/2] selftests/memfd: Add tests for F_SEAL_FUTURE_WRITE seal

2018-11-07 Thread Joel Fernandes (Google)
Add tests to verify sealing memfds with the F_SEAL_FUTURE_WRITE works as
expected.

Cc: dan...@google.com
Cc: minc...@kernel.org
Reviewed-by: John Stultz 
Signed-off-by: Joel Fernandes (Google) 
---
 tools/testing/selftests/memfd/memfd_test.c | 74 ++
 1 file changed, 74 insertions(+)

diff --git a/tools/testing/selftests/memfd/memfd_test.c 
b/tools/testing/selftests/memfd/memfd_test.c
index 10baa1652fc2..32b207ca7372 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -692,6 +692,79 @@ static void test_seal_write(void)
close(fd);
 }
 
+/*
+ * Test SEAL_FUTURE_WRITE
+ * Test whether SEAL_FUTURE_WRITE actually prevents modifications.
+ */
+static void test_seal_future_write(void)
+{
+   int fd;
+   void *p;
+
+   printf("%s SEAL-FUTURE-WRITE\n", memfd_str);
+
+   fd = mfd_assert_new("kern_memfd_seal_future_write",
+   mfd_def_size,
+   MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+   p = mfd_assert_mmap_shared(fd);
+
+   mfd_assert_has_seals(fd, 0);
+   /* Not adding grow/shrink seals makes the future write
+* seal fail to get added
+*/
+   mfd_fail_add_seals(fd, F_SEAL_FUTURE_WRITE);
+
+   mfd_assert_add_seals(fd, F_SEAL_GROW);
+   mfd_assert_has_seals(fd, F_SEAL_GROW);
+
+   /* Should still fail since shrink seal has
+* not yet been added
+*/
+   mfd_fail_add_seals(fd, F_SEAL_FUTURE_WRITE);
+
+   mfd_assert_add_seals(fd, F_SEAL_SHRINK);
+   mfd_assert_has_seals(fd, F_SEAL_GROW |
+F_SEAL_SHRINK);
+
+   /* Now should succeed, also verifies that the seal
+* could be added with an existing writable mmap
+*/
+   mfd_assert_add_seals(fd, F_SEAL_FUTURE_WRITE);
+   mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+F_SEAL_GROW |
+F_SEAL_FUTURE_WRITE);
+
+   /* read should pass, writes should fail */
+   mfd_assert_read(fd);
+   mfd_fail_write(fd);
+
+   munmap(p, mfd_def_size);
+   close(fd);
+
+   /* Test adding all seals (grow, shrink, future write) at once */
+   fd = mfd_assert_new("kern_memfd_seal_future_write2",
+   mfd_def_size,
+   MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+   p = mfd_assert_mmap_shared(fd);
+
+   mfd_assert_has_seals(fd, 0);
+   mfd_assert_add_seals(fd, F_SEAL_SHRINK |
+F_SEAL_GROW |
+F_SEAL_FUTURE_WRITE);
+   mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+F_SEAL_GROW |
+F_SEAL_FUTURE_WRITE);
+
+   /* read should pass, writes should fail */
+   mfd_assert_read(fd);
+   mfd_fail_write(fd);
+
+   munmap(p, mfd_def_size);
+   close(fd);
+}
+
 /*
  * Test SEAL_SHRINK
  * Test whether SEAL_SHRINK actually prevents shrinking
@@ -945,6 +1018,7 @@ int main(int argc, char **argv)
test_basic();
 
test_seal_write();
+   test_seal_future_write();
test_seal_shrink();
test_seal_grow();
test_seal_resize();
-- 
2.19.1.930.g4563a0d9d0-goog



[PATCH 2/7] dmaengine: stm32-dma: fix incomplete configuration in cyclic mode

2018-10-08 Thread Joel Fernandes (Google)
From: Pierre Yves MORDRET 

When in cyclic mode, the configuration is updated after having started the
DMA hardware (STM32_DMA_SCR_EN) leading to incomplete configuration of
SMxAR registers.

Signed-off-by: Pierre-Yves MORDRET 
Signed-off-by: Hugues Fruchet 
Signed-off-by: Vinod Koul 
---
 drivers/dma/stm32-dma.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index 4099948b6914..fae7de54f00a 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -441,6 +441,8 @@ static void stm32_dma_dump_reg(struct stm32_dma_chan *chan)
dev_dbg(chan2dev(chan), "SFCR:  0x%08x\n", sfcr);
 }
 
+static void stm32_dma_configure_next_sg(struct stm32_dma_chan *chan);
+
 static void stm32_dma_start_transfer(struct stm32_dma_chan *chan)
 {
struct stm32_dma_device *dmadev = stm32_dma_get_dev(chan);
@@ -483,6 +485,9 @@ static void stm32_dma_start_transfer(struct stm32_dma_chan 
*chan)
if (status)
stm32_dma_irq_clear(chan, status);
 
+   if (chan->desc->cyclic)
+   stm32_dma_configure_next_sg(chan);
+
stm32_dma_dump_reg(chan);
 
/* Start DMA */
@@ -576,8 +581,7 @@ static void stm32_dma_issue_pending(struct dma_chan *c)
if (vchan_issue_pending(>vchan) && !chan->desc && !chan->busy) {
dev_dbg(chan2dev(chan), "vchan %p: issued\n", >vchan);
stm32_dma_start_transfer(chan);
-   if (chan->desc->cyclic)
-   stm32_dma_configure_next_sg(chan);
+
}
spin_unlock_irqrestore(>vchan.lock, flags);
 }
-- 
2.19.0.605.g01d371f741-goog



[PATCH 0/7] NULL pointer deref fix for stm32-dma

2018-10-08 Thread Joel Fernandes (Google)
Hi Greg,

While looking at android-4.14, I found a NULL pointer deref with
stm32-dma driver using Coccicheck errors. I found that upstream had a
bunch of patches on stm32-dma that have fixed this and other issues, I
applied these patches cleanly onto Android 4.14. I believe these should
goto stable and flow into Android 4.14 from there, but I haven't tested
this since I have no hardware to do so.

Atleast I can say that the coccicheck error below goes away when running:
make coccicheck MODE=report
./drivers/dma/stm32-dma.c:567:18-24: ERROR: chan -> desc is NULL but 
dereferenced.

Anyway, please consider this series for 4.14 stable, I have CC'd the
author and others, thanks.

Pierre Yves MORDRET (7):
  dmaengine: stm32-dma: threshold manages with bitfield feature
  dmaengine: stm32-dma: fix incomplete configuration in cyclic mode
  dmaengine: stm32-dma: fix typo and reported checkpatch warnings
  dmaengine: stm32-dma: Improve memory burst management
  dmaengine: stm32-dma: fix DMA IRQ status handling
  dmaengine: stm32-dma: fix max items per transfer
  dmaengine: stm32-dma: properly mask irq bits

 drivers/dma/stm32-dma.c | 287 +---
 1 file changed, 240 insertions(+), 47 deletions(-)

-- 
2.19.0.605.g01d371f741-goog


[PATCH 1/7] dmaengine: stm32-dma: threshold manages with bitfield feature

2018-10-08 Thread Joel Fernandes (Google)
From: Pierre Yves MORDRET 

>From now on, DMA bitfield is to manage DMA FIFO Threshold.

Signed-off-by: Pierre-Yves MORDRET 
Signed-off-by: Vinod Koul 
---
 drivers/dma/stm32-dma.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index 786fc8fcc38e..4099948b6914 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -116,6 +116,10 @@
 #define STM32_DMA_MAX_DATA_PARAM   0x03
 #define STM32_DMA_MAX_BURST16
 
+/* DMA Features */
+#define STM32_DMA_THRESHOLD_FTR_MASK   GENMASK(1, 0)
+#define STM32_DMA_THRESHOLD_FTR_GET(n) ((n) & STM32_DMA_THRESHOLD_FTR_MASK)
+
 enum stm32_dma_width {
STM32_DMA_BYTE,
STM32_DMA_HALF_WORD,
@@ -129,11 +133,18 @@ enum stm32_dma_burst_size {
STM32_DMA_BURST_INCR16,
 };
 
+/**
+ * struct stm32_dma_cfg - STM32 DMA custom configuration
+ * @channel_id: channel ID
+ * @request_line: DMA request
+ * @stream_config: 32bit mask specifying the DMA channel configuration
+ * @features: 32bit mask specifying the DMA Feature list
+ */
 struct stm32_dma_cfg {
u32 channel_id;
u32 request_line;
u32 stream_config;
-   u32 threshold;
+   u32 features;
 };
 
 struct stm32_dma_chan_reg {
@@ -171,6 +182,7 @@ struct stm32_dma_chan {
u32 next_sg;
struct dma_slave_config dma_sconfig;
struct stm32_dma_chan_reg chan_reg;
+   u32 threshold;
 };
 
 struct stm32_dma_device {
@@ -976,7 +988,8 @@ static void stm32_dma_set_config(struct stm32_dma_chan 
*chan,
/* Enable Interrupts  */
chan->chan_reg.dma_scr |= STM32_DMA_SCR_TEIE | STM32_DMA_SCR_TCIE;
 
-   chan->chan_reg.dma_sfcr = cfg->threshold & STM32_DMA_SFCR_FTH_MASK;
+   chan->threshold = STM32_DMA_THRESHOLD_FTR_GET(cfg->features);
+   chan->chan_reg.dma_sfcr = STM32_DMA_SFCR_FTH(chan->threshold);
 }
 
 static struct dma_chan *stm32_dma_of_xlate(struct of_phandle_args *dma_spec,
@@ -996,7 +1009,7 @@ static struct dma_chan *stm32_dma_of_xlate(struct 
of_phandle_args *dma_spec,
cfg.channel_id = dma_spec->args[0];
cfg.request_line = dma_spec->args[1];
cfg.stream_config = dma_spec->args[2];
-   cfg.threshold = dma_spec->args[3];
+   cfg.features = dma_spec->args[3];
 
if ((cfg.channel_id >= STM32_DMA_MAX_CHANNELS) ||
(cfg.request_line >= STM32_DMA_MAX_REQUEST_ID)) {
-- 
2.19.0.605.g01d371f741-goog



[PATCH 3/7] dmaengine: stm32-dma: fix typo and reported checkpatch warnings

2018-10-08 Thread Joel Fernandes (Google)
From: Pierre Yves MORDRET 

Fix typo in a comment and solved reported checkpatch warnings.

Signed-off-by: Pierre-Yves MORDRET 
Signed-off-by: Vinod Koul 
---
 drivers/dma/stm32-dma.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index fae7de54f00a..b64e14a83dec 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -60,7 +60,8 @@
 #define STM32_DMA_SCR_PINC BIT(9) /* Peripheral increment mode */
 #define STM32_DMA_SCR_CIRC BIT(8) /* Circular mode */
 #define STM32_DMA_SCR_PFCTRL   BIT(5) /* Peripheral Flow Controller */
-#define STM32_DMA_SCR_TCIE BIT(4) /* Transfer Cplete Int Enable*/
+#define STM32_DMA_SCR_TCIE BIT(4) /* Transfer Complete Int Enable
+   */
 #define STM32_DMA_SCR_TEIE BIT(2) /* Transfer Error Int Enable */
 #define STM32_DMA_SCR_DMEIEBIT(1) /* Direct Mode Err Int Enable */
 #define STM32_DMA_SCR_EN   BIT(0) /* Stream Enable */
@@ -918,7 +919,7 @@ static enum dma_status stm32_dma_tx_status(struct dma_chan 
*c,
u32 residue = 0;
 
status = dma_cookie_status(c, cookie, state);
-   if ((status == DMA_COMPLETE) || (!state))
+   if (status == DMA_COMPLETE || !state)
return status;
 
spin_lock_irqsave(>vchan.lock, flags);
@@ -982,7 +983,7 @@ static void stm32_dma_desc_free(struct virt_dma_desc *vdesc)
 }
 
 static void stm32_dma_set_config(struct stm32_dma_chan *chan,
- struct stm32_dma_cfg *cfg)
+struct stm32_dma_cfg *cfg)
 {
stm32_dma_clear_reg(>chan_reg);
 
@@ -1015,8 +1016,8 @@ static struct dma_chan *stm32_dma_of_xlate(struct 
of_phandle_args *dma_spec,
cfg.stream_config = dma_spec->args[2];
cfg.features = dma_spec->args[3];
 
-   if ((cfg.channel_id >= STM32_DMA_MAX_CHANNELS) ||
-   (cfg.request_line >= STM32_DMA_MAX_REQUEST_ID)) {
+   if (cfg.channel_id >= STM32_DMA_MAX_CHANNELS ||
+   cfg.request_line >= STM32_DMA_MAX_REQUEST_ID) {
dev_err(dev, "Bad channel and/or request id\n");
return NULL;
}
-- 
2.19.0.605.g01d371f741-goog



[PATCH 5/7] dmaengine: stm32-dma: fix DMA IRQ status handling

2018-10-08 Thread Joel Fernandes (Google)
From: Pierre Yves MORDRET 

Update the way Transfer Complete and Half Transfer Complete status are
acknowledge. Even if HTI is not enabled its status is shown when reading
registers, driver has to clear it gently and not raise an error.

Signed-off-by: Pierre-Yves MORDRET 
Signed-off-by: Vinod Koul 
---
 drivers/dma/stm32-dma.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index 21ad359a5a59..b40486454a2c 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -34,9 +34,14 @@
 #define STM32_DMA_LIFCR0x0008 /* DMA Low Int Flag 
Clear Reg */
 #define STM32_DMA_HIFCR0x000c /* DMA High Int Flag 
Clear Reg */
 #define STM32_DMA_TCI  BIT(5) /* Transfer Complete Interrupt */
+#define STM32_DMA_HTI  BIT(4) /* Half Transfer Interrupt */
 #define STM32_DMA_TEI  BIT(3) /* Transfer Error Interrupt */
 #define STM32_DMA_DMEI BIT(2) /* Direct Mode Error Interrupt */
 #define STM32_DMA_FEI  BIT(0) /* FIFO Error Interrupt */
+#define STM32_DMA_MASKI(STM32_DMA_TCI \
+| STM32_DMA_TEI \
+| STM32_DMA_DMEI \
+| STM32_DMA_FEI)
 
 /* DMA Stream x Configuration Register */
 #define STM32_DMA_SCR(x)   (0x0010 + 0x18 * (x)) /* x = 0..7 */
@@ -643,13 +648,29 @@ static irqreturn_t stm32_dma_chan_irq(int irq, void 
*devid)
status = stm32_dma_irq_status(chan);
scr = stm32_dma_read(dmadev, STM32_DMA_SCR(chan->id));
 
-   if ((status & STM32_DMA_TCI) && (scr & STM32_DMA_SCR_TCIE)) {
+   if (status & STM32_DMA_TCI) {
stm32_dma_irq_clear(chan, STM32_DMA_TCI);
-   stm32_dma_handle_chan_done(chan);
-
-   } else {
+   if (scr & STM32_DMA_SCR_TCIE)
+   stm32_dma_handle_chan_done(chan);
+   status &= ~STM32_DMA_TCI;
+   }
+   if (status & STM32_DMA_HTI) {
+   stm32_dma_irq_clear(chan, STM32_DMA_HTI);
+   status &= ~STM32_DMA_HTI;
+   }
+   if (status & STM32_DMA_FEI) {
+   stm32_dma_irq_clear(chan, STM32_DMA_FEI);
+   status &= ~STM32_DMA_FEI;
+   if (!(scr & STM32_DMA_SCR_EN))
+   dev_err(chan2dev(chan), "FIFO Error\n");
+   else
+   dev_dbg(chan2dev(chan), "FIFO over/underrun\n");
+   }
+   if (status) {
stm32_dma_irq_clear(chan, status);
dev_err(chan2dev(chan), "DMA error: status=0x%08x\n", status);
+   if (!(scr & STM32_DMA_SCR_EN))
+   dev_err(chan2dev(chan), "chan disabled by HW\n");
}
 
spin_unlock(>vchan.lock);
-- 
2.19.0.605.g01d371f741-goog



[PATCH 4/7] dmaengine: stm32-dma: Improve memory burst management

2018-10-08 Thread Joel Fernandes (Google)
From: Pierre Yves MORDRET 

This patch improves memory burst capability using best burst size
according to transferred buffer size from/to memory.

>From now on, memory burst is not necessarily same as with peripheral
burst one and fifo threshold is directly managed by this driver in order
to fit with computed memory burst.

Signed-off-by: M'boumba Cedric Madianga 
Signed-off-by: Pierre-Yves MORDRET 
Signed-off-by: Vinod Koul 
---
 drivers/dma/stm32-dma.c | 204 ++--
 1 file changed, 175 insertions(+), 29 deletions(-)

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index b64e14a83dec..21ad359a5a59 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -5,6 +5,7 @@
  *
  * Copyright (C) M'boumba Cedric Madianga 2015
  * Author: M'boumba Cedric Madianga 
+ * Pierre-Yves Mordret 
  *
  * License terms:  GNU General Public License (GPL), version 2
  */
@@ -115,6 +116,8 @@
 #define STM32_DMA_MAX_CHANNELS 0x08
 #define STM32_DMA_MAX_REQUEST_ID   0x08
 #define STM32_DMA_MAX_DATA_PARAM   0x03
+#define STM32_DMA_FIFO_SIZE16  /* FIFO is 16 bytes */
+#define STM32_DMA_MIN_BURST4
 #define STM32_DMA_MAX_BURST16
 
 /* DMA Features */
@@ -184,6 +187,8 @@ struct stm32_dma_chan {
struct dma_slave_config dma_sconfig;
struct stm32_dma_chan_reg chan_reg;
u32 threshold;
+   u32 mem_burst;
+   u32 mem_width;
 };
 
 struct stm32_dma_device {
@@ -248,6 +253,85 @@ static int stm32_dma_get_width(struct stm32_dma_chan *chan,
}
 }
 
+static enum dma_slave_buswidth stm32_dma_get_max_width(u32 buf_len,
+  u32 threshold)
+{
+   enum dma_slave_buswidth max_width;
+
+   if (threshold == STM32_DMA_FIFO_THRESHOLD_FULL)
+   max_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+   else
+   max_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
+
+   while ((buf_len < max_width  || buf_len % max_width) &&
+  max_width > DMA_SLAVE_BUSWIDTH_1_BYTE)
+   max_width = max_width >> 1;
+
+   return max_width;
+}
+
+static bool stm32_dma_fifo_threshold_is_allowed(u32 burst, u32 threshold,
+   enum dma_slave_buswidth width)
+{
+   u32 remaining;
+
+   if (width != DMA_SLAVE_BUSWIDTH_UNDEFINED) {
+   if (burst != 0) {
+   /*
+* If number of beats fit in several whole bursts
+* this configuration is allowed.
+*/
+   remaining = ((STM32_DMA_FIFO_SIZE / width) *
+(threshold + 1) / 4) % burst;
+
+   if (remaining == 0)
+   return true;
+   } else {
+   return true;
+   }
+   }
+
+   return false;
+}
+
+static bool stm32_dma_is_burst_possible(u32 buf_len, u32 threshold)
+{
+   switch (threshold) {
+   case STM32_DMA_FIFO_THRESHOLD_FULL:
+   if (buf_len >= STM32_DMA_MAX_BURST)
+   return true;
+   else
+   return false;
+   case STM32_DMA_FIFO_THRESHOLD_HALFFULL:
+   if (buf_len >= STM32_DMA_MAX_BURST / 2)
+   return true;
+   else
+   return false;
+   default:
+   return false;
+   }
+}
+
+static u32 stm32_dma_get_best_burst(u32 buf_len, u32 max_burst, u32 threshold,
+   enum dma_slave_buswidth width)
+{
+   u32 best_burst = max_burst;
+
+   if (best_burst == 1 || !stm32_dma_is_burst_possible(buf_len, threshold))
+   return 0;
+
+   while ((buf_len < best_burst * width && best_burst > 1) ||
+  !stm32_dma_fifo_threshold_is_allowed(best_burst, threshold,
+   width)) {
+   if (best_burst > STM32_DMA_MIN_BURST)
+   best_burst = best_burst >> 1;
+   else
+   best_burst = 0;
+   }
+
+   return best_burst;
+}
+
 static int stm32_dma_get_burst(struct stm32_dma_chan *chan, u32 maxburst)
 {
switch (maxburst) {
@@ -267,12 +351,12 @@ static int stm32_dma_get_burst(struct stm32_dma_chan 
*chan, u32 maxburst)
 }
 
 static void stm32_dma_set_fifo_config(struct stm32_dma_chan *chan,
- u32 src_maxburst, u32 dst_maxburst)
+ u32 src_burst, u32 dst_burst)
 {
chan->chan_reg.dma_sfcr &= ~STM32_DMA_SFCR_MASK;
chan->chan_reg.dma_scr &= ~STM32_DMA_SCR_DMEIE;
 
-   if ((!src_maxburst) && (!dst_maxburst)) {
+   if (!src_burst && !dst_burst) {
/* Using direct mode */
chan->chan_reg.dma_scr |= STM32_DMA_SCR_DMEIE;
} else {
@@ -589,37 

[PATCH 7/7] dmaengine: stm32-dma: properly mask irq bits

2018-10-08 Thread Joel Fernandes (Google)
From: Pierre Yves MORDRET 

A single register of the controller holds the information for four dma
channels.
The functions stm32_dma_irq_status() don't mask the relevant bits after
the shift, thus adjacent channel's status is also reported in the returned
value.
Fixed by masking the value before returning it.

Similarly, the function stm32_dma_irq_clear() don't mask the input value
before shifting it, thus an incorrect input value could disable the
interrupts of adjacent channels.
Fixed by masking the input value before using it.

Signed-off-by: Pierre-Yves MORDRET 
Signed-off-by: Antonio Borneo 
Signed-off-by: Vinod Koul 
---
 drivers/dma/stm32-dma.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index 05a2974cd2c0..8c5807362a25 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -38,6 +38,10 @@
 #define STM32_DMA_TEI  BIT(3) /* Transfer Error Interrupt */
 #define STM32_DMA_DMEI BIT(2) /* Direct Mode Error Interrupt */
 #define STM32_DMA_FEI  BIT(0) /* FIFO Error Interrupt */
+#define STM32_DMA_MASKI(STM32_DMA_TCI \
+| STM32_DMA_TEI \
+| STM32_DMA_DMEI \
+| STM32_DMA_FEI)
 
 /* DMA Stream x Configuration Register */
 #define STM32_DMA_SCR(x)   (0x0010 + 0x18 * (x)) /* x = 0..7 */
@@ -405,7 +409,7 @@ static u32 stm32_dma_irq_status(struct stm32_dma_chan *chan)
 
flags = dma_isr >> (((chan->id & 2) << 3) | ((chan->id & 1) * 6));
 
-   return flags;
+   return flags & STM32_DMA_MASKI;
 }
 
 static void stm32_dma_irq_clear(struct stm32_dma_chan *chan, u32 flags)
@@ -420,6 +424,7 @@ static void stm32_dma_irq_clear(struct stm32_dma_chan 
*chan, u32 flags)
 * If (ch % 4) is 2 or 3, left shift the mask by 16 bits.
 * If (ch % 4) is 1 or 3, additionally left shift the mask by 6 bits.
 */
+   flags &= STM32_DMA_MASKI;
dma_ifcr = flags << (((chan->id & 2) << 3) | ((chan->id & 1) * 6));
 
if (chan->id & 4)
-- 
2.19.0.605.g01d371f741-goog



[PATCH 6/7] dmaengine: stm32-dma: fix max items per transfer

2018-10-08 Thread Joel Fernandes (Google)
From: Pierre Yves MORDRET 

Having 0 in item counter register is valid and stands for a "No or Ended
transfer". Therefore valid transfer starts from @+0 to @+0xFFFE leading to
unaligned scatter gather at boundary. Thus it's safer to round down this
value on its FIFO size (16 Bytes).

Signed-off-by: Pierre-Yves MORDRET 
Signed-off-by: Vinod Koul 
---
 drivers/dma/stm32-dma.c | 19 +++
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c
index b40486454a2c..05a2974cd2c0 100644
--- a/drivers/dma/stm32-dma.c
+++ b/drivers/dma/stm32-dma.c
@@ -38,10 +38,6 @@
 #define STM32_DMA_TEI  BIT(3) /* Transfer Error Interrupt */
 #define STM32_DMA_DMEI BIT(2) /* Direct Mode Error Interrupt */
 #define STM32_DMA_FEI  BIT(0) /* FIFO Error Interrupt */
-#define STM32_DMA_MASKI(STM32_DMA_TCI \
-| STM32_DMA_TEI \
-| STM32_DMA_DMEI \
-| STM32_DMA_FEI)
 
 /* DMA Stream x Configuration Register */
 #define STM32_DMA_SCR(x)   (0x0010 + 0x18 * (x)) /* x = 0..7 */
@@ -118,6 +114,13 @@
 #define STM32_DMA_FIFO_THRESHOLD_FULL  0x03
 
 #define STM32_DMA_MAX_DATA_ITEMS   0x
+/*
+ * Valid transfer starts from @0 to @0xFFFE leading to unaligned scatter
+ * gather at boundary. Thus it's safer to round down this value on FIFO
+ * size (16 Bytes)
+ */
+#define STM32_DMA_ALIGNED_MAX_DATA_ITEMS   \
+   ALIGN_DOWN(STM32_DMA_MAX_DATA_ITEMS, 16)
 #define STM32_DMA_MAX_CHANNELS 0x08
 #define STM32_DMA_MAX_REQUEST_ID   0x08
 #define STM32_DMA_MAX_DATA_PARAM   0x03
@@ -869,7 +872,7 @@ static struct dma_async_tx_descriptor 
*stm32_dma_prep_slave_sg(
desc->sg_req[i].len = sg_dma_len(sg);
 
nb_data_items = desc->sg_req[i].len / buswidth;
-   if (nb_data_items > STM32_DMA_MAX_DATA_ITEMS) {
+   if (nb_data_items > STM32_DMA_ALIGNED_MAX_DATA_ITEMS) {
dev_err(chan2dev(chan), "nb items not supported\n");
goto err;
}
@@ -935,7 +938,7 @@ static struct dma_async_tx_descriptor 
*stm32_dma_prep_dma_cyclic(
return NULL;
 
nb_data_items = period_len / buswidth;
-   if (nb_data_items > STM32_DMA_MAX_DATA_ITEMS) {
+   if (nb_data_items > STM32_DMA_ALIGNED_MAX_DATA_ITEMS) {
dev_err(chan2dev(chan), "number of items not supported\n");
return NULL;
}
@@ -985,7 +988,7 @@ static struct dma_async_tx_descriptor 
*stm32_dma_prep_dma_memcpy(
u32 num_sgs, best_burst, dma_burst, threshold;
int i;
 
-   num_sgs = DIV_ROUND_UP(len, STM32_DMA_MAX_DATA_ITEMS);
+   num_sgs = DIV_ROUND_UP(len, STM32_DMA_ALIGNED_MAX_DATA_ITEMS);
desc = stm32_dma_alloc_desc(num_sgs);
if (!desc)
return NULL;
@@ -994,7 +997,7 @@ static struct dma_async_tx_descriptor 
*stm32_dma_prep_dma_memcpy(
 
for (offset = 0, i = 0; offset < len; offset += xfer_count, i++) {
xfer_count = min_t(size_t, len - offset,
-  STM32_DMA_MAX_DATA_ITEMS);
+  STM32_DMA_ALIGNED_MAX_DATA_ITEMS);
 
/* Compute best burst size */
max_width = DMA_SLAVE_BUSWIDTH_1_BYTE;
-- 
2.19.0.605.g01d371f741-goog



[PATCH] mm: Speed up mremap on large regions

2018-10-09 Thread Joel Fernandes (Google)
Android needs to mremap large regions of memory during memory management
related operations. The mremap system call can be really slow if THP is
not enabled. The bottleneck is move_page_tables, which is copying each
pte at a time, and can be really slow across a large map. Turning on THP
may not be a viable option, and is not for us. This patch speeds up the
performance for non-THP system by copying at the PMD level when possible.

The speed up is three orders of magnitude. On a 1GB mremap, the mremap
completion times drops from 160-250 millesconds to 380-400 microseconds.

Before:
Total mremap time for 1GB data: 242321014 nanoseconds.
Total mremap time for 1GB data: 196842467 nanoseconds.
Total mremap time for 1GB data: 167051162 nanoseconds.

After:
Total mremap time for 1GB data: 385781 nanoseconds.
Total mremap time for 1GB data: 388959 nanoseconds.
Total mremap time for 1GB data: 402813 nanoseconds.

Incase THP is enabled, the optimization is skipped. I also flush the
tlb every time we do this optimization since I couldn't find a way to
determine if the low-level PTEs are dirty. It is seen that the cost of
doing so is not much compared the improvement, on both x86-64 and arm64.

Cc: minc...@google.com
Cc: hu...@google.com
Cc: lokeshgi...@google.com
Cc: kernel-t...@android.com
Signed-off-by: Joel Fernandes (Google) 
---
 mm/mremap.c | 62 +
 1 file changed, 62 insertions(+)

diff --git a/mm/mremap.c b/mm/mremap.c
index 5c2e18505f75..68ddc9e9dfde 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -191,6 +191,54 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t 
*old_pmd,
drop_rmap_locks(vma);
 }
 
+bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, unsigned long old_end,
+ pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
+{
+   spinlock_t *old_ptl, *new_ptl;
+   struct mm_struct *mm = vma->vm_mm;
+
+   if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
+   || old_end - old_addr < PMD_SIZE)
+   return false;
+
+   /*
+* The destination pmd shouldn't be established, free_pgtables()
+* should have release it.
+*/
+   if (WARN_ON(!pmd_none(*new_pmd)))
+   return false;
+
+   /*
+* We don't have to worry about the ordering of src and dst
+* ptlocks because exclusive mmap_sem prevents deadlock.
+*/
+   old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+   if (old_ptl) {
+   pmd_t pmd;
+
+   new_ptl = pmd_lockptr(mm, new_pmd);
+   if (new_ptl != old_ptl)
+   spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+   /* Clear the pmd */
+   pmd = *old_pmd;
+   pmd_clear(old_pmd);
+
+   VM_BUG_ON(!pmd_none(*new_pmd));
+
+   /* Set the new pmd */
+   set_pmd_at(mm, new_addr, new_pmd, pmd);
+   if (new_ptl != old_ptl)
+   spin_unlock(new_ptl);
+   spin_unlock(old_ptl);
+
+   *need_flush = true;
+   return true;
+   }
+   return false;
+}
+
 unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
@@ -239,7 +287,21 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
split_huge_pmd(vma, old_pmd, old_addr);
if (pmd_trans_unstable(old_pmd))
continue;
+   } else if (extent == PMD_SIZE) {
+   bool moved;
+
+   /* See comment in move_ptes() */
+   if (need_rmap_locks)
+   take_rmap_locks(vma);
+   moved = move_normal_pmd(vma, old_addr, new_addr,
+   old_end, old_pmd, new_pmd,
+   _flush);
+   if (need_rmap_locks)
+   drop_rmap_locks(vma);
+   if (moved)
+   continue;
}
+
if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
-- 
2.19.0.605.g01d371f741-goog



[PATCH v2 1/2] mm: Add an F_SEAL_FS_WRITE seal to memfd

2018-10-09 Thread Joel Fernandes (Google)
Android uses ashmem for sharing memory regions. We are looking forward
to migrating all usecases of ashmem to memfd so that we can possibly
remove the ashmem driver in the future from staging while also
benefiting from using memfd and contributing to it. Note staging drivers
are also not ABI and generally can be removed at anytime.

One of the main usecases Android has is the ability to create a region
and mmap it as writeable, then drop its protection for "future" writes
while keeping the existing already mmap'ed writeable-region active.
This allows us to implement a usecase where receivers of the shared
memory buffer can get a read-only view, while the sender continues to
write to the buffer. See CursorWindow in Android for more details:
https://developer.android.com/reference/android/database/CursorWindow

This usecase cannot be implemented with the existing F_SEAL_WRITE seal.
To support the usecase, this patch adds a new F_SEAL_FS_WRITE seal which
prevents any future mmap and write syscalls from succeeding while
keeping the existing mmap active. The following program shows the seal
working in action:

int main() {
int ret, fd;
void *addr, *addr2, *addr3, *addr1;
ret = memfd_create_region("test_region", REGION_SIZE);
printf("ret=%d\n", ret);
fd = ret;

// Create map
addr = mmap(0, REGION_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED)
printf("map 0 failed\n");
else
printf("map 0 passed\n");

if ((ret = write(fd, "test", 4)) != 4)
printf("write failed even though no fs-write seal "
   "(ret=%d errno =%d)\n", ret, errno);
else
printf("write passed\n");

addr1 = mmap(0, REGION_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (addr1 == MAP_FAILED)
perror("map 1 prot-write failed even though no seal\n");
else
printf("map 1 prot-write passed as expected\n");

ret = fcntl(fd, F_ADD_SEALS, F_SEAL_FS_WRITE);
if (ret == -1)
printf("fcntl failed, errno: %d\n", errno);
else
printf("fs-write seal now active\n");

if ((ret = write(fd, "test", 4)) != 4)
printf("write failed as expected due to fs-write seal\n");
else
printf("write passed (unexpected)\n");

addr2 = mmap(0, REGION_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
if (addr2 == MAP_FAILED)
perror("map 2 prot-write failed as expected due to seal\n");
else
printf("map 2 passed\n");

addr3 = mmap(0, REGION_SIZE, PROT_READ, MAP_SHARED, fd, 0);
if (addr3 == MAP_FAILED)
perror("map 3 failed\n");
else
printf("map 3 prot-read passed as expected\n");
}

The output of running this program is as follows:
ret=3
map 0 passed
write passed
map 1 prot-write passed as expected
fs-write seal now active
write failed as expected due to fs-write seal
map 2 prot-write failed as expected due to seal
: Permission denied
map 3 prot-read passed as expected

Note: This seal will also prevent growing and shrinking of the memfd.
This is not something we do in Android so it does not affect us, however
I have mentioned this behavior of the seal in the manpage.

Cc: jr...@google.com
Cc: john.stu...@linaro.org
Cc: tk...@google.com
Cc: gre...@linuxfoundation.org
Signed-off-by: Joel Fernandes (Google) 
---
v1->v2: No change, just added selftests to the series. manpages are
ready and I'll submit them once the patches are accepted.

 include/uapi/linux/fcntl.h | 1 +
 mm/memfd.c | 6 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index c98312fa78a5..fe44a2035edf 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -41,6 +41,7 @@
 #define F_SEAL_SHRINK  0x0002  /* prevent file from shrinking */
 #define F_SEAL_GROW0x0004  /* prevent file from growing */
 #define F_SEAL_WRITE   0x0008  /* prevent writes */
+#define F_SEAL_FS_WRITE0x0010  /* prevent all write-related syscalls */
 /* (1U << 31) is reserved for signed error codes */
 
 /*
diff --git a/mm/memfd.c b/mm/memfd.c
index 27069518e3c5..9b8855b80de9 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -150,7 +150,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
 #define F_ALL_SEALS (F_SEAL_SEAL | \
 F_SEAL_SHRINK | \
 F_SEAL_GROW | \
-F_SEAL_WRITE)
+F_SEAL_WRITE | \
+F_SEAL_FS_WRITE)
 
 static int memfd_add_seals(struct file *file, unsigned int seals)
 {
@@ -219,6 +220,9 @@ static int memfd_add_seals(struct file *file, unsigned int 
seals)
}
}
 
+   if ((seals & F_SEAL_FS_WRITE) && !(*file_seals & F_SEAL_FS_WRITE))
+   file->f_mode &= ~(FMODE_WRITE | FMODE_PWRITE);
+
*file_seals |= seals;
error = 0;
 
-- 
2.19.0.605.g01d371f741-goog



[PATCH v2 2/2] selftests/memfd: Add tests for F_SEAL_FS_WRITE seal

2018-10-09 Thread Joel Fernandes (Google)
Add tests to verify sealing memfds with the F_SEAL_FS_WRITE works as
expected.

Cc: dan...@google.com
Cc: minc...@google.com
Signed-off-by: Joel Fernandes (Google) 
---
 tools/testing/selftests/memfd/memfd_test.c | 51 +-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/memfd/memfd_test.c 
b/tools/testing/selftests/memfd/memfd_test.c
index 10baa1652fc2..4bd2b6c87bb4 100644
--- a/tools/testing/selftests/memfd/memfd_test.c
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -27,7 +27,7 @@
 
 #define MFD_DEF_SIZE 8192
 #define STACK_SIZE 65536
-
+#define F_SEAL_FS_WRITE 0x0010
 /*
  * Default is not to test hugetlbfs
  */
@@ -170,6 +170,24 @@ static void *mfd_assert_mmap_shared(int fd)
return p;
 }
 
+static void *mfd_fail_mmap_shared(int fd)
+{
+   void *p;
+
+   p = mmap(NULL,
+mfd_def_size,
+PROT_READ | PROT_WRITE,
+MAP_SHARED,
+fd,
+0);
+   if (p != MAP_FAILED) {
+   printf("mmap() didn't fail as expected\n");
+   abort();
+   }
+
+   return p;
+}
+
 static void *mfd_assert_mmap_private(int fd)
 {
void *p;
@@ -692,6 +710,36 @@ static void test_seal_write(void)
close(fd);
 }
 
+/*
+ * Test SEAL_WRITE
+ * Test whether SEAL_WRITE actually prevents modifications.
+ */
+static void test_seal_fs_write(void)
+{
+   int fd;
+   void *p;
+
+   printf("%s SEAL-FS-WRITE\n", memfd_str);
+
+   fd = mfd_assert_new("kern_memfd_seal_fs_write",
+   mfd_def_size,
+   MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+   p = mfd_assert_mmap_shared(fd);
+
+   /* FS_WRITE seal can be added even with existing
+* writeable mappings */
+   mfd_assert_has_seals(fd, 0);
+   mfd_assert_add_seals(fd, F_SEAL_FS_WRITE);
+   mfd_assert_has_seals(fd, F_SEAL_FS_WRITE);
+
+   mfd_assert_read(fd);
+   mfd_fail_write(fd);
+
+   munmap(p, mfd_def_size);
+   close(fd);
+}
+
 /*
  * Test SEAL_SHRINK
  * Test whether SEAL_SHRINK actually prevents shrinking
@@ -945,6 +993,7 @@ int main(int argc, char **argv)
test_basic();
 
test_seal_write();
+   test_seal_fs_write();
test_seal_shrink();
test_seal_grow();
test_seal_resize();
-- 
2.19.0.605.g01d371f741-goog



Re: [RFC v3 1/5] sched/core: add capacity constraints to CPU controller

2017-03-22 Thread Joel Fernandes (Google)
Hi,

On Mon, Mar 20, 2017 at 11:08 AM, Patrick Bellasi
 wrote:
> On 20-Mar 13:15, Tejun Heo wrote:
>> Hello,
>>
>> On Tue, Feb 28, 2017 at 02:38:38PM +, Patrick Bellasi wrote:
[..]
>> > These attributes:
>> > a) are tunable at all hierarchy levels, i.e. root group too
>>
>> This usually is problematic because there should be a non-cgroup way
>> of configuring the feature in case cgroup isn't configured or used,
>> and it becomes awkward to have two separate mechanisms configuring the
>> same thing.  Maybe the feature is cgroup specific enough that it makes
>> sense here but this needs more explanation / justification.
>
> In the previous proposal I used to expose global tunables under
> procfs, e.g.:
>
>  /proc/sys/kernel/sched_capacity_min
>  /proc/sys/kernel/sched_capacity_max
>

But then we would lose out on being able to attach capacity
constraints to specific tasks or groups of tasks?

> which can be used to defined tunable root constraints when CGroups are
> not available, and becomes RO when CGroups are.
>
> Can this be eventually an acceptable option?
>
> In any case I think that this feature will be mainly targeting CGroup
> based systems. Indeed, one of the main goals is to collect
> "application specific" information from "informed run-times". Being
> "application specific" means that we need a way to classify
> applications depending on the runtime context... and that capability
> in Linux is ultimately provided via the CGroup interface.

I think the concern raised is more about whether CGroups is the right
interface to use for attaching capacity constraints to task or groups
of tasks, or is there a better way to attach such constraints?

I am actually looking at a workload where its desirable to attach such
constraints to only 1 thread or task, in this case it would be a bit
overkill to use CGroups to attach such property just for 1 task with
specific constraints and it would be beneficial that along with the
CGroup interface, there's also an interface to attach it to individual
tasks. The other advantage of such interface is we don't have to
create a separate CGroup for every new constraint limit and can have
several tasks with different unique constraints.

Regards,
Joel


Re: [RFC v3 1/5] sched/core: add capacity constraints to CPU controller

2017-03-24 Thread Joel Fernandes (Google)
Hi Tejun,

>> That's also why the proposed interface has now been defined as a extension of
>> the CPU controller in such a way to keep a consistent view.
>>
>> This controller is already used by run-times like Android to "scope" apps by
>> constraining the amount of CPUs resource they are getting.
>> Is that not a legitimate usage of the cpu controller?
>>
>> What we are doing here is just extending it a bit in such a way that, while:
>>
>>   {cfs,rt}_{period,runtime}_us limits the amount of TIME we can use a CPU
>>
>> we can also use:
>>
>>   capacity_{min,max} to limit the actual COMPUTATIONAL BANDWIDTH we can use
>>  during that time.
>
> Yes, we do have bandwidth restriction as a cgroup only feature, which
> is different from how we handle nice levels and weights.  Given the
> nature of bandwidth limits, if necessary, it is straight-forward to
> expose per-task interface.
>
> capacity min/max isn't the same thing.  It isn't a limit on countable
> units of a specific resource and that's why the interface you
> suggested for .min is different.  It's restricting attribute set which
> can be picked in the subhierarchy rather than controlling distribution
> of atoms of the resource.
>
> That's also why we're gonna have problem if we later decide we need a
> thread based API for it.  Once we make cgroup the primary owner of the
> attribute, it's not straight forward to add another owner.

Sorry I don't immediately see why it is not straight forward to have a
per-task API later once CGroup interface is added? Maybe if you don't
mind giving an example that will help?

I can start with an example, say you have a single level hierarchy
(Top-app in Android terms is the set of tasks that are user facing and
we'd like to enforce some capacity minimums, background on the other
hand is the opposite):

   ROOT (min = 0, max = 1024)
   / \
  /   \
  TOP-APP (min = 200, max = 1024)  BACKGROUND (min = 0, max = 500)

If in the future, if we want to have a per-task API to individually
configure the task with these limits, it seems it will be straight
forward to implement IMO.

As Patrick mentioned, all of the usecases needing this right now is an
informed runtime placing a task in a group of tasks and not needing to
set attributes for each individual task. We are already placing tasks
in individual CGroups in Android based on the information the runtime
has so adding in the capacity constraints will make it fit naturally
while leaving the door open for any future per-task API additions IMO.

Thanks,

Joel


Re: [RFC v3 1/5] sched/core: add capacity constraints to CPU controller

2017-03-24 Thread Joel Fernandes (Google)
Hi Patrick,

On Thu, Mar 23, 2017 at 3:32 AM, Patrick Bellasi
 wrote:
[..]
>> > which can be used to defined tunable root constraints when CGroups are
>> > not available, and becomes RO when CGroups are.
>> >
>> > Can this be eventually an acceptable option?
>> >
>> > In any case I think that this feature will be mainly targeting CGroup
>> > based systems. Indeed, one of the main goals is to collect
>> > "application specific" information from "informed run-times". Being
>> > "application specific" means that we need a way to classify
>> > applications depending on the runtime context... and that capability
>> > in Linux is ultimately provided via the CGroup interface.
>>
>> I think the concern raised is more about whether CGroups is the right
>> interface to use for attaching capacity constraints to task or groups
>> of tasks, or is there a better way to attach such constraints?
>
> Notice that CGroups based classification allows to easily enforce
> the concept of "delegation containment". I think this feature should
> be nice to have whatever interface we choose.
>
> However, potentially we can define a proper per-task API; are you
> thinking to something specifically?
>

I was thinking how about adding per-task constraints to the resource
limits API if it makes sense to? There's already RLIMIT_CPU and
RLIMIT_NICE. An informed-runtime could then modify the limits of tasks
using prlimit.

>> The other advantage of such interface is we don't have to
>> create a separate CGroup for every new constraint limit and can have
>> several tasks with different unique constraints.
>
> That's still possible using CGroups and IMO it will not be the "most
> common case".
> Don't you think that in general we will need to set constraints at
> applications level, thus group of tasks?

Some applications could be a single task, also not all tasks in an
application may need constraints right?

> As a general rule we should probably go for an interface which makes
> easy the most common case.

I agree.

Thanks,
Joel


Re: [RFC v3 5/5] sched/{core,cpufreq_schedutil}: add capacity clamping for RT/DL tasks

2017-03-13 Thread Joel Fernandes (Google)
Hi Patrick,

On Tue, Feb 28, 2017 at 6:38 AM, Patrick Bellasi
 wrote:
> Currently schedutil enforce a maximum OPP when RT/DL tasks are RUNNABLE.
> Such a mandatory policy can be made more tunable from userspace thus
> allowing for example to define a reasonable max capacity (i.e.
> frequency) which is required for the execution of a specific RT/DL
> workload. This will contribute to make the RT class more "friendly" for
> power/energy sensible applications.
>
> This patch extends the usage of capacity_{min,max} to the RT/DL classes.
> Whenever a task in these classes is RUNNABLE, the capacity required is
> defined by the constraints of the control group that task belongs to.
>

We briefly discussed this at Linaro Connect that this works well for
sporadic RT tasks that run briefly and then sleep for long periods of
time - so certainly this patch is good, but its only a partial
solution to the problem of frequent and short-sleepers and something
is required to keep the boost active for short non-RUNNABLE as well.
The behavior with many periodic RT tasks is that they will sleep for
short intervals and run for short intervals periodically. In this case
removing the clamp (or the boost as in schedtune v2) on a dequeue will
essentially mean during a narrow window cpufreq can drop the frequency
and only to make it go back up again.

Currently for schedtune v2, I am working on prototyping something like
the following for Android:
- if RT task is enqueue, introduce the boost.
- When task is dequeued, start a timer for a  "minimum deboost delay
time" before taking out the boost.
- If task is enqueued again before the timer fires, then cancel the timer.

I don't think any "fix" to this particular issue should be to the
schedutil governor and should be sorted before going to cpufreq itself
(that is before making the request). What do you think about this?

Thanks,
Joel


Re: [RFC v3 1/5] sched/core: add capacity constraints to CPU controller

2017-03-13 Thread Joel Fernandes (Google)
On Tue, Feb 28, 2017 at 6:38 AM, Patrick Bellasi
 wrote:
> The CPU CGroup controller allows to assign a specified (maximum)
> bandwidth to tasks within a group, however it does not enforce any
> constraint on how such bandwidth can be consumed.
> With the integration of schedutil, the scheduler has now the proper
> information about a task to select  the most suitable frequency to
> satisfy tasks needs.
[..]

> +static u64 cpu_capacity_min_read_u64(struct cgroup_subsys_state *css,
> +struct cftype *cft)
> +{
> +   struct task_group *tg;
> +   u64 min_capacity;
> +
> +   rcu_read_lock();
> +   tg = css_tg(css);
> +   min_capacity = tg->cap_clamp[CAP_CLAMP_MIN];

Shouldn't the cap_clamp be accessed with READ_ONCE (and WRITE_ONCE in
the write path) to avoid load-tearing?

Thanks,
Joel


[PATCH v8 -tip 01/26] sched: Wrap rq::lock access

2020-10-19 Thread Joel Fernandes (Google)
From: Peter Zijlstra 

In preparation of playing games with rq->lock, abstract the thing
using an accessor.

Tested-by: Julien Desfossez 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Vineeth Remanan Pillai 
Signed-off-by: Julien Desfossez 
---
 kernel/sched/core.c |  46 +-
 kernel/sched/cpuacct.c  |  12 ++---
 kernel/sched/deadline.c |  18 +++
 kernel/sched/debug.c|   4 +-
 kernel/sched/fair.c |  38 +++
 kernel/sched/idle.c |   4 +-
 kernel/sched/pelt.h |   2 +-
 kernel/sched/rt.c   |   8 +--
 kernel/sched/sched.h| 105 +---
 kernel/sched/topology.c |   4 +-
 10 files changed, 122 insertions(+), 119 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2003a7d5ab5..97181b3d12eb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -186,12 +186,12 @@ struct rq *__task_rq_lock(struct task_struct *p, struct 
rq_flags *rf)
 
for (;;) {
rq = task_rq(p);
-   raw_spin_lock(>lock);
+   raw_spin_lock(rq_lockp(rq));
if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
rq_pin_lock(rq, rf);
return rq;
}
-   raw_spin_unlock(>lock);
+   raw_spin_unlock(rq_lockp(rq));
 
while (unlikely(task_on_rq_migrating(p)))
cpu_relax();
@@ -210,7 +210,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct 
rq_flags *rf)
for (;;) {
raw_spin_lock_irqsave(>pi_lock, rf->flags);
rq = task_rq(p);
-   raw_spin_lock(>lock);
+   raw_spin_lock(rq_lockp(rq));
/*
 *  move_queued_task()  task_rq_lock()
 *
@@ -232,7 +232,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct 
rq_flags *rf)
rq_pin_lock(rq, rf);
return rq;
}
-   raw_spin_unlock(>lock);
+   raw_spin_unlock(rq_lockp(rq));
raw_spin_unlock_irqrestore(>pi_lock, rf->flags);
 
while (unlikely(task_on_rq_migrating(p)))
@@ -302,7 +302,7 @@ void update_rq_clock(struct rq *rq)
 {
s64 delta;
 
-   lockdep_assert_held(>lock);
+   lockdep_assert_held(rq_lockp(rq));
 
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
@@ -611,7 +611,7 @@ void resched_curr(struct rq *rq)
struct task_struct *curr = rq->curr;
int cpu;
 
-   lockdep_assert_held(>lock);
+   lockdep_assert_held(rq_lockp(rq));
 
if (test_tsk_need_resched(curr))
return;
@@ -635,10 +635,10 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
 
-   raw_spin_lock_irqsave(>lock, flags);
+   raw_spin_lock_irqsave(rq_lockp(rq), flags);
if (cpu_online(cpu) || cpu == smp_processor_id())
resched_curr(rq);
-   raw_spin_unlock_irqrestore(>lock, flags);
+   raw_spin_unlock_irqrestore(rq_lockp(rq), flags);
 }
 
 #ifdef CONFIG_SMP
@@ -1137,7 +1137,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct 
task_struct *p,
struct uclamp_se *uc_se = >uclamp[clamp_id];
struct uclamp_bucket *bucket;
 
-   lockdep_assert_held(>lock);
+   lockdep_assert_held(rq_lockp(rq));
 
/* Update task effective clamp */
p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
@@ -1177,7 +1177,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct 
task_struct *p,
unsigned int bkt_clamp;
unsigned int rq_clamp;
 
-   lockdep_assert_held(>lock);
+   lockdep_assert_held(rq_lockp(rq));
 
/*
 * If sched_uclamp_used was enabled after task @p was enqueued,
@@ -1733,7 +1733,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, 
int cpu)
 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
   struct task_struct *p, int new_cpu)
 {
-   lockdep_assert_held(>lock);
+   lockdep_assert_held(rq_lockp(rq));
 
deactivate_task(rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, new_cpu);
@@ -1845,7 +1845,7 @@ void do_set_cpus_allowed(struct task_struct *p, const 
struct cpumask *new_mask)
 * Because __kthread_bind() calls this on blocked tasks without
 * holding rq->lock.
 */
-   lockdep_assert_held(>lock);
+   lockdep_assert_held(rq_lockp(rq));
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
}
if (running)
@@ -1982,7 +1982,7 @@ void set_task_cpu(struct task_struct *p, unsigned int 
new_cpu)
 * task_rq_lock().
 */
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(>pi_lock) ||
- lockdep_is_held(_rq(p)->lock)));
+   

[PATCH v8 -tip 00/26] Core scheduling

2020-10-19 Thread Joel Fernandes (Google)
Eighth iteration of the Core-Scheduling feature.

Core scheduling is a feature that allows only trusted tasks to run
concurrently on cpus sharing compute resources (eg: hyperthreads on a
core). The goal is to mitigate the core-level side-channel attacks
without requiring to disable SMT (which has a significant impact on
performance in some situations). Core scheduling (as of v7) mitigates
user-space to user-space attacks and user to kernel attack when one of
the siblings enters the kernel via interrupts or system call.

By default, the feature doesn't change any of the current scheduler
behavior. The user decides which tasks can run simultaneously on the
same core (for now by having them in the same tagged cgroup). When a tag
is enabled in a cgroup and a task from that cgroup is running on a
hardware thread, the scheduler ensures that only idle or trusted tasks
run on the other sibling(s). Besides security concerns, this feature can
also be beneficial for RT and performance applications where we want to
control how tasks make use of SMT dynamically.

This iteration focuses on the the following stuff:
- Redesigned API.
- Rework of Kernel Protection feature based on Thomas's entry work.
- Rework of hotplug fixes.
- Address review comments in v7

Joel: Both a CGroup and Per-task interface via prctl(2) are provided for
configuring core sharing. More details are provided in documentation patch.
Kselftests are provided to verify the correctness/rules of the interface.

Julien: TPCC tests showed improvements with core-scheduling. With kernel
protection enabled, it does not show any regression. Possibly ASI will improve
the performance for those who choose kernel protection (can be toggled through
sched_core_protect_kernel sysctl). Results:
v8  average stdev   diff
baseline (SMT on)   1197.27244.78312824 
core sched (   kernel protect)  412.989545.42734343 -65.51%
core sched (no kernel protect)  686.651571.77756931 -42.65%
nosmt   408.667 39.39042872 -65.87%

v8 is rebased on tip/master.

Future work
===
- Load balancing/Migration fixes for core scheduling.
  With v6, Load balancing is partially coresched aware, but has some
  issues w.r.t process/taskgroup weights:
  https://lwn.net/ml/linux-kernel/20200225034438.GA617271@z...
- Core scheduling test framework: kselftests, torture tests etc

Changes in v8
=
- New interface/API implementation
  - Joel
- Revised kernel protection patch
  - Joel
- Revised Hotplug fixes
  - Joel
- Minor bug fixes and address review comments
  - Vineeth

Changes in v7
=
- Kernel protection from untrusted usermode tasks
  - Joel, Vineeth
- Fix for hotplug crashes and hangs
  - Joel, Vineeth

Changes in v6
=
- Documentation
  - Joel
- Pause siblings on entering nmi/irq/softirq
  - Joel, Vineeth
- Fix for RCU crash
  - Joel
- Fix for a crash in pick_next_task
  - Yu Chen, Vineeth
- Minor re-write of core-wide vruntime comparison
  - Aaron Lu
- Cleanup: Address Review comments
- Cleanup: Remove hotplug support (for now)
- Build fixes: 32 bit, SMT=n, AUTOGROUP=n etc
  - Joel, Vineeth

Changes in v5
=
- Fixes for cgroup/process tagging during corner cases like cgroup
  destroy, task moving across cgroups etc
  - Tim Chen
- Coresched aware task migrations
  - Aubrey Li
- Other minor stability fixes.

Changes in v4
=
- Implement a core wide min_vruntime for vruntime comparison of tasks
  across cpus in a core.
  - Aaron Lu
- Fixes a typo bug in setting the forced_idle cpu.
  - Aaron Lu

Changes in v3
=
- Fixes the issue of sibling picking up an incompatible task
  - Aaron Lu
  - Vineeth Pillai
  - Julien Desfossez
- Fixes the issue of starving threads due to forced idle
  - Peter Zijlstra
- Fixes the refcounting issue when deleting a cgroup with tag
  - Julien Desfossez
- Fixes a crash during cpu offline/online with coresched enabled
  - Vineeth Pillai
- Fixes a comparison logic issue in sched_core_find
  - Aaron Lu

Changes in v2
=
- Fixes for couple of NULL pointer dereference crashes
  - Subhra Mazumdar
  - Tim Chen
- Improves priority comparison logic for process in different cpus
  - Peter Zijlstra
  - Aaron Lu
- Fixes a hard lockup in rq locking
  - Vineeth Pillai
  - Julien Desfossez
- Fixes a performance issue seen on IO heavy workloads
  - Vineeth Pillai
  - Julien Desfossez
- Fix for 32bit build
  - Aubrey Li

Aubrey Li (1):
sched: migration changes for core scheduling

Joel Fernandes (Google) (13):
sched/fair: Snapshot the min_vruntime of CPUs on force idle
arch/x86: Add a new TIF flag for untrusted tasks
kernel/entry: Add support for core-wide protection of kernel-mode
entry/idle: Enter and exit kernel protection during idle entry and
exit
sched: Split the cookie and setup per-task cookie on fork
sched: Add a per-thread core scheduling interface
sched: Add a second

[PATCH v8 -tip 02/26] sched: Introduce sched_class::pick_task()

2020-10-19 Thread Joel Fernandes (Google)
From: Peter Zijlstra 

Because sched_class::pick_next_task() also implies
sched_class::set_next_task() (and possibly put_prev_task() and
newidle_balance) it is not state invariant. This makes it unsuitable
for remote task selection.

Tested-by: Julien Desfossez 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Vineeth Remanan Pillai 
Signed-off-by: Julien Desfossez 
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/deadline.c  | 16 ++--
 kernel/sched/fair.c  | 32 +++-
 kernel/sched/idle.c  |  8 
 kernel/sched/rt.c| 14 --
 kernel/sched/sched.h |  3 +++
 kernel/sched/stop_task.c | 13 +++--
 6 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 814ec49502b1..0271a7848ab3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1848,7 +1848,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct 
rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
 }
 
-static struct task_struct *pick_next_task_dl(struct rq *rq)
+static struct task_struct *pick_task_dl(struct rq *rq)
 {
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = >dl;
@@ -1860,7 +1860,18 @@ static struct task_struct *pick_next_task_dl(struct rq 
*rq)
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
-   set_next_task_dl(rq, p, true);
+
+   return p;
+}
+
+static struct task_struct *pick_next_task_dl(struct rq *rq)
+{
+   struct task_struct *p;
+
+   p = pick_task_dl(rq);
+   if (p)
+   set_next_task_dl(rq, p, true);
+
return p;
 }
 
@@ -2517,6 +2528,7 @@ const struct sched_class dl_sched_class
 
 #ifdef CONFIG_SMP
.balance= balance_dl,
+   .pick_task  = pick_task_dl,
.select_task_rq = select_task_rq_dl,
.migrate_task_rq= migrate_task_rq_dl,
.set_cpus_allowed   = set_cpus_allowed_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dbd9368a959d..bd6aed63f5e3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4450,7 +4450,7 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *curr)
 * Avoid running the skip buddy, if running something else can
 * be done without getting too unfair.
 */
-   if (cfs_rq->skip == se) {
+   if (cfs_rq->skip && cfs_rq->skip == se) {
struct sched_entity *second;
 
if (se == curr) {
@@ -6976,6 +6976,35 @@ static void check_preempt_wakeup(struct rq *rq, struct 
task_struct *p, int wake_
set_last_buddy(se);
 }
 
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_fair(struct rq *rq)
+{
+   struct cfs_rq *cfs_rq = >cfs;
+   struct sched_entity *se;
+
+   if (!cfs_rq->nr_running)
+   return NULL;
+
+   do {
+   struct sched_entity *curr = cfs_rq->curr;
+
+   se = pick_next_entity(cfs_rq, NULL);
+
+   if (curr) {
+   if (se && curr->on_rq)
+   update_curr(cfs_rq);
+
+   if (!se || entity_before(curr, se))
+   se = curr;
+   }
+
+   cfs_rq = group_cfs_rq(se);
+   } while (cfs_rq);
+
+   return task_of(se);
+}
+#endif
+
 struct task_struct *
 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags 
*rf)
 {
@@ -11173,6 +11202,7 @@ const struct sched_class fair_sched_class
 
 #ifdef CONFIG_SMP
.balance= balance_fair,
+   .pick_task  = pick_task_fair,
.select_task_rq = select_task_rq_fair,
.migrate_task_rq= migrate_task_rq_fair,
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8ce6e80352cf..ce7552c6bc65 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -405,6 +405,13 @@ static void set_next_task_idle(struct rq *rq, struct 
task_struct *next, bool fir
schedstat_inc(rq->sched_goidle);
 }
 
+#ifdef CONFIG_SMP
+static struct task_struct *pick_task_idle(struct rq *rq)
+{
+   return rq->idle;
+}
+#endif
+
 struct task_struct *pick_next_task_idle(struct rq *rq)
 {
struct task_struct *next = rq->idle;
@@ -472,6 +479,7 @@ const struct sched_class idle_sched_class
 
 #ifdef CONFIG_SMP
.balance= balance_idle,
+   .pick_task  = pick_task_idle,
.select_task_rq = select_task_rq_idle,
.set_cpus_allowed   = set_cpus_allowed_common,
 #endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e57fca05b660..a5851c775270 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1624,7 +1624,7 @@ static struct task_struct *_pick_next_task_rt(struct rq 
*rq)
return rt_task_of(rt_se);
 }
 
-st

[PATCH v8 -tip 08/26] sched/fair: Snapshot the min_vruntime of CPUs on force idle

2020-10-19 Thread Joel Fernandes (Google)
During force-idle, we end up doing cross-cpu comparison of vruntimes
during pick_next_task. If we simply compare (vruntime-min_vruntime)
across CPUs, and if the CPUs only have 1 task each, we will always
end up comparing 0 with 0 and pick just one of the tasks all the time.
This starves the task that was not picked. To fix this, take a snapshot
of the min_vruntime when entering force idle and use it for comparison.
This min_vruntime snapshot will only be used for cross-CPU vruntime
comparison, and nothing else.

This resolves several performance issues that were seen in ChromeOS
audio usecase.

Tested-by: Julien Desfossez 
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/core.c  | 33 -
 kernel/sched/fair.c  | 40 
 kernel/sched/sched.h |  5 +
 3 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 469428979182..a5404ec9e89a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -115,19 +115,8 @@ static inline bool prio_less(struct task_struct *a, struct 
task_struct *b)
if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
return !dl_time_before(a->dl.deadline, b->dl.deadline);
 
-   if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
-   u64 vruntime = b->se.vruntime;
-
-   /*
-* Normalize the vruntime if tasks are in different cpus.
-*/
-   if (task_cpu(a) != task_cpu(b)) {
-   vruntime -= task_cfs_rq(b)->min_vruntime;
-   vruntime += task_cfs_rq(a)->min_vruntime;
-   }
-
-   return !((s64)(a->se.vruntime - vruntime) <= 0);
-   }
+   if (pa == MAX_RT_PRIO + MAX_NICE)   /* fair */
+   return cfs_prio_less(a, b);
 
return false;
 }
@@ -4648,6 +4637,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
struct task_struct *next, *max = NULL;
const struct sched_class *class;
const struct cpumask *smt_mask;
+   bool fi_before = false;
bool need_sync;
int i, j, cpu;
 
@@ -4712,6 +4702,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
rq->core->core_cookie = 0UL;
if (rq->core->core_forceidle) {
need_sync = true;
+   fi_before = true;
rq->core->core_forceidle = false;
}
for_each_cpu(i, smt_mask) {
@@ -4723,6 +4714,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
update_rq_clock(rq_i);
}
 
+   /* Reset the snapshot if core is no longer in force-idle. */
+   if (!fi_before) {
+   for_each_cpu(i, smt_mask) {
+   struct rq *rq_i = cpu_rq(i);
+   rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
+   }
+   }
+
/*
 * Try and select tasks for each sibling in decending sched_class
 * order.
@@ -4859,6 +4858,14 @@ next_class:;
resched_curr(rq_i);
}
 
+   /* Snapshot if core is in force-idle. */
+   if (!fi_before && rq->core->core_forceidle) {
+   for_each_cpu(i, smt_mask) {
+   struct rq *rq_i = cpu_rq(i);
+   rq_i->cfs.min_vruntime_fi = rq_i->cfs.min_vruntime;
+   }
+   }
+
 done:
set_next_task(rq, next);
return next;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56bea0decda1..9cae08c3fca1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10686,6 +10686,46 @@ static inline void task_tick_core(struct rq *rq, 
struct task_struct *curr)
__entity_slice_used(>se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
 }
+
+bool cfs_prio_less(struct task_struct *a, struct task_struct *b)
+{
+   bool samecpu = task_cpu(a) == task_cpu(b);
+   struct sched_entity *sea = >se;
+   struct sched_entity *seb = >se;
+   struct cfs_rq *cfs_rqa;
+   struct cfs_rq *cfs_rqb;
+   s64 delta;
+
+   if (samecpu) {
+   /* vruntime is per cfs_rq */
+   while (!is_same_group(sea, seb)) {
+   int sea_depth = sea->depth;
+   int seb_depth = seb->depth;
+   if (sea_depth >= seb_depth)
+   sea = parent_entity(sea);
+   if (sea_depth <= seb_depth)
+   seb = parent_entity(seb);
+   }
+
+   delta = (s64)(sea->vruntime - seb->vruntime);
+   goto out;
+   }
+
+   /* crosscpu: compare root level se's vruntime to decide priority */
+   while (sea->parent)
+   sea 

[PATCH v8 -tip 14/26] entry/idle: Enter and exit kernel protection during idle entry and exit

2020-10-19 Thread Joel Fernandes (Google)
Add a generic_idle_{enter,exit} helper function to enter and exit kernel
protection when entering and exiting idle, respectively.

Tested-by: Julien Desfossez 
Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/entry-common.h | 18 ++
 kernel/sched/idle.c  | 11 ++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 260216de357b..879562d920f2 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -402,4 +402,22 @@ void irqentry_exit_cond_resched(void);
  */
 void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state);
 
+/**
+ * generic_idle_enter - Called during entry into idle for housekeeping.
+ */
+static inline void generic_idle_enter(void)
+{
+   /* Entering idle ends the protected kernel region. */
+   sched_core_unsafe_exit();
+}
+
+/**
+ * generic_idle_enter - Called when exiting idle for housekeeping.
+ */
+static inline void generic_idle_exit(void)
+{
+   /* Exiting idle (re)starts the protected kernel region. */
+   sched_core_unsafe_enter();
+}
+
 #endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index a74926be80ac..029ba61576f2 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -8,6 +8,7 @@
  */
 #include "sched.h"
 
+#include 
 #include 
 
 /* Linker adds these: start and end of __cpuidle functions */
@@ -54,6 +55,7 @@ __setup("hlt", cpu_idle_nopoll_setup);
 
 static noinline int __cpuidle cpu_idle_poll(void)
 {
+   generic_idle_enter();
trace_cpu_idle(0, smp_processor_id());
stop_critical_timings();
rcu_idle_enter();
@@ -66,6 +68,7 @@ static noinline int __cpuidle cpu_idle_poll(void)
rcu_idle_exit();
start_critical_timings();
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
+   generic_idle_exit();
 
return 1;
 }
@@ -156,11 +159,7 @@ static void cpuidle_idle_call(void)
return;
}
 
-   /*
-* The RCU framework needs to be told that we are entering an idle
-* section, so no more rcu read side critical sections and one more
-* step to the grace period
-*/
+   generic_idle_enter();
 
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
@@ -225,6 +224,8 @@ static void cpuidle_idle_call(void)
 */
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
+
+   generic_idle_exit();
 }
 
 /*
-- 
2.29.0.rc1.297.gfa9743e501-goog



[PATCH v8 -tip 12/26] arch/x86: Add a new TIF flag for untrusted tasks

2020-10-19 Thread Joel Fernandes (Google)
Add a new TIF flag to indicate whether the kernel needs to be careful
and take additional steps to mitigate micro-architectural issues during
entry into user or guest mode.

This new flag will be used by the series to determine if waiting is
needed or not, during exit to user or guest mode.

Tested-by: Julien Desfossez 
Signed-off-by: Joel Fernandes (Google) 
---
 arch/x86/include/asm/thread_info.h | 2 ++
 kernel/sched/sched.h   | 6 ++
 2 files changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index c448fcfa1b82..45b6dbdf116e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -99,6 +99,7 @@ struct thread_info {
 #define TIF_SPEC_FORCE_UPDATE  23  /* Force speculation MSR update in 
context switch */
 #define TIF_FORCED_TF  24  /* true if TF in eflags artificially */
 #define TIF_BLOCKSTEP  25  /* set when we want DEBUGCTLMSR_BTF */
+#define TIF_UNSAFE_RET 26  /* On return to process/guest, perform 
safety checks. */
 #define TIF_LAZY_MMU_UPDATES   27  /* task is updating the mmu lazily */
 #define TIF_SYSCALL_TRACEPOINT 28  /* syscall tracepoint instrumentation */
 #define TIF_ADDR32 29  /* 32-bit address space on 64 bits */
@@ -129,6 +130,7 @@ struct thread_info {
 #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
 #define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
 #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP)
+#define _TIF_UNSAFE_RET(1 << TIF_UNSAFE_RET)
 #define _TIF_LAZY_MMU_UPDATES  (1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT)
 #define _TIF_ADDR32(1 << TIF_ADDR32)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d0c7a7f87d73..f7e2d8a3be8e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2769,3 +2769,9 @@ static inline bool is_per_cpu_kthread(struct task_struct 
*p)
 
 void swake_up_all_locked(struct swait_queue_head *q);
 void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+
+#ifdef CONFIG_SCHED_CORE
+#ifndef TIF_UNSAFE_RET
+#define TIF_UNSAFE_RET (0)
+#endif
+#endif
-- 
2.29.0.rc1.297.gfa9743e501-goog



[PATCH v8 -tip 06/26] sched: Add core wide task selection and scheduling.

2020-10-19 Thread Joel Fernandes (Google)
From: Peter Zijlstra 

Instead of only selecting a local task, select a task for all SMT
siblings for every reschedule on the core (irrespective which logical
CPU does the reschedule).

During a CPU hotplug event, schedule would be called with the hotplugged
CPU not in the cpumask. So use for_each_cpu(_wrap)_or to include the
current cpu in the task pick loop.

There are multiple loops in pick_next_task that iterate over CPUs in
smt_mask. During a hotplug event, sibling could be removed from the
smt_mask while pick_next_task is running. So we cannot trust the mask
across the different loops. This can confuse the logic. Add a retry logic
if smt_mask changes between the loops.

Tested-by: Julien Desfossez 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Julien Desfossez 
Signed-off-by: Vineeth Remanan Pillai 
Signed-off-by: Joel Fernandes (Google) 
Signed-off-by: Aaron Lu 
Signed-off-by: Tim Chen 
Signed-off-by: Chen Yu 
---
 kernel/sched/core.c  | 301 ++-
 kernel/sched/sched.h |   6 +-
 2 files changed, 305 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a032f481c6e6..12030b77bd6d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4533,7 +4533,7 @@ static void put_prev_task_balance(struct rq *rq, struct 
task_struct *prev,
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+__pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
const struct sched_class *class;
struct task_struct *p;
@@ -4574,6 +4574,294 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
 }
 
 #ifdef CONFIG_SCHED_CORE
+static inline bool is_task_rq_idle(struct task_struct *t)
+{
+   return (task_rq(t)->idle == t);
+}
+
+static inline bool cookie_equals(struct task_struct *a, unsigned long cookie)
+{
+   return is_task_rq_idle(a) || (a->core_cookie == cookie);
+}
+
+static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
+{
+   if (is_task_rq_idle(a) || is_task_rq_idle(b))
+   return true;
+
+   return a->core_cookie == b->core_cookie;
+}
+
+// XXX fairness/fwd progress conditions
+/*
+ * Returns
+ * - NULL if there is no runnable task for this class.
+ * - the highest priority task for this runqueue if it matches
+ *   rq->core->core_cookie or its priority is greater than max.
+ * - Else returns idle_task.
+ */
+static struct task_struct *
+pick_task(struct rq *rq, const struct sched_class *class, struct task_struct 
*max)
+{
+   struct task_struct *class_pick, *cookie_pick;
+   unsigned long cookie = rq->core->core_cookie;
+
+   class_pick = class->pick_task(rq);
+   if (!class_pick)
+   return NULL;
+
+   if (!cookie) {
+   /*
+* If class_pick is tagged, return it only if it has
+* higher priority than max.
+*/
+   if (max && class_pick->core_cookie &&
+   prio_less(class_pick, max))
+   return idle_sched_class.pick_task(rq);
+
+   return class_pick;
+   }
+
+   /*
+* If class_pick is idle or matches cookie, return early.
+*/
+   if (cookie_equals(class_pick, cookie))
+   return class_pick;
+
+   cookie_pick = sched_core_find(rq, cookie);
+
+   /*
+* If class > max && class > cookie, it is the highest priority task on
+* the core (so far) and it must be selected, otherwise we must go with
+* the cookie pick in order to satisfy the constraint.
+*/
+   if (prio_less(cookie_pick, class_pick) &&
+   (!max || prio_less(max, class_pick)))
+   return class_pick;
+
+   return cookie_pick;
+}
+
+static struct task_struct *
+pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+   struct task_struct *next, *max = NULL;
+   const struct sched_class *class;
+   const struct cpumask *smt_mask;
+   bool need_sync;
+   int i, j, cpu;
+
+   if (!sched_core_enabled(rq))
+   return __pick_next_task(rq, prev, rf);
+
+   cpu = cpu_of(rq);
+
+   /* Stopper task is switching into idle, no need core-wide selection. */
+   if (cpu_is_offline(cpu)) {
+   /*
+* Reset core_pick so that we don't enter the fastpath when
+* coming online. core_pick would already be migrated to
+* another cpu during offline.
+*/
+   rq->core_pick = NULL;
+   return __pick_next_task(rq, prev, rf);
+   }
+
+   /*
+* If there were no {en,de}queues since we picked (IOW, the task
+* pointers are all still valid), and we haven't scheduled the last

[PATCH v8 -tip 03/26] sched: Core-wide rq->lock

2020-10-19 Thread Joel Fernandes (Google)
From: Peter Zijlstra 

Introduce the basic infrastructure to have a core wide rq->lock.

Tested-by: Julien Desfossez 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Julien Desfossez 
Signed-off-by: Vineeth Remanan Pillai 
---
 kernel/Kconfig.preempt |   6 +++
 kernel/sched/core.c| 109 +
 kernel/sched/sched.h   |  31 
 3 files changed, 146 insertions(+)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf82259cff96..4488fbf4d3a8 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -80,3 +80,9 @@ config PREEMPT_COUNT
 config PREEMPTION
bool
select PREEMPT_COUNT
+
+config SCHED_CORE
+   bool "Core Scheduling for SMT"
+   default y
+   depends on SCHED_SMT
+
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 97181b3d12eb..cecbf91cb477 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,70 @@ unsigned int sysctl_sched_rt_period = 100;
 
 __read_mostly int scheduler_running;
 
+#ifdef CONFIG_SCHED_CORE
+
+DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
+
+/*
+ * The static-key + stop-machine variable are needed such that:
+ *
+ * spin_lock(rq_lockp(rq));
+ * ...
+ * spin_unlock(rq_lockp(rq));
+ *
+ * ends up locking and unlocking the _same_ lock, and all CPUs
+ * always agree on what rq has what lock.
+ *
+ * XXX entirely possible to selectively enable cores, don't bother for now.
+ */
+static int __sched_core_stopper(void *data)
+{
+   bool enabled = !!(unsigned long)data;
+   int cpu;
+
+   for_each_possible_cpu(cpu)
+   cpu_rq(cpu)->core_enabled = enabled;
+
+   return 0;
+}
+
+static DEFINE_MUTEX(sched_core_mutex);
+static int sched_core_count;
+
+static void __sched_core_enable(void)
+{
+   // XXX verify there are no cookie tasks (yet)
+
+   static_branch_enable(&__sched_core_enabled);
+   stop_machine(__sched_core_stopper, (void *)true, NULL);
+}
+
+static void __sched_core_disable(void)
+{
+   // XXX verify there are no cookie tasks (left)
+
+   stop_machine(__sched_core_stopper, (void *)false, NULL);
+   static_branch_disable(&__sched_core_enabled);
+}
+
+void sched_core_get(void)
+{
+   mutex_lock(_core_mutex);
+   if (!sched_core_count++)
+   __sched_core_enable();
+   mutex_unlock(_core_mutex);
+}
+
+void sched_core_put(void)
+{
+   mutex_lock(_core_mutex);
+   if (!--sched_core_count)
+   __sched_core_disable();
+   mutex_unlock(_core_mutex);
+}
+
+#endif /* CONFIG_SCHED_CORE */
+
 /*
  * part of the period that we allow rt tasks to run in us.
  * default: 0.95s
@@ -4363,6 +4427,43 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
BUG();
 }
 
+#ifdef CONFIG_SCHED_CORE
+
+static inline void sched_core_cpu_starting(unsigned int cpu)
+{
+   const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+   struct rq *rq, *core_rq = NULL;
+   int i;
+
+   core_rq = cpu_rq(cpu)->core;
+
+   if (!core_rq) {
+   for_each_cpu(i, smt_mask) {
+   rq = cpu_rq(i);
+   if (rq->core && rq->core == rq)
+   core_rq = rq;
+   init_sched_core_irq_work(rq);
+   }
+
+   if (!core_rq)
+   core_rq = cpu_rq(cpu);
+
+   for_each_cpu(i, smt_mask) {
+   rq = cpu_rq(i);
+
+   WARN_ON_ONCE(rq->core && rq->core != core_rq);
+   rq->core = core_rq;
+   }
+   }
+
+   printk("core: %d -> %d\n", cpu, cpu_of(core_rq));
+}
+#else /* !CONFIG_SCHED_CORE */
+
+static inline void sched_core_cpu_starting(unsigned int cpu) {}
+
+#endif /* CONFIG_SCHED_CORE */
+
 /*
  * __schedule() is the main scheduler function.
  *
@@ -6963,6 +7064,9 @@ static void sched_rq_cpu_starting(unsigned int cpu)
 
 int sched_cpu_starting(unsigned int cpu)
 {
+
+   sched_core_cpu_starting(cpu);
+
sched_rq_cpu_starting(cpu);
sched_tick_start(cpu);
return 0;
@@ -7193,6 +7297,11 @@ void __init sched_init(void)
 #endif /* CONFIG_SMP */
hrtick_rq_init(rq);
atomic_set(>nr_iowait, 0);
+
+#ifdef CONFIG_SCHED_CORE
+   rq->core = NULL;
+   rq->core_enabled = 0;
+#endif
}
 
set_load_weight(_task, false);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 54bfac702805..85c8472b5d00 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1048,6 +1048,12 @@ struct rq {
/* Must be inspected within a rcu lock section */
struct cpuidle_state*idle_state;
 #endif
+
+#ifdef CONFIG_SCHED_CORE
+   /* per rq */
+   struct rq   *core;
+   unsigned intcore_enabled;
+#endif
 };
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1075,11 +1081,36 @@ static inline int cpu_of(struct rq 

[PATCH v8 -tip 07/26] sched/fair: Fix forced idle sibling starvation corner case

2020-10-19 Thread Joel Fernandes (Google)
From: Vineeth Pillai 

If there is only one long running local task and the sibling is
forced idle, it  might not get a chance to run until a schedule
event happens on any cpu in the core.

So we check for this condition during a tick to see if a sibling
is starved and then give it a chance to schedule.

Tested-by: Julien Desfossez 
Signed-off-by: Vineeth Remanan Pillai 
Signed-off-by: Julien Desfossez 
---
 kernel/sched/core.c  | 15 ---
 kernel/sched/fair.c  | 40 
 kernel/sched/sched.h |  2 +-
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 12030b77bd6d..469428979182 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4710,16 +4710,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
 
/* reset state */
rq->core->core_cookie = 0UL;
+   if (rq->core->core_forceidle) {
+   need_sync = true;
+   rq->core->core_forceidle = false;
+   }
for_each_cpu(i, smt_mask) {
struct rq *rq_i = cpu_rq(i);
 
rq_i->core_pick = NULL;
 
-   if (rq_i->core_forceidle) {
-   need_sync = true;
-   rq_i->core_forceidle = false;
-   }
-
if (i != cpu)
update_rq_clock(rq_i);
}
@@ -4839,8 +4838,10 @@ next_class:;
if (!rq_i->core_pick)
continue;
 
-   if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running)
-   rq_i->core_forceidle = true;
+   if (is_task_rq_idle(rq_i->core_pick) && rq_i->nr_running &&
+   !rq_i->core->core_forceidle) {
+   rq_i->core->core_forceidle = true;
+   }
 
if (i == cpu) {
rq_i->core_pick = NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 58f670e5704d..56bea0decda1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10652,6 +10652,44 @@ static void rq_offline_fair(struct rq *rq)
 
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_SCHED_CORE
+static inline bool
+__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
+{
+   u64 slice = sched_slice(cfs_rq_of(se), se);
+   u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+
+   return (rtime * min_nr_tasks > slice);
+}
+
+#define MIN_NR_TASKS_DURING_FORCEIDLE  2
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
+{
+   if (!sched_core_enabled(rq))
+   return;
+
+   /*
+* If runqueue has only one task which used up its slice and
+* if the sibling is forced idle, then trigger schedule to
+* give forced idle task a chance.
+*
+* sched_slice() considers only this active rq and it gets the
+* whole slice. But during force idle, we have siblings acting
+* like a single runqueue and hence we need to consider runnable
+* tasks on this cpu and the forced idle cpu. Ideally, we should
+* go through the forced idle rq, but that would be a perf hit.
+* We can assume that the forced idle cpu has atleast
+* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
+* if we need to give up the cpu.
+*/
+   if (rq->core->core_forceidle && rq->cfs.nr_running == 1 &&
+   __entity_slice_used(>se, MIN_NR_TASKS_DURING_FORCEIDLE))
+   resched_curr(rq);
+}
+#else
+static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
+#endif
+
 /*
  * scheduler tick hitting a task of our scheduling class.
  *
@@ -10675,6 +10713,8 @@ static void task_tick_fair(struct rq *rq, struct 
task_struct *curr, int queued)
 
update_misfit_status(curr, rq);
update_overutilized_status(task_rq(curr));
+
+   task_tick_core(rq, curr);
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2b6e0bf61720..884d23d5e55d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1056,12 +1056,12 @@ struct rq {
unsigned intcore_enabled;
unsigned intcore_sched_seq;
struct rb_root  core_tree;
-   unsigned char   core_forceidle;
 
/* shared state */
unsigned intcore_task_seq;
unsigned intcore_pick_seq;
unsigned long   core_cookie;
+   unsigned char   core_forceidle;
 #endif
 };
 
-- 
2.29.0.rc1.297.gfa9743e501-goog



[PATCH v8 -tip 10/26] sched: migration changes for core scheduling

2020-10-19 Thread Joel Fernandes (Google)
From: Aubrey Li 

 - Don't migrate if there is a cookie mismatch
 Load balance tries to move task from busiest CPU to the
 destination CPU. When core scheduling is enabled, if the
 task's cookie does not match with the destination CPU's
 core cookie, this task will be skipped by this CPU. This
 mitigates the forced idle time on the destination CPU.

 - Select cookie matched idle CPU
 In the fast path of task wakeup, select the first cookie matched
 idle CPU instead of the first idle CPU.

 - Find cookie matched idlest CPU
 In the slow path of task wakeup, find the idlest CPU whose core
 cookie matches with task's cookie

 - Don't migrate task if cookie not match
 For the NUMA load balance, don't migrate task to the CPU whose
 core cookie does not match with task's cookie

Tested-by: Julien Desfossez 
Signed-off-by: Aubrey Li 
Signed-off-by: Tim Chen 
Signed-off-by: Vineeth Remanan Pillai 
---
 kernel/sched/fair.c  | 64 
 kernel/sched/sched.h | 29 
 2 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9cae08c3fca1..93a3b874077d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1912,6 +1912,15 @@ static void task_numa_find_cpu(struct task_numa_env *env,
if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
 
+#ifdef CONFIG_SCHED_CORE
+   /*
+* Skip this cpu if source task's cookie does not match
+* with CPU's core cookie.
+*/
+   if (!sched_core_cookie_match(cpu_rq(cpu), env->p))
+   continue;
+#endif
+
env->dst_cpu = cpu;
if (task_numa_compare(env, taskimp, groupimp, maymove))
break;
@@ -5846,11 +5855,17 @@ find_idlest_group_cpu(struct sched_group *group, struct 
task_struct *p, int this
 
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
+   struct rq *rq = cpu_rq(i);
+
+#ifdef CONFIG_SCHED_CORE
+   if (!sched_core_cookie_match(rq, p))
+   continue;
+#endif
+
if (sched_idle_cpu(i))
return i;
 
if (available_idle_cpu(i)) {
-   struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
/*
@@ -6108,8 +6123,18 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
for_each_cpu_wrap(cpu, cpus, target) {
if (!--nr)
return -1;
-   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
-   break;
+
+   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) {
+#ifdef CONFIG_SCHED_CORE
+   /*
+* If Core Scheduling is enabled, select this cpu
+* only if the process cookie matches core cookie.
+*/
+   if (sched_core_enabled(cpu_rq(cpu)) &&
+   p->core_cookie == cpu_rq(cpu)->core->core_cookie)
+#endif
+   break;
+   }
}
 
time = cpu_clock(this) - time;
@@ -7495,8 +7520,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env 
*env)
 * We do not migrate tasks that are:
 * 1) throttled_lb_pair, or
 * 2) cannot be migrated to this CPU due to cpus_ptr, or
-* 3) running (obviously), or
-* 4) are cache-hot on their current CPU.
+* 3) task's cookie does not match with this CPU's core cookie
+* 4) running (obviously), or
+* 5) are cache-hot on their current CPU.
 */
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
@@ -7531,6 +7557,15 @@ int can_migrate_task(struct task_struct *p, struct 
lb_env *env)
return 0;
}
 
+#ifdef CONFIG_SCHED_CORE
+   /*
+* Don't migrate task if the task's cookie does not match
+* with the destination CPU's core cookie.
+*/
+   if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
+   return 0;
+#endif
+
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
 
@@ -8757,6 +8792,25 @@ find_idlest_group(struct sched_domain *sd, struct 
task_struct *p, int this_cpu)
p->cpus_ptr))
continue;
 
+#ifdef CONFIG_SCHED_CORE
+   if (sched_core_enabled(cpu_rq(this_cpu))) {
+   int i = 0;
+   bool cookie_match = false;
+
+   

[PATCH v8 -tip 04/26] sched/fair: Add a few assertions

2020-10-19 Thread Joel Fernandes (Google)
From: Peter Zijlstra 

Tested-by: Julien Desfossez 
Signed-off-by: Peter Zijlstra (Intel) 
---
 kernel/sched/fair.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bd6aed63f5e3..b4bc82f46fe7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6228,6 +6228,11 @@ static int select_idle_sibling(struct task_struct *p, 
int prev, int target)
}
 
 symmetric:
+   /*
+* per-cpu select_idle_mask usage
+*/
+   lockdep_assert_irqs_disabled();
+
if (available_idle_cpu(target) || sched_idle_cpu(target))
return target;
 
@@ -6670,8 +6675,6 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
  * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
  *
  * Returns the target CPU number.
- *
- * preempt must be disabled.
  */
 static int
 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int 
wake_flags)
@@ -6682,6 +6685,11 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, 
int sd_flag, int wake_f
int want_affine = 0;
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
 
+   /*
+* required for stable ->cpus_allowed
+*/
+   lockdep_assert_held(>pi_lock);
+
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
 
-- 
2.29.0.rc1.297.gfa9743e501-goog



[PATCH v8 -tip 13/26] kernel/entry: Add support for core-wide protection of kernel-mode

2020-10-19 Thread Joel Fernandes (Google)
Core-scheduling prevents hyperthreads in usermode from attacking each
other, but it does not do anything about one of the hyperthreads
entering the kernel for any reason. This leaves the door open for MDS
and L1TF attacks with concurrent execution sequences between
hyperthreads.

This patch therefore adds support for protecting all syscall and IRQ
kernel mode entries. Care is taken to track the outermost usermode exit
and entry using per-cpu counters. In cases where one of the hyperthreads
enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
when not needed - example: idle and non-cookie HTs do not need to be
forced into kernel mode.

More information about attacks:
For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
data to either host or guest attackers. For L1TF, it is possible to leak
to guest attackers. There is no possible mitigation involving flushing
of buffers to avoid this since the execution of attacker and victims
happen concurrently on 2 or more HTs.

Cc: Julien Desfossez 
Cc: Tim Chen 
Cc: Aaron Lu 
Cc: Aubrey Li 
Cc: Tim Chen 
Cc: Paul E. McKenney 
Co-developed-by: Vineeth Pillai 
Tested-by: Julien Desfossez 
Signed-off-by: Vineeth Pillai 
Signed-off-by: Joel Fernandes (Google) 
---
 .../admin-guide/kernel-parameters.txt |   7 +
 include/linux/entry-common.h  |   2 +-
 include/linux/sched.h |  12 +
 kernel/entry/common.c |  25 +-
 kernel/sched/core.c   | 229 ++
 kernel/sched/sched.h  |   3 +
 6 files changed, 275 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 3236427e2215..48567110f709 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4678,6 +4678,13 @@
 
sbni=   [NET] Granch SBNI12 leased line adapter
 
+   sched_core_protect_kernel=
+   [SCHED_CORE] Pause SMT siblings of a core running in
+   user mode, if at least one of the siblings of the core
+   is running in kernel mode. This is to guarantee that
+   kernel data is not leaked to tasks which are not trusted
+   by the kernel.
+
sched_debug [KNL] Enables verbose scheduler debug messages.
 
schedstats= [KNL,X86] Enable or disable scheduled statistics.
diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h
index 474f29638d2c..260216de357b 100644
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -69,7 +69,7 @@
 
 #define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |   \
-_TIF_NEED_RESCHED | _TIF_PATCH_PENDING |   \
+_TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_UNSAFE_RET | \
 ARCH_EXIT_TO_USER_MODE_WORK)
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d38e904dd603..fe6f225bfbf9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2071,4 +2071,16 @@ int sched_trace_rq_nr_running(struct rq *rq);
 
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 
+#ifdef CONFIG_SCHED_CORE
+void sched_core_unsafe_enter(void);
+void sched_core_unsafe_exit(void);
+bool sched_core_wait_till_safe(unsigned long ti_check);
+bool sched_core_kernel_protected(void);
+#else
+#define sched_core_unsafe_enter(ignore) do { } while (0)
+#define sched_core_unsafe_exit(ignore) do { } while (0)
+#define sched_core_wait_till_safe(ignore) do { } while (0)
+#define sched_core_kernel_protected(ignore) do { } while (0)
+#endif
+
 #endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 0a1e20f8d4e8..c8dc6b1b1f40 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -137,6 +137,26 @@ static __always_inline void exit_to_user_mode(void)
 /* Workaround to allow gradual conversion of architecture code */
 void __weak arch_do_signal(struct pt_regs *regs) { }
 
+unsigned long exit_to_user_get_work(void)
+{
+   unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+
+   if (IS_ENABLED(CONFIG_SCHED_CORE) && !sched_core_kernel_protected())
+   return ti_work;
+
+#ifdef CONFIG_SCHED_CORE
+   ti_work &= EXIT_TO_USER_MODE_WORK;
+   if ((ti_work & _TIF_UNSAFE_RET) == ti_work) {
+   sched_core_unsafe_exit();
+   if (sched_core_wait_till_safe(EXIT_TO_USER_MODE_WORK)) {
+   sched_core_unsafe_enter(); /* not exiting to user yet. 
*/
+   }
+   }
+
+   return READ_ONCE(current_thread_info()->flags);
+#endif
+}
+
 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long

[PATCH v8 -tip 05/26] sched: Basic tracking of matching tasks

2020-10-19 Thread Joel Fernandes (Google)
From: Peter Zijlstra 

Introduce task_struct::core_cookie as an opaque identifier for core
scheduling. When enabled; core scheduling will only allow matching
task to be on the core; where idle matches everything.

When task_struct::core_cookie is set (and core scheduling is enabled)
these tasks are indexed in a second RB-tree, first on cookie value
then on scheduling function, such that matching task selection always
finds the most elegible match.

NOTE: *shudder* at the overhead...

NOTE: *sigh*, a 3rd copy of the scheduling function; the alternative
is per class tracking of cookies and that just duplicates a lot of
stuff for no raisin (the 2nd copy lives in the rt-mutex PI code).

Tested-by: Julien Desfossez 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Vineeth Remanan Pillai 
Signed-off-by: Julien Desfossez 
---
 include/linux/sched.h |   8 ++-
 kernel/sched/core.c   | 146 ++
 kernel/sched/fair.c   |  46 -
 kernel/sched/sched.h  |  55 
 4 files changed, 208 insertions(+), 47 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 393db0690101..c3563d7cab7f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -683,10 +683,16 @@ struct task_struct {
const struct sched_class*sched_class;
struct sched_entity se;
struct sched_rt_entity  rt;
+   struct sched_dl_entity  dl;
+
+#ifdef CONFIG_SCHED_CORE
+   struct rb_node  core_node;
+   unsigned long   core_cookie;
+#endif
+
 #ifdef CONFIG_CGROUP_SCHED
struct task_group   *sched_task_group;
 #endif
-   struct sched_dl_entity  dl;
 
 #ifdef CONFIG_UCLAMP_TASK
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cecbf91cb477..a032f481c6e6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -78,6 +78,141 @@ __read_mostly int scheduler_running;
 
 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
 
+/* kernel prio, less is more */
+static inline int __task_prio(struct task_struct *p)
+{
+   if (p->sched_class == _sched_class) /* trumps deadline */
+   return -2;
+
+   if (rt_prio(p->prio)) /* includes deadline */
+   return p->prio; /* [-1, 99] */
+
+   if (p->sched_class == _sched_class)
+   return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
+
+   return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+}
+
+/*
+ * l(a,b)
+ * le(a,b) := !l(b,a)
+ * g(a,b)  := l(b,a)
+ * ge(a,b) := !l(a,b)
+ */
+
+/* real prio, less is less */
+static inline bool prio_less(struct task_struct *a, struct task_struct *b)
+{
+
+   int pa = __task_prio(a), pb = __task_prio(b);
+
+   if (-pa < -pb)
+   return true;
+
+   if (-pb < -pa)
+   return false;
+
+   if (pa == -1) /* dl_prio() doesn't work because of stop_class above */
+   return !dl_time_before(a->dl.deadline, b->dl.deadline);
+
+   if (pa == MAX_RT_PRIO + MAX_NICE)  { /* fair */
+   u64 vruntime = b->se.vruntime;
+
+   /*
+* Normalize the vruntime if tasks are in different cpus.
+*/
+   if (task_cpu(a) != task_cpu(b)) {
+   vruntime -= task_cfs_rq(b)->min_vruntime;
+   vruntime += task_cfs_rq(a)->min_vruntime;
+   }
+
+   return !((s64)(a->se.vruntime - vruntime) <= 0);
+   }
+
+   return false;
+}
+
+static inline bool __sched_core_less(struct task_struct *a, struct task_struct 
*b)
+{
+   if (a->core_cookie < b->core_cookie)
+   return true;
+
+   if (a->core_cookie > b->core_cookie)
+   return false;
+
+   /* flip prio, so high prio is leftmost */
+   if (prio_less(b, a))
+   return true;
+
+   return false;
+}
+
+static void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+{
+   struct rb_node *parent, **node;
+   struct task_struct *node_task;
+
+   rq->core->core_task_seq++;
+
+   if (!p->core_cookie)
+   return;
+
+   node = >core_tree.rb_node;
+   parent = *node;
+
+   while (*node) {
+   node_task = container_of(*node, struct task_struct, core_node);
+   parent = *node;
+
+   if (__sched_core_less(p, node_task))
+   node = >rb_left;
+   else
+   node = >rb_right;
+   }
+
+   rb_link_node(>core_node, parent, node);
+   rb_insert_color(>core_node, >core_tree);
+}
+
+static void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+{
+   rq->core->core_task_seq++;
+
+   if (!p->core_cookie)
+   return;
+
+   rb_erase(>core_node, >core_tree);
+}
+
+/*
+ * Find left-most (aka, highest priority) task matching @cookie.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned 

[PATCH v8 -tip 11/26] irq_work: Cleanup

2020-10-19 Thread Joel Fernandes (Google)
From: Peter Zijlstra 

Get rid of the __call_single_node union and clean up the API a little
to avoid external code relying on the structure layout as much.

(Needed for irq_work_is_busy() API in core-scheduling series).

Tested-by: Julien Desfossez 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Joel Fernandes (Google) 
---
 drivers/gpu/drm/i915/i915_request.c |  4 ++--
 include/linux/irq_work.h| 33 ++---
 include/linux/irqflags.h|  4 ++--
 kernel/bpf/stackmap.c   |  2 +-
 kernel/irq_work.c   | 18 
 kernel/printk/printk.c  |  6 ++
 kernel/rcu/tree.c   |  3 +--
 kernel/time/tick-sched.c|  6 ++
 kernel/trace/bpf_trace.c|  2 +-
 9 files changed, 41 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c 
b/drivers/gpu/drm/i915/i915_request.c
index 0e813819b041..5385b081a376 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -197,7 +197,7 @@ __notify_execute_cb(struct i915_request *rq, bool 
(*fn)(struct irq_work *wrk))
 
llist_for_each_entry_safe(cb, cn,
  llist_del_all(>execute_cb),
- work.llnode)
+ work.node.llist)
fn(>work);
 }
 
@@ -460,7 +460,7 @@ __await_execution(struct i915_request *rq,
 * callback first, then checking the ACTIVE bit, we serialise with
 * the completed/retired request.
 */
-   if (llist_add(>work.llnode, >execute_cb)) {
+   if (llist_add(>work.node.llist, >execute_cb)) {
if (i915_request_is_active(signal) ||
__request_in_flight(signal))
__notify_execute_cb_imm(signal);
diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 30823780c192..ec2a47a81e42 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -14,28 +14,37 @@
  */
 
 struct irq_work {
-   union {
-   struct __call_single_node node;
-   struct {
-   struct llist_node llnode;
-   atomic_t flags;
-   };
-   };
+   struct __call_single_node node;
void (*func)(struct irq_work *);
 };
 
+#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){  \
+   .node = { .u_flags = (_flags), },   \
+   .func = (_func),\
+}
+
+#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0)
+#define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY)
+#define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ)
+
+#define DEFINE_IRQ_WORK(name, _f)  \
+   struct irq_work name = IRQ_WORK_INIT(_f)
+
 static inline
 void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 {
-   atomic_set(>flags, 0);
-   work->func = func;
+   *work = IRQ_WORK_INIT(func);
 }
 
-#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \
-   .flags = ATOMIC_INIT(0),\
-   .func  = (_f)   \
+static inline bool irq_work_is_pending(struct irq_work *work)
+{
+   return atomic_read(>node.a_flags) & IRQ_WORK_PENDING;
 }
 
+static inline bool irq_work_is_busy(struct irq_work *work)
+{
+   return atomic_read(>node.a_flags) & IRQ_WORK_BUSY;
+}
 
 bool irq_work_queue(struct irq_work *work);
 bool irq_work_queue_on(struct irq_work *work, int cpu);
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 3ed4e8771b64..fef2d43a7a1d 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -109,12 +109,12 @@ do {  \
 
 # define lockdep_irq_work_enter(__work)
\
  do {  \
- if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
+ if (!(atomic_read(&__work->node.a_flags) & 
IRQ_WORK_HARD_IRQ))\
current->irq_config = 1;\
  } while (0)
 # define lockdep_irq_work_exit(__work) \
  do {  \
- if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\
+ if (!(atomic_read(&__work->node.a_flags) & 
IRQ_WORK_HARD_IRQ))\
current->irq_config = 0;\
  } while (0)
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 06065fa27124..599041cd0c8a 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -298,7 +298,7 @@ static void stac

[PATCH v8 -tip 22/26] sched/debug: Add CGroup node for printing group cookie if SCHED_DEBUG

2020-10-19 Thread Joel Fernandes (Google)
This will be used by kselftest to verify the CGroup cookie value that is
set by the CGroup interface.

Tested-by: Julien Desfossez 
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/core.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1321c26a8385..b3afbba5abe1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9520,6 +9520,13 @@ static u64 cpu_core_tag_color_read_u64(struct 
cgroup_subsys_state *css, struct c
return tg->core_tag_color;
 }
 
+#ifdef CONFIG_SCHED_DEBUG
+static u64 cpu_core_group_cookie_read_u64(struct cgroup_subsys_state *css, 
struct cftype *cft)
+{
+   return cpu_core_get_group_cookie(css_tg(css));
+}
+#endif
+
 struct write_core_tag {
struct cgroup_subsys_state *css;
unsigned long cookie;
@@ -9695,6 +9702,14 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_core_tag_color_read_u64,
.write_u64 = cpu_core_tag_color_write_u64,
},
+#ifdef CONFIG_SCHED_DEBUG
+   /* Read the effective cookie (color+tag) of the group. */
+   {
+   .name = "core_group_cookie",
+   .flags = CFTYPE_NOT_ON_ROOT,
+   .read_u64 = cpu_core_group_cookie_read_u64,
+   },
+#endif
 #endif
 #ifdef CONFIG_UCLAMP_TASK_GROUP
{
@@ -9882,6 +9897,14 @@ static struct cftype cpu_files[] = {
.read_u64 = cpu_core_tag_color_read_u64,
.write_u64 = cpu_core_tag_color_write_u64,
},
+#ifdef CONFIG_SCHED_DEBUG
+   /* Read the effective cookie (color+tag) of the group. */
+   {
+   .name = "core_group_cookie",
+   .flags = CFTYPE_NOT_ON_ROOT,
+   .read_u64 = cpu_core_group_cookie_read_u64,
+   },
+#endif
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
{
-- 
2.29.0.rc1.297.gfa9743e501-goog



[PATCH v8 -tip 18/26] sched: Add a per-thread core scheduling interface

2020-10-19 Thread Joel Fernandes (Google)
Add a per-thread core scheduling interface which allows a thread to share a
core with another thread, or have a core exclusively for itself.

ChromeOS uses core-scheduling to securely enable hyperthreading.  This cuts
down the keypress latency in Google docs from 150ms to 50ms while improving
the camera streaming frame rate by ~3%.

Tested-by: Julien Desfossez 
Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/sched.h|  2 ++
 include/uapi/linux/prctl.h   |  3 ++
 kernel/sched/core.c  | 51 +---
 kernel/sys.c |  3 ++
 tools/include/uapi/linux/prctl.h |  3 ++
 5 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c6034c00846a..4cb76575afa8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2078,11 +2078,13 @@ void sched_core_unsafe_enter(void);
 void sched_core_unsafe_exit(void);
 bool sched_core_wait_till_safe(unsigned long ti_check);
 bool sched_core_kernel_protected(void);
+int sched_core_share_pid(pid_t pid);
 #else
 #define sched_core_unsafe_enter(ignore) do { } while (0)
 #define sched_core_unsafe_exit(ignore) do { } while (0)
 #define sched_core_wait_till_safe(ignore) do { } while (0)
 #define sched_core_kernel_protected(ignore) do { } while (0)
+#define sched_core_share_pid(pid_t pid) do { } while (0)
 #endif
 
 #endif
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index c334e6a02e5f..217b0482aea1 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -248,4 +248,7 @@ struct prctl_mm_map {
 #define PR_SET_IO_FLUSHER  57
 #define PR_GET_IO_FLUSHER  58
 
+/* Request the scheduler to share a core */
+#define PR_SCHED_CORE_SHARE59
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 30a9e4cb5ce1..a0678614a056 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -310,6 +310,7 @@ static int __sched_core_stopper(void *data)
 }
 
 static DEFINE_MUTEX(sched_core_mutex);
+static DEFINE_MUTEX(sched_core_tasks_mutex);
 static int sched_core_count;
 
 static void __sched_core_enable(void)
@@ -3588,8 +3589,9 @@ int sched_fork(unsigned long clone_flags, struct 
task_struct *p)
RB_CLEAR_NODE(>core_node);
 
/*
-* Tag child via per-task cookie only if parent is tagged via per-task
-* cookie. This is independent of, but can be additive to the CGroup 
tagging.
+* If parent is tagged via per-task cookie, tag the child (either with
+* the parent's cookie, or a new one). The final cookie is calculated
+* by concatenating the per-task cookie with that of the CGroup's.
 */
if (current->core_task_cookie) {
 
@@ -9301,7 +9303,7 @@ static int sched_core_share_tasks(struct task_struct *t1, 
struct task_struct *t2
unsigned long cookie;
int ret = -ENOMEM;
 
-   mutex_lock(_core_mutex);
+   mutex_lock(_core_tasks_mutex);
 
/*
 * NOTE: sched_core_get() is done by sched_core_alloc_task_cookie() or
@@ -9400,10 +9402,51 @@ static int sched_core_share_tasks(struct task_struct 
*t1, struct task_struct *t2
 
ret = 0;
 out_unlock:
-   mutex_unlock(_core_mutex);
+   mutex_unlock(_core_tasks_mutex);
return ret;
 }
 
+/* Called from prctl interface: PR_SCHED_CORE_SHARE */
+int sched_core_share_pid(pid_t pid)
+{
+   struct task_struct *task;
+   int err;
+
+   if (pid == 0) { /* Recent current task's cookie. */
+   /* Resetting a cookie requires privileges. */
+   if (current->core_task_cookie)
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+   task = NULL;
+   } else {
+   rcu_read_lock();
+   task = pid ? find_task_by_vpid(pid) : current;
+   if (!task) {
+   rcu_read_unlock();
+   return -ESRCH;
+   }
+
+   get_task_struct(task);
+
+   /*
+* Check if this process has the right to modify the specified
+* process. Use the regular "ptrace_may_access()" checks.
+*/
+   if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+   rcu_read_unlock();
+   err = -EPERM;
+   goto out_put;
+   }
+   rcu_read_unlock();
+   }
+
+   err = sched_core_share_tasks(current, task);
+out_put:
+   if (task)
+   put_task_struct(task);
+   return err;
+}
+
 /* CGroup interface */
 static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct 
cftype *cft)
 {
diff --git a/kernel/sys.c b/kernel/sys.c
index 6401880dff74..17911b8680b1 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2530,6 +2530,9 @@ SYSCALL_DEFINE5(prctl, int, op

[PATCH v8 -tip 21/26] sched: Handle task addition to CGroup

2020-10-19 Thread Joel Fernandes (Google)
Due to earlier patches, the old way of computing a task's cookie when it
is added to a CGroup,is outdated. Update it by fetching the group's
cookie using the new helpers.

Tested-by: Julien Desfossez 
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/core.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 61e1dcf11000..1321c26a8385 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8505,6 +8505,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(_group_lock, flags);
 }
 
+#define SCHED_CORE_GROUP_COOKIE_MASK ((1UL << (sizeof(unsigned long) * 4)) - 1)
+static unsigned long cpu_core_get_group_cookie(struct task_group *tg);
+
 static void sched_change_group(struct task_struct *tsk, int type)
 {
struct task_group *tg;
@@ -8519,11 +8522,13 @@ static void sched_change_group(struct task_struct *tsk, 
int type)
tg = autogroup_task_group(tsk, tg);
 
 #ifdef CONFIG_SCHED_CORE
-   if ((unsigned long)tsk->sched_task_group == tsk->core_cookie)
-   tsk->core_cookie = 0UL;
+   if (tsk->core_group_cookie) {
+   tsk->core_group_cookie = 0UL;
+   tsk->core_cookie &= ~SCHED_CORE_GROUP_COOKIE_MASK;
+   }
 
-   if (tg->core_tagged /* && !tsk->core_cookie ? */)
-   tsk->core_cookie = (unsigned long)tg;
+   tsk->core_group_cookie = cpu_core_get_group_cookie(tg);
+   tsk->core_cookie |= tsk->core_group_cookie;
 #endif
 
tsk->sched_task_group = tg;
@@ -9471,7 +9476,7 @@ static unsigned long cpu_core_get_group_cookie(struct 
task_group *tg)
 
if (tg->core_tagged) {
unsigned long cookie = ((unsigned long)tg << 8) | color;
-   cookie &= (1UL << (sizeof(unsigned long) * 4)) - 1;
+   cookie &= SCHED_CORE_GROUP_COOKIE_MASK;
return cookie;
}
}
-- 
2.29.0.rc1.297.gfa9743e501-goog



[PATCH v8 -tip 09/26] sched: Trivial forced-newidle balancer

2020-10-19 Thread Joel Fernandes (Google)
From: Peter Zijlstra 

When a sibling is forced-idle to match the core-cookie; search for
matching tasks to fill the core.

rcu_read_unlock() can incur an infrequent deadlock in
sched_core_balance(). Fix this by using the RCU-sched flavor instead.

Tested-by: Julien Desfossez 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Joel Fernandes (Google) 
Acked-by: Paul E. McKenney 
---
 include/linux/sched.h |   1 +
 kernel/sched/core.c   | 130 +-
 kernel/sched/idle.c   |   1 +
 kernel/sched/sched.h  |   6 ++
 4 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c3563d7cab7f..d38e904dd603 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -688,6 +688,7 @@ struct task_struct {
 #ifdef CONFIG_SCHED_CORE
struct rb_node  core_node;
unsigned long   core_cookie;
+   unsigned intcore_occupation;
 #endif
 
 #ifdef CONFIG_CGROUP_SCHED
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a5404ec9e89a..02db5b024768 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -202,6 +202,21 @@ static struct task_struct *sched_core_find(struct rq *rq, 
unsigned long cookie)
return match;
 }
 
+static struct task_struct *sched_core_next(struct task_struct *p, unsigned 
long cookie)
+{
+   struct rb_node *node = >core_node;
+
+   node = rb_next(node);
+   if (!node)
+   return NULL;
+
+   p = container_of(node, struct task_struct, core_node);
+   if (p->core_cookie != cookie)
+   return NULL;
+
+   return p;
+}
+
 /*
  * The static-key + stop-machine variable are needed such that:
  *
@@ -4638,8 +4653,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
const struct sched_class *class;
const struct cpumask *smt_mask;
bool fi_before = false;
+   int i, j, cpu, occ = 0;
bool need_sync;
-   int i, j, cpu;
 
if (!sched_core_enabled(rq))
return __pick_next_task(rq, prev, rf);
@@ -4768,6 +4783,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
goto done;
}
 
+   if (!is_task_rq_idle(p))
+   occ++;
+
rq_i->core_pick = p;
 
/*
@@ -4793,6 +4811,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
 
cpu_rq(j)->core_pick = NULL;
}
+   occ = 1;
goto again;
} else {
/*
@@ -4842,6 +4861,8 @@ next_class:;
rq_i->core->core_forceidle = true;
}
 
+   rq_i->core_pick->core_occupation = occ;
+
if (i == cpu) {
rq_i->core_pick = NULL;
continue;
@@ -4871,6 +4892,113 @@ next_class:;
return next;
 }
 
+static bool try_steal_cookie(int this, int that)
+{
+   struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
+   struct task_struct *p;
+   unsigned long cookie;
+   bool success = false;
+
+   local_irq_disable();
+   double_rq_lock(dst, src);
+
+   cookie = dst->core->core_cookie;
+   if (!cookie)
+   goto unlock;
+
+   if (dst->curr != dst->idle)
+   goto unlock;
+
+   p = sched_core_find(src, cookie);
+   if (p == src->idle)
+   goto unlock;
+
+   do {
+   if (p == src->core_pick || p == src->curr)
+   goto next;
+
+   if (!cpumask_test_cpu(this, >cpus_mask))
+   goto next;
+
+   if (p->core_occupation > dst->idle->core_occupation)
+   goto next;
+
+   p->on_rq = TASK_ON_RQ_MIGRATING;
+   deactivate_task(src, p, 0);
+   set_task_cpu(p, this);
+   activate_task(dst, p, 0);
+   p->on_rq = TASK_ON_RQ_QUEUED;
+
+   resched_curr(dst);
+
+   success = true;
+   break;
+
+next:
+   p = sched_core_next(p, cookie);
+   } while (p);
+
+unlock:
+   double_rq_unlock(dst, src);
+   local_irq_enable();
+
+   return success;
+}
+
+static bool steal_cookie_task(int cpu, struct sched_domain *sd)
+{
+   int i;
+
+   for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+   if (i == cpu)
+   continue;
+
+   if (need_resched())
+   break;
+
+   if (try_steal_cookie(cpu, i))
+   return true;
+

[PATCH v8 -tip 25/26] Documentation: Add core scheduling documentation

2020-10-19 Thread Joel Fernandes (Google)
Document the usecases, design and interfaces for core scheduling.

Co-developed-by: Vineeth Pillai 
Tested-by: Julien Desfossez 
Signed-off-by: Joel Fernandes (Google) 
---
 .../admin-guide/hw-vuln/core-scheduling.rst   | 312 ++
 Documentation/admin-guide/hw-vuln/index.rst   |   1 +
 2 files changed, 313 insertions(+)
 create mode 100644 Documentation/admin-guide/hw-vuln/core-scheduling.rst

diff --git a/Documentation/admin-guide/hw-vuln/core-scheduling.rst 
b/Documentation/admin-guide/hw-vuln/core-scheduling.rst
new file mode 100644
index ..eacafbb8fa3f
--- /dev/null
+++ b/Documentation/admin-guide/hw-vuln/core-scheduling.rst
@@ -0,0 +1,312 @@
+Core Scheduling
+***
+Core scheduling support allows userspace to define groups of tasks that can
+share a core. These groups can be specified either for security usecases (one
+group of tasks don't trust another), or for performance usecases (some
+workloads may benefit from running on the same core as they don't need the same
+hardware resources of the shared core).
+
+Security usecase
+
+A cross-HT attack involves the attacker and victim running on different
+Hyper Threads of the same core. MDS and L1TF are examples of such attacks.
+Without core scheduling, the only full mitigation of cross-HT attacks is to
+disable Hyper Threading (HT). Core scheduling allows HT to be turned on safely
+by ensuring that trusted tasks can share a core. This increase in core sharing
+can improvement performance, however it is not guaranteed that performance will
+always improve, though that is seen to be the case with a number of real world
+workloads. In theory, core scheduling aims to perform at least as good as when
+Hyper Threading is disabled. In practise, this is mostly the case though not
+always: as synchronizing scheduling decisions across 2 or more CPUs in a core
+involves additional overhead - especially when the system is lightly loaded
+(``total_threads <= N/2``).
+
+Usage
+-
+Core scheduling support is enabled via the ``CONFIG_SCHED_CORE`` config option.
+Using this feature, userspace defines groups of tasks that trust each other.
+The core scheduler uses this information to make sure that tasks that do not
+trust each other will never run simultaneously on a core, while doing its best
+to satisfy the system's scheduling requirements.
+
+There are 2 ways to use core-scheduling:
+
+CGroup
+##
+Core scheduling adds additional files to the CPU controller CGroup:
+
+* ``cpu.tag``
+Writing ``1`` into this file results in all tasks in the group get tagged. This
+results in all the CGroup's tasks allowed to run concurrently on a core's
+hyperthreads (also called siblings).
+
+The file being a value of ``0`` means the tag state of the CGroup is inheritted
+from its parent hierarchy. If any ancestor of the CGroup is tagged, then the
+group is tagged.
+
+.. note:: Once a CGroup is tagged via cpu.tag, it is not possible to set this
+  for any descendant of the tagged group. For finer grained control, 
the
+  ``cpu.tag_color`` file described next may be used.
+
+.. note:: When a CGroup is not tagged, all the tasks within the group can share
+  a core with kernel threads and untagged system threads. For this 
reason,
+  if a group has ``cpu.tag`` of 0, it is considered to be trusted.
+
+* ``cpu.tag_color``
+For finer grained control over core sharing, a color can also be set in
+addition to the tag. This allows to further control core sharing between child
+CGroups within an already tagged CGroup. The color and the tag are both used to
+generate a `cookie` which is used by the scheduler to identify the group.
+
+Upto 256 different colors can be set (0-255) by writing into this file.
+
+A sample real-world usage of this file follows:
+
+Google uses DAC controls to make ``cpu.tag`` writeable only by root and the
+``cpu.tag_color`` can be changed by anyone.
+
+The hierarchy looks like this:
+::
+  Root group
+ / \
+A   B(These are created by the root daemon - borglet).
+   / \   \
+  C   D   E  (These are created by AppEngine within the container).
+
+A and B are containers for 2 different jobs or apps that are created by a root
+daemon called borglet. borglet then tags each of these group with the 
``cpu.tag``
+file. The job itself can create additional child CGroups which are colored by
+the container's AppEngine with the ``cpu.tag_color`` file.
+
+The reason why Google uses this 2-level tagging system is that AppEngine wants 
to
+allow a subset of child CGroups within a tagged parent CGroup to be 
co-scheduled on a
+core while not being co-scheduled with other child CGroups. Think of these
+child CGroups as belonging to the same customer or project.  Because these
+child CGroups are created by AppEngine, they are not tracked by borglet (the
+root daemon), therefore borglet won't have a chance to set a color for them.
+That's where cpu.tag_color file comes in. A co

[PATCH v8 -tip 19/26] sched: Add a second-level tag for nested CGroup usecase

2020-10-19 Thread Joel Fernandes (Google)
Google has a usecase where the first level tag to tag a CGroup is not
sufficient. So, a patch is carried for years where a second tag is added which
is writeable by unprivileged users.

Google uses DAC controls to make the 'tag' possible to set only by root while
the second-level 'color' can be changed by anyone. The actual names that
Google uses is different, but the concept is the same.

The hierarchy looks like:

Root group
   / \
  A   B(These are created by the root daemon - borglet).
 / \   \
C   D   E  (These are created by AppEngine within the container).

The reason why Google has two parts is that AppEngine wants to allow a subset of
subcgroups within a parent tagged cgroup sharing execution. Think of these
subcgroups belong to the same customer or project. Because these subcgroups are
created by AppEngine, they are not tracked by borglet (the root daemon),
therefore borglet won't have a chance to set a color for them. That's where
'color' file comes from. Color could be set by AppEngine, and once set, the
normal tasks within the subcgroup would not be able to overwrite it. This is
enforced by promoting the permission of the color file in cgroupfs.

The 'color' is a 8-bit value allowing for upto 256 unique colors. IMHO, having
more than these many CGroups sounds like a scalability issue so this suffices.
We steal the lower 8-bits of the cookie to set the color.

Tested-by: Julien Desfossez 
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/core.c  | 181 +--
 kernel/sched/sched.h |   3 +-
 2 files changed, 158 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a0678614a056..42aa811eab14 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8522,7 +8522,7 @@ static void sched_change_group(struct task_struct *tsk, 
int type)
if ((unsigned long)tsk->sched_task_group == tsk->core_cookie)
tsk->core_cookie = 0UL;
 
-   if (tg->tagged /* && !tsk->core_cookie ? */)
+   if (tg->core_tagged /* && !tsk->core_cookie ? */)
tsk->core_cookie = (unsigned long)tg;
 #endif
 
@@ -8623,9 +8623,9 @@ static void cpu_cgroup_css_offline(struct 
cgroup_subsys_state *css)
 #ifdef CONFIG_SCHED_CORE
struct task_group *tg = css_tg(css);
 
-   if (tg->tagged) {
+   if (tg->core_tagged) {
sched_core_put();
-   tg->tagged = 0;
+   tg->core_tagged = 0;
}
 #endif
 }
@@ -9228,7 +9228,7 @@ void sched_core_tag_requeue(struct task_struct *p, 
unsigned long cookie, bool gr
 
if (sched_core_enqueued(p)) {
sched_core_dequeue(task_rq(p), p);
-   if (!p->core_task_cookie)
+   if (!p->core_cookie)
return;
}
 
@@ -9448,41 +9448,100 @@ int sched_core_share_pid(pid_t pid)
 }
 
 /* CGroup interface */
+
+/*
+ * Helper to get the cookie in a hierarchy.
+ * The cookie is a combination of a tag and color. Any ancestor
+ * can have a tag/color. tag is the first-level cookie setting
+ * with color being the second. Atmost one color and one tag is
+ * allowed.
+ */
+static unsigned long cpu_core_get_group_cookie(struct task_group *tg)
+{
+   unsigned long color = 0;
+
+   if (!tg)
+   return 0;
+
+   for (; tg; tg = tg->parent) {
+   if (tg->core_tag_color) {
+   WARN_ON_ONCE(color);
+   color = tg->core_tag_color;
+   }
+
+   if (tg->core_tagged) {
+   unsigned long cookie = ((unsigned long)tg << 8) | color;
+   cookie &= (1UL << (sizeof(unsigned long) * 4)) - 1;
+   return cookie;
+   }
+   }
+
+   return 0;
+}
+
+/* Determine if any group in @tg's children are tagged or colored. */
+static bool cpu_core_check_descendants(struct task_group *tg, bool check_tag,
+   bool check_color)
+{
+   struct task_group *child;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(child, >children, siblings) {
+   if ((child->core_tagged && check_tag) ||
+   (child->core_tag_color && check_color)) {
+   rcu_read_unlock();
+   return true;
+   }
+
+   rcu_read_unlock();
+   return cpu_core_check_descendants(child, check_tag, 
check_color);
+   }
+
+   rcu_read_unlock();
+   return false;
+}
+
 static u64 cpu_core_tag_read_u64(struct cgroup_subsys_state *css, struct 
cftype *cft)
 {
struct task_group *tg = css_tg(css);
 
-   return !!tg->tagged;
+   return !!tg->core_tagged;
+}
+
+static u64 cpu_core_tag_color_read_u64(struct cgroup_subsys_state *css, struct 
cftype *cft)
+{
+   struct task_group *tg = cs

[PATCH v8 -tip 24/26] sched: Move core-scheduler interfacing code to a new file

2020-10-19 Thread Joel Fernandes (Google)
core.c is already huge. The core-tagging interface code is largely
independent of it. Move it to its own file to make both files easier to
maintain.

Tested-by: Julien Desfossez 
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/sched/Makefile  |   1 +
 kernel/sched/core.c| 481 +
 kernel/sched/coretag.c | 468 +++
 kernel/sched/sched.h   |  56 -
 4 files changed, 523 insertions(+), 483 deletions(-)
 create mode 100644 kernel/sched/coretag.c

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5fc9c9b70862..c526c20adf9d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -36,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
 obj-$(CONFIG_CPU_ISOLATION) += isolation.o
 obj-$(CONFIG_PSI) += psi.o
+obj-$(CONFIG_SCHED_CORE) += coretag.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b3afbba5abe1..211e0784675f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -162,11 +162,6 @@ static bool sched_core_empty(struct rq *rq)
return RB_EMPTY_ROOT(>core_tree);
 }
 
-static bool sched_core_enqueued(struct task_struct *task)
-{
-   return !RB_EMPTY_NODE(>core_node);
-}
-
 static struct task_struct *sched_core_first(struct rq *rq)
 {
struct task_struct *task;
@@ -188,7 +183,7 @@ static void sched_core_flush(int cpu)
rq->core->core_task_seq++;
 }
 
-static void sched_core_enqueue(struct rq *rq, struct task_struct *p)
+void sched_core_enqueue(struct rq *rq, struct task_struct *p)
 {
struct rb_node *parent, **node;
struct task_struct *node_task;
@@ -215,7 +210,7 @@ static void sched_core_enqueue(struct rq *rq, struct 
task_struct *p)
rb_insert_color(>core_node, >core_tree);
 }
 
-static void sched_core_dequeue(struct rq *rq, struct task_struct *p)
+void sched_core_dequeue(struct rq *rq, struct task_struct *p)
 {
rq->core->core_task_seq++;
 
@@ -310,7 +305,6 @@ static int __sched_core_stopper(void *data)
 }
 
 static DEFINE_MUTEX(sched_core_mutex);
-static DEFINE_MUTEX(sched_core_tasks_mutex);
 static int sched_core_count;
 
 static void __sched_core_enable(void)
@@ -346,16 +340,6 @@ void sched_core_put(void)
__sched_core_disable();
mutex_unlock(_core_mutex);
 }
-
-static int sched_core_share_tasks(struct task_struct *t1, struct task_struct 
*t2);
-
-#else /* !CONFIG_SCHED_CORE */
-
-static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { }
-static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p) { }
-static bool sched_core_enqueued(struct task_struct *task) { return false; }
-static int sched_core_share_tasks(struct task_struct *t1, struct task_struct 
*t2) { }
-
 #endif /* CONFIG_SCHED_CORE */
 
 /*
@@ -8505,9 +8489,6 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(_group_lock, flags);
 }
 
-#define SCHED_CORE_GROUP_COOKIE_MASK ((1UL << (sizeof(unsigned long) * 4)) - 1)
-static unsigned long cpu_core_get_group_cookie(struct task_group *tg);
-
 static void sched_change_group(struct task_struct *tsk, int type)
 {
struct task_group *tg;
@@ -8583,11 +8564,6 @@ void sched_move_task(struct task_struct *tsk)
task_rq_unlock(rq, tsk, );
 }
 
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-{
-   return css ? container_of(css, struct task_group, css) : NULL;
-}
-
 static struct cgroup_subsys_state *
 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -9200,459 +9176,6 @@ static u64 cpu_rt_period_read_uint(struct 
cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-#ifdef CONFIG_SCHED_CORE
-/*
- * A simple wrapper around refcount. An allocated sched_core_cookie's
- * address is used to compute the cookie of the task.
- */
-struct sched_core_cookie {
-   refcount_t refcnt;
-};
-
-/*
- * sched_core_tag_requeue - Common helper for all interfaces to set a cookie.
- * @p: The task to assign a cookie to.
- * @cookie: The cookie to assign.
- * @group: is it a group interface or a per-task interface.
- *
- * This function is typically called from a stop-machine handler.
- */
-void sched_core_tag_requeue(struct task_struct *p, unsigned long cookie, bool 
group)
-{
-   if (!p)
-   return;
-
-   if (group)
-   p->core_group_cookie = cookie;
-   else
-   p->core_task_cookie = cookie;
-
-   /* Use up half of the cookie's bits for task cookie and remaining for 
group cookie. */
-   p->core_cookie = (p->core_task_cookie <<
-   (sizeof(unsigned long) * 4)) + 
p->core_group_cookie;
-
-   if (sched_core_enqueued(p)) {
-   sched_core_dequeue(task_rq(p), p);
-   if (!p->core_cookie)
-   return;
-   }
-
-   if (sched_core_enabled

[PATCH v8 -tip 26/26] sched: Debug bits...

2020-10-19 Thread Joel Fernandes (Google)
From: Peter Zijlstra 

Tested-by: Julien Desfossez 
Not-Signed-off-by: Peter Zijlstra (Intel) 
---
 kernel/sched/core.c | 37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 211e0784675f..61758b5478d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -127,6 +127,10 @@ static inline bool prio_less(struct task_struct *a, struct 
task_struct *b)
 
int pa = __task_prio(a), pb = __task_prio(b);
 
+   trace_printk("(%s/%d;%d,%Lu,%Lu) ?< (%s/%d;%d,%Lu,%Lu)\n",
+a->comm, a->pid, pa, a->se.vruntime, a->dl.deadline,
+b->comm, b->pid, pb, b->se.vruntime, b->dl.deadline);
+
if (-pa < -pb)
return true;
 
@@ -317,12 +321,16 @@ static void __sched_core_enable(void)
 
static_branch_enable(&__sched_core_enabled);
stop_machine(__sched_core_stopper, (void *)true, NULL);
+
+   printk("core sched enabled\n");
 }
 
 static void __sched_core_disable(void)
 {
stop_machine(__sched_core_stopper, (void *)false, NULL);
static_branch_disable(&__sched_core_enabled);
+
+   printk("core sched disabled\n");
 }
 
 void sched_core_get(void)
@@ -4978,6 +4986,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
set_next_task(rq, next);
}
 
+   trace_printk("pick pre selected (%u %u %u): %s/%d %lx\n",
+rq->core->core_task_seq,
+rq->core->core_pick_seq,
+rq->core_sched_seq,
+next->comm, next->pid,
+next->core_cookie);
+
rq->core_pick = NULL;
return next;
}
@@ -5066,6 +5081,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
 */
if (i == cpu && !need_sync && !p->core_cookie) {
next = p;
+   trace_printk("unconstrained pick: %s/%d %lx\n",
+next->comm, next->pid, 
next->core_cookie);
goto done;
}
 
@@ -5074,6 +5091,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
 
rq_i->core_pick = p;
 
+   trace_printk("cpu(%d): selected: %s/%d %lx\n",
+i, p->comm, p->pid, p->core_cookie);
+
/*
 * If this new candidate is of higher priority than the
 * previous; and they're incompatible; we need to wipe
@@ -5090,6 +5110,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
rq->core->core_cookie = p->core_cookie;
max = p;
 
+   trace_printk("max: %s/%d %lx\n", max->comm, 
max->pid, max->core_cookie);
+
if (old_max) {
for_each_cpu(j, smt_mask) {
if (j == i)
@@ -5120,6 +5142,7 @@ next_class:;
 
/* Something should have been selected for current CPU */
WARN_ON_ONCE(!next);
+   trace_printk("picked: %s/%d %lx\n", next->comm, next->pid, 
next->core_cookie);
 
/*
 * Reschedule siblings
@@ -5155,13 +5178,21 @@ next_class:;
}
 
/* Did we break L1TF mitigation requirements? */
-   WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
+   if (unlikely(!cookie_match(next, rq_i->core_pick))) {
+   trace_printk("[%d]: cookie mismatch. 
%s/%d/0x%lx/0x%lx\n",
+rq_i->cpu, rq_i->core_pick->comm,
+rq_i->core_pick->pid,
+rq_i->core_pick->core_cookie,
+rq_i->core->core_cookie);
+   WARN_ON_ONCE(1);
+   }
 
if (rq_i->curr == rq_i->core_pick) {
rq_i->core_pick = NULL;
continue;
}
 
+   trace_printk("IPI(%d)\n", i);
resched_curr(rq_i);
}
 
@@ -5209,6 +5240,10 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
 
+   trace_printk("core fill: %s/%d (%d->%d) %d %d %lx\n",
+p->comm, p->pid, that, this,
+p->core_occupation, dst->idle->core_occupation, 
cookie);
+
p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src, p, 0);
set_task_cpu(p, this);

<    1   2   3   4   5   6   >