date:20210317

Re: [RESEND PATCH v6 1/2] procfs: Allow reading fdinfo with PTRACE_MODE_READ

2021-03-17 Thread Kees Cook

On Mon, Mar 08, 2021 at 05:06:40PM +, Kalesh Singh wrote:
> Android captures per-process system memory state when certain low memory
> events (e.g a foreground app kill) occur, to identify potential memory
> hoggers. In order to measure how much memory a process actually consumes,
> it is necessary to include the DMA buffer sizes for that process in the
> memory accounting. Since the handle to DMA buffers are raw FDs, it is
> important to be able to identify which processes have FD references to
> a DMA buffer.
> 
> Currently, DMA buffer FDs can be accounted using /proc//fd/* and
> /proc//fdinfo -- both are only readable by the process owner,
> as follows:
>   1. Do a readlink on each FD.
>   2. If the target path begins with "/dmabuf", then the FD is a dmabuf FD.
>   3. stat the file to get the dmabuf inode number.
>   4. Read/ proc//fdinfo/, to get the DMA buffer size.
> 
> Accessing other processes' fdinfo requires root privileges. This limits
> the use of the interface to debugging environments and is not suitable
> for production builds.  Granting root privileges even to a system process
> increases the attack surface and is highly undesirable.
> 
> Since fdinfo doesn't permit reading process memory and manipulating
> process state, allow accessing fdinfo under PTRACE_MODE_READ_FSCRED.
> 
> Suggested-by: Jann Horn 
> Signed-off-by: Kalesh Singh 

Reviewed-by: Kees Cook 

Who would be best to pick this up? Maybe akpm?

-- 
Kees Cook

Re: [PATCH v2 1/3] perf-stat: introduce bperf, share hardware PMCs with BPF

2021-03-17 Thread Namhyung Kim

On Wed, Mar 17, 2021 at 6:18 AM Song Liu  wrote:
> +static int bperf_check_target(struct evsel *evsel,
> + struct target *target,
> + enum bperf_filter_type *filter_type,
> + __u32 *filter_entry_cnt)
> +{
> +   if (evsel->leader->core.nr_members > 1) {
> +   pr_err("bpf managed perf events do not yet support 
> groups.\n");
> +   return -1;
> +   }
> +
> +   /* determine filter type based on target */
> +   if (target->system_wide) {
> +   *filter_type = BPERF_FILTER_GLOBAL;
> +   *filter_entry_cnt = 1;
> +   } else if (target->cpu_list) {
> +   *filter_type = BPERF_FILTER_CPU;
> +   *filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel));
> +   } else if (target->tid) {
> +   *filter_type = BPERF_FILTER_PID;
> +   *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
> +   } else if (target->pid || evsel->evlist->workload.pid != -1) {
> +   *filter_type = BPERF_FILTER_TGID;
> +   *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads);
> +   } else {
> +   pr_err("bpf managed perf events do not yet support these 
> targets.\n");
> +   return -1;
> +   }
> +
> +   return 0;
> +}
> +
> +static struct perf_cpu_map *all_cpu_map;
> +
> +static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd,
> +  struct perf_event_attr_map_entry 
> *entry)
> +{
> +   struct bperf_leader_bpf *skel = bperf_leader_bpf__open();
> +   int link_fd, diff_map_fd, err;
> +   struct bpf_link *link = NULL;
> +
> +   if (!skel) {
> +   pr_err("Failed to open leader skeleton\n");
> +   return -1;
> +   }
> +
> +   bpf_map__resize(skel->maps.events, libbpf_num_possible_cpus());
> +   err = bperf_leader_bpf__load(skel);
> +   if (err) {
> +   pr_err("Failed to load leader skeleton\n");
> +   goto out;
> +   }
> +
> +   err = -1;
> +   link = bpf_program__attach(skel->progs.on_switch);
> +   if (!link) {
> +   pr_err("Failed to attach leader program\n");
> +   goto out;
> +   }
> +
> +   link_fd = bpf_link__fd(link);
> +   diff_map_fd = bpf_map__fd(skel->maps.diff_readings);
> +   entry->link_id = bpf_link_get_id(link_fd);
> +   entry->diff_map_id = bpf_map_get_id(diff_map_fd);
> +   err = bpf_map_update_elem(attr_map_fd, >core.attr, entry, 
> BPF_ANY);
> +   assert(err == 0);
> +
> +   evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id);
> +   assert(evsel->bperf_leader_link_fd >= 0);

Isn't it the same as link_fd?

> +
> +   /*
> +* save leader_skel for install_pe, which is called within
> +* following evsel__open_per_cpu call
> +*/
> +   evsel->leader_skel = skel;
> +   evsel__open_per_cpu(evsel, all_cpu_map, -1);
> +
> +out:
> +   bperf_leader_bpf__destroy(skel);
> +   bpf_link__destroy(link);

Why do we destroy it?  Is it because we get an another reference?

> +   return err;
> +}
> +
> +static int bperf__load(struct evsel *evsel, struct target *target)
> +{
> +   struct perf_event_attr_map_entry entry = {0x, 0x};
> +   int attr_map_fd, diff_map_fd = -1, err;
> +   enum bperf_filter_type filter_type;
> +   __u32 filter_entry_cnt, i;
> +
> +   if (bperf_check_target(evsel, target, _type, 
> _entry_cnt))
> +   return -1;
> +
> +   if (!all_cpu_map) {
> +   all_cpu_map = perf_cpu_map__new(NULL);
> +   if (!all_cpu_map)
> +   return -1;
> +   }
> +
> +   evsel->bperf_leader_prog_fd = -1;
> +   evsel->bperf_leader_link_fd = -1;
> +
> +   /*
> +* Step 1: hold a fd on the leader program and the bpf_link, if
> +* the program is not already gone, reload the program.
> +* Use flock() to ensure exclusive access to the perf_event_attr
> +* map.
> +*/
> +   attr_map_fd = bperf_lock_attr_map(target);
> +   if (attr_map_fd < 0) {
> +   pr_err("Failed to lock perf_event_attr map\n");
> +   return -1;
> +   }
> +
> +   err = bpf_map_lookup_elem(attr_map_fd, >core.attr, );
> +   if (err) {
> +   err = bpf_map_update_elem(attr_map_fd, >core.attr, 
> , BPF_ANY);
> +   if (err)
> +   goto out;
> +   }
> +
> +   evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id);
> +   if (evsel->bperf_leader_link_fd < 0 &&
> +   bperf_reload_leader_program(evsel, attr_map_fd, ))
> +   goto out;
> +
> +   /*
> +* The bpf_link holds reference to the leader program, and the
> +* leader program holds reference to the maps.

Re: [PATCH 4.14 00/95] 4.14.226-rc1 review

2021-03-17 Thread Samuel Zou





On 2021/3/17 23:36, Greg KH wrote:

On Tue, Mar 16, 2021 at 02:35:36PM +0800, Samuel Zou wrote:



On 2021/3/15 21:56, gre...@linuxfoundation.org wrote:

From: Greg Kroah-Hartman 

This is the start of the stable review cycle for the 4.14.226 release.
There are 95 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Wed, 17 Mar 2021 13:57:24 +.
Anything received after that time might be too late.

The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.14.226-rc1.gz
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
linux-4.14.y
and the diffstat can be found below.

thanks,

greg k-h



Tested on x86 for 4.14.226-rc1,

Kernel repo:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Branch: linux-4.14.y
Version: 4.14.226-rc1
Commit: 57cc62fb2d2b8e81c02cb9197e303c7782dee4cd
Compiler: gcc version 7.3.0 (GCC)

x86 (No kernel failures)

Testcase Result Summary:
total_num: 4728
succeed_num: 4727
failed_num: 1


What does this "failed_num" mean?

thanks,

greg k-h


total_num: The number of total testcases
succeed_num: The number of succeed testcases
failed_num: The number of failed testcases

Maybe I can revise the description in the next email.

Re: [RESEND PATCH v6 2/2] procfs/dmabuf: Add inode number to /proc/*/fdinfo

2021-03-17 Thread Kees Cook

On Mon, Mar 08, 2021 at 05:06:41PM +, Kalesh Singh wrote:
> And 'ino' field to /proc//fdinfo/ and
> /proc//task//fdinfo/.
> 
> The inode numbers can be used to uniquely identify DMA buffers
> in user space and avoids a dependency on /proc//fd/* when
> accounting per-process DMA buffer sizes.
> 
> Signed-off-by: Kalesh Singh 

Reviewed-by: Kees Cook 

-- 
Kees Cook

[PATCH 4/4] Fix spelling of achieving

2021-03-17 Thread qiumibaozi_1

From: ganjisheng 

Signed-off-by: ganjisheng 
---
 drivers/mfd/ab3100-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mfd/ab3100-core.c b/drivers/mfd/ab3100-core.c
index ee71ae0..f424c2a 100644
--- a/drivers/mfd/ab3100-core.c
+++ b/drivers/mfd/ab3100-core.c
@@ -159,7 +159,7 @@ static int ab3100_get_register_interruptible(struct ab3100 
*ab3100,
 
/*
 * AB3100 require an I2C "stop" command between each message, else
-* it will not work. The only way of achieveing this with the
+* it will not work. The only way of achieving this with the
 * message transport layer is to send the read and write messages
 * separately.
 */
-- 
1.9.1

linux-next: build warning after merge of the scsi tree

2021-03-17 Thread Stephen Rothwell

Hi all,

After merging the scsi tree, today's linux-next build (htmldocs) produced
this warning:

Documentation/driver-api/target:19: 
drivers/target/target_core_transport.c:1661: WARNING: Block quote ends without 
a blank line; unexpected unindent.

Introduced by commit

  750a1d93f905 ("scsi: target: core: Break up target_submit_cmd_map_sgls()")

-- 
Cheers,
Stephen Rothwell


pgp5EB1NJ8f99.pgp
Description: OpenPGP digital signature

Re: [PATCH v2] smp: kernel/panic.c - silence warnings

2021-03-17 Thread Christophe Leroy





Le 17/03/2021 à 18:37, Peter Zijlstra a écrit :

On Wed, Mar 17, 2021 at 06:17:26PM +0100, Christophe Leroy wrote:



Le 17/03/2021 à 13:23, Peter Zijlstra a écrit :

On Wed, Mar 17, 2021 at 12:00:29PM +0100, Christophe Leroy wrote:

What do you mean ? 'extern' prototype is pointless for function prototypes
and deprecated, no new function prototypes should be added with the 'extern'
keyword.

checkpatch.pl tells you: "extern prototypes should be avoided in .h files"


I have a very strong preference for extern on function decls, to match
the extern on variable decl.


You mean you also do 'static inline' variable declarations ?


That's a func definition, not a declaration. And you _can_ do static
variable definitions in a header file just fine, although that's
typically not what you'd want. Although sometimes I've seen people do:

static const int my_var = 10;

inline is an attribute that obviously doesn't work on variables.


Using the extern keyword on function prototypes is superfluous visual
noise so suggest removing it.


I don't agree; and I think the C spec is actually wrong there (too).

The thing is that it distinguishes between a forward declaration of a
function in the same TU and an external declaration for a function in
another TU.

That is; if I see:

void ponies(int legs);

I expect that function to be defined later in the same TU. IOW it's a
forward declaration. OTOH if I see:

extern void ponies(int legs);

I know I won't find it in this TU and the linker will end up involved.


Yes I can understand that for a .c file where you want to distinguish between forward declaration of 
functions defined in the file and functions declared outside. There, it is definitely an added value.


But in .h, all functions must be defined somewhere else, otherwise you have another problem. So all 
functions would have the 'extern' keyword according to your reasoning. Therefore that's just useless 
and I fully agree with Checkpatch's commit that in that case that's "superfluous visual noise" 
impeding readability and making it more difficult to fit the prototype on a single line.





Now, the C people figured that distinction was useless and allowed
sloppiness. But I still think there's merrit to that. And as mentioned
earlier, it is consistent with variable declarations.

[PATCH] sched/rt: use DIV_ROUND_UP to calculate sysctl_sched_rr_timeslice

2021-03-17 Thread Hongchen Zhang

When HZ is 300, the value of sysctl_sched_rr_timeslice is different from
the actual value. Therefore, replace with DIV_ROUND_UP to calculate
sysctl_sched_rr_timeslice.

Signed-off-by: Hongchen Zhang 
---
 kernel/sched/rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b980cc96604f..c684440eefdb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -8,7 +8,7 @@
 #include "pelt.h"
 
 int sched_rr_timeslice = RR_TIMESLICE;
-int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = DIV_ROUND_UP(RR_TIMESLICE * MSEC_PER_SEC, HZ);
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 
-- 
2.27.0

Re: [PATCH RESEND] gcc-plugins: avoid errors with -std=gnu++11 on old gcc

2021-03-17 Thread Kees Cook

On Mon, Mar 08, 2021 at 03:40:21AM -0500, Valdis Klētnieks wrote:
> It turns out that older gcc (4.9 and 5.4) have gnu++11 support, but
> due to a gcc bug fixed in gcc6, throw errors during the build.
> The relevant gcc bug is https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69959
> 
> Version the option based on what gcc we're using.

Is there a better way to detect this than with version checking?

-Kees

> 
> Signed-off-by: Valdis Kletnieks 
> Fixes: 27c287b41659 ("gcc-plugins: fix gcc 11 indigestion with plugins...")
> ---
> diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile
> index b5487cce69e8..cc779973724a 100644
> --- a/scripts/gcc-plugins/Makefile
> +++ b/scripts/gcc-plugins/Makefile
> @@ -21,8 +21,11 @@ always-y += $(GCC_PLUGIN)
>  
>  GCC_PLUGINS_DIR = $(shell $(CC) -print-file-name=plugin)
>  
> +# need gnu++11 for gcc 11, but 4.9 and 5.4 need gnu++98
> +GCC_FLAVOR = $(call cc-ifversion, -ge, 1100, 11, 98)
> +
>  plugin_cxxflags  = -Wp,-MMD,$(depfile) $(KBUILD_HOSTCXXFLAGS) -fPIC \
> --I $(GCC_PLUGINS_DIR)/include -I $(obj) -std=gnu++11 \
> +-I $(GCC_PLUGINS_DIR)/include -I $(obj) 
> -std=gnu++$(GCC_FLAVOR) \
>  -fno-rtti -fno-exceptions -fasynchronous-unwind-tables \
>  -ggdb -Wno-narrowing -Wno-unused-variable \
>  -Wno-format-diag
> 
> 

-- 
Kees Cook

Re: [PATCH char-misc v1] misc: Add Renesas Synchronization Management Unit (SMU) support

2021-03-17 Thread Greg KH

On Wed, Mar 17, 2021 at 05:14:34PM -0400, min.li...@renesas.com wrote:
> +static int
> +rsmu_open(struct inode *iptr, struct file *fptr)
> +{
> + struct rsmu_cdev *rsmu;
> +
> + rsmu = container_of(iptr->i_cdev, struct rsmu_cdev, rsmu_cdev);
> + if (!rsmu)
> + return -EAGAIN;

This check will never happen, so why are you making it?

And what does -EAGIN mean in this case?

> +
> + fptr->private_data = rsmu;
> + return 0;
> +}
> +
> +static int
> +rsmu_release(struct inode *iptr, struct file *fptr)
> +{
> + struct rsmu_cdev *rsmu;
> +
> + rsmu = container_of(iptr->i_cdev, struct rsmu_cdev, rsmu_cdev);
> + if (!rsmu)
> + return -EAGAIN;

Same here, this is impossible to ever have happen.  Please look up how
container_of() works.


> +
> + return 0;
> +}
> +
> +static long
> +rsmu_ioctl(struct file *fptr, unsigned int cmd, unsigned long data)
> +{
> + struct rsmu_cdev *rsmu = fptr->private_data;
> + void __user *arg = (void __user *)data;
> + int err = 0;
> +
> + if (!rsmu)
> + return -EINVAL;

How can this happen?

Why all of these impossible checks?

And the build failures that the kernel test robot obviously need to be
fixed as well...

thanks,

greg k-h

Re: [PATCH] mm/slub: Add slub_debug option to panic on memory corruption

2021-03-17 Thread Kees Cook

On Tue, Mar 09, 2021 at 07:18:32PM +0100, Vlastimil Babka wrote:
> On 3/9/21 7:14 PM, Georgi Djakov wrote:
> > Hi Vlastimil,
> > 
> > Thanks for the comment!
> > 
> > On 3/9/21 17:09, Vlastimil Babka wrote:
> >> On 3/9/21 2:47 PM, Georgi Djakov wrote:
> >>> Being able to stop the system immediately when a memory corruption
> >>> is detected is crucial to finding the source of it. This is very
> >>> useful when the memory can be inspected with kdump or other tools.
> >>
> >> Is this in some testing scenarios where you would also use e.g. 
> >> panic_on_warn?
> >> We could hook to that. If not, we could introduce a new
> >> panic_on_memory_corruption that would apply also for debug_pagealloc and 
> >> whatnot?
> > 
> > I would prefer that we not tie it with panic_on_warn - there might be lots 
> > of
> > new code in multiple subsystems, so hitting some WARNing while testing is 
> > not
> > something unexpected.
> > 
> > Introducing an additional panic_on_memory_corruption would work, but i 
> > noticed
> > that we already have slub_debug and thought to re-use that. But indeed, 
> > аdding
> > an option to panic in for example bad_page() sounds also useful, if that's 
> > what
> > you suggest.
> 
> Yes, that would be another example.
> Also CCing Kees for input, as besides the "kdump ASAP for debugging" case, I 
> can
> imagine security hardening folks could be interested in the "somebody might 
> have
> just failed to pwn the kernel, better panic than let them continue" angle. But
> I'm naive wrt security, so it might be a stupid idea :)

I've really wanted such things, but Linus has been pretty adamant about
not wanting to provide new "panic" paths (or even BUG usage[1]). It
seems that panic_on_warn remains the way to get this behavior,
with the understanding that WARN should only be produced on
expected-to-be-impossible situations[1].

Hitting a WARN while testing should result in either finding and fixing
a real bug, or removing the WARN in favor of pr_warn(). :)

-Kees

[1] 
https://www.kernel.org/doc/html/latest/process/deprecated.html#bug-and-bug-on

-- 
Kees Cook

Re: [v8,3/7] PCI: mediatek-gen3: Add MediaTek Gen3 driver for MT8192

2021-03-17 Thread Jianjun Wang

On Thu, 2021-03-18 at 01:02 +0100, Pali Rohár wrote:
> On Saturday 13 March 2021 15:43:14 Jianjun Wang wrote:
> > On Thu, 2021-03-11 at 13:38 +0100, Pali Rohár wrote:
> > > On Wednesday 24 February 2021 14:11:28 Jianjun Wang wrote:
> > > > +static int mtk_pcie_startup_port(struct mtk_pcie_port *port)
> > > > +{
> > > ...
> > > > +
> > > > +   /* Delay 100ms to wait the reference clocks become stable */
> > > > +   msleep(100);
> > > > +
> > > > +   /* De-assert PERST# signal */
> > > > +   val &= ~PCIE_PE_RSTB;
> > > > +   writel_relaxed(val, port->base + PCIE_RST_CTRL_REG);
> > > 
> > > Hello! This is a new driver which introduce yet another custom timeout
> > > prior PERST# signal for PCIe card is de-asserted. Timeouts for other
> > > drivers I collected in older email [2].
> > > 
> > > Please look at my email [1] about PCIe Warm Reset if you have any clue
> > > about it. Lorenzo and Rob already expressed that this timeout should not
> > > be driver specific. But nobody was able to "decode" and "understand"
> > > PCIe spec yet about these timeouts.
> > 
> > Hi Pali,
> > 
> > I think this is more like a platform specific timeout, which is used to
> > wait for the reference clocks to become stable and finish the reset flow
> > of HW blocks.
> > 
> > Here is the steps to start a link training in this HW:
> > 
> > 1. Assert all reset signals which including the transaction layer, PIPE
> > interface and internal bus interface;
> > 
> > 2. De-assert reset signals except the PERST#, this will make the
> > physical layer active and start to output the reference clock, but the
> > EP device remains in the reset state.
> >Before releasing the PERST# signal, the HW blocks needs at least 10ms
> > to finish the reset flow, and ref-clk needs about 30us to become stable.
> > 
> > 3. De-assert PERST# signal, wait LTSSM enter L0 state.
> > 
> > This 100ms timeout is reference to TPVPERL in the PCIe CEM spec. Since
> > we are in the kernel stage, the power supply has already stabled, this
> > timeout may not take that long.
> 
> I think that this is not platform specific timeout or platform specific
> steps. This matches generic steps as defined in PCIe CEM spec, section
> 2.2.1. Initial Power-Up (G3 to S0).
> 
> What is platform specific is just how to achieve these steps.
> 
> Am I right?
> 
> ...
> 
> TPVPERL is one of my timeout candidates as minimal required timeout for
> Warm Reset. I have wrote it in email:
> 
> https://lore.kernel.org/linux-pci/20200430082245.xblvb7xeamm4e336@pali/
> 
> But I'm not sure as specially in none diagram is described just warm
> reset as defined in mPCIe CEM (3.2.4.3. PERST# Signal).
> 
> ...
> 
> Anyway, I would suggest to define constants for those timeouts. I guess
> that in future we could be able to define "generic" timeout constants
> which would not be in private driver section, but in some common header
> file.

I agree with this, but I'm not sure if we really need that long time in
the kernel stage, because the power supply has already stable and it's
really impact the boot time, especially when the platform have multi
ports and not connect any EP device, we need to wait 200ms for each port
when system bootup.

For this PCIe controller driver, I would like to change the timeout
value to 10ms to comply with the HW design, and save some boot time.

> 
> > > > +
> > > > +   /* Check if the link is up or not */
> > > > +   err = readl_poll_timeout(port->base + PCIE_LINK_STATUS_REG, val,
> > > > +!!(val & PCIE_PORT_LINKUP), 20,
> > > > +50 * USEC_PER_MSEC);
> > > 
> > > IIRC, you need to wait at least 100ms after de-asserting PERST# signal
> > > as it is required by PCIe specs and also because experiments proved that
> > > some Compex wifi cards (e.g. WLE900VX) are not detected if you do not
> > > wait this minimal time.
> > 
> > Yes, this should be 100ms, I will fix it at next version, thanks for
> > your review.
> 
> In past Bjorn suggested to use msleep(PCI_PM_D3COLD_WAIT); macro for
> this step during reviewing aardvark driver.
> 
> https://lore.kernel.org/linux-pci/20190426161050.ga189...@google.com/
> 
> And next iteration used this PCI_PM_D3COLD_WAIT macro instead of 100:
> 
> https://lore.kernel.org/linux-pci/20190522213351.21366-2-r...@triplefau.lt/

Sure, I will use PCI_PM_D3COLD_WAIT macro instead in the next version.

Thanks.

> 
> > Thanks.
> > > 
> > > > +   if (err) {
> > > > +   val = readl_relaxed(port->base + PCIE_LTSSM_STATUS_REG);
> > > > +   dev_err(port->dev, "PCIe link down, ltssm reg val: 
> > > > %#x\n", val);
> > > > +   return err;
> > > > +   }
> > > 
> > > [1] - 
> > > https://lore.kernel.org/linux-pci/20210310110535.zh4pnn4vpmvzwl5q@pali/
> > > [2] - 
> > > https://lore.kernel.org/linux-pci/20200424092546.25p3hdtkehohe3xw@pali/
> >

Re: [PATCH] scsi: ufs: Add selector to ufshcd_query_flag* APIs

2021-03-17 Thread Can Guo


On 2021-03-17 11:31, Daejun Park wrote:

Unlike other query APIs in UFS, ufshcd_query_flag has a fixed selector
as 0. This patch allows ufshcd_query_flag API to choose selector value
by parameter.

Signed-off-by: Daejun Park 


Reviewed-by: Can Guo 


---
 drivers/scsi/ufs/ufs-sysfs.c |  2 +-
 drivers/scsi/ufs/ufshcd.c| 29 +
 drivers/scsi/ufs/ufshcd.h|  2 +-
 3 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/scsi/ufs/ufs-sysfs.c 
b/drivers/scsi/ufs/ufs-sysfs.c

index acc54f530f2d..606b058a3394 100644
--- a/drivers/scsi/ufs/ufs-sysfs.c
+++ b/drivers/scsi/ufs/ufs-sysfs.c
@@ -746,7 +746,7 @@ static ssize_t _name##_show(struct device 
*dev,\

index = ufshcd_wb_get_query_index(hba); \
pm_runtime_get_sync(hba->dev);   \
ret = ufshcd_query_flag(hba, UPIU_QUERY_OPCODE_READ_FLAG,   \
-   QUERY_FLAG_IDN##_uname, index, );  \
+   QUERY_FLAG_IDN##_uname, index, , 0);   \
pm_runtime_put_sync(hba->dev);   \
if (ret) {  \
ret = -EINVAL;  \
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 8c0ff024231c..c2fd9c58d6b8 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -2940,13 +2940,15 @@ static inline void ufshcd_init_query(struct
ufs_hba *hba,
 }

 static int ufshcd_query_flag_retry(struct ufs_hba *hba,
-	enum query_opcode opcode, enum flag_idn idn, u8 index, bool 
*flag_res)
+	enum query_opcode opcode, enum flag_idn idn, u8 index, bool 
*flag_res,

+   u8 selector)
 {
int ret;
int retries;

for (retries = 0; retries < QUERY_REQ_RETRIES; retries++) {
-   ret = ufshcd_query_flag(hba, opcode, idn, index, flag_res);
+   ret = ufshcd_query_flag(hba, opcode, idn, index, flag_res,
+   selector);
if (ret)
dev_dbg(hba->dev,
"%s: failed with error %d, retries %d\n",
@@ -2969,15 +2971,17 @@ static int ufshcd_query_flag_retry(struct 
ufs_hba *hba,

  * @idn: flag idn to access
  * @index: flag index to access
  * @flag_res: the flag value after the query request completes
+ * @selector: selector field
  *
  * Returns 0 for success, non-zero in case of failure
  */
 int ufshcd_query_flag(struct ufs_hba *hba, enum query_opcode opcode,
-   enum flag_idn idn, u8 index, bool *flag_res)
+   enum flag_idn idn, u8 index, bool *flag_res,
+   u8 selector)
 {
struct ufs_query_req *request = NULL;
struct ufs_query_res *response = NULL;
-   int err, selector = 0;
+   int err;
int timeout = QUERY_REQ_TIMEOUT;

BUG_ON(!hba);
@@ -4331,7 +4335,7 @@ static int ufshcd_complete_dev_init(struct 
ufs_hba *hba)

ktime_t timeout;

err = ufshcd_query_flag_retry(hba, UPIU_QUERY_OPCODE_SET_FLAG,
-   QUERY_FLAG_IDN_FDEVICEINIT, 0, NULL);
+   QUERY_FLAG_IDN_FDEVICEINIT, 0, NULL, 0);
if (err) {
dev_err(hba->dev,
"%s setting fDeviceInit flag failed with error %d\n",
@@ -4343,7 +4347,8 @@ static int ufshcd_complete_dev_init(struct 
ufs_hba *hba)

timeout = ktime_add_ms(ktime_get(), FDEVICEINIT_COMPL_TIMEOUT);
do {
err = ufshcd_query_flag(hba, UPIU_QUERY_OPCODE_READ_FLAG,
-   QUERY_FLAG_IDN_FDEVICEINIT, 0, 
_res);
+   QUERY_FLAG_IDN_FDEVICEINIT, 0, 
_res,
+   0);
if (!flag_res)
break;
usleep_range(5000, 1);
@@ -5250,7 +5255,7 @@ static int ufshcd_enable_auto_bkops(struct 
ufs_hba *hba)

goto out;

err = ufshcd_query_flag_retry(hba, UPIU_QUERY_OPCODE_SET_FLAG,
-   QUERY_FLAG_IDN_BKOPS_EN, 0, NULL);
+   QUERY_FLAG_IDN_BKOPS_EN, 0, NULL, 0);
if (err) {
dev_err(hba->dev, "%s: failed to enable bkops %d\n",
__func__, err);
@@ -5300,7 +5305,7 @@ static int ufshcd_disable_auto_bkops(struct 
ufs_hba *hba)

}

err = ufshcd_query_flag_retry(hba, UPIU_QUERY_OPCODE_CLEAR_FLAG,
-   QUERY_FLAG_IDN_BKOPS_EN, 0, NULL);
+   QUERY_FLAG_IDN_BKOPS_EN, 0, NULL, 0);
if (err) {
dev_err(hba->dev, "%s: failed to disable bkops %d\n",
__func__, err);
@@ -5463,7 +5468,7 @@ int ufshcd_wb_ctrl(struct ufs_hba *hba, bool 
enable)


index = ufshcd_wb_get_query_index(hba);
ret = ufshcd_query_flag_retry(hba, opcode,

linux-next: build warning after merge of the net-next tree

2021-03-17 Thread Stephen Rothwell

Hi all,

After merging the net-next tree, today's linux-next build (htmldocs)
produced this warning:

net/tipc/subscr.h:73: warning: Function parameter or member 's' not described 
in 'tipc_subscription'

Introduced by commit

  429189acac53 ("tipc: add host-endian copy of user subscription to struct 
tipc_subscription")

-- 
Cheers,
Stephen Rothwell


pgpfAwnqtfk1n.pgp
Description: OpenPGP digital signature

Re: Errant readings on LM81 with T2080 SoC

2021-03-17 Thread Wolfram Sang


> Probably depends on the device implementation. I've got multiple other 
> I2C/SMBUS devices and the LM81 seems to be the one that objects.

For the recored, there was just a similar case with a DA9063, but that
one luckily had a bit to switch from SMBus to I2C mode, i.e. no timeout
handling:

  [PATCH v6 1/1] mfd: da9063: Support SMBus and I2C mode



signature.asc
Description: PGP signature

Re: [PATCH][RESEND] Revert "PM: ACPI: reboot: Use S5 for reboot"

2021-03-17 Thread Kai-Heng Feng

On Thu, Mar 18, 2021 at 1:25 AM Josef Bacik  wrote:
[snipped]
> "shutdown now" works fine with and without your patch.  Thanks,

Rafael,
Please revert the patch while we are working on it.

Josef,
Can you please test the following patch:

diff --git a/kernel/reboot.c b/kernel/reboot.c
index eb1b15850761..263444a3fb38 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -233,6 +233,15 @@ void migrate_to_reboot_cpu(void)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
 }

+static void kernel_shutdown_prepare(enum system_states state)
+{
+   blocking_notifier_call_chain(_notifier_list,
+   (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
+   system_state = state;
+   usermodehelper_disable();
+   device_shutdown();
+}
+
 /**
  * kernel_restart - reboot the system
  * @cmd: pointer to buffer containing command to execute for restart
@@ -243,7 +252,7 @@ void migrate_to_reboot_cpu(void)
  */
 void kernel_restart(char *cmd)
 {
-   kernel_restart_prepare(cmd);
+   kernel_shutdown_prepare(SYSTEM_POWER_OFF);
if (pm_power_off_prepare)
pm_power_off_prepare();
migrate_to_reboot_cpu();
@@ -257,14 +266,6 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);

-static void kernel_shutdown_prepare(enum system_states state)
-{
-   blocking_notifier_call_chain(_notifier_list,
-   (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
-   system_state = state;
-   usermodehelper_disable();
-   device_shutdown();
-}
 /**
  * kernel_halt - halt the system
  *

>
> Josef

[v5,1/2] drm/mediatek: mtk_dpi: Add check for max clock rate in mode_valid

2021-03-17 Thread Rex-BC Chen

Add per-platform max clock rate check in mtk_dpi_bridge_mode_valid.

Signed-off-by: Pi-Hsun Shih 
Signed-off-by: Rex-BC Chen 
Signed-off-by: Jitao Shi 
---
 drivers/gpu/drm/mediatek/mtk_dpi.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/mediatek/mtk_dpi.c 
b/drivers/gpu/drm/mediatek/mtk_dpi.c
index b05f900d9322..0b427ad0cd9b 100644
--- a/drivers/gpu/drm/mediatek/mtk_dpi.c
+++ b/drivers/gpu/drm/mediatek/mtk_dpi.c
@@ -120,6 +120,7 @@ struct mtk_dpi_yc_limit {
 struct mtk_dpi_conf {
unsigned int (*cal_factor)(int clock);
u32 reg_h_fre_con;
+   u32 max_clock_khz;
bool edge_sel_en;
 };
 
@@ -557,9 +558,23 @@ static void mtk_dpi_bridge_enable(struct drm_bridge 
*bridge)
mtk_dpi_set_display_mode(dpi, >mode);
 }
 
+static enum drm_mode_status
+mtk_dpi_bridge_mode_valid(struct drm_bridge *bridge,
+ const struct drm_display_info *info,
+ const struct drm_display_mode *mode)
+{
+   struct mtk_dpi *dpi = bridge_to_dpi(bridge);
+
+   if (dpi->conf->max_clock_khz && mode->clock > dpi->conf->max_clock_khz)
+   return MODE_CLOCK_HIGH;
+
+   return MODE_OK;
+}
+
 static const struct drm_bridge_funcs mtk_dpi_bridge_funcs = {
.attach = mtk_dpi_bridge_attach,
.mode_set = mtk_dpi_bridge_mode_set,
+   .mode_valid = mtk_dpi_bridge_mode_valid,
.disable = mtk_dpi_bridge_disable,
.enable = mtk_dpi_bridge_enable,
 };
@@ -668,17 +683,20 @@ static unsigned int mt8183_calculate_factor(int clock)
 static const struct mtk_dpi_conf mt8173_conf = {
.cal_factor = mt8173_calculate_factor,
.reg_h_fre_con = 0xe0,
+   .max_clock_khz = 30,
 };
 
 static const struct mtk_dpi_conf mt2701_conf = {
.cal_factor = mt2701_calculate_factor,
.reg_h_fre_con = 0xb0,
.edge_sel_en = true,
+   .max_clock_khz = 15,
 };
 
 static const struct mtk_dpi_conf mt8183_conf = {
.cal_factor = mt8183_calculate_factor,
.reg_h_fre_con = 0xe0,
+   .max_clock_khz = 10,
 };
 
 static int mtk_dpi_probe(struct platform_device *pdev)
-- 
2.18.0

[v5,2/2] drm/mediatek: mtk_dpi: Add dpi config for mt8192

2021-03-17 Thread Rex-BC Chen

add dpi config setting and compatible for MT8192

Signed-off-by: Rex-BC Chen 
Signed-off-by: Jitao Shi 
---
 drivers/gpu/drm/mediatek/mtk_dpi.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/mediatek/mtk_dpi.c 
b/drivers/gpu/drm/mediatek/mtk_dpi.c
index 0b427ad0cd9b..88ad58ca4115 100644
--- a/drivers/gpu/drm/mediatek/mtk_dpi.c
+++ b/drivers/gpu/drm/mediatek/mtk_dpi.c
@@ -699,6 +699,12 @@ static const struct mtk_dpi_conf mt8183_conf = {
.max_clock_khz = 10,
 };
 
+static const struct mtk_dpi_conf mt8192_conf = {
+   .cal_factor = mt8183_calculate_factor,
+   .reg_h_fre_con = 0xe0,
+   .max_clock_khz = 15,
+};
+
 static int mtk_dpi_probe(struct platform_device *pdev)
 {
struct device *dev = >dev;
@@ -819,6 +825,9 @@ static const struct of_device_id mtk_dpi_of_ids[] = {
{ .compatible = "mediatek,mt8183-dpi",
  .data = _conf,
},
+   { .compatible = "mediatek,mt8192-dpi",
+ .data = _conf,
+   },
{ },
 };
 
-- 
2.18.0

[v5,0/2] Add check for max clock rate in mode_valid

2021-03-17 Thread Rex-BC Chen

Changes in v5:
  fix build error

Changes in v4:
  add Author and add patch description

Rex-BC Chen (2):
  drm/mediatek: mtk_dpi: Add check for max clock rate in mode_valid
  drm/mediatek: mtk_dpi: Add dpi config for mt8192

 drivers/gpu/drm/mediatek/mtk_dpi.c | 27 +++
 1 file changed, 27 insertions(+)

-- 
2.18.0

Re: Errant readings on LM81 with T2080 SoC

2021-03-17 Thread Wolfram Sang


> The polling code is from pre-git times. Like 2005 and earlier.
> I'd say it is about time to get rid of it. Any out-of-tree users
> had more than 15 years to upstream their code, after all.

Parts of the polling mode might be interesting for the atomic_xfer mode
maybe? Which is not implemented yet.



signature.asc
Description: PGP signature

Re: [PATCH] mm/gup: check page posion status for coredump.

2021-03-17 Thread Aili Yao

On Thu, 18 Mar 2021 04:46:00 +
Matthew Wilcox  wrote:

> On Wed, Mar 17, 2021 at 10:12:02AM +0100, David Hildenbrand wrote:
> > > + if (IS_ENABLED(CONFIG_MEMORY_FAILURE) && ret == 1) {
> > > + if (unlikely(PageHuge(page) && 
> > > PageHWPoison(compound_head(page
> > > + ret = 0;
> > > + else if (unlikely(PageHWPoison(page)))
> > > + ret = 0;
> > > + }  
> > 
> > I wonder if a simple
> > 
> > if (PageHWPoison(compound_head(page)))
> > ret = 0;
> > 
> > won't suffice. But I guess the "issue" is compound pages that are not huge
> > pages or transparent huge pages.  
> 
> THPs don't set the HWPoison bit on the head page.
> 
> https://lore.kernel.org/linux-mm/20210316140947.ga3...@casper.infradead.org/
> 
> (and PAGEFLAG(HWPoison, hwpoison, PF_ANY))
> 
> By the way,
> 
> #ifdef CONFIG_MEMORY_FAILURE
> PAGEFLAG(HWPoison, hwpoison, PF_ANY)
> TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
> #define __PG_HWPOISON (1UL << PG_hwpoison)
> extern bool take_page_off_buddy(struct page *page);
> #else
> PAGEFLAG_FALSE(HWPoison)
> #define __PG_HWPOISON 0
> #endif
> 
> so there's no need for this 
>   if (IS_ENABLED(CONFIG_MEMORY_FAILURE)
> check, as it simply turns into
> 
>   if (PageHuge(page) && 0)
>   else if (0)
> 
> and the compiler can optimise it all away.

Yes, You are right, I will modify this later.
Thanks for correction

-- 
Thanks!
Aili Yao

mmotm 2021-03-17-22-24 uploaded

2021-03-17 Thread akpm

The mm-of-the-moment snapshot 2021-03-17-22-24 has been uploaded to

   https://www.ozlabs.org/~akpm/mmotm/

mmotm-readme.txt says

README for mm-of-the-moment:

https://www.ozlabs.org/~akpm/mmotm/

This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
more than once a week.

You will need quilt to apply these patches to the latest Linus release (5.x
or 5.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
https://ozlabs.org/~akpm/mmotm/series

The file broken-out.tar.gz contains two datestamp files: .DATE and
.DATE--mm-dd-hh-mm-ss.  Both contain the string -mm-dd-hh-mm-ss,
followed by the base kernel version against which this patch series is to
be applied.

This tree is partially included in linux-next.  To see which patches are
included in linux-next, consult the `series' file.  Only the patches
within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in
linux-next.


A full copy of the full kernel tree with the linux-next and mmotm patches
already applied is available through git within an hour of the mmotm
release.  Individual mmotm releases are tagged.  The master branch always
points to the latest release, so it's constantly rebasing.

https://github.com/hnaz/linux-mm

The directory https://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second)
contains daily snapshots of the -mm tree.  It is updated more frequently
than mmotm, and is untested.

A git copy of this tree is also available at

https://github.com/hnaz/linux-mm



This mmotm tree contains the following patches against 5.12-rc3:
(patches marked "*" will be included in linux-next)

  origin.patch
* 
hugetlb_cgroup-fix-imbalanced-css_get-and-css_put-pair-for-shared-mappings.patch
* 
hugetlb_cgroup-fix-imbalanced-css_get-and-css_put-pair-for-shared-mappings-v3.patch
* kasan-fix-per-page-tags-for-non-page_alloc-pages.patch
* mm-mmu_notifiers-esnure-range_end-is-paired-with-range_start.patch
* selftests-vm-fix-out-of-tree-build.patch
* z3fold-prevent-reclaim-free-race-for-headless-pages.patch
* squashfs-fix-inode-lookup-sanity-checks.patch
* squashfs-fix-xattr-id-and-id-lookup-sanity-checks.patch
* ia64-mca-allocate-early-mca-with-gfp_atomic.patch
* ia64-fix-format-strings-for-err_inject.patch
* gcov-fix-clang-11-support.patch
* kfence-make-compatible-with-kmemleak.patch
* memblock-fix-section-mismatch-warning-again.patch
* mm-slub-fixing-backtrace-of-objects-because-of-redzone-adjustment.patch
* mm-slub-add-support-for-free-path-information-of-an-object.patch
* arm-print-alloc-free-paths-for-address-in-registers.patch
* proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags.patch
* proc-kpageflags-do-not-use-uninitialized-struct-pages.patch
* module-remove-duplicate-include-in-arch-ia64-kernel-heads.patch
* ia64-kernel-few-typos-fixed-in-the-file-fsyss.patch
* ia64-include-asm-minor-typo-fixes-in-the-file-pgtableh.patch
* sparse-can-do-constant-folding-of-__builtin_bswap.patch
* scripts-spellingtxt-add-overlfow.patch
* scripts-spellingtxt-add-diabled-typo.patch
* scripts-spellingtxt-add-overflw.patch
* sh-remove-duplicate-include-in-tlbh.patch
* ocfs2-replace-define_simple_attribute-with-define_debugfs_attribute.patch
* ocfs2-clear-links-count-in-ocfs2_mknod-if-an-error-occurs.patch
* ocfs2-fix-ocfs2-corrupt-when-iputting-an-inode.patch
* watchdog-rename-__touch_watchdog-to-a-better-descriptive-name.patch
* watchdog-explicitly-update-timestamp-when-reporting-softlockup.patch
* watchdog-softlockup-report-the-overall-time-of-softlockups.patch
* watchdog-softlockup-remove-logic-that-tried-to-prevent-repeated-reports.patch
* watchdog-fix-barriers-when-printing-backtraces-from-all-cpus.patch
* watchdog-cleanup-handling-of-false-positives.patch
  mm.patch
* mm-slab-fix-spelling-mistake-disired-desired.patch
* 
mm-slub-enable-slub_debug-static-key-when-creating-cache-with-explicit-debug-flags.patch
* selftests-add-a-kselftest-for-slub-debugging-functionality.patch
* slub-remove-resiliency_test-function.patch
* mm-page_owner-record-the-timestamp-of-all-pages-during-free.patch
* mm-provide-filemap_range_needs_writeback-helper.patch
* mm-use-filemap_range_needs_writeback-for-o_direct-reads.patch
* iomap-use-filemap_range_needs_writeback-for-o_direct-reads.patch
* mm-filemap-use-filemap_read_page-in-filemap_fault.patch
* mm-filemap-drop-check-for-truncated-page-after-i-o.patch
* mm-page-writeback-simplify-memcg-handling-in-test_clear_page_writeback.patch
* mm-introduce-and-use-mapping_empty.patch
* mm-stop-accounting-shadow-entries.patch
* dax-account-dax-entries-as-nrpages.patch
* mm-remove-nrexceptional-from-inode.patch
* mm-move-page_mapping_file-to-pagemaph.patch
* mm-msync-exit-early-when-the-flags-is-an-ms_async-and-start-vm_start.patch
* mm-gup-add-compound-page-list-iterator.patch
* mm-gup-decrement-head-page-once-for-group-of-subpages.patch
* mm-gup-add-a-range-variant-of-unpin_user_pages_dirty_lock.patch
* rdma-umem-batch-page-unpin-in-__ib_umem_release.patch
*

[PATCH 0/5] Handle seccomp notification preemption

2021-03-17 Thread Sargun Dhillon



This patchset addresses a race condition we've dealt with recently with
seccomp. Specifically programs interrupting syscalls while they're in
progress. This was exacerbated by Golang's recent adoption of "async
preemption", in which they try to interrupt any syscall that's been
running for more than 10ms during GC. During certain syscalls, it's
non-trivial to write them in a reetrant manner in userspace (mount).

This has a couple semantic changes, and relaxes a check on seccomp_data, and
changes the semantics with ordering of how addfd and notification replies
in the supervisor are handled.

It also follows up on the original proposal from Tycho[2] to allow
for adding an FD and returning that value atomically.

Changes since v1[1]:
 * Fix some documentation
 * Add Rata's patches to allow for direct return from addfd

[1]: https://lore.kernel.org/lkml/20210220090502.7202-1-sar...@sargun.me/
[2]: https://lore.kernel.org/lkml/202012011322.26DCBC64F2@keescook/

Rodrigo Campos (1):
  seccomp: Support atomic "addfd + send reply"

Sargun Dhillon (4):
  seccomp: Refactor notification handler to prepare for new semantics
  seccomp: Add wait_killable semantic to seccomp user notifier
  selftests/seccomp: Add test for wait killable notifier
  selftests/seccomp: Add test for atomic addfd+send

 .../userspace-api/seccomp_filter.rst  |  15 +-
 include/uapi/linux/seccomp.h  |   4 +
 kernel/seccomp.c  | 129 ++
 tools/testing/selftests/seccomp/seccomp_bpf.c | 102 ++
 4 files changed, 220 insertions(+), 30 deletions(-)

-- 
2.25.1

[PATCH 1/5] seccomp: Refactor notification handler to prepare for new semantics

2021-03-17 Thread Sargun Dhillon

This refactors the user notification code to have a do / while loop around
the completion condition. This has a small change in semantic, in that
previously we ignored addfd calls upon wakeup if the notification had been
responded to, but instead with the new change we check for an outstanding
addfd calls prior to returning to userspace.

Signed-off-by: Sargun Dhillon 
---
 kernel/seccomp.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 952dc1c90229..b48fb0a29455 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1098,28 +1098,30 @@ static int seccomp_do_user_notification(int 
this_syscall,
 
up(>notif->request);
wake_up_poll(>wqh, EPOLLIN | EPOLLRDNORM);
-   mutex_unlock(>notify_lock);
 
/*
 * This is where we wait for a reply from userspace.
 */
-wait:
-   err = wait_for_completion_interruptible();
-   mutex_lock(>notify_lock);
-   if (err == 0) {
-   /* Check if we were woken up by a addfd message */
+   do {
+   mutex_unlock(>notify_lock);
+   err = wait_for_completion_interruptible();
+   mutex_lock(>notify_lock);
+   if (err != 0)
+   goto interrupted;
+
addfd = list_first_entry_or_null(,
 struct seccomp_kaddfd, list);
-   if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) {
+   /* Check if we were woken up by a addfd message */
+   if (addfd)
seccomp_handle_addfd(addfd);
-   mutex_unlock(>notify_lock);
-   goto wait;
-   }
-   ret = n.val;
-   err = n.error;
-   flags = n.flags;
-   }
 
+   }  while (n.state != SECCOMP_NOTIFY_REPLIED);
+
+   ret = n.val;
+   err = n.error;
+   flags = n.flags;
+
+interrupted:
/* If there were any pending addfd calls, clear them out */
list_for_each_entry_safe(addfd, tmp, , list) {
/* The process went away before we got a chance to handle it */
-- 
2.25.1

[PATCH 2/5] seccomp: Add wait_killable semantic to seccomp user notifier

2021-03-17 Thread Sargun Dhillon

The user notifier feature allows for filtering of seccomp notifications in
userspace. While the user notifier is handling the syscall, the notifying
process can be preempted, thus ending the notification. This has become a
growing problem, as Golang has adopted signal based async preemption[1]. In
this, it will preempt every 10ms, thus leaving the supervisor less than
10ms to respond to a given notification. If the syscall require I/O (mount,
connect) on behalf of the process, it can easily take 10ms.

This allows the supervisor to set a flag that moves the process into a
state where it is only killable by terminating signals as opposed to all
signals. The process can still be terminated before the supervisor receives
the notification.

Signed-off-by: Sargun Dhillon 

[1]: https://github.com/golang/go/issues/24543
---
 .../userspace-api/seccomp_filter.rst  | 15 +++---
 include/uapi/linux/seccomp.h  |  3 ++
 kernel/seccomp.c  | 54 ---
 3 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/Documentation/userspace-api/seccomp_filter.rst 
b/Documentation/userspace-api/seccomp_filter.rst
index bd9165241b6c..75de9400d56a 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -251,13 +251,14 @@ seccomp notification fd to receive a ``struct 
seccomp_notif``, which contains
 five members: the input length of the structure, a unique-per-filter ``id``,
 the ``pid`` of the task which triggered this request (which may be 0 if the
 task is in a pid ns not visible from the listener's pid namespace), a ``flags``
-member which for now only has ``SECCOMP_NOTIF_FLAG_SIGNALED``, representing
-whether or not the notification is a result of a non-fatal signal, and the
-``data`` passed to seccomp. Userspace can then make a decision based on this
-information about what to do, and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a
-response, indicating what should be returned to userspace. The ``id`` member of
-``struct seccomp_notif_resp`` should be the same ``id`` as in ``struct
-seccomp_notif``.
+member and the ``data`` passed to seccomp. Upon receiving the notification,
+the ``SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE`` flag may be set, which will
+try to put the task into a state where it will only respond to fatal signals.
+
+Userspace can then make a decision based on this information about what to do,
+and ``ioctl(SECCOMP_IOCTL_NOTIF_SEND)`` a response, indicating what should be
+returned to userspace. The ``id`` member of ``struct seccomp_notif_resp`` 
should
+be the same ``id`` as in ``struct seccomp_notif``.
 
 It is worth noting that ``struct seccomp_data`` contains the values of register
 arguments to the syscall, but does not contain pointers to memory. The task's
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 6ba18b82a02e..bc7fc8b04749 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -70,6 +70,9 @@ struct seccomp_notif_sizes {
__u16 seccomp_data;
 };
 
+/* Valid flags for struct seccomp_notif */
+#define SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE  (1UL << 0) /* Prevent task from 
being interrupted */
+
 struct seccomp_notif {
__u64 id;
__u32 pid;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b48fb0a29455..1a38fb1de053 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -97,6 +97,8 @@ struct seccomp_knotif {
 
/* outstanding addfd requests */
struct list_head addfd;
+
+   bool wait_killable;
 };
 
 /**
@@ -1073,6 +1075,11 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd 
*addfd)
complete(>completion);
 }
 
+static bool notification_interruptible(struct seccomp_knotif *n)
+{
+   return !(n->state == SECCOMP_NOTIFY_SENT && n->wait_killable);
+}
+
 static int seccomp_do_user_notification(int this_syscall,
struct seccomp_filter *match,
const struct seccomp_data *sd)
@@ -1082,6 +1089,7 @@ static int seccomp_do_user_notification(int this_syscall,
long ret = 0;
struct seccomp_knotif n = {};
struct seccomp_kaddfd *addfd, *tmp;
+   bool interruptible = true;
 
mutex_lock(>notify_lock);
err = -ENOSYS;
@@ -1103,11 +,31 @@ static int seccomp_do_user_notification(int 
this_syscall,
 * This is where we wait for a reply from userspace.
 */
do {
+   interruptible = notification_interruptible();
+
mutex_unlock(>notify_lock);
-   err = wait_for_completion_interruptible();
+   if (interruptible)
+   err = wait_for_completion_interruptible();
+   else
+   err = wait_for_completion_killable();
mutex_lock(>notify_lock);
-   if (err != 0)
+
+   if (err != 0) {
+   /*
+

[PATCH 3/5] selftests/seccomp: Add test for wait killable notifier

2021-03-17 Thread Sargun Dhillon

This adds a test for the positive case of the wait killable notifier,
in testing that when the feature is activated the process acts as
expected -- in not terminating on a non-fatal signal, and instead
queueing it up. There is already a test case for normal handlers
and preemption.

Signed-off-by: Sargun Dhillon 
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 64 +++
 1 file changed, 64 insertions(+)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c 
b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 26c72f2b61b1..48ad53030d5a 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -235,6 +235,10 @@ struct seccomp_notif_addfd {
 };
 #endif
 
+#ifndef SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE
+#define SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE  (1UL << 0) /* Prevent task from 
being interrupted */
+#endif
+
 struct seccomp_notif_addfd_small {
__u64 id;
char weird[4];
@@ -4139,6 +4143,66 @@ TEST(user_notification_addfd_rlimit)
close(memfd);
 }
 
+TEST(user_notification_signal_wait_killable)
+{
+   pid_t pid;
+   long ret;
+   int status, listener, sk_pair[2];
+   struct seccomp_notif req = {
+   .flags = SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE,
+   };
+   struct seccomp_notif_resp resp = {};
+   char c;
+
+   ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+   ASSERT_EQ(0, ret) {
+   TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+   }
+
+   ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
+   ASSERT_EQ(fcntl(sk_pair[0], F_SETFL, O_NONBLOCK), 0);
+
+   listener = user_notif_syscall(__NR_gettid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+   ASSERT_GE(listener, 0);
+
+   pid = fork();
+   ASSERT_GE(pid, 0);
+
+   if (pid == 0) {
+   close(sk_pair[0]);
+   handled = sk_pair[1];
+   if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
+   perror("signal");
+   exit(1);
+   }
+
+   ret = syscall(__NR_gettid);
+   exit(!(ret == 42));
+   }
+   close(sk_pair[1]);
+
+   EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, ), 0);
+   EXPECT_EQ(kill(pid, SIGUSR1), 0);
+   /* Make sure we didn't get a signal */
+   EXPECT_EQ(read(sk_pair[0], , 1), -1);
+   /* Make sure the notification is still alive */
+   EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, ), 0);
+
+   resp.id = req.id;
+   resp.error = 0;
+   resp.val = 42;
+
+   EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, ), 0);
+
+   EXPECT_EQ(waitpid(pid, , 0), pid);
+   EXPECT_EQ(true, WIFEXITED(status));
+   EXPECT_EQ(0, WEXITSTATUS(status));
+   /* Check we eventually received the signal */
+   EXPECT_EQ(read(sk_pair[0], , 1), 1);
+}
+
+
 /*
  * TODO:
  * - expand NNP testing
-- 
2.25.1

[PATCH 4/5] seccomp: Support atomic "addfd + send reply"

2021-03-17 Thread Sargun Dhillon

From: Rodrigo Campos 

Alban Crequy reported a race condition userspace faces when we want to
add some fds and make the syscall return them[1] using seccomp notify.

The problem is that currently two different ioctl() calls are needed by
the process handling the syscalls (agent) for another userspace process
(target): SECCOMP_IOCTL_NOTIF_ADDFD to allocate the fd and
SECCOMP_IOCTL_NOTIF_SEND to return that value. Therefore, it is possible
for the agent to do the first ioctl to add a file descriptor but the
target is interrupted (EINTR) before the agent does the second ioctl()
call.

Other patches in this series add a way to block signals when a syscall
is put to wait by seccomp. However, that might be a big hammer for some
cases, as the golang runtime uses SIGURG to interrupt threads for GC
collection.  Sometimes we just don't want to interfere with the GC, for
example, and just either add the fd and return it or fail the syscall.
With no leaking fds added inadvertly to the target process.

This patch adds a flag to the ADDFD ioctl() so it adds the fd and
returns that value atomically to the target program, as suggested by
Kees Cook[2]. This is done by simply allowing
seccomp_do_user_notification() to add the fd and return it in this case.
Therefore, in this case the target wakes up from the wait in
seccomp_do_user_notification() either to interrupt the syscall or to add
the fd and return it.

This "allocate an fd and return" functionality is useful for syscalls
that return a file descriptor only, like connect(2). Other syscalls that
return a file descriptor but not as return value (or return more than
one fd), like socketpair(), pipe(), recvmsg with SCM_RIGHTs, will not
work with this flag. The way to go to emulate those in cases where a
signal might interrupt is to use the functionality to block signals.

The struct seccomp_notif_resp, used when doing SECCOMP_IOCTL_NOTIF_SEND
ioctl() to send a response to the target, has three more fields that we
don't allow to set when doing the addfd ioctl() to also return. The
reasons to disallow each field are:
 * val: This will be set to the new allocated fd. No point taking it
   from userspace in this case.
 * error: If this is non-zero, the value is ignored. Therefore,
   it is pointless in this case as we want to return the value.
 * flags: The only flag is to let userspace continue to execute the
   syscall. This seems pointless, as we want the syscall to return the
   allocated fd.

This is why those fields are not possible to set when using this new
flag.

[1]: 
https://lore.kernel.org/lkml/cadzs7q4sw71inhmv8eooxhukjmorpzf7thraxzyddtzsxta...@mail.gmail.com/
[2]: https://lore.kernel.org/lkml/202012011322.26DCBC64F2@keescook/

Signed-off-by: Rodrigo Campos 
Signed-off-by: Sargun Dhillon 
---
 include/uapi/linux/seccomp.h |  1 +
 kernel/seccomp.c | 49 +---
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index bc7fc8b04749..95dd9bab73c6 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -118,6 +118,7 @@ struct seccomp_notif_resp {
 
 /* valid flags for seccomp_notif_addfd */
 #define SECCOMP_ADDFD_FLAG_SETFD   (1UL << 0) /* Specify remote fd */
+#define SECCOMP_ADDFD_FLAG_SEND(1UL << 1) /* Addfd and return 
it, atomically */
 
 /**
  * struct seccomp_notif_addfd
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 1a38fb1de053..66b3ff58469a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -109,6 +109,7 @@ struct seccomp_knotif {
  *  installing process should allocate the fd as normal.
  * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC
  * is allowed.
+ * @ioctl_flags: The flags used for the seccomp_addfd ioctl.
  * @ret: The return value of the installing process. It is set to the fd num
  *   upon success (>= 0).
  * @completion: Indicates that the installing process has completed fd
@@ -120,6 +121,7 @@ struct seccomp_kaddfd {
struct file *file;
int fd;
unsigned int flags;
+   __u32 ioctl_flags;
 
/* To only be set on reply */
int ret;
@@ -1064,14 +1066,35 @@ static u64 seccomp_next_notify_id(struct seccomp_filter 
*filter)
return filter->notif->next_id++;
 }
 
-static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd)
+static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct 
seccomp_knotif *n)
 {
+   int fd;
+
/*
 * Remove the notification, and reset the list pointers, indicating
 * that it has been handled.
 */
list_del_init(>list);
-   addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
+   fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
+
+   addfd->ret = fd;
+
+   if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) {
+   /* If we fail reset and return an error to

[PATCH 5/5] selftests/seccomp: Add test for atomic addfd+send

2021-03-17 Thread Sargun Dhillon

This just adds a test to verify that when using the new introduced flag
to ADDFD, a valid fd is added and returned as the syscall result.

Signed-off-by: Rodrigo Campos 
Signed-off-by: Sargun Dhillon 
---
 tools/testing/selftests/seccomp/seccomp_bpf.c | 38 +++
 1 file changed, 38 insertions(+)

diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c 
b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 48ad53030d5a..f7242294a2d5 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -239,6 +239,10 @@ struct seccomp_notif_addfd {
 #define SECCOMP_USER_NOTIF_FLAG_WAIT_KILLABLE  (1UL << 0) /* Prevent task from 
being interrupted */
 #endif
 
+#ifndef SECCOMP_ADDFD_FLAG_SEND
+#define SECCOMP_ADDFD_FLAG_SEND(1UL << 1) /* Addfd and return it, 
atomically */
+#endif
+
 struct seccomp_notif_addfd_small {
__u64 id;
char weird[4];
@@ -3980,8 +3984,14 @@ TEST(user_notification_addfd)
ASSERT_GE(pid, 0);
 
if (pid == 0) {
+   /* fds will be added and this value is expected */
if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
exit(1);
+
+   /* Atomic addfd+send is received here. Check it is a valid fd */
+   if (fcntl(syscall(__NR_getppid), F_GETFD) == -1)
+   exit(1);
+
exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
}
 
@@ -4064,6 +4074,30 @@ TEST(user_notification_addfd)
ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, ), 0);
ASSERT_EQ(addfd.id, req.id);
 
+   /* Verify we can do an atomic addfd and send */
+   addfd.newfd = 0;
+   addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
+   fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, );
+
+   /* Child has fds 0-6 and 42 used, we expect the lower fd available: 7 */
+   EXPECT_EQ(fd, 7);
+   EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
+
+   /*
+* This sets the ID of the ADD FD to the last request plus 1. The
+* notification ID increments 1 per notification.
+*/
+   addfd.id = req.id + 1;
+
+   /* This spins until the underlying notification is generated */
+   while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, ) != -1 &&
+  errno != -EINPROGRESS)
+   nanosleep(, NULL);
+
+   memset(, 0, sizeof(req));
+   ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, ), 0);
+   ASSERT_EQ(addfd.id, req.id);
+
resp.id = req.id;
resp.error = 0;
resp.val = USER_NOTIF_MAGIC;
@@ -4124,6 +4158,10 @@ TEST(user_notification_addfd_rlimit)
EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, ), -1);
EXPECT_EQ(errno, EMFILE);
 
+   addfd.flags = SECCOMP_ADDFD_FLAG_SEND;
+   EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, ), -1);
+   EXPECT_EQ(errno, EMFILE);
+
addfd.newfd = 100;
addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, ), -1);
-- 
2.25.1

[tip:locking/urgent] BUILD SUCCESS bee645788e07eea63055d261d2884ea45c2ba857

2021-03-17 Thread kernel test robot

 allnoconfig
nds32   defconfig
nios2allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
sh   allmodconfig
parisc  defconfig
s390 allyesconfig
s390 allmodconfig
parisc   allyesconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
i386   tinyconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a006-20210317
x86_64   randconfig-a001-20210317
x86_64   randconfig-a005-20210317
x86_64   randconfig-a004-20210317
x86_64   randconfig-a003-20210317
x86_64   randconfig-a002-20210317
i386 randconfig-a001-20210317
i386 randconfig-a005-20210317
i386 randconfig-a002-20210317
i386 randconfig-a003-20210317
i386 randconfig-a004-20210317
i386 randconfig-a006-20210317
i386 randconfig-a001-20210318
i386 randconfig-a005-20210318
i386 randconfig-a003-20210318
i386 randconfig-a002-20210318
i386 randconfig-a006-20210318
i386 randconfig-a004-20210318
i386 randconfig-a013-20210317
i386 randconfig-a016-20210317
i386 randconfig-a011-20210317
i386 randconfig-a012-20210317
i386 randconfig-a015-20210317
i386 randconfig-a014-20210317
riscvnommu_k210_defconfig
riscvnommu_virt_defconfig
riscv   defconfig
riscv  rv32_defconfig
x86_64rhel-7.6-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-a011-20210317
x86_64   randconfig-a016-20210317
x86_64   randconfig-a013-20210317
x86_64   randconfig-a014-20210317
x86_64   randconfig-a015-20210317
x86_64   randconfig-a012-20210317

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org

[PATCH 0/5] Handle seccomp notification preemption

2021-03-17 Thread Sargun Dhillon



This patchset addresses a race condition we've dealt with recently with
seccomp. Specifically programs interrupting syscalls while they're in
progress. This was exacerbated by Golang's recent adoption of "async
preemption", in which they try to interrupt any syscall that's been
running for more than 10ms during GC. During certain syscalls, it's
non-trivial to write them in a reetrant manner in userspace (mount).

This has a couple semantic changes, and relaxes a check on seccomp_data, and
changes the semantics with ordering of how addfd and notification replies
in the supervisor are handled.

It also follows up on the original proposal from Tycho[2] to allow
for adding an FD and returning that value atomically.

Changes since v1[1]:
 * Fix some documentation
 * Add Rata's patches to allow for direct return from addfd

[1]: https://lore.kernel.org/lkml/20210220090502.7202-1-sar...@sargun.me/
[2]: https://lore.kernel.org/lkml/202012011322.26DCBC64F2@keescook/

Rodrigo Campos (1):
  seccomp: Support atomic "addfd + send reply"

Sargun Dhillon (4):
  seccomp: Refactor notification handler to prepare for new semantics
  seccomp: Add wait_killable semantic to seccomp user notifier
  selftests/seccomp: Add test for wait killable notifier
  selftests/seccomp: Add test for atomic addfd+send

 .../userspace-api/seccomp_filter.rst  |  15 +-
 include/uapi/linux/seccomp.h  |   4 +
 kernel/seccomp.c  | 129 ++
 tools/testing/selftests/seccomp/seccomp_bpf.c | 102 ++
 4 files changed, 220 insertions(+), 30 deletions(-)

-- 
2.25.1

[PATCH 2/2] drivers/clocksource/mediatek: Ack and disable interrupts on shutdown

2021-03-17 Thread Evan Benn

set_state_shutdown is called during system suspend after interrupts have
been disabled. If the timer has fired in the meantime, there will be
a pending IRQ. So we ack that now and disable the timer. Without this
ARM trusted firmware will abort the suspend due to the pending
interrupt.

Now always disable the IRQ in state transitions, and re-enable in
set_periodic and next_event.

Signed-off-by: Evan Benn 
---

 drivers/clocksource/timer-mediatek-mt6577.c | 49 +
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/drivers/clocksource/timer-mediatek-mt6577.c 
b/drivers/clocksource/timer-mediatek-mt6577.c
index 9e5241d1876d..44598121585c 100644
--- a/drivers/clocksource/timer-mediatek-mt6577.c
+++ b/drivers/clocksource/timer-mediatek-mt6577.c
@@ -54,13 +54,33 @@ static u64 notrace mtk_gpt_read_sched_clock(void)
return readl_relaxed(gpt_sched_reg);
 }
 
+static void mtk_gpt_disable_ack_interrupts(struct timer_of *to, u8 timer)
+{
+   u32 val;
+
+   /* Disable interrupts */
+   val = readl(timer_of_base(to) + GPT_IRQ_EN_REG);
+   writel(val & ~GPT_IRQ_ENABLE(timer), timer_of_base(to) +
+  GPT_IRQ_EN_REG);
+
+   /* Ack interrupts */
+   writel(GPT_IRQ_ACK(timer), timer_of_base(to) + GPT_IRQ_ACK_REG);
+}
+
 static void mtk_gpt_clkevt_time_stop(struct timer_of *to, u8 timer)
 {
u32 val;
 
+   /* Disable timer */
val = readl(timer_of_base(to) + GPT_CTRL_REG(timer));
writel(val & ~GPT_CTRL_ENABLE, timer_of_base(to) +
   GPT_CTRL_REG(timer));
+
+   /* This may be called with interrupts disabled,
+* so we need to ack any interrupt that is pending
+* Or for example ATF will prevent a suspend from completing.
+*/
+   mtk_gpt_disable_ack_interrupts(to, timer);
 }
 
 static void mtk_gpt_clkevt_time_setup(struct timer_of *to,
@@ -74,8 +94,10 @@ static void mtk_gpt_clkevt_time_start(struct timer_of *to,
 {
u32 val;
 
-   /* Acknowledge interrupt */
-   writel(GPT_IRQ_ACK(timer), timer_of_base(to) + GPT_IRQ_ACK_REG);
+   /* Enable interrupts */
+   val = readl(timer_of_base(to) + GPT_IRQ_EN_REG);
+   writel(val | GPT_IRQ_ENABLE(timer),
+  timer_of_base(to) + GPT_IRQ_EN_REG);
 
val = readl(timer_of_base(to) + GPT_CTRL_REG(timer));
 
@@ -148,21 +170,6 @@ __init mtk_gpt_setup(struct timer_of *to, u8 timer, u8 
option)
   timer_of_base(to) + GPT_CTRL_REG(timer));
 }
 
-static void mtk_gpt_enable_irq(struct timer_of *to, u8 timer)
-{
-   u32 val;
-
-   /* Disable all interrupts */
-   writel(0x0, timer_of_base(to) + GPT_IRQ_EN_REG);
-
-   /* Acknowledge all spurious pending interrupts */
-   writel(0x3f, timer_of_base(to) + GPT_IRQ_ACK_REG);
-
-   val = readl(timer_of_base(to) + GPT_IRQ_EN_REG);
-   writel(val | GPT_IRQ_ENABLE(timer),
-  timer_of_base(to) + GPT_IRQ_EN_REG);
-}
-
 static struct timer_of to = {
.flags = TIMER_OF_IRQ | TIMER_OF_BASE | TIMER_OF_CLOCK,
 
@@ -193,6 +200,12 @@ static int __init mtk_gpt_init(struct device_node *node)
if (ret)
return ret;
 
+   /* In case the firmware left the interrupts enabled
+* disable and ack those now
+*/
+   mtk_gpt_disable_ack_interrupts(, TIMER_CLK_SRC);
+   mtk_gpt_disable_ack_interrupts(, TIMER_CLK_EVT);
+
/* Configure clock source */
mtk_gpt_setup(, TIMER_CLK_SRC, GPT_CTRL_OP_FREERUN);
clocksource_mmio_init(timer_of_base() + GPT_CNT_REG(TIMER_CLK_SRC),
@@ -206,8 +219,6 @@ static int __init mtk_gpt_init(struct device_node *node)
clockevents_config_and_register(, timer_of_rate(),
TIMER_SYNC_TICKS, 0x);
 
-   mtk_gpt_enable_irq(, TIMER_CLK_EVT);
-
return 0;
 }
 TIMER_OF_DECLARE(mtk_mt6577, "mediatek,mt6577-timer", mtk_gpt_init);
-- 
2.31.0.rc2.261.g7f71774620-goog

[PATCH 1/2] drivers/clocksource/mediatek: Split mediatek drivers into 2 files

2021-03-17 Thread Evan Benn

mtk_gpt and mtk_syst drivers for mt6577 and mt6765 devices were not
sharing any code. So split them into separate files.

Signed-off-by: Evan Benn 
---

 arch/arm/mach-mediatek/Kconfig|   3 +-
 arch/arm64/Kconfig.platforms  |   3 +-
 drivers/clocksource/Kconfig   |  13 +-
 drivers/clocksource/Makefile  |   3 +-
 ...mer-mediatek.c => timer-mediatek-mt6577.c} | 100 -
 drivers/clocksource/timer-mediatek-mt6765.c   | 135 ++
 6 files changed, 151 insertions(+), 106 deletions(-)
 rename drivers/clocksource/{timer-mediatek.c => timer-mediatek-mt6577.c} (69%)
 create mode 100644 drivers/clocksource/timer-mediatek-mt6765.c

diff --git a/arch/arm/mach-mediatek/Kconfig b/arch/arm/mach-mediatek/Kconfig
index 9e0f592d87d8..8686f992c4b6 100644
--- a/arch/arm/mach-mediatek/Kconfig
+++ b/arch/arm/mach-mediatek/Kconfig
@@ -4,7 +4,8 @@ menuconfig ARCH_MEDIATEK
depends on ARCH_MULTI_V7
select ARM_GIC
select PINCTRL
-   select MTK_TIMER
+   select TIMER_MEDIATEK_MT6577
+   select TIMER_MEDIATEK_MT6765
select MFD_SYSCON
help
  Support for Mediatek MT65xx & MT81xx SoCs
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index cdfd5fed457f..d4752375ab0b 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -161,7 +161,8 @@ config ARCH_MEDIATEK
bool "MediaTek SoC Family"
select ARM_GIC
select PINCTRL
-   select MTK_TIMER
+   select TIMER_MEDIATEK_MT6577
+   select TIMER_MEDIATEK_MT6765
help
  This enables support for MediaTek MT27xx, MT65xx, MT76xx
  & MT81xx ARMv8 SoCs
diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig
index 39aa21d01e05..d697c799284e 100644
--- a/drivers/clocksource/Kconfig
+++ b/drivers/clocksource/Kconfig
@@ -438,13 +438,20 @@ config OXNAS_RPS_TIMER
 config SYS_SUPPORTS_SH_CMT
bool
 
-config MTK_TIMER
-   bool "Mediatek timer driver" if COMPILE_TEST
+config TIMER_MEDIATEK_MT6577
+   bool "Mediatek mt6577 timer driver" if COMPILE_TEST
depends on HAS_IOMEM
select TIMER_OF
select CLKSRC_MMIO
help
- Support for Mediatek timer driver.
+ Enables clocksource and clockevent driver for Mediatek mt6577 timer.
+
+config TIMER_MEDIATEK_MT6765
+   bool "Mediatek mt6765 timer driver" if COMPILE_TEST
+   depends on HAS_IOMEM
+   select TIMER_OF
+   help
+ Enables clockevent driver for Mediatek mt6765 timer.
 
 config SPRD_TIMER
bool "Spreadtrum timer driver" if EXPERT
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index c17ee32a7151..b1f06ce114f9 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -49,7 +49,8 @@ obj-$(CONFIG_CLKSRC_SAMSUNG_PWM)  += samsung_pwm_timer.o
 obj-$(CONFIG_FSL_FTM_TIMER)+= timer-fsl-ftm.o
 obj-$(CONFIG_VF_PIT_TIMER) += timer-vf-pit.o
 obj-$(CONFIG_CLKSRC_QCOM)  += timer-qcom.o
-obj-$(CONFIG_MTK_TIMER)+= timer-mediatek.o
+obj-$(CONFIG_TIMER_MEDIATEK_MT6577)+= timer-mediatek-mt6577.o
+obj-$(CONFIG_TIMER_MEDIATEK_MT6765)+= timer-mediatek-mt6765.o
 obj-$(CONFIG_CLKSRC_PISTACHIO) += timer-pistachio.o
 obj-$(CONFIG_CLKSRC_TI_32K)+= timer-ti-32k.o
 obj-$(CONFIG_OXNAS_RPS_TIMER)  += timer-oxnas-rps.o
diff --git a/drivers/clocksource/timer-mediatek.c 
b/drivers/clocksource/timer-mediatek-mt6577.c
similarity index 69%
rename from drivers/clocksource/timer-mediatek.c
rename to drivers/clocksource/timer-mediatek-mt6577.c
index 9318edcd8963..9e5241d1876d 100644
--- a/drivers/clocksource/timer-mediatek.c
+++ b/drivers/clocksource/timer-mediatek-mt6577.c
@@ -47,86 +47,8 @@
 #define GPT_CNT_REG(val)(0x08 + (0x10 * (val)))
 #define GPT_CMP_REG(val)(0x0C + (0x10 * (val)))
 
-/* system timer */
-#define SYST_BASE   (0x40)
-
-#define SYST_CON(SYST_BASE + 0x0)
-#define SYST_VAL(SYST_BASE + 0x4)
-
-#define SYST_CON_REG(to)(timer_of_base(to) + SYST_CON)
-#define SYST_VAL_REG(to)(timer_of_base(to) + SYST_VAL)
-
-/*
- * SYST_CON_EN: Clock enable. Shall be set to
- *   - Start timer countdown.
- *   - Allow timeout ticks being updated.
- *   - Allow changing interrupt functions.
- *
- * SYST_CON_IRQ_EN: Set to allow interrupt.
- *
- * SYST_CON_IRQ_CLR: Set to clear interrupt.
- */
-#define SYST_CON_EN  BIT(0)
-#define SYST_CON_IRQ_EN  BIT(1)
-#define SYST_CON_IRQ_CLR BIT(4)
-
 static void __iomem *gpt_sched_reg __read_mostly;
 
-static void mtk_syst_ack_irq(struct timer_of *to)
-{
-   /* Clear and disable interrupt */
-   writel(SYST_CON_IRQ_CLR | SYST_CON_EN, SYST_CON_REG(to));
-}
-
-static irqreturn_t mtk_syst_handler(int irq, void *dev_id)
-{
-   struct clock_event_device *clkevt = dev_id;
-   struct timer_of *to =

Re: linux-next: manual merge of the rust tree with the kbuild tree

2021-03-17 Thread Miguel Ojeda

On Thu, Mar 18, 2021 at 5:37 AM Stephen Rothwell  wrote:
>
> I fixed it up (see below - I think I got this right ...) and can carry
> the fix as necessary. This is now fixed as far as linux-next is
> concerned, but any non trivial conflicts should be mentioned to your
> upstream maintainer when your tree is submitted for merging.  You may
> also want to consider cooperating with the maintainer of the
> conflicting tree to minimise any particularly complex conflicts.

Yeah, the TENTATIVE_CLANG_FLAGS is the workaround to support GCC I
mentioned privately. It is unfortunate that particular bit was the one
that had to give you a conflict... :-)

Longer-term, Masahiro et. al. may have better ideas on how to do the
whole trick in a cleaner way (kbuild folks: don't worry, our branch is
not going to be merged just yet ;-)

Cheers,
Miguel

[PATCH 09/10] m68k: use libata instead of the legacy ide driver

2021-03-17 Thread Christoph Hellwig

Switch the m68 defconfigs from the deprecated ide subsystem to use libata
instead.  The gayle and buddha and falcon drivers are enabled for libata,
while support for the q40 and macide drivers is lost.

Signed-off-by: Christoph Hellwig 
---
 arch/m68k/configs/amiga_defconfig | 10 +-
 arch/m68k/configs/atari_defconfig |  8 
 arch/m68k/configs/mac_defconfig   |  5 -
 arch/m68k/configs/multi_defconfig | 15 ++-
 arch/m68k/configs/q40_defconfig   |  4 
 5 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/arch/m68k/configs/amiga_defconfig 
b/arch/m68k/configs/amiga_defconfig
index 786656090c5029..fba7275de0fb5f 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -323,11 +323,6 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
-CONFIG_IDE=y
-CONFIG_IDE_GD_ATAPI=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_GAYLE=y
-CONFIG_BLK_DEV_BUDDHA=y
 CONFIG_RAID_ATTRS=m
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
@@ -344,6 +339,11 @@ CONFIG_GVP11_SCSI=y
 CONFIG_SCSI_A4000T=y
 CONFIG_SCSI_ZORRO7XX=y
 CONFIG_SCSI_ZORRO_ESP=y
+CONFIG_ATA=y
+# CONFIG_ATA_VERBOSE_ERROR is not set
+# CONFIG_ATA_BMDMA is not set
+CONFIG_PATA_GAYLE=y
+CONFIG_PATA_BUDDHA=y
 CONFIG_MD=y
 CONFIG_MD_LINEAR=m
 CONFIG_BLK_DEV_DM=m
diff --git a/arch/m68k/configs/atari_defconfig 
b/arch/m68k/configs/atari_defconfig
index 413232626d9d57..235d038be9 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -324,10 +324,6 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
-CONFIG_IDE=y
-CONFIG_IDE_GD_ATAPI=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_FALCON_IDE=y
 CONFIG_RAID_ATTRS=m
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
@@ -339,6 +335,10 @@ CONFIG_SCSI_SAS_ATTRS=m
 CONFIG_ISCSI_TCP=m
 CONFIG_ISCSI_BOOT_SYSFS=m
 CONFIG_ATARI_SCSI=y
+CONFIG_ATA=y
+# CONFIG_ATA_VERBOSE_ERROR is not set
+# CONFIG_ATA_BMDMA is not set
+CONFIG_PATA_FALCON=y
 CONFIG_MD=y
 CONFIG_MD_LINEAR=m
 CONFIG_BLK_DEV_DM=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index bf15e6c1c939bb..cc92cc4601cb1f 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -315,11 +315,6 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
-CONFIG_IDE=y
-CONFIG_IDE_GD_ATAPI=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_PLATFORM=y
-CONFIG_BLK_DEV_MAC_IDE=y
 CONFIG_RAID_ATTRS=m
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
diff --git a/arch/m68k/configs/multi_defconfig 
b/arch/m68k/configs/multi_defconfig
index 5466d48fcd9d51..9be9f2ad4ddb84 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -344,15 +344,6 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
-CONFIG_IDE=y
-CONFIG_IDE_GD_ATAPI=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_PLATFORM=y
-CONFIG_BLK_DEV_GAYLE=y
-CONFIG_BLK_DEV_BUDDHA=y
-CONFIG_BLK_DEV_FALCON_IDE=y
-CONFIG_BLK_DEV_MAC_IDE=y
-CONFIG_BLK_DEV_Q40IDE=y
 CONFIG_RAID_ATTRS=m
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
@@ -376,6 +367,12 @@ CONFIG_MVME147_SCSI=y
 CONFIG_MVME16x_SCSI=y
 CONFIG_BVME6000_SCSI=y
 CONFIG_SUN3X_ESP=y
+CONFIG_ATA=y
+# CONFIG_ATA_VERBOSE_ERROR is not set
+# CONFIG_ATA_BMDMA is not set
+CONFIG_PATA_FALCON=y
+CONFIG_PATA_GAYLE=y
+CONFIG_PATA_BUDDHA=y
 CONFIG_MD=y
 CONFIG_MD_LINEAR=m
 CONFIG_BLK_DEV_DM=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 3ae421cb24a439..ac35e448b1c58f 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -314,10 +314,6 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_DUMMY_IRQ=m
-CONFIG_IDE=y
-CONFIG_IDE_GD_ATAPI=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_Q40IDE=y
 CONFIG_RAID_ATTRS=m
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
-- 
2.30.1

[PATCH 08/10] MIPS: disable CONFIG_IDE in malta*_defconfig

2021-03-17 Thread Christoph Hellwig

Various malta defconfigs enable CONFIG_IDE for the tc86c001 ide driver,
hich is a Toshiba plug in card that does not make much sense to use on
bigsur platforms.  For all other ATA cards libata support is already
enabled.

Signed-off-by: Christoph Hellwig 
---
 arch/mips/configs/malta_kvm_defconfig   | 3 ---
 arch/mips/configs/malta_kvm_guest_defconfig | 3 ---
 arch/mips/configs/maltaup_xpa_defconfig | 3 ---
 3 files changed, 9 deletions(-)

diff --git a/arch/mips/configs/malta_kvm_defconfig 
b/arch/mips/configs/malta_kvm_defconfig
index 62b1969b4f55b9..b4f9f3d4bd5d34 100644
--- a/arch/mips/configs/malta_kvm_defconfig
+++ b/arch/mips/configs/malta_kvm_defconfig
@@ -239,9 +239,6 @@ CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
-CONFIG_IDE=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_TC86C001=m
 CONFIG_RAID_ATTRS=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=m
diff --git a/arch/mips/configs/malta_kvm_guest_defconfig 
b/arch/mips/configs/malta_kvm_guest_defconfig
index 9185e0a0aa4551..4d415145d1163e 100644
--- a/arch/mips/configs/malta_kvm_guest_defconfig
+++ b/arch/mips/configs/malta_kvm_guest_defconfig
@@ -237,9 +237,6 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
 CONFIG_VIRTIO_BLK=y
-CONFIG_IDE=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_TC86C001=m
 CONFIG_RAID_ATTRS=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=m
diff --git a/arch/mips/configs/maltaup_xpa_defconfig 
b/arch/mips/configs/maltaup_xpa_defconfig
index 636311d67a533c..cd536086dca4a4 100644
--- a/arch/mips/configs/maltaup_xpa_defconfig
+++ b/arch/mips/configs/maltaup_xpa_defconfig
@@ -237,9 +237,6 @@ CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
-CONFIG_IDE=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_TC86C001=m
 CONFIG_RAID_ATTRS=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=m
-- 
2.30.1

[PATCH 07/10] MIPS: disable CONFIG_IDE in bigsur_defconfig

2021-03-17 Thread Christoph Hellwig

bigsur_defconfig enables CONFIG_IDE for the tc86c001 ide driver, which
is a Toshiba plug in card that does not make much sense to use on bigsur
platforms.  For all other ATA cards libata support is already enabled.

Signed-off-by: Christoph Hellwig 
---
 arch/mips/configs/bigsur_defconfig | 4 
 1 file changed, 4 deletions(-)

diff --git a/arch/mips/configs/bigsur_defconfig 
b/arch/mips/configs/bigsur_defconfig
index eea9b613bb7402..d83e7d600b0a56 100644
--- a/arch/mips/configs/bigsur_defconfig
+++ b/arch/mips/configs/bigsur_defconfig
@@ -105,10 +105,6 @@ CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_EEPROM_LEGACY=y
 CONFIG_EEPROM_MAX6875=y
-CONFIG_IDE=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_IDETAPE=y
-CONFIG_BLK_DEV_TC86C001=m
 CONFIG_BLK_DEV_SD=y
 CONFIG_CHR_DEV_ST=y
 CONFIG_BLK_DEV_SR=y
-- 
2.30.1

[PATCH 06/10] MIPS: disable CONFIG_IDE in rbtx49xx_defconfig

2021-03-17 Thread Christoph Hellwig

rbtx49xx_defconfig enables CONFIG_IDE for the tx4938 and tx4939 ide
drivers, but those aren't actually used by the last known remaining user:

https://lore.kernel.org/lkml/20210107.101729.1936921832901251107.an...@mba.ocn.ne.jp/

Signed-off-by: Christoph Hellwig 
---
 arch/mips/configs/rbtx49xx_defconfig | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/mips/configs/rbtx49xx_defconfig 
b/arch/mips/configs/rbtx49xx_defconfig
index 5e389db35fa746..69f2300107f961 100644
--- a/arch/mips/configs/rbtx49xx_defconfig
+++ b/arch/mips/configs/rbtx49xx_defconfig
@@ -44,9 +44,6 @@ CONFIG_MTD_NAND_TXX9NDFMC=m
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_IDE=y
-CONFIG_BLK_DEV_IDE_TX4938=y
-CONFIG_BLK_DEV_IDE_TX4939=y
 CONFIG_NETDEVICES=y
 CONFIG_NE2000=y
 CONFIG_SMC91X=y
-- 
2.30.1

[PATCH 05/10] MIPS: switch workpad_defconfig from legacy IDE to libata

2021-03-17 Thread Christoph Hellwig

Use libata instead of the deprecated legacy ide driver in
workpad_defconfig.

Signed-off-by: Christoph Hellwig 
---
 arch/mips/configs/workpad_defconfig | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/mips/configs/workpad_defconfig 
b/arch/mips/configs/workpad_defconfig
index 891a5f77305da1..4798dc86c9ceaf 100644
--- a/arch/mips/configs/workpad_defconfig
+++ b/arch/mips/configs/workpad_defconfig
@@ -26,9 +26,12 @@ CONFIG_IP_MULTICAST=y
 # CONFIG_IPV6 is not set
 CONFIG_NETWORK_SECMARK=y
 CONFIG_BLK_DEV_RAM=m
-CONFIG_IDE=y
-CONFIG_BLK_DEV_IDECS=m
-CONFIG_IDE_GENERIC=y
+# CONFIG_SCSI_PROC_FS is not set
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+# CONFIG_ATA_VERBOSE_ERROR is not set
+# CONFIG_ATA_FORCE is not set
+# CONFIG_ATA_BMDMA is not set
 CONFIG_NETDEVICES=y
 CONFIG_PCMCIA_3C574=m
 CONFIG_PCMCIA_3C589=m
-- 
2.30.1

[PATCH 04/10] MIPS: disable CONFIG_IDE in sb1250_swarm_defconfig

2021-03-17 Thread Christoph Hellwig

sb1250_swarm_defconfig enables CONFIG_IDE but no actual host controller
driver, so just drop CONFIG_IDE, CONFIG_BLK_DEV_IDECD and
CONFIG_BLK_DEV_IDETAPE as they are useless.

Signed-off-by: Christoph Hellwig 
---
 arch/mips/configs/sb1250_swarm_defconfig | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/mips/configs/sb1250_swarm_defconfig 
b/arch/mips/configs/sb1250_swarm_defconfig
index bb0b1b22ebe164..96a94ebf05bf08 100644
--- a/arch/mips/configs/sb1250_swarm_defconfig
+++ b/arch/mips/configs/sb1250_swarm_defconfig
@@ -49,9 +49,6 @@ CONFIG_BLK_DEV_RAM=y
 CONFIG_BLK_DEV_RAM_SIZE=9220
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_ATA_OVER_ETH=m
-CONFIG_IDE=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_BLK_DEV_IDETAPE=y
 CONFIG_RAID_ATTRS=m
 CONFIG_NETDEVICES=y
 CONFIG_MACVLAN=m
-- 
2.30.1

[PATCH 03/10] ARM: disable CONFIG_IDE in pxa_defconfig

2021-03-17 Thread Christoph Hellwig

pxa_defconfig already enables libata including the pata_pcmcia driver, so
drop the legacy ide driver and idecs host driver.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/configs/pxa_defconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index bd7dd81c9c5441..c82b8a1d6e84f6 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -215,8 +215,6 @@ CONFIG_IIO=m
 CONFIG_AD5446=m
 CONFIG_EEPROM_AT24=m
 CONFIG_SENSORS_LIS3_SPI=m
-CONFIG_IDE=m
-CONFIG_BLK_DEV_IDECS=m
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=m
 CONFIG_CHR_DEV_ST=m
-- 
2.30.1

[PATCH 02/10] ARM: disable CONFIG_IDE in footbridge_defconfig

2021-03-17 Thread Christoph Hellwig

footbridge_defconfig enables CONFIG_IDE but no actual host controller
driver, so just drop it.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/configs/footbridge_defconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm/configs/footbridge_defconfig 
b/arch/arm/configs/footbridge_defconfig
index 3a7938f244e566..1fe60e0fcf2790 100644
--- a/arch/arm/configs/footbridge_defconfig
+++ b/arch/arm/configs/footbridge_defconfig
@@ -65,7 +65,6 @@ CONFIG_PARIDE_ON26=m
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_RAM=y
-CONFIG_IDE=y
 CONFIG_NETDEVICES=y
 CONFIG_NET_ETHERNET=y
 CONFIG_NET_VENDOR_3COM=y
-- 
2.30.1

[PATCH 01/10] alpha: use libata instead of the legacy ide driver

2021-03-17 Thread Christoph Hellwig

Switch the alpha defconfig from the legacy ide driver to libata.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/configs/defconfig | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/alpha/configs/defconfig b/arch/alpha/configs/defconfig
index 724c4075df408e..dd2dd9f0861f18 100644
--- a/arch/alpha/configs/defconfig
+++ b/arch/alpha/configs/defconfig
@@ -25,19 +25,18 @@ CONFIG_PNP=y
 CONFIG_ISAPNP=y
 CONFIG_BLK_DEV_FD=y
 CONFIG_BLK_DEV_LOOP=m
-CONFIG_IDE=y
-CONFIG_BLK_DEV_IDECD=y
-CONFIG_IDE_GENERIC=y
-CONFIG_BLK_DEV_GENERIC=y
-CONFIG_BLK_DEV_ALI15X3=y
-CONFIG_BLK_DEV_CMD64X=y
-CONFIG_BLK_DEV_CY82C693=y
 CONFIG_SCSI=y
 CONFIG_BLK_DEV_SD=y
 CONFIG_BLK_DEV_SR=y
 CONFIG_SCSI_AIC7XXX=m
 CONFIG_AIC7XXX_CMDS_PER_DEVICE=253
 # CONFIG_AIC7XXX_DEBUG_ENABLE is not set
+CONFIG_ATA=y
+# CONFIG_SATA_PMP is not set
+CONFIG_PATA_ALI=y
+CONFIG_PATA_CMD64X=y
+CONFIG_PATA_CYPRESS=y
+CONFIG_ATA_GENERIC=y
 CONFIG_NETDEVICES=y
 CONFIG_DUMMY=m
 CONFIG_NET_ETHERNET=y
-- 
2.30.1

remove the legacy ide driver

2021-03-17 Thread Christoph Hellwig

Hi all,

we've been trying to get rid of the legacy ide driver for a while now,
and finally scheduled a removal for 2021, which is three month old now.

In general distros and most defconfigs have switched to libata long ago,
but there are a few exceptions.  This series first switches over all
remaining defconfigs to use libata and then removes the legacy ide
driver.

libata mostly covers all hardware supported by the legacy ide driver.
There are three mips drivers that are not supported, but the linux-mips
list could not identify any users of those.  There also are two m68k
drivers that do not have libata equivalents, which might or might not
have users, so we'll need some input and possibly help from the m68k
community here.

Re: [PATCH 2/3] mm, dax, pmem: Introduce dev_pagemap_failure()

2021-03-17 Thread Dave Chinner

On Wed, Mar 17, 2021 at 09:08:23PM -0700, Dan Williams wrote:
> Jason wondered why the get_user_pages_fast() path takes references on a
> @pgmap object. The rationale was to protect against accessing a 'struct
> page' that might be in the process of being removed by the driver, but
> he rightly points out that should be solved the same way all gup-fast
> synchronization is solved which is invalidate the mapping and let the
> gup slow path do @pgmap synchronization [1].
> 
> To achieve that it means that new user mappings need to stop being
> created and all existing user mappings need to be invalidated.
> 
> For device-dax this is already the case as kill_dax() prevents future
> faults from installing a pte, and the single device-dax inode
> address_space can be trivially unmapped.
> 
> The situation is different for filesystem-dax where device pages could
> be mapped by any number of inode address_space instances. An initial
> thought was to treat the device removal event like a drop_pagecache_sb()
> event that walks superblocks and unmaps all inodes. However, Dave points
> out that it is not just the filesystem user-mappings that need to react
> to global DAX page-unmap events, it is also filesystem metadata
> (proposed DAX metadata access), and other drivers (upstream
> DM-writecache) that need to react to this event [2].
> 
> The only kernel facility that is meant to globally broadcast the loss of
> a page (via corruption or surprise remove) is memory_failure(). The
> downside of memory_failure() is that it is a pfn-at-a-time interface.
> However, the events that would trigger the need to call memory_failure()
> over a full PMEM device should be rare.

This is a highly suboptimal design. Filesystems only need a single
callout to trigger a shutdown that unmaps every active mapping in
the filesystem - we do not need a page-by-page error notification
which results in 250 million hwposion callouts per TB of pmem to do
this.

Indeed, the moment we get the first hwpoison from this patch, we'll
map it to the primary XFS superblock and we'd almost certainly
consider losing the storage behind that block to be a shut down
trigger. During the shutdown, the filesystem should unmap all the
active mappings (we already need to add this to shutdown on DAX
regardless of this device remove issue) and so we really don't need
a page-by-page notification of badness.

AFAICT, it's going to take minutes, maybe hours for do the page-by-page
iteration to hwposion every page. It's going to take a few seconds
for the filesystem shutdown to run a device wide invalidation.

SO, yeah, I think this should simply be a single ranged call to the
filesystem like:

->memory_failure(dev, 0, -1ULL)

to tell the filesystem that the entire backing device has gone away,
and leave the filesystem to handle failure entirely at the
filesystem level.

-Dave.
-- 
Dave Chinner
da...@fromorbit.com

Re: [PATCH v4 1/3] mm/slub: Introduce two counters for partial objects

2021-03-17 Thread Xunlei Pang

On 3/18/21 2:45 AM, Vlastimil Babka wrote:
> On 3/17/21 8:54 AM, Xunlei Pang wrote:
>> The node list_lock in count_partial() spends long time iterating
>> in case of large amount of partial page lists, which can cause
>> thunder herd effect to the list_lock contention.
>>
>> We have HSF RT(High-speed Service Framework Response-Time) monitors,
>> the RT figures fluctuated randomly, then we deployed a tool detecting
>> "irq off" and "preempt off" to dump the culprit's calltrace, capturing
>> the list_lock cost nearly 100ms with irq off issued by "ss", this also
>> caused network timeouts.
>>
>> This patch introduces two counters to maintain the actual number
>> of partial objects dynamically instead of iterating the partial
>> page lists with list_lock held.
>>
>> New counters of kmem_cache_node: partial_free_objs, partial_total_objs.
>> The main operations are under list_lock in slow path, its performance
>> impact is expected to be minimal except the __slab_free() path.
>>
>> The only concern of introducing partial counter is that partial_free_objs
>> may cause cacheline contention and false sharing issues in case of same
>> SLUB concurrent __slab_free(), so define it to be a percpu counter and
>> places it carefully.
> 
> Hm I wonder, is it possible that this will eventually overflow/underflow the
> counter on some CPU? (I guess practially only on 32bit). Maybe the operations
> that are already done under n->list_lock should flush the percpu counter to a
> shared counter?

You are right, thanks a lot for noticing this.

> 
> ...
> 
>> @@ -3039,6 +3066,13 @@ static void __slab_free(struct kmem_cache *s, struct 
>> page *page,
>>  head, new.counters,
>>  "__slab_free"));
>>  
>> +if (!was_frozen && prior) {
>> +if (n)
>> +__update_partial_free(n, cnt);
>> +else
>> +__update_partial_free(get_node(s, page_to_nid(page)), 
>> cnt);
>> +}
> 
> I would guess this is the part that makes your measurements notice that
> (although tiny) difference. We didn't need to obtain the node pointer before 
> and
> now we do. And that is really done just for the per-node breakdown in 
> "objects"
> and "objects_partial" files under /sys/kernel/slab - distinguishing nodes is 
> not
> needed for /proc/slabinfo. So that kinda justifies putting this under a new
> CONFIG as you did. Although perhaps somebody interested in these kind of stats
> would enable CONFIG_SLUB_STATS anyway, so that's still an option to use 
> instead
> of introducing a new oddly specific CONFIG? At least until somebody comes up 
> and
> presents an use case where they want the per-node breakdowns in /sys but 
> cannot
> afford CONFIG_SLUB_STATS.
> 
> But I'm also still thinking about simply counting all free objects (for the
> purposes of accurate  in /proc/slabinfo) as a percpu variable in
> struct kmem_cache itself. That would basically put this_cpu_add() in all the
> fast paths, but AFAICS thanks to the segment register it doesn't mean 
> disabling
> interrupts nor a LOCK operation, so maybe it wouldn't be that bad? And it
> shouldn't need to deal with these node pointers. So maybe that would be
> acceptable for CONFIG_SLUB_DEBUG? Guess I'll have to try...
> 

The percpu operation itself should be fine, it looks to be cacheline
pingpong issue due to extra percpu counter access, so making it
cacheline aligned improves a little according to my tests.

Re: [PATCH v29 4/4] scsi: ufs: Add HPB 2.0 support

2021-03-17 Thread Can Guo


On 2021-03-18 10:02, Daejun Park wrote:

On 2021-03-17 09:42, Daejun Park wrote:

On 2021-03-15 15:23, Can Guo wrote:

On 2021-03-15 15:07, Daejun Park wrote:

This patch supports the HPB 2.0.

The HPB 2.0 supports read of varying sizes from 4KB to 512KB.
In the case of Read (<= 32KB) is supported as single HPB read.
In the case of Read (36KB ~ 512KB) is supported by as a
combination
of
write buffer command and HPB read command to deliver more PPN.
The write buffer commands may not be issued immediately due to
busy
tags.
To use HPB read more aggressively, the driver can requeue the
write
buffer
command. The requeue threshold is implemented as timeout and can
be
modified with requeue_timeout_ms entry in sysfs.

Signed-off-by: Daejun Park 
---
+static struct attribute *hpb_dev_param_attrs[] = {
+_attr_requeue_timeout_ms.attr,
+NULL,
+};
+
+struct attribute_group ufs_sysfs_hpb_param_group = {
+.name = "hpb_param_sysfs",
+.attrs = hpb_dev_param_attrs,
+};
+
+static int ufshpb_pre_req_mempool_init(struct ufshpb_lu *hpb)
+{
+struct ufshpb_req *pre_req = NULL;
+int qd = hpb->sdev_ufs_lu->queue_depth / 2;
+int i, j;
+
+INIT_LIST_HEAD(>lh_pre_req_free);
+
+hpb->pre_req = kcalloc(qd, sizeof(struct ufshpb_req),
GFP_KERNEL);
+hpb->throttle_pre_req = qd;
+hpb->num_inflight_pre_req = 0;
+
+if (!hpb->pre_req)
+goto release_mem;
+
+for (i = 0; i < qd; i++) {
+pre_req = hpb->pre_req + i;
+INIT_LIST_HEAD(_req->list_req);
+pre_req->req = NULL;
+pre_req->bio = NULL;


Why don't prepare bio as same as wb.m_page? Won't that save more
time
for ufshpb_issue_pre_req()?


It is pre_req pool. So although we prepare bio at this time, it 
just

only for first pre_req.


I meant removing the bio_alloc() in ufshpb_issue_pre_req() and
bio_put()
in ufshpb_pre_req_compl_fn(). bios, in pre_req's case, just hold a
page.
So, prepare 16 (if queue depth is 32) bios here, just use them 
along

with
wb.m_page and call bio_reset() in ufshpb_pre_req_compl_fn(). Shall 
it

work?



If it works, you can even have the bio_add_pc_page() called here.
Later
in
ufshpb_execute_pre_req(), you don't need to call
ufshpb_pre_req_add_bio_page(),
just call ufshpb_prep_entry() once instead - it save many repeated
steps
for a
pre_req, and you don't even need to call bio_reset() in this case,
since
for a
bio, nothing changes after it is binded with a specific page...


Hi, Can Guo

I tried the idea that you suggested, but it doesn't work properly.
This optimization should be done next time for enhancement.


Can you elaborate please? Any error seen?

Per my understanding, in the case for pre_reqs, a bio is no different
from a page. Here it can reserve 16 pages for later use, which can be
done the same for bios.


I found some problem with re-using pre allocated bio.

The following kernel message is related with problem.
[2.750530] [ cut here ]
[2.751404] WARNING: CPU: 4 PID: 170 at
drivers/scsi/scsi_lib.c:1020 scsi_alloc_sgtables+0x253/0x2b0
[2.753054] Modules linked in:
[2.753651] CPU: 4 PID: 170 Comm: mount Not tainted 5.12.0-rc1+ #331
[2.754752] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[2.756813] RIP: 0010:scsi_alloc_sgtables+0x253/0x2b0
[2.757699] Code: 85 c0 74 19 41 0f b6 44 24 18 8d 50 e0 83 fa 03
76 30 41 bd 01 00 00 00 e9 1f fe ff ff be 01 00 00 00 45 31 ed e9 19
fe ff ff <0f> 0b b8 0a f
[2.761021] RSP: 0018:b06e0027f538 EFLAGS: 00010246
[2.761902] RAX:  RBX: 9c3a42d424d0 RCX: 
b06e0027f5e0
[2.763184] RDX: 9c3a42d426a8 RSI:  RDI: 
9c3a42d424d0
[2.764446] RBP: b06e0027f570 R08:  R09: 

[2.765704] R10: 8eb0dda0 R11: fffb7675 R12: 
9c3a42d423c0
[2.766976] R13:  R14: 9c3a41bed000 R15: 
9c3a420f4000

[2.768225] FS:  7f42d1eab100() GS:9c3b77c0()
knlGS:
[2.769666] CS:  0010 DS:  ES:  CR0: 80050033
[2.770719] CR2: 7f42d1ac1000 CR3: 000104bee006 CR4: 
00370ee0
[2.771997] DR0:  DR1:  DR2: 

[2.773288] DR3:  DR6: fffe0ff0 DR7: 
0400

[2.774543] Call Trace:
[2.775092]  scsi_queue_rq+0x9b6/0xb20
[2.775754]  __blk_mq_try_issue_directly+0x150/0x1f0
[2.776636]  blk_mq_request_issue_directly+0x49/0x80
[2.777616]  blk_insert_cloned_request+0x85/0xd0
[2.778470]  ufshpb_prep.cold+0x793/0x7be
[2.779179]  ufshcd_queuecommand+0x114/0x690
[2.779986]  scsi_queue_rq+0x38a/0xb20
[2.780755]  blk_mq_dispatch_rq_list+0x13d/0x760
[2.781605]  ? dd_dispatch_request+0x67/0x1c0
[2.782337]

Re: [PATCHv2 3/3] media: uvcvideo: add UVC 1.5 ROI control

2021-03-17 Thread Sergey Senozhatsky

On (21/03/17 08:58), Ricardo Ribalda Delgado wrote:
[..]
> >
> > GET_CUR?
> yep
> 
> >
> > > https://www.kernel.org/doc/html/v4.13/media/uapi/v4l/vidioc-g-selection.html?highlight=vidioc_s_selection
> > > On success the struct v4l2_rect r field contains the adjusted
> > > rectangle.
> >
> > What is the adjusted rectangle here? Does this mean that firmware can
> > successfully apply SET_CUR and return 0, but in reality it was not happy
> > with the rectangle dimensions so it modified it behind the scenes?
> 
> I can imagine that some hw might have spooky requirements for the roi
> rectangle (multiple of 4, not crossing the bayer filter, odd/even
> line...) so they might be able to go the closest valid config.

Hmm. Honestly, I'm very unsure about it. ROI::SET_CUR can be a very
hot path, depending on what user-space considers to be of interest
and how frequently that object of interest changes its position/shape/etc.
Doing GET_CUR after every SET_CUR doubles the number of firmware calls
we issue, that's for sure; is it worth it - that's something that I'm
not sure of.

May I please ask for more opinions on this?

-ss

Re: [PATCH] mm/gup: check page posion status for coredump.

2021-03-17 Thread Matthew Wilcox

On Wed, Mar 17, 2021 at 10:12:02AM +0100, David Hildenbrand wrote:
> > +   if (IS_ENABLED(CONFIG_MEMORY_FAILURE) && ret == 1) {
> > +   if (unlikely(PageHuge(page) && 
> > PageHWPoison(compound_head(page
> > +   ret = 0;
> > +   else if (unlikely(PageHWPoison(page)))
> > +   ret = 0;
> > +   }
> 
> I wonder if a simple
> 
> if (PageHWPoison(compound_head(page)))
>   ret = 0;
> 
> won't suffice. But I guess the "issue" is compound pages that are not huge
> pages or transparent huge pages.

THPs don't set the HWPoison bit on the head page.

https://lore.kernel.org/linux-mm/20210316140947.ga3...@casper.infradead.org/

(and PAGEFLAG(HWPoison, hwpoison, PF_ANY))

By the way,

#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
extern bool take_page_off_buddy(struct page *page);
#else
PAGEFLAG_FALSE(HWPoison)
#define __PG_HWPOISON 0
#endif

so there's no need for this 
if (IS_ENABLED(CONFIG_MEMORY_FAILURE)
check, as it simply turns into

if (PageHuge(page) && 0)
else if (0)

and the compiler can optimise it all away.

RE: A problem of Intel IOMMU hardware ？

2021-03-17 Thread Longpeng (Mike, Cloud Infrastructure Service Product Dept.)

Hi guys,

I provide more information, please see below

> -Original Message-
> From: Lu Baolu [mailto:baolu...@linux.intel.com]
> Sent: Thursday, March 18, 2021 10:59 AM
> To: Alex Williamson 
> Cc: baolu...@linux.intel.com; Longpeng (Mike, Cloud Infrastructure Service 
> Product
> Dept.) ; dw...@infradead.org; j...@8bytes.org;
> w...@kernel.org; io...@lists.linux-foundation.org; LKML
> ; Gonglei (Arei) ;
> chenjiashang 
> Subject: Re: A problem of Intel IOMMU hardware ？
> 
> Hi Alex,
> 
> On 3/17/21 11:18 PM, Alex Williamson wrote:
> >>>   {MAP,   0x0, 0xc000}, - (b)
> >>>   use GDB to pause at here, and then DMA read
> >>> IOVA=0,
> >> IOVA 0 seems to be a special one. Have you verified with other
> >> addresses than IOVA 0?
> > It is???  That would be a problem.
> >
> 
> No problem from hardware point of view as far as I can see. Just thought about
> software might handle it specially.
> 

We simplify the reproducer, use the following map/unmap sequences can also 
reproduce the problem.

1. use 2M hugetlbfs to mmap 4G memory

2. run the while loop:
While (1) {
DMA MAP (0, 0xa) - - - - - - - - - - - - - -(a)
DMA UNMAP (0, 0xa) - - - - - - - - - - - (b)
  Operation-1 : dump DMAR table
DMA MAP (0, 0xc000) - - - - - - - - - - -(c)
  Operation-2 :
 use GDB to pause at here, then DMA read IOVA=0,
 sometimes DMA success (as expected),
 but sometimes DMA error (report not-present).
  Operation-3 : dump DMAR table
  Operation-4 (when DMA error) : please see below
DMA UNMAP (0, 0xc000) - - - - - - - - -(d)
}

The DMAR table of Operation-1 is (only show the entries about IOVA 0):

PML4: 0x  1a34fbb003
  PDPE: 0x  1a34fbb003
   PDE: 0x  1a34fbf003
PTE: 0x   0

And the table of Operation-3 is:

PML4: 0x  1a34fbb003
  PDPE: 0x  1a34fbb003
   PDE: 0x   15ec00883 < - - 2M superpage

So we can see the IOVA 0 is mapped, but the DMA read is error:

dmar_fault: 131757 callbacks suppressed
DRHD: handling fault status reg 402
[DMA Read] Request device [86:05.6] fault addr 0 [fault reason 06] PTE Read 
access is not set
[DMA Read] Request device [86:05.6] fault addr 0 [fault reason 06] PTE Read 
access is not set
DRHD: handling fault status reg 600
DRHD: handling fault status reg 602
[DMA Read] Request device [86:05.6] fault addr 0 [fault reason 06] PTE Read 
access is not set
[DMA Read] Request device [86:05.6] fault addr 0 [fault reason 06] PTE Read 
access is not set
[DMA Read] Request device [86:05.6] fault addr 0 [fault reason 06] PTE Read 
access is not set

NOTE, the magical thing happen...(*Operation-4*) we write the PTE
of Operation-1 from 0 to 0x3 which means can Read/Write, and then
we trigger DMA read again, it success and return the data of HPA 0 !!

Why we modify the older page table would make sense ? As we
have discussed previously, the cache flush part of the driver is correct,
it call flush_iotlb after (b) and no need to flush after (c). But the result
of the experiment shows the older page table or older caches is effective
actually.

Any ideas ?

> Best regards,
> baolu

Re: [PATCH 2/3] mm, dax, pmem: Introduce dev_pagemap_failure()

2021-03-17 Thread Darrick J. Wong

On Wed, Mar 17, 2021 at 09:08:23PM -0700, Dan Williams wrote:
> Jason wondered why the get_user_pages_fast() path takes references on a
> @pgmap object. The rationale was to protect against accessing a 'struct
> page' that might be in the process of being removed by the driver, but
> he rightly points out that should be solved the same way all gup-fast
> synchronization is solved which is invalidate the mapping and let the
> gup slow path do @pgmap synchronization [1].
> 
> To achieve that it means that new user mappings need to stop being
> created and all existing user mappings need to be invalidated.
> 
> For device-dax this is already the case as kill_dax() prevents future
> faults from installing a pte, and the single device-dax inode
> address_space can be trivially unmapped.
> 
> The situation is different for filesystem-dax where device pages could
> be mapped by any number of inode address_space instances. An initial
> thought was to treat the device removal event like a drop_pagecache_sb()
> event that walks superblocks and unmaps all inodes. However, Dave points
> out that it is not just the filesystem user-mappings that need to react
> to global DAX page-unmap events, it is also filesystem metadata
> (proposed DAX metadata access), and other drivers (upstream
> DM-writecache) that need to react to this event [2].
> 
> The only kernel facility that is meant to globally broadcast the loss of
> a page (via corruption or surprise remove) is memory_failure(). The
> downside of memory_failure() is that it is a pfn-at-a-time interface.
> However, the events that would trigger the need to call memory_failure()
> over a full PMEM device should be rare. Remove should always be
> coordinated by the administrator with the filesystem. If someone force
> removes a device from underneath a mounted filesystem the driver assumes
> they have a good reason, or otherwise get to keep the pieces. Since
> ->remove() callbacks can not fail the only option is to trigger the mass
> memory_failure().
> 
> The mechanism to determine whether memory_failure() triggers at
> pmem->remove() time is whether the associated dax_device has an elevated
> reference at @pgmap ->kill() time.
> 
> With this in place the get_user_pages_fast() path can drop its
> half-measure synchronization with an @pgmap reference.
> 
> Link: http://lore.kernel.org/r/20210224010017.gq2643...@ziepe.ca [1]
> Link: http://lore.kernel.org/r/20210302075736.gj4...@dread.disaster.area [2]
> Reported-by: Jason Gunthorpe 
> Cc: Dave Chinner 
> Cc: Christoph Hellwig 
> Cc: Shiyang Ruan 
> Cc: Vishal Verma 
> Cc: Dave Jiang 
> Cc: Ira Weiny 
> Cc: Matthew Wilcox 
> Cc: Jan Kara 
> Cc: Andrew Morton 
> Cc: Naoya Horiguchi 
> Cc: "Darrick J. Wong" 
> Signed-off-by: Dan Williams 
> ---
>  drivers/dax/super.c  |   15 +++
>  drivers/nvdimm/pmem.c|   10 +-
>  drivers/nvdimm/pmem.h|1 +
>  include/linux/dax.h  |5 +
>  include/linux/memremap.h |5 +
>  include/linux/mm.h   |3 +++
>  mm/memory-failure.c  |   11 +--
>  mm/memremap.c|   11 +++
>  8 files changed, 58 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index 5fa6ae9dbc8b..5ebcedf4a68c 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -624,6 +624,21 @@ void put_dax(struct dax_device *dax_dev)
>  }
>  EXPORT_SYMBOL_GPL(put_dax);
>  
> +bool dax_is_idle(struct dax_device *dax_dev)
> +{
> + struct inode *inode;
> +
> + if (!dax_dev)
> + return true;
> +
> + WARN_ONCE(test_bit(DAXDEV_ALIVE, _dev->flags),
> +   "dax idle check on live device.\n");
> +
> + inode = _dev->inode;
> + return atomic_read(>i_count) < 2;
> +}
> +EXPORT_SYMBOL_GPL(dax_is_idle);
> +
>  /**
>   * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
>   * @host: alternate name for the device registered by a dax driver
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index b8a85bfb2e95..e8822c9262ee 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -348,15 +348,21 @@ static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
>  {
>   struct request_queue *q =
>   container_of(pgmap->ref, struct request_queue, q_usage_counter);
> + struct pmem_device *pmem = q->queuedata;
>  
>   blk_freeze_queue_start(q);
> + kill_dax(pmem->dax_dev);
> + if (!dax_is_idle(pmem->dax_dev)) {
> + dev_warn(pmem->dev,
> +  "DAX active at remove, trigger mass memory failure\n");
> + dev_pagemap_failure(pgmap);
> + }
>  }
>  
>  static void pmem_release_disk(void *__pmem)
>  {
>   struct pmem_device *pmem = __pmem;
>  
> - kill_dax(pmem->dax_dev);
>   put_dax(pmem->dax_dev);
>   del_gendisk(pmem->disk);
>   put_disk(pmem->disk);
> @@ -406,6 +412,7 @@ static int pmem_attach_disk(struct device *dev,
>

Re: [PATCH] objtool,static_call: Don't emit static_call_site for .exit.text

2021-03-17 Thread Sumit Garg

On Thu, 18 Mar 2021 at 03:26, Jarkko Sakkinen  wrote:
>
> On Wed, Mar 17, 2021 at 07:07:07PM +0530, Sumit Garg wrote:
> > On Wed, 17 Mar 2021 at 18:16, Peter Zijlstra  wrote:
> > >
> > > On Wed, Mar 17, 2021 at 05:25:48PM +0530, Sumit Garg wrote:
> > > > Thanks Peter for this fix. It does work for me on qemu for x86. Can
> > > > you turn this into a proper fix patch? BTW, feel free to add:
> > >
> > > Per the below, the original patch ought to be fixed as well, to not use
> > > static_call() in __exit.
> >
> > Okay, fair enough.
> >
> > Jarkko,
> >
> > Can you please incorporate the following change to the original patch as 
> > well?
>
> Can you roll-out a proper patch of this?

Okay, I will post a separate patch for this.

-Sumit

>
> /Jarkko

[PATCH v3 2/2] arm64: dts: mt8183: Add kukui-jacuzzi-damu board

2021-03-17 Thread Hsin-Yi Wang

Damu is known as ASUS Chromebook Flip CM3.

Signed-off-by: Hsin-Yi Wang 
---
v2->v3: remove unused nodes
v1->v2: fix pp3300_panel regulator property
---
 arch/arm64/boot/dts/mediatek/Makefile |   1 +
 .../mediatek/mt8183-kukui-jacuzzi-damu.dts|  31 ++
 .../dts/mediatek/mt8183-kukui-jacuzzi.dtsi| 474 ++
 3 files changed, 506 insertions(+)
 create mode 100644 arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi-damu.dts
 create mode 100644 arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi

diff --git a/arch/arm64/boot/dts/mediatek/Makefile 
b/arch/arm64/boot/dts/mediatek/Makefile
index deba27ab7657..554105d2c389 100644
--- a/arch/arm64/boot/dts/mediatek/Makefile
+++ b/arch/arm64/boot/dts/mediatek/Makefile
@@ -13,6 +13,7 @@ dtb-$(CONFIG_ARCH_MEDIATEK) += mt8173-elm-hana.dtb
 dtb-$(CONFIG_ARCH_MEDIATEK) += mt8173-elm-hana-rev7.dtb
 dtb-$(CONFIG_ARCH_MEDIATEK) += mt8173-evb.dtb
 dtb-$(CONFIG_ARCH_MEDIATEK) += mt8183-evb.dtb
+dtb-$(CONFIG_ARCH_MEDIATEK) += mt8183-kukui-jacuzzi-damu.dtb
 dtb-$(CONFIG_ARCH_MEDIATEK) += mt8183-kukui-krane-sku0.dtb
 dtb-$(CONFIG_ARCH_MEDIATEK) += mt8183-kukui-krane-sku176.dtb
 dtb-$(CONFIG_ARCH_MEDIATEK) += mt8192-evb.dtb
diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi-damu.dts 
b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi-damu.dts
new file mode 100644
index ..42ba9c00866c
--- /dev/null
+++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi-damu.dts
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Copyright 2021 Google LLC
+ */
+
+/dts-v1/;
+#include "mt8183-kukui-jacuzzi.dtsi"
+
+/ {
+   model = "Google damu board";
+   compatible = "google,damu", "mediatek,mt8183";
+};
+
+ {
+   status = "okay";
+
+   compatible = "hid-over-i2c";
+   reg = <0x10>;
+   interrupt-parent = <>;
+   interrupts = <155 IRQ_TYPE_LEVEL_LOW>;
+   pinctrl-names = "default";
+   pinctrl-0 = <_pins>;
+
+   post-power-on-delay-ms = <10>;
+   hid-descr-addr = <0x0001>;
+};
+
+_wifi {
+   qcom,ath10k-calibration-variant = "GO_DAMU";
+};
+
diff --git a/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi 
b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi
new file mode 100644
index ..4049dff8464b
--- /dev/null
+++ b/arch/arm64/boot/dts/mediatek/mt8183-kukui-jacuzzi.dtsi
@@ -0,0 +1,474 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * Copyright 2021 Google LLC
+ */
+
+#include "mt8183-kukui.dtsi"
+
+/ {
+   panel: panel {
+   compatible = "auo,b116xw03";
+   power-supply = <_panel>;
+   ddc-i2c-bus = <>;
+   backlight = <_lcd0>;
+
+   port {
+   panel_in: endpoint {
+   remote-endpoint = <_out>;
+   };
+   };
+   };
+
+   pp1200_mipibrdg: pp1200-mipibrdg {
+   compatible = "regulator-fixed";
+   regulator-name = "pp1200_mipibrdg";
+   pinctrl-names = "default";
+   pinctrl-0 = <_mipibrdg_en>;
+
+   enable-active-high;
+   regulator-boot-on;
+
+   gpio = < 54 GPIO_ACTIVE_HIGH>;
+   };
+
+   pp1800_mipibrdg: pp1800-mipibrdg {
+   compatible = "regulator-fixed";
+   regulator-name = "pp1800_mipibrdg";
+   pinctrl-names = "default";
+   pinctrl-0 = <_lcd_en>;
+
+   enable-active-high;
+   regulator-boot-on;
+
+   gpio = < 36 GPIO_ACTIVE_HIGH>;
+   };
+
+   pp3300_panel: pp3300-panel {
+   compatible = "regulator-fixed";
+   regulator-name = "pp3300_panel";
+   regulator-min-microvolt = <330>;
+   regulator-max-microvolt = <330>;
+   pinctrl-names = "default";
+   pinctrl-0 = <_panel_pins>;
+
+   enable-active-high;
+   regulator-boot-on;
+
+   gpio = < 35 GPIO_ACTIVE_HIGH>;
+   };
+
+   vddio_mipibrdg: vddio-mipibrdg {
+   compatible = "regulator-fixed";
+   regulator-name = "vddio_mipibrdg";
+   pinctrl-names = "default";
+   pinctrl-0 = <_mipibrdg_en>;
+
+   enable-active-high;
+   regulator-boot-on;
+
+   gpio = < 37 GPIO_ACTIVE_HIGH>;
+   };
+
+   volume_buttons: volume-buttons {
+   compatible = "gpio-keys";
+   pinctrl-names = "default";
+   pinctrl-0 = <_button_pins>;
+
+   volume_down {
+   label = "Volume Down";
+   linux,code = ;
+   debounce-interval = <100>;
+
+   gpios = < 6 GPIO_ACTIVE_LOW>;
+   };
+
+   volume_up {
+   label = "Volume Up";
+   linux,code = ;
+   debounce-interval = <100>;
+

[PATCH v3 1/2] dt-bindings: arm64: dts: mediatek: Add mt8183-kukui-jacuzzi-damu

2021-03-17 Thread Hsin-Yi Wang

mt8183-kukui-jacuzzi-damu board also known as ASUS Chromebook Flip CM3,
using mediatek mt8183 SoC.

Signed-off-by: Hsin-Yi Wang 
Reviewed-by: Enric Balletbo i Serra 
---
 Documentation/devicetree/bindings/arm/mediatek.yaml | 4 
 1 file changed, 4 insertions(+)

diff --git a/Documentation/devicetree/bindings/arm/mediatek.yaml 
b/Documentation/devicetree/bindings/arm/mediatek.yaml
index 93b3bdf6eaeb..a86716cdd408 100644
--- a/Documentation/devicetree/bindings/arm/mediatek.yaml
+++ b/Documentation/devicetree/bindings/arm/mediatek.yaml
@@ -125,6 +125,10 @@ properties:
   - google,krane-sku176
   - const: google,krane
   - const: mediatek,mt8183
+  - description: Google Damu (ASUS Chromebook Flip CM3)
+items:
+  - const: google,damu
+  - const: mediatek,mt8183
 
 additionalProperties: true
 
-- 
2.31.0.rc2.261.g7f71774620-goog

linux-next: manual merge of the rust tree with the kbuild tree

2021-03-17 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the rust tree got a conflict in:

  Makefile

between commits:

  a14efe0d73eb ("kbuild: include Makefile.compiler only when compiler is 
needed")
  9fc2872b700a ("Makefile: Remove '--gcc-toolchain' flag")
  71eb5c859a59 ("Makefile: Only specify '--prefix=' when building with clang + 
GNU as")
  884a7fa1b090 ("kbuild: replace sed with $(subst ) or $(patsubst )")

from the kbuild tree and commit:

  c77c8025525c ("Rust support")

from the rust tree.

I fixed it up (see below - I think I got this right ...) and can carry
the fix as necessary. This is now fixed as far as linux-next is
concerned, but any non trivial conflicts should be mentioned to your
upstream maintainer when your tree is submitted for merging.  You may
also want to consider cooperating with the maintainer of the
conflicting tree to minimise any particularly complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc Makefile
index cc5b7e39fde4,6e6562591a91..
--- a/Makefile
+++ b/Makefile
@@@ -263,11 -270,7 +270,11 @@@ no-dot-config-targets := $(clean-target
 cscope gtags TAGS tags help% %docs check% coccicheck \
 $(version_h) headers headers_% archheaders archscripts 
\
 %asm-generic kernelversion %src-pkg dt_binding_check \
-outputmakefile
+outputmakefile rustfmt rustfmtcheck
 +# Installation targets should not require compiler. Unfortunately, 
vdso_install
 +# is an exception where build artifacts may be updated. This must be fixed.
 +no-compiler-targets := $(no-dot-config-targets) install dtbs_install \
 +  headers_install modules_install kernelrelease image_name
  no-sync-config-targets := $(no-dot-config-targets) %install kernelrelease \
  image_name
  single-targets := %.a %.i %.ko %.lds %.ll %.lst %.mod %.o %.s %.symtypes %/
@@@ -570,22 -588,31 +599,27 @@@ ifdef building_out_of_srctre
{ echo "# this is build directory, ignore it"; echo "*"; } > .gitignore
  endif
  
+ TENTATIVE_CLANG_FLAGS := -Werror=unknown-warning-option
+ 
+ ifneq ($(CROSS_COMPILE),)
+ TENTATIVE_CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%))
 -GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
 -TENTATIVE_CLANG_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir 
$(CROSS_COMPILE))
 -GCC_TOOLCHAIN := $(realpath $(GCC_TOOLCHAIN_DIR)/..)
 -endif
 -ifneq ($(GCC_TOOLCHAIN),)
 -TENTATIVE_CLANG_FLAGS += --gcc-toolchain=$(GCC_TOOLCHAIN)
+ endif
+ ifneq ($(LLVM_IAS),1)
+ TENTATIVE_CLANG_FLAGS += -no-integrated-as
++GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
++TENTATIVE_CLANG_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir 
$(CROSS_COMPILE))
+ endif
+ 
+ export TENTATIVE_CLANG_FLAGS
+ 
  # The expansion should be delayed until arch/$(SRCARCH)/Makefile is included.
  # Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile.
  # CC_VERSION_TEXT is referenced from Kconfig (so it needs export),
  # and from include/config/auto.conf.cmd to detect the compiler upgrade.
 -CC_VERSION_TEXT = $(shell $(CC) --version 2>/dev/null | head -n 1 | sed 
's/\#//g')
 +CC_VERSION_TEXT = $(subst $(pound),,$(shell $(CC) --version 2>/dev/null | 
head -n 1))
  
  ifneq ($(findstring clang,$(CC_VERSION_TEXT)),)
- ifneq ($(CROSS_COMPILE),)
- CLANG_FLAGS   += --target=$(notdir $(CROSS_COMPILE:%-=%))
- endif
- ifneq ($(LLVM_IAS),1)
- CLANG_FLAGS   += -no-integrated-as
- GCC_TOOLCHAIN_DIR := $(dir $(shell which $(CROSS_COMPILE)elfedit))
- CLANG_FLAGS   += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir $(CROSS_COMPILE))
- endif
- CLANG_FLAGS   += -Werror=unknown-warning-option
+ CLANG_FLAGS   += $(TENTATIVE_CLANG_FLAGS)
  KBUILD_CFLAGS += $(CLANG_FLAGS)
  KBUILD_AFLAGS += $(CLANG_FLAGS)
  export CLANG_FLAGS


pgphzn_wsJnwk.pgp
Description: OpenPGP digital signature

Re: kmap_local semantics

2021-03-17 Thread Christoph Hellwig

On Wed, Mar 17, 2021 at 07:44:20PM -0700, Ira Weiny wrote:
> Agreed.  I'm Sorry, I did not word the above clearly enough.  Let me rephrase
> that.
> 
> Christoph, do you anticipate additional need to call kmap and hand the 
> mappings
> to other threads?  If not then kmap_local is what you should use.  If so, I'd
> like to know why.

No.  Just looking into callers that want a mapping and then synchronously
wait for I/O.

Re: [PATCH v2 0/3] perf-stat: share hardware PMCs with BPF

2021-03-17 Thread Namhyung Kim

On Thu, Mar 18, 2021 at 12:52 PM Song Liu  wrote:
>
>
>
> > On Mar 17, 2021, at 6:11 AM, Arnaldo Carvalho de Melo  
> > wrote:
> >
> > Em Wed, Mar 17, 2021 at 02:29:28PM +0900, Namhyung Kim escreveu:
> >> Hi Song,
> >>
> >> On Wed, Mar 17, 2021 at 6:18 AM Song Liu  wrote:
> >>>
> >>> perf uses performance monitoring counters (PMCs) to monitor system
> >>> performance. The PMCs are limited hardware resources. For example,
> >>> Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu.
> >>>
> >>> Modern data center systems use these PMCs in many different ways:
> >>> system level monitoring, (maybe nested) container level monitoring, per
> >>> process monitoring, profiling (in sample mode), etc. In some cases,
> >>> there are more active perf_events than available hardware PMCs. To allow
> >>> all perf_events to have a chance to run, it is necessary to do expensive
> >>> time multiplexing of events.
> >>>
> >>> On the other hand, many monitoring tools count the common metrics (cycles,
> >>> instructions). It is a waste to have multiple tools create multiple
> >>> perf_events of "cycles" and occupy multiple PMCs.
> >>
> >> Right, it'd be really helpful when the PMCs are frequently or mostly 
> >> shared.
> >> But it'd also increase the overhead for uncontended cases as BPF programs
> >> need to run on every context switch.  Depending on the workload, it may
> >> cause a non-negligible performance impact.  So users should be aware of it.
> >
> > Would be interesting to, humm, measure both cases to have a firm number
> > of the impact, how many instructions are added when sharing using
> > --bpf-counters?
> >
> > I.e. compare the "expensive time multiplexing of events" with its
> > avoidance by using --bpf-counters.
> >
> > Song, have you perfmormed such measurements?
>
> I have got some measurements with perf-bench-sched-messaging:
>
> The system: x86_64 with 23 cores (46 HT)
>
> The perf-stat command:
> perf stat -e cycles,cycles,instructions,instructions,ref-cycles,ref-cycles 
> 
>
> The benchmark command and output:
> ./perf bench sched messaging -g 40 -l 5 -t
> # Running 'sched/messaging' benchmark:
> # 20 sender and receiver threads per group
> # 40 groups == 1600 threads run
>  Total time: 10X.XXX [sec]
>
>
> I use the "Total time" as measurement, so smaller number is better.
>
> For each condition, I run the command 5 times, and took the median of
> "Total time".
>
> Baseline (no perf-stat) 104.873 [sec]
> # global
> perf stat -a107.887 [sec]
> perf stat -a --bpf-counters 106.071 [sec]
> # per task
> perf stat   106.314 [sec]
> perf stat --bpf-counters105.965 [sec]
> # per cpu
> perf stat -C 1,3,5  107.063 [sec]
> perf stat -C 1,3,5 --bpf-counters   106.406 [sec]
>
> From the data, --bpf-counters is slightly better than the regular event
> for all targets. I noticed that the results are not very stable. There
> are a couple 108.xx runs in some of the conditions (w/ and w/o
> --bpf-counters).

Hmm.. so this result is when multiplexing happened, right?
I wondered how/why the regular perf stat is slower..

Thanks,
Namhyung

>
>
> I also measured the average runtime of the BPF programs, with
>
> sysctl kernel.bpf_stats_enabled=1
>
> For each event, if we have one leader and two followers, the total run
> time is about 340ns. IOW, 340ns for two perf-stat reading instructions,
> 340ns for two perf-stat reading cycles, etc.
>
> Thanks,
> Song

[tip:sched/core] BUILD SUCCESS 90f093fa8ea48e5d991332cee160b761423d55c1

2021-03-17 Thread kernel test robot

  defconfig
s390 allyesconfig
s390 allmodconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
i386   tinyconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a006-20210317
x86_64   randconfig-a001-20210317
x86_64   randconfig-a005-20210317
x86_64   randconfig-a004-20210317
x86_64   randconfig-a003-20210317
x86_64   randconfig-a002-20210317
i386 randconfig-a001-20210317
i386 randconfig-a005-20210317
i386 randconfig-a002-20210317
i386 randconfig-a003-20210317
i386 randconfig-a004-20210317
i386 randconfig-a006-20210317
i386 randconfig-a013-20210317
i386 randconfig-a016-20210317
i386 randconfig-a011-20210317
i386 randconfig-a012-20210317
i386 randconfig-a015-20210317
i386 randconfig-a014-20210317
riscvnommu_k210_defconfig
riscvnommu_virt_defconfig
riscv   defconfig
riscv  rv32_defconfig
x86_64rhel-7.6-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-a011-20210317
x86_64   randconfig-a016-20210317
x86_64   randconfig-a013-20210317
x86_64   randconfig-a014-20210317
x86_64   randconfig-a015-20210317
x86_64   randconfig-a012-20210317

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org

Re: [PATCH v4 2/2] drm/bridge: anx7625: disable regulators when power off

2021-03-17 Thread Hsin-Yi Wang

On Wed, Feb 24, 2021 at 2:14 PM Hsin-Yi Wang  wrote:
>
> When suspending the driver, anx7625_power_standby() will be called to
> turn off reset-gpios and enable-gpios. However, power supplies are not
> disabled. To save power, the driver can get the power supply regulators
> and turn off them in anx7625_power_standby().
>
> Signed-off-by: Hsin-Yi Wang 
> Reviewed-by: Robert Foss 
> ---

Ping on the thread, thanks.

>  drivers/gpu/drm/bridge/analogix/anx7625.c | 34 +++
>  drivers/gpu/drm/bridge/analogix/anx7625.h |  1 +
>  2 files changed, 35 insertions(+)
>
> diff --git a/drivers/gpu/drm/bridge/analogix/anx7625.c 
> b/drivers/gpu/drm/bridge/analogix/anx7625.c
> index 65cc05982f826..23283ba0c4f93 100644
> --- a/drivers/gpu/drm/bridge/analogix/anx7625.c
> +++ b/drivers/gpu/drm/bridge/analogix/anx7625.c
> @@ -11,6 +11,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -875,12 +876,25 @@ static int sp_tx_edid_read(struct anx7625_data *ctx,
>  static void anx7625_power_on(struct anx7625_data *ctx)
>  {
> struct device *dev = >client->dev;
> +   int ret, i;
>
> if (!ctx->pdata.low_power_mode) {
> DRM_DEV_DEBUG_DRIVER(dev, "not low power mode!\n");
> return;
> }
>
> +   for (i = 0; i < ARRAY_SIZE(ctx->pdata.supplies); i++) {
> +   ret = regulator_enable(ctx->pdata.supplies[i].consumer);
> +   if (ret < 0) {
> +   DRM_DEV_DEBUG_DRIVER(dev, "cannot enable supply %d: 
> %d\n",
> +i, ret);
> +   goto reg_err;
> +   }
> +   usleep_range(2000, 2100);
> +   }
> +
> +   usleep_range(4000, 4100);
> +
> /* Power on pin enable */
> gpiod_set_value(ctx->pdata.gpio_p_on, 1);
> usleep_range(1, 11000);
> @@ -889,11 +903,16 @@ static void anx7625_power_on(struct anx7625_data *ctx)
> usleep_range(1, 11000);
>
> DRM_DEV_DEBUG_DRIVER(dev, "power on !\n");
> +   return;
> +reg_err:
> +   for (--i; i >= 0; i--)
> +   regulator_disable(ctx->pdata.supplies[i].consumer);
>  }
>
>  static void anx7625_power_standby(struct anx7625_data *ctx)
>  {
> struct device *dev = >client->dev;
> +   int ret;
>
> if (!ctx->pdata.low_power_mode) {
> DRM_DEV_DEBUG_DRIVER(dev, "not low power mode!\n");
> @@ -904,6 +923,12 @@ static void anx7625_power_standby(struct anx7625_data 
> *ctx)
> usleep_range(1000, 1100);
> gpiod_set_value(ctx->pdata.gpio_p_on, 0);
> usleep_range(1000, 1100);
> +
> +   ret = regulator_bulk_disable(ARRAY_SIZE(ctx->pdata.supplies),
> +ctx->pdata.supplies);
> +   if (ret < 0)
> +   DRM_DEV_DEBUG_DRIVER(dev, "cannot disable supplies %d\n", 
> ret);
> +
> DRM_DEV_DEBUG_DRIVER(dev, "power down\n");
>  }
>
> @@ -1742,6 +1767,15 @@ static int anx7625_i2c_probe(struct i2c_client *client,
> platform->client = client;
> i2c_set_clientdata(client, platform);
>
> +   pdata->supplies[0].supply = "vdd10";
> +   pdata->supplies[1].supply = "vdd18";
> +   pdata->supplies[2].supply = "vdd33";
> +   ret = devm_regulator_bulk_get(dev, ARRAY_SIZE(pdata->supplies),
> + pdata->supplies);
> +   if (ret) {
> +   DRM_DEV_ERROR(dev, "fail to get power supplies: %d\n", ret);
> +   return ret;
> +   }
> anx7625_init_gpio(platform);
>
> atomic_set(>power_status, 0);
> diff --git a/drivers/gpu/drm/bridge/analogix/anx7625.h 
> b/drivers/gpu/drm/bridge/analogix/anx7625.h
> index 193ad86c54503..e4a086b3a3d7b 100644
> --- a/drivers/gpu/drm/bridge/analogix/anx7625.h
> +++ b/drivers/gpu/drm/bridge/analogix/anx7625.h
> @@ -350,6 +350,7 @@ struct s_edid_data {
>  struct anx7625_platform_data {
> struct gpio_desc *gpio_p_on;
> struct gpio_desc *gpio_reset;
> +   struct regulator_bulk_data supplies[3];
> struct drm_bridge *panel_bridge;
> int intp_irq;
> u32 low_power_mode;
> --
> 2.30.1.766.gb4fecdf3b7-goog
>

Re: [PATCH v6 7/8] Documentation: Add documentation for the Brute LSM

2021-03-17 Thread Kees Cook

On Sun, Mar 07, 2021 at 12:30:30PM +0100, John Wood wrote:
> Add some info detailing what is the Brute LSM, its motivation, weak
> points of existing implementations, proposed solutions, enabling,
> disabling and self-tests.
> 
> Signed-off-by: John Wood 
> ---
>  Documentation/admin-guide/LSM/Brute.rst | 278 
>  Documentation/admin-guide/LSM/index.rst |   1 +
>  security/brute/Kconfig  |   3 +-
>  3 files changed, 281 insertions(+), 1 deletion(-)
>  create mode 100644 Documentation/admin-guide/LSM/Brute.rst
> 
> diff --git a/Documentation/admin-guide/LSM/Brute.rst 
> b/Documentation/admin-guide/LSM/Brute.rst
> new file mode 100644
> index ..ca80aef9aa67
> --- /dev/null
> +++ b/Documentation/admin-guide/LSM/Brute.rst
> @@ -0,0 +1,278 @@
> +.. SPDX-License-Identifier: GPL-2.0
> +===
> +Brute: Fork brute force attack detection and mitigation LSM
> +===
> +
> +Attacks against vulnerable userspace applications with the purpose to break 
> ASLR
> +or bypass canaries traditionally use some level of brute force with the help 
> of
> +the fork system call. This is possible since when creating a new process 
> using
> +fork its memory contents are the same as those of the parent process (the
> +process that called the fork system call). So, the attacker can test the 
> memory
> +infinite times to find the correct memory values or the correct memory 
> addresses
> +without worrying about crashing the application.
> +
> +Based on the above scenario it would be nice to have this detected and
> +mitigated, and this is the goal of this implementation. Specifically the
> +following attacks are expected to be detected:
> +
> +1.- Launching (fork()/exec()) a setuid/setgid process repeatedly until a
> +desirable memory layout is got (e.g. Stack Clash).
> +2.- Connecting to an exec()ing network daemon (e.g. xinetd) repeatedly until 
> a
> +desirable memory layout is got (e.g. what CTFs do for simple network
> +service).
> +3.- Launching processes without exec() (e.g. Android Zygote) and exposing 
> state
> +to attack a sibling.
> +4.- Connecting to a fork()ing network daemon (e.g. apache) repeatedly until 
> the
> +previously shared memory layout of all the other children is exposed 
> (e.g.
> +kind of related to HeartBleed).
> +
> +In each case, a privilege boundary has been crossed:
> +
> +Case 1: setuid/setgid process
> +Case 2: network to local
> +Case 3: privilege changes
> +Case 4: network to local
> +
> +So, what really needs to be detected are fork/exec brute force attacks that
> +cross any of the commented bounds.
> +
> +
> +Other implementations
> +=
> +
> +The public version of grsecurity, as a summary, is based on the idea of 
> delaying
> +the fork system call if a child died due to some fatal signal (SIGSEGV, 
> SIGBUS,
> +SIGKILL or SIGILL). This has some issues:
> +
> +Bad practices
> +-
> +
> +Adding delays to the kernel is, in general, a bad idea.
> +
> +Scenarios not detected (false negatives)
> +
> +
> +This protection acts only when the fork system call is called after a child 
> has
> +crashed. So, it would still be possible for an attacker to fork a big amount 
> of
> +children (in the order of thousands), then probe all of them, and finally 
> wait
> +the protection time before repeating the steps.
> +
> +Moreover, this method is based on the idea that the protection doesn't act if
> +the parent crashes. So, it would still be possible for an attacker to fork a
> +process and probe itself. Then, fork the child process and probe itself 
> again.
> +This way, these steps can be repeated infinite times without any mitigation.
> +
> +Scenarios detected (false positives)
> +
> +
> +Scenarios where an application rarely fails for reasons unrelated to a real
> +attack.
> +
> +
> +This implementation
> +===
> +
> +The main idea behind this implementation is to improve the existing ones
> +focusing on the weak points annotated before. Basically, the adopted 
> solution is
> +to detect a fast crash rate instead of only one simple crash and to detect 
> both
> +the crash of parent and child processes. Also, fine tune the detection 
> focusing
> +on privilege boundary crossing. And finally, as a mitigation method, kill all
> +the offending tasks involved in the attack instead of using delays.
> +
> +To achieve this goal, and going into more details, this implementation is 
> based
> +on the use of some statistical data shared across all the processes that can
> +have the same memory contents. Or in other words, a statistical data shared
> +between all the fork hierarchy processes after an execve system call.
> +
> +The purpose of these statistics is, basically, collect all the necessary info
> +to compute

Re: [PATCH v6 6/8] selftests/brute: Add tests for the Brute LSM

2021-03-17 Thread Kees Cook

On Sun, Mar 07, 2021 at 12:30:29PM +0100, John Wood wrote:
> Add tests to check the brute LSM functionality and cover fork/exec brute
> force attacks crossing the following privilege boundaries:
> 
> 1.- setuid process
> 2.- privilege changes
> 3.- network to local
> 
> Also, as a first step check that fork/exec brute force attacks without
> crossing any privilege boundariy already commented doesn't trigger the
> detection and mitigation stage.
> 
> All the fork brute force attacks are carried out via the "exec" app to
> avoid the triggering of the "brute" LSM over the shell script running
> the tests.
> 
> Signed-off-by: John Wood 

Yay tests!

> ---
>  tools/testing/selftests/Makefile |   1 +
>  tools/testing/selftests/brute/.gitignore |   2 +
>  tools/testing/selftests/brute/Makefile   |   5 +
>  tools/testing/selftests/brute/config |   1 +
>  tools/testing/selftests/brute/exec.c |  44 ++
>  tools/testing/selftests/brute/test.c | 507 +++
>  tools/testing/selftests/brute/test.sh| 226 ++
>  7 files changed, 786 insertions(+)
>  create mode 100644 tools/testing/selftests/brute/.gitignore
>  create mode 100644 tools/testing/selftests/brute/Makefile
>  create mode 100644 tools/testing/selftests/brute/config
>  create mode 100644 tools/testing/selftests/brute/exec.c
>  create mode 100644 tools/testing/selftests/brute/test.c
>  create mode 100755 tools/testing/selftests/brute/test.sh
> 
> diff --git a/tools/testing/selftests/Makefile 
> b/tools/testing/selftests/Makefile
> index 6c575cf34a71..d4cf9e1c0a6d 100644
> --- a/tools/testing/selftests/Makefile
> +++ b/tools/testing/selftests/Makefile
> @@ -2,6 +2,7 @@
>  TARGETS = arm64
>  TARGETS += bpf
>  TARGETS += breakpoints
> +TARGETS += brute
>  TARGETS += capabilities
>  TARGETS += cgroup
>  TARGETS += clone3
> diff --git a/tools/testing/selftests/brute/.gitignore 
> b/tools/testing/selftests/brute/.gitignore
> new file mode 100644
> index ..1ccc45251a1b
> --- /dev/null
> +++ b/tools/testing/selftests/brute/.gitignore
> @@ -0,0 +1,2 @@
> +exec
> +test
> diff --git a/tools/testing/selftests/brute/Makefile 
> b/tools/testing/selftests/brute/Makefile
> new file mode 100644
> index ..52662d0b484c
> --- /dev/null
> +++ b/tools/testing/selftests/brute/Makefile
> @@ -0,0 +1,5 @@
> +# SPDX-License-Identifier: GPL-2.0
> +CFLAGS += -Wall -O2
> +TEST_PROGS := test.sh
> +TEST_GEN_FILES := exec test
> +include ../lib.mk
> diff --git a/tools/testing/selftests/brute/config 
> b/tools/testing/selftests/brute/config
> new file mode 100644
> index ..3587b7bf6c23
> --- /dev/null
> +++ b/tools/testing/selftests/brute/config
> @@ -0,0 +1 @@
> +CONFIG_SECURITY_FORK_BRUTE=y
> diff --git a/tools/testing/selftests/brute/exec.c 
> b/tools/testing/selftests/brute/exec.c
> new file mode 100644
> index ..1bbe72f6e4bd
> --- /dev/null
> +++ b/tools/testing/selftests/brute/exec.c
> @@ -0,0 +1,44 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +static __attribute__((noreturn)) void error_failure(const char *message)
> +{
> + perror(message);
> + exit(EXIT_FAILURE);
> +}
> +
> +#define PROG_NAME basename(argv[0])
> +
> +int main(int argc, char **argv)
> +{
> + pid_t pid;
> + int status;
> +
> + if (argc < 2) {
> + printf("Usage: %s \n", PROG_NAME);
> + exit(EXIT_FAILURE);
> + }
> +
> + pid = fork();
> + if (pid < 0)
> + error_failure("fork");
> +
> + /* Child process */
> + if (!pid) {
> + execve(argv[1], [1], NULL);
> + error_failure("execve");
> + }
> +
> + /* Parent process */
> + pid = waitpid(pid, , 0);
> + if (pid < 0)
> + error_failure("waitpid");
> +
> + return EXIT_SUCCESS;
> +}
> diff --git a/tools/testing/selftests/brute/test.c 
> b/tools/testing/selftests/brute/test.c
> new file mode 100644
> index ..44c32f446dca
> --- /dev/null
> +++ b/tools/testing/selftests/brute/test.c
> @@ -0,0 +1,507 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +static const char *message = "message";
> +
> +enum mode {
> + MODE_NONE,
> + MODE_CRASH,
> + MODE_SERVER_CRASH,
> + MODE_CLIENT,
> +};
> +
> +enum crash_after {
> + CRASH_AFTER_NONE,
> + CRASH_AFTER_FORK,
> + CRASH_AFTER_EXEC,
> +};
> +
> +enum signal_from {
> + SIGNAL_FROM_NONE,
> + SIGNAL_FROM_USER,
> + SIGNAL_FROM_KERNEL,
> +};
> +
> +struct args {
> + uint32_t ip;
> + uint16_t port;
> + int counter;
> + long timeout;
> + enum mode mode;
> + enum crash_after crash_after;
> + enum signal_from signal_from;
> + unsigned char has_counter : 1;
> + unsigned

[PATCH 2/3] mm, dax, pmem: Introduce dev_pagemap_failure()

2021-03-17 Thread Dan Williams

Jason wondered why the get_user_pages_fast() path takes references on a
@pgmap object. The rationale was to protect against accessing a 'struct
page' that might be in the process of being removed by the driver, but
he rightly points out that should be solved the same way all gup-fast
synchronization is solved which is invalidate the mapping and let the
gup slow path do @pgmap synchronization [1].

To achieve that it means that new user mappings need to stop being
created and all existing user mappings need to be invalidated.

For device-dax this is already the case as kill_dax() prevents future
faults from installing a pte, and the single device-dax inode
address_space can be trivially unmapped.

The situation is different for filesystem-dax where device pages could
be mapped by any number of inode address_space instances. An initial
thought was to treat the device removal event like a drop_pagecache_sb()
event that walks superblocks and unmaps all inodes. However, Dave points
out that it is not just the filesystem user-mappings that need to react
to global DAX page-unmap events, it is also filesystem metadata
(proposed DAX metadata access), and other drivers (upstream
DM-writecache) that need to react to this event [2].

The only kernel facility that is meant to globally broadcast the loss of
a page (via corruption or surprise remove) is memory_failure(). The
downside of memory_failure() is that it is a pfn-at-a-time interface.
However, the events that would trigger the need to call memory_failure()
over a full PMEM device should be rare. Remove should always be
coordinated by the administrator with the filesystem. If someone force
removes a device from underneath a mounted filesystem the driver assumes
they have a good reason, or otherwise get to keep the pieces. Since
->remove() callbacks can not fail the only option is to trigger the mass
memory_failure().

The mechanism to determine whether memory_failure() triggers at
pmem->remove() time is whether the associated dax_device has an elevated
reference at @pgmap ->kill() time.

With this in place the get_user_pages_fast() path can drop its
half-measure synchronization with an @pgmap reference.

Link: http://lore.kernel.org/r/20210224010017.gq2643...@ziepe.ca [1]
Link: http://lore.kernel.org/r/20210302075736.gj4...@dread.disaster.area [2]
Reported-by: Jason Gunthorpe 
Cc: Dave Chinner 
Cc: Christoph Hellwig 
Cc: Shiyang Ruan 
Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Cc: Matthew Wilcox 
Cc: Jan Kara 
Cc: Andrew Morton 
Cc: Naoya Horiguchi 
Cc: "Darrick J. Wong" 
Signed-off-by: Dan Williams 
---
 drivers/dax/super.c  |   15 +++
 drivers/nvdimm/pmem.c|   10 +-
 drivers/nvdimm/pmem.h|1 +
 include/linux/dax.h  |5 +
 include/linux/memremap.h |5 +
 include/linux/mm.h   |3 +++
 mm/memory-failure.c  |   11 +--
 mm/memremap.c|   11 +++
 8 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 5fa6ae9dbc8b..5ebcedf4a68c 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -624,6 +624,21 @@ void put_dax(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(put_dax);
 
+bool dax_is_idle(struct dax_device *dax_dev)
+{
+   struct inode *inode;
+
+   if (!dax_dev)
+   return true;
+
+   WARN_ONCE(test_bit(DAXDEV_ALIVE, _dev->flags),
+ "dax idle check on live device.\n");
+
+   inode = _dev->inode;
+   return atomic_read(>i_count) < 2;
+}
+EXPORT_SYMBOL_GPL(dax_is_idle);
+
 /**
  * dax_get_by_host() - temporary lookup mechanism for filesystem-dax
  * @host: alternate name for the device registered by a dax driver
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index b8a85bfb2e95..e8822c9262ee 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -348,15 +348,21 @@ static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
 {
struct request_queue *q =
container_of(pgmap->ref, struct request_queue, q_usage_counter);
+   struct pmem_device *pmem = q->queuedata;
 
blk_freeze_queue_start(q);
+   kill_dax(pmem->dax_dev);
+   if (!dax_is_idle(pmem->dax_dev)) {
+   dev_warn(pmem->dev,
+"DAX active at remove, trigger mass memory failure\n");
+   dev_pagemap_failure(pgmap);
+   }
 }
 
 static void pmem_release_disk(void *__pmem)
 {
struct pmem_device *pmem = __pmem;
 
-   kill_dax(pmem->dax_dev);
put_dax(pmem->dax_dev);
del_gendisk(pmem->disk);
put_disk(pmem->disk);
@@ -406,6 +412,7 @@ static int pmem_attach_disk(struct device *dev,
devm_namespace_disable(dev, ndns);
 
dev_set_drvdata(dev, pmem);
+   pmem->dev = dev;
pmem->phys_addr = res->start;
pmem->size = resource_size(res);
fua = nvdimm_has_flush(nd_region);
@@ -467,6 +474,7 @@ static int pmem_attach_disk(struct device

[PATCH 3/3] mm/devmap: Remove pgmap accounting in the get_user_pages_fast() path

2021-03-17 Thread Dan Williams

Now that device-dax and filesystem-dax are guaranteed to unmap all user
mappings of devmap / DAX pages before tearing down the 'struct page'
array, get_user_pages_fast() can rely on its traditional synchronization
method "validate_pte(); get_page(); revalidate_pte()" to catch races with
device shutdown. Specifically the unmap guarantee ensures that gup-fast
either succeeds in taking a page reference (lock-less), or it detects a
need to fall back to the slow path where the device presence can be
revalidated with locks held.

Reported-by: Jason Gunthorpe 
Cc: Christoph Hellwig 
Cc: Shiyang Ruan 
Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: Ira Weiny 
Cc: Matthew Wilcox 
Cc: Jan Kara 
Cc: Andrew Morton 
Signed-off-by: Dan Williams 
---
 mm/gup.c |   38 --
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index e40579624f10..dfeb47e4e8d4 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1996,9 +1996,8 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int 
nr_start,
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 unsigned int flags, struct page **pages, int *nr)
 {
-   struct dev_pagemap *pgmap = NULL;
-   int nr_start = *nr, ret = 0;
pte_t *ptep, *ptem;
+   int ret = 0;
 
ptem = ptep = pte_offset_map(, addr);
do {
@@ -2015,16 +2014,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
 
-   if (pte_devmap(pte)) {
-   if (unlikely(flags & FOLL_LONGTERM))
-   goto pte_unmap;
+   if (pte_devmap(pte) && (flags & FOLL_LONGTERM))
+   goto pte_unmap;
 
-   pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
-   if (unlikely(!pgmap)) {
-   undo_dev_pagemap(nr, nr_start, flags, pages);
-   goto pte_unmap;
-   }
-   } else if (pte_special(pte))
+   if (pte_special(pte))
goto pte_unmap;
 
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -2063,8 +2056,6 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
ret = 1;
 
 pte_unmap:
-   if (pgmap)
-   put_dev_pagemap(pgmap);
pte_unmap(ptem);
return ret;
 }
@@ -2087,21 +2078,26 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
 #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
 
 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 unsigned long end, unsigned int flags,
 struct page **pages, int *nr)
 {
int nr_start = *nr;
-   struct dev_pagemap *pgmap = NULL;
 
do {
-   struct page *page = pfn_to_page(pfn);
+   struct page *page;
+
+   /*
+* Typically pfn_to_page() on a devmap pfn is not safe
+* without holding a live reference on the hosting
+* pgmap. In the gup-fast path it is safe because any
+* races will be resolved by either gup-fast taking a
+* reference or the shutdown path unmapping the pte to
+* trigger gup-fast to fall back to the slow path.
+*/
+   page = pfn_to_page(pfn);
 
-   pgmap = get_dev_pagemap(pfn, pgmap);
-   if (unlikely(!pgmap)) {
-   undo_dev_pagemap(nr, nr_start, flags, pages);
-   return 0;
-   }
SetPageReferenced(page);
pages[*nr] = page;
if (unlikely(!try_grab_page(page, flags))) {
@@ -2112,8 +2108,6 @@ static int __gup_device_huge(unsigned long pfn, unsigned 
long addr,
pfn++;
} while (addr += PAGE_SIZE, addr != end);
 
-   if (pgmap)
-   put_dev_pagemap(pgmap);
return 1;
 }

[PATCH 1/3] mm/memory-failure: Prepare for mass memory_failure()

2021-03-17 Thread Dan Williams

Currently memory_failure() assumes an infrequent report on a handful of
pages. A new use case for surprise removal of a persistent memory device
needs to trigger memory_failure() on a large range. Rate limit
memory_failure() error logging, and allow the
memory_failure_dev_pagemap() helper to be called directly.

Cc: Naoya Horiguchi 
Cc: Andrew Morton 
Signed-off-by: Dan Williams 
---
 mm/memory-failure.c |   25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 24210c9bd843..43ba4307c526 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -395,8 +395,9 @@ static void kill_procs(struct list_head *to_kill, int 
forcekill, bool fail,
 * signal and then access the memory. Just kill it.
 */
if (fail || tk->addr == -EFAULT) {
-   pr_err("Memory failure: %#lx: forcibly killing 
%s:%d because of failure to unmap corrupted page\n",
-  pfn, tk->tsk->comm, tk->tsk->pid);
+   pr_err_ratelimited(
+   "Memory failure: %#lx: forcibly killing 
%s:%d because of failure to unmap corrupted page\n",
+   pfn, tk->tsk->comm, tk->tsk->pid);
do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
 tk->tsk, PIDTYPE_PID);
}
@@ -408,8 +409,9 @@ static void kill_procs(struct list_head *to_kill, int 
forcekill, bool fail,
 * process anyways.
 */
else if (kill_proc(tk, pfn, flags) < 0)
-   pr_err("Memory failure: %#lx: Cannot send 
advisory machine check signal to %s:%d\n",
-  pfn, tk->tsk->comm, tk->tsk->pid);
+   pr_err_ratelimited(
+   "Memory failure: %#lx: Cannot send 
advisory machine check signal to %s:%d\n",
+   pfn, tk->tsk->comm, tk->tsk->pid);
}
put_task_struct(tk->tsk);
kfree(tk);
@@ -919,8 +921,8 @@ static void action_result(unsigned long pfn, enum 
mf_action_page_type type,
 {
trace_memory_failure_event(pfn, type, result);
 
-   pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
-   pfn, action_page_types[type], action_name[result]);
+   pr_err_ratelimited("Memory failure: %#lx: recovery action for %s: %s\n",
+  pfn, action_page_types[type], action_name[result]);
 }
 
 static int page_action(struct page_state *ps, struct page *p,
@@ -1375,8 +1377,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, 
int flags,
 unlock:
dax_unlock_page(page, cookie);
 out:
-   /* drop pgmap ref acquired in caller */
-   put_dev_pagemap(pgmap);
action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED);
return rc;
 }
@@ -1415,9 +1415,12 @@ int memory_failure(unsigned long pfn, int flags)
if (!p) {
if (pfn_valid(pfn)) {
pgmap = get_dev_pagemap(pfn, NULL);
-   if (pgmap)
-   return memory_failure_dev_pagemap(pfn, flags,
- pgmap);
+   if (pgmap) {
+   res = memory_failure_dev_pagemap(pfn, flags,
+pgmap);
+   put_dev_pagemap(pgmap);
+   return res;
+   }
}
pr_err("Memory failure: %#lx: memory outside kernel control\n",
pfn);

Re: [PATCH v2] ARM: dts: imx6ull: fix ubi filesystem mount failed

2021-03-17 Thread Shawn Guo

On Wed, Mar 17, 2021 at 11:45:09PM +0800, dillon.min...@gmail.com wrote:
> From: dillon min 
> 
> For NAND Ecc layout, there is a dependency from old kernel's nand driver
> setting and current. if old kernel use 4 bit ecc , we should use 4 bit
> in new kernel either. else will run into following error at filesystem
> mounting.
> 
> So, enable fsl,use-minimum-ecc from device tree, to fix this mismatch
> 
> [9.449265] ubi0: scanning is finished
> [9.463968] ubi0 warning: ubi_io_read: error -74 (ECC error) while reading
> 22528 bytes from PEB 513:4096, read only 22528 bytes, retry
> [9.486940] ubi0 warning: ubi_io_read: error -74 (ECC error) while reading
> 22528 bytes from PEB 513:4096, read only 22528 bytes, retry
> [9.509906] ubi0 warning: ubi_io_read: error -74 (ECC error) while reading
> 22528 bytes from PEB 513:4096, read only 22528 bytes, retry
> [9.532845] ubi0 error: ubi_io_read: error -74 (ECC error) while reading
> 22528 bytes from PEB 513:4096, read 22528 bytes
> 
> Fixes: f9ecf10cb88c ("ARM: dts: imx6ull: add MYiR MYS-6ULX SBC")
> Signed-off-by: dillon min 
> Reviewed-by: Fabio Estevam 
> Signed-off-by: Shawn Guo 

Replaced with this version.

Shawn

[PATCH 0/3] mm, pmem: Force unmap pmem on surprise remove

2021-03-17 Thread Dan Williams

Summary:

A dax_dev can be unbound from its driver at any time. Unbind can not
fail. The driver-core will always trigger ->remove() and the result from
->remove() is ignored. After ->remove() the driver-core proceeds to tear
down context. The filesystem-dax implementation can leave pfns mapped
after ->remove() if it is triggered while the filesystem is mounted.
Security and data-integrity is forfeit if the dax_dev is repurposed for
another security domain (new filesystem or change device modes), or if
the dax_dev is physically replaced. CXL is a hotplug bus that makes
dax_dev physical replace a real world prospect. 

All dax_dev pfns must be unmapped at remove. Detect the "remove while
mounted" case and trigger memory_failure() over the entire dax_dev
range.

Details:

The get_user_pages_fast() path expects all synchronization to be handled
by the pattern of checking for pte presence, taking a page reference,
and then validating that the pte was stable over that event. The
gup-fast path for devmap / DAX pages additionally attempts to take/hold
a live reference against the hosting pgmap over the page pin. The
rational for the pgmap reference is to synchronize against a dax-device
unbind / ->remove() event, but that is unnecessary if pte invalidation
is guaranteed in the ->remove() path.

Global dax-device pte invalidation *does* happen when the device is in
raw "device-dax" mode where there is a single shared inode to unmap at
remove, but the filesystem-dax path has a large number of actively
mapped inodes unknown to the driver at ->remove() time. So, that unmap
does not happen today for filesystem-dax. However, as Jason points out,
that unmap / invalidation *needs* to happen not only to cleanup
get_user_pages_fast() semantics, but in a future (see CXL) where dax_dev
->remove() is correlated with actual physical removal / replacement the
implications of allowing a physical pfn to be exchanged without tearing
down old mappings are severe (security and data-integrity).

What is not in this patch set is coordination with the dax_kmem driver
to trigger memory_failure() when the dax_dev is onlined as "System
RAM". The remove_memory() API was built with the assumption that
platform firmware negotiates all removal requests and the OS has a
chance to say "no". This is why dax_kmem today simply leaks
request_region() to burn that physical address space for any other
usage until the next reboot on a manual unbind event if the memory can't
be offlined. However a future to make sure that remove_memory() succeeds
after memory_failure() of the same range seems a better semantic than
permanently burning physical address space.

The topic of remove_memory() failures gets to the question of what
happens to active page references when the inopportune ->remove() event
happens. For transient pins the ->remove() event will wait for for all
pins to be dropped before allowing ->remove() to complete. Since
fileystem-dax forbids longterm pins all those pins are transient.
Device-dax, on the other hand, does allow longterm pins which means that
->remove() will hang unless / until the longterm pin is dropped.
Hopefully an unmap_mapping_range() event is sufficient to get the pin
dropped, but I suspect device-dax might need to trigger memory_failure()
as well to get the longterm pin holder to wake up and get out of the
way (TBD).

Lest we repeat the "longterm-pin-revoke" debate, which highlighted that
RDMA devices do not respond well to having context torn down, keep in
mind that this proposal is to do a best effort recovery of an event that
should not happen (surprise removal) under nominal operation.

---

Dan Williams (3):
  mm/memory-failure: Prepare for mass memory_failure()
  mm, dax, pmem: Introduce dev_pagemap_failure()
  mm/devmap: Remove pgmap accounting in the get_user_pages_fast() path


 drivers/dax/super.c  |   15 +++
 drivers/nvdimm/pmem.c|   10 +-
 drivers/nvdimm/pmem.h|1 +
 include/linux/dax.h  |5 +
 include/linux/memremap.h |5 +
 include/linux/mm.h   |3 +++
 mm/gup.c |   38 --
 mm/memory-failure.c  |   36 +++-
 mm/memremap.c|   11 +++
 9 files changed, 88 insertions(+), 36 deletions(-)

Re: [PATCH v6 5/8] security/brute: Mitigate a brute force attack

2021-03-17 Thread Kees Cook

On Sun, Mar 07, 2021 at 12:30:28PM +0100, John Wood wrote:
> In order to mitigate a brute force attack all the offending tasks involved
> in the attack must be killed. In other words, it is necessary to kill all
> the tasks that share the fork and/or exec statistical data related to the
> attack. Moreover, if the attack happens through the fork system call, the
> processes that have the same group_leader that the current task (the task
> that has crashed) must be avoided since they are in the path to be killed.
> 
> When the SIGKILL signal is sent to the offending tasks, the function
> "brute_kill_offending_tasks" will be called in a recursive way from the
> task_fatal_signal LSM hook due to a small crash period. So, to avoid kill
> again the same tasks due to a recursive call of this function, it is
> necessary to disable the attack detection for the involved hierarchies.
> 
> To disable the attack detection, set to zero the last crash timestamp and
> avoid to compute the application crash period in this case.
> 
> Signed-off-by: John Wood 
> ---
>  security/brute/brute.c | 141 ++---
>  1 file changed, 132 insertions(+), 9 deletions(-)
> 
> diff --git a/security/brute/brute.c b/security/brute/brute.c
> index 38e5e050964a..36a3286a02dd 100644
> --- a/security/brute/brute.c
> +++ b/security/brute/brute.c
> @@ -22,6 +22,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -64,7 +65,7 @@ struct brute_cred {
>   * @lock: Lock to protect the brute_stats structure.
>   * @refc: Reference counter.
>   * @faults: Number of crashes.
> - * @jiffies: Last crash timestamp.
> + * @jiffies: Last crash timestamp. If zero, the attack detection is disabled.
>   * @period: Crash period's moving average.
>   * @saved_cred: Saved credentials.
>   * @network: Network activity flag.
> @@ -571,6 +572,125 @@ static inline void print_fork_attack_running(void)
>   pr_warn("Fork brute force attack detected [%s]\n", current->comm);
>  }
> 
> +/**
> + * brute_disabled() - Test if the brute force attack detection is disabled.
> + * @stats: Statistical data shared by all the fork hierarchy processes.
> + *
> + * The brute force attack detection enabling/disabling is based on the last
> + * crash timestamp. A zero timestamp indicates that this feature is 
> disabled. A
> + * timestamp greater than zero indicates that the attack detection is 
> enabled.
> + *
> + * The statistical data shared by all the fork hierarchy processes cannot be
> + * NULL.
> + *
> + * It's mandatory to disable interrupts before acquiring the 
> brute_stats::lock
> + * since the task_free hook can be called from an IRQ context during the
> + * execution of the task_fatal_signal hook.
> + *
> + * Context: Must be called with interrupts disabled and brute_stats_ptr_lock
> + *  held.
> + * Return: True if the brute force attack detection is disabled. False
> + * otherwise.
> + */
> +static bool brute_disabled(struct brute_stats *stats)
> +{
> + bool disabled;
> +
> + spin_lock(>lock);
> + disabled = !stats->jiffies;
> + spin_unlock(>lock);
> +
> + return disabled;
> +}
> +
> +/**
> + * brute_disable() - Disable the brute force attack detection.
> + * @stats: Statistical data shared by all the fork hierarchy processes.
> + *
> + * To disable the brute force attack detection it is only necessary to set 
> the
> + * last crash timestamp to zero. A zero timestamp indicates that this 
> feature is
> + * disabled. A timestamp greater than zero indicates that the attack 
> detection
> + * is enabled.
> + *
> + * The statistical data shared by all the fork hierarchy processes cannot be
> + * NULL.
> + *
> + * Context: Must be called with interrupts disabled and brute_stats_ptr_lock
> + *  and brute_stats::lock held.
> + */
> +static inline void brute_disable(struct brute_stats *stats)
> +{
> + stats->jiffies = 0;
> +}
> +
> +/**
> + * enum brute_attack_type - Brute force attack type.
> + * @BRUTE_ATTACK_TYPE_FORK: Attack that happens through the fork system call.
> + * @BRUTE_ATTACK_TYPE_EXEC: Attack that happens through the execve system 
> call.
> + */
> +enum brute_attack_type {
> + BRUTE_ATTACK_TYPE_FORK,
> + BRUTE_ATTACK_TYPE_EXEC,
> +};
> +
> +/**
> + * brute_kill_offending_tasks() - Kill the offending tasks.
> + * @attack_type: Brute force attack type.
> + * @stats: Statistical data shared by all the fork hierarchy processes.
> + *
> + * When a brute force attack is detected all the offending tasks involved in 
> the
> + * attack must be killed. In other words, it is necessary to kill all the 
> tasks
> + * that share the same statistical data. Moreover, if the attack happens 
> through
> + * the fork system call, the processes that have the same group_leader that 
> the
> + * current task must be avoided since they are in the path to be killed.
> + *
> + * When the SIGKILL signal is sent to the offending tasks, this

Re: [PATCH v3] arm64: configs: Enable PCIe support for imx8mq boards

2021-03-17 Thread Shawn Guo

On Wed, Mar 17, 2021 at 01:11:37PM +0100, Heiko Thiery wrote:
> Enable PCI_IMX6 to get PCI support for imx8mq boards like imx8mq-evk,
> imx8mq-kontron-pitx-imx8m and imx8mq-zii-ultra.
> 
> The driver only has build-in support and cannot be compiled as module.
> 
> Signed-off-by: Heiko Thiery 

Applied, thanks.

Re: Errant readings on LM81 with T2080 SoC

2021-03-17 Thread Guenter Roeck

On 3/17/21 8:46 PM, Chris Packham wrote:
> 
> On 12/03/21 10:34 am, Guenter Roeck wrote:
>> On 3/11/21 1:17 PM, Chris Packham wrote:
>>> On 11/03/21 9:18 pm, Wolfram Sang wrote:
> Bummer. What is really weird is that you see clock stretching under
> CPU load. Normally clock stretching is triggered by the device, not
> by the host.
 One example: Some hosts need an interrupt per byte to know if they
 should send ACK or NACK. If that interrupt is delayed, they stretch the
 clock.

>>> It feels like something like that is happening. Looking at the T2080
>>> Reference manual there is an interesting timing diagram (Figure 14-2 if
>>> someone feels like looking it up). It shows SCL low between the ACK for
>>> the address and the data byte. I think if we're delayed in sending the
>>> next byte we could violate Ttimeout or Tlow:mext from the SMBUS spec.
>>>
>> I think that really leaves you only two options that I can see:
>> Rework the driver to handle critical actions (such as setting TXAK,
>> and everything else that might result in clock stretching) in the
>> interrupt handler, or rework the driver to handle everything in
>> a high priority kernel thread.
> I've made some reasonable progress on making i2c-mpc more interrupt 
> driven. Assuming it works out for my use-case is there an opinion on 
> making interrupt support mandatory? Looking at all the in-tree dts files 
> that use one of the compatible strings from i2c-mpc.c they all have 
> interrupt properties so in theory nothing is using the polling mode. But 
> there may be some out-of-tree boards or boards using an old dtb that 
> would be affected?
> 

The polling code is from pre-git times. Like 2005 and earlier.
I'd say it is about time to get rid of it. Any out-of-tree users
had more than 15 years to upstream their code, after all.

Guenter

Re: [PATCH v6 4/8] security/brute: Fine tuning the attack detection

2021-03-17 Thread Kees Cook

On Sun, Mar 07, 2021 at 12:30:27PM +0100, John Wood wrote:
> To avoid false positives during the attack detection it is necessary to
> narrow the possible cases. Only the following scenarios are taken into
> account:
> 
> 1.- Launching (fork()/exec()) a setuid/setgid process repeatedly until a
> desirable memory layout is got (e.g. Stack Clash).
> 2.- Connecting to an exec()ing network daemon (e.g. xinetd) repeatedly
> until a desirable memory layout is got (e.g. what CTFs do for simple
> network service).
> 3.- Launching processes without exec() (e.g. Android Zygote) and exposing
> state to attack a sibling.
> 4.- Connecting to a fork()ing network daemon (e.g. apache) repeatedly until
> the previously shared memory layout of all the other children is
> exposed (e.g. kind of related to HeartBleed).
> 
> In each case, a privilege boundary has been crossed:
> 
> Case 1: setuid/setgid process
> Case 2: network to local
> Case 3: privilege changes
> Case 4: network to local
> 
> So, this patch checks if any of these privilege boundaries have been
> crossed before to compute the application crash period.
> 
> Also, in every fatal crash only the signals delivered by the kernel are
> taken into account with the exception of the SIGABRT signal since the
> latter is used by glibc for stack canary, malloc, etc failures, which may
> indicate that a mitigation has been triggered.
> 
> Signed-off-by: John Wood 
> ---
>  security/brute/brute.c | 293 +++--
>  1 file changed, 280 insertions(+), 13 deletions(-)
> 
> diff --git a/security/brute/brute.c b/security/brute/brute.c
> index 870db55332d4..38e5e050964a 100644
> --- a/security/brute/brute.c
> +++ b/security/brute/brute.c
> @@ -3,15 +3,25 @@
>  #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> 
>  #include 
> +#include 
> +#include 
> +#include 
> +#include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
> +#include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -19,9 +29,35 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
> +#include 
>  #include 
> +#include 

This is really a LOT of includes. Are you sure all of these are
explicitly needed?

> +
> +/**
> + * struct brute_cred - Saved credentials.
> + * @uid: Real UID of the task.
> + * @gid: Real GID of the task.
> + * @suid: Saved UID of the task.
> + * @sgid: Saved GID of the task.
> + * @euid: Effective UID of the task.
> + * @egid: Effective GID of the task.
> + * @fsuid: UID for VFS ops.
> + * @fsgid: GID for VFS ops.
> + */
> +struct brute_cred {
> + kuid_t uid;
> + kgid_t gid;
> + kuid_t suid;
> + kgid_t sgid;
> + kuid_t euid;
> + kgid_t egid;
> + kuid_t fsuid;
> + kgid_t fsgid;
> +};
> 
>  /**
>   * struct brute_stats - Fork brute force attack statistics.
> @@ -30,6 +66,9 @@
>   * @faults: Number of crashes.
>   * @jiffies: Last crash timestamp.
>   * @period: Crash period's moving average.
> + * @saved_cred: Saved credentials.
> + * @network: Network activity flag.
> + * @bounds_crossed: Privilege bounds crossed flag.
>   *
>   * This structure holds the statistical data shared by all the fork hierarchy
>   * processes.
> @@ -40,6 +79,9 @@ struct brute_stats {
>   unsigned char faults;
>   u64 jiffies;
>   u64 period;
> + struct brute_cred saved_cred;
> + unsigned char network : 1;
> + unsigned char bounds_crossed : 1;

If you really want to keep faults a "char", I would move these bools
after "faults" to avoid adding more padding.

>  };
> 
>  /*
> @@ -71,18 +113,25 @@ static inline struct brute_stats 
> **brute_stats_ptr(struct task_struct *task)
> 
>  /**
>   * brute_new_stats() - Allocate a new statistics structure.
> + * @network_to_local: Network activity followed by a fork or execve system 
> call.
> + * @is_setid: The executable file has the setid flags set.
>   *
>   * If the allocation is successful the reference counter is set to one to
>   * indicate that there will be one task that points to this structure. Also, 
> the
>   * last crash timestamp is set to now. This way, it is possible to compute 
> the
>   * application crash period at the first fault.
>   *
> + * Moreover, the credentials of the current task are saved. Also, the network
> + * and bounds_crossed flags are set based on the network_to_local and 
> is_setid
> + * parameters.
> + *
>   * Return: NULL if the allocation fails. A pointer to the new allocated
>   * statistics structure if it success.
>   */
> -static struct brute_stats *brute_new_stats(void)
> +static struct brute_stats *brute_new_stats(bool network_to_local, bool 
> is_setid)
>  {
>   struct brute_stats *stats;
> + const struct cred *cred = current_cred();
> 
>   stats = kmalloc(sizeof(struct brute_stats), GFP_ATOMIC);
>   if (!stats)
> @@ -93,6 +142,16 @@

Re: [PATCH v2 0/3] perf-stat: share hardware PMCs with BPF

2021-03-17 Thread Song Liu




> On Mar 17, 2021, at 6:11 AM, Arnaldo Carvalho de Melo  wrote:
> 
> Em Wed, Mar 17, 2021 at 02:29:28PM +0900, Namhyung Kim escreveu:
>> Hi Song,
>> 
>> On Wed, Mar 17, 2021 at 6:18 AM Song Liu  wrote:
>>> 
>>> perf uses performance monitoring counters (PMCs) to monitor system
>>> performance. The PMCs are limited hardware resources. For example,
>>> Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu.
>>> 
>>> Modern data center systems use these PMCs in many different ways:
>>> system level monitoring, (maybe nested) container level monitoring, per
>>> process monitoring, profiling (in sample mode), etc. In some cases,
>>> there are more active perf_events than available hardware PMCs. To allow
>>> all perf_events to have a chance to run, it is necessary to do expensive
>>> time multiplexing of events.
>>> 
>>> On the other hand, many monitoring tools count the common metrics (cycles,
>>> instructions). It is a waste to have multiple tools create multiple
>>> perf_events of "cycles" and occupy multiple PMCs.
>> 
>> Right, it'd be really helpful when the PMCs are frequently or mostly shared.
>> But it'd also increase the overhead for uncontended cases as BPF programs
>> need to run on every context switch.  Depending on the workload, it may
>> cause a non-negligible performance impact.  So users should be aware of it.
> 
> Would be interesting to, humm, measure both cases to have a firm number
> of the impact, how many instructions are added when sharing using
> --bpf-counters?
> 
> I.e. compare the "expensive time multiplexing of events" with its
> avoidance by using --bpf-counters.
> 
> Song, have you perfmormed such measurements?

I have got some measurements with perf-bench-sched-messaging:

The system: x86_64 with 23 cores (46 HT)

The perf-stat command:
perf stat -e cycles,cycles,instructions,instructions,ref-cycles,ref-cycles 


The benchmark command and output:
./perf bench sched messaging -g 40 -l 5 -t
# Running 'sched/messaging' benchmark:
# 20 sender and receiver threads per group
# 40 groups == 1600 threads run
 Total time: 10X.XXX [sec]


I use the "Total time" as measurement, so smaller number is better. 

For each condition, I run the command 5 times, and took the median of 
"Total time". 

Baseline (no perf-stat) 104.873 [sec]
# global
perf stat -a107.887 [sec]
perf stat -a --bpf-counters 106.071 [sec]
# per task
perf stat   106.314 [sec]
perf stat --bpf-counters105.965 [sec]
# per cpu
perf stat -C 1,3,5  107.063 [sec]
perf stat -C 1,3,5 --bpf-counters   106.406 [sec]

>From the data, --bpf-counters is slightly better than the regular event
for all targets. I noticed that the results are not very stable. There 
are a couple 108.xx runs in some of the conditions (w/ and w/o 
--bpf-counters).


I also measured the average runtime of the BPF programs, with 

sysctl kernel.bpf_stats_enabled=1

For each event, if we have one leader and two followers, the total run 
time is about 340ns. IOW, 340ns for two perf-stat reading instructions, 
340ns for two perf-stat reading cycles, etc. 

Thanks,
Song

Re: [PATCH v13 00/14] huge vmalloc mappings

2021-03-17 Thread Nicholas Piggin

Excerpts from Andrew Morton's message of March 18, 2021 8:58 am:
> On Wed, 17 Mar 2021 16:23:48 +1000 Nicholas Piggin  wrote:
> 
>> 
>> *** BLURB HERE ***
>> 
> 
> That's really not what it means ;)

Sigh, wasn't having a good yesterday.

> Could we please get a nice description for the [0/n]?  What's it all
> about, what's the benefit, what are potential downsides.
>
> And performance testing results!  Because if it ain't faster, there's
> no point in merging it?
> 

It's supposed to have a bit of description in patch 13, and has some
performance reuslts in patch 14. Is it better to put a bigger writeup
in 0? I thought that tends to get lost.

I'll write something here to discuss for now, and can fit it into the 
appropriate place in the series after that.

The kernel virtual mapping layer grew support for mapping memory with > 
PAGE_SIZE ptes with 0ddab1d2ed664 ("lib/ioremap.c: add huge I/O map 
capability interfaces"), and implemented support for using those huge
page mappings with ioremap.

According to the submission, the use-case is mapping very large 
non-volatile memory devices, which could be GB or TB.
https://lore.kernel.org/lkml/1425404664-19675-1-git-send-email-toshi.k...@hp.com/
The benefit is said to be in the overhead of maintaining the mapping,
perhaps both in memory overhead and setup / teardown time. Memory
overhead for the mapping with a 4kB page and 8 byte page table is 2GB
per TB of mapping, down to 4MB / TB with 2MB pages.

The same huge page vmap infrastructure can be quite easily adapted and
used for mapping vmalloc memory pages without more complexity for arch
or core vmap code. However unlike ioremap, vmalloc page table overhead 
is not a real problem, so the advantage to justify this is performance.

Several of the most structures in the kernel (e.g., vfs and network hash 
tables) are allocated with vmalloc on NUMA machines, in order to 
distribute access bandwidth over the machine. Mapping these with larger
pages can improve TLB usage significantly, for example this reduces TLB 
misses by nearly 30x on a `git diff` workload on a 2-node POWER9 (59,800 
-> 2,100) and reduces CPU cycles by 0.54%, due to vfs hashes being 
allocated with 2MB pages.

[ Other numbers?
  - The difference is even larger in a guest due to more costly TLB 
misses.
  - Eric Dumazet was keen on the network hash performance possibilities.
  - Other archs? Ding was doing x86 testing. ]

The kernel module allocator also uses vmalloc to map module images even 
on non-NUMA, which can result in high iTLB pressure on highly modular 
distro type of kernels. This series does not implement huge mappings for 
modules yet, but it's a step along the way. Rick Edgecombe was looking 
at that IIRC.

The per-cpu allocator similarly might be able to take advantage of this.
Also on the todo list.

The disadvantages of this I can see are:
* Memory fragmentation can waste some physical memory because it will 
  attempt to allocate larger pages to fit the required size, rounding up 
  (once the requested size is >= 2MB).
  - I don't see it being a big problem in practice unless some user 
crops up that allocates thousands of 2.5MB ranges. We can tewak 
heuristics a bit there if needed to reduce peak waste.
* Less granular mappings can make the NUMA distribution less balanced.
  - Similar to the above.
  - Could also allocate all major system hashes with one allocation
up-front and spread them all across the one block, which should help
overall NUMA distribution and reduce fragmentation waste.
* Callers might expect something about the underlying allocated pages.
  - Tried to keep the apperance of base PAGE_SIZE pages throughout the 
APIs and exposed data structures.
  - Added a VM_NO_HUGE_VMAP flag to hammer troublesome cases with.

- Finally, added a nohugevmalloc boot option to turn it off (independent
  of nohugeiomap).

Is that helpful?

Thanks,
Nick

Re: Errant readings on LM81 with T2080 SoC

2021-03-17 Thread Chris Packham



On 12/03/21 10:34 am, Guenter Roeck wrote:
> On 3/11/21 1:17 PM, Chris Packham wrote:
>> On 11/03/21 9:18 pm, Wolfram Sang wrote:
 Bummer. What is really weird is that you see clock stretching under
 CPU load. Normally clock stretching is triggered by the device, not
 by the host.
>>> One example: Some hosts need an interrupt per byte to know if they
>>> should send ACK or NACK. If that interrupt is delayed, they stretch the
>>> clock.
>>>
>> It feels like something like that is happening. Looking at the T2080
>> Reference manual there is an interesting timing diagram (Figure 14-2 if
>> someone feels like looking it up). It shows SCL low between the ACK for
>> the address and the data byte. I think if we're delayed in sending the
>> next byte we could violate Ttimeout or Tlow:mext from the SMBUS spec.
>>
> I think that really leaves you only two options that I can see:
> Rework the driver to handle critical actions (such as setting TXAK,
> and everything else that might result in clock stretching) in the
> interrupt handler, or rework the driver to handle everything in
> a high priority kernel thread.
I've made some reasonable progress on making i2c-mpc more interrupt 
driven. Assuming it works out for my use-case is there an opinion on 
making interrupt support mandatory? Looking at all the in-tree dts files 
that use one of the compatible strings from i2c-mpc.c they all have 
interrupt properties so in theory nothing is using the polling mode. But 
there may be some out-of-tree boards or boards using an old dtb that 
would be affected?

Re: [PATCH v4 14/14] vdpa_sim_blk: add support for vdpa management tool

2021-03-17 Thread Jason Wang




在 2021/3/16 上午12:34, Stefano Garzarella 写道:

Enable the user to create vDPA block simulator devices using the
vdpa management tool:

 # Show vDPA supported devices
 $ vdpa mgmtdev show
 vdpasim_blk:
   supported_classes block

 # Create a vDPA block device named as 'blk0' from the management
 # device vdpasim:
 $ vdpa dev add mgmtdev vdpasim_blk name blk0

 # Show the info of the 'blk0' device just created
 $ vdpa dev show blk0 -jp
 {
 "dev": {
 "blk0": {
 "type": "block",
 "mgmtdev": "vdpasim_blk",
 "vendor_id": 0,
 "max_vqs": 1,
 "max_vq_size": 256
 }
 }
 }

 # Delete the vDPA device after its use
 $ vdpa dev del blk0

Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 



---
  drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 76 +++-
  1 file changed, 63 insertions(+), 13 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c 
b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
index 643ae3bc62c0..5bfe1c281645 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c
@@ -37,7 +37,6 @@
  #define VDPASIM_BLK_SEG_MAX   32
  #define VDPASIM_BLK_VQ_NUM1
  
-static struct vdpasim *vdpasim_blk_dev;

  static char vdpasim_blk_id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim";
  
  static bool vdpasim_blk_check_range(u64 start_sector, size_t range_size)

@@ -241,11 +240,23 @@ static void vdpasim_blk_get_config(struct vdpasim 
*vdpasim, void *config)
blk_config->blk_size = cpu_to_vdpasim32(vdpasim, SECTOR_SIZE);
  }
  
-static int __init vdpasim_blk_init(void)

+static void vdpasim_blk_mgmtdev_release(struct device *dev)
+{
+}
+
+static struct device vdpasim_blk_mgmtdev = {
+   .init_name = "vdpasim_blk",
+   .release = vdpasim_blk_mgmtdev_release,
+};
+
+static int vdpasim_blk_dev_add(struct vdpa_mgmt_dev *mdev, const char *name)
  {
struct vdpasim_dev_attr dev_attr = {};
+   struct vdpasim *simdev;
int ret;
  
+	dev_attr.mgmt_dev = mdev;

+   dev_attr.name = name;
dev_attr.id = VIRTIO_ID_BLOCK;
dev_attr.supported_features = VDPASIM_BLK_FEATURES;
dev_attr.nvqs = VDPASIM_BLK_VQ_NUM;
@@ -254,29 +265,68 @@ static int __init vdpasim_blk_init(void)
dev_attr.work_fn = vdpasim_blk_work;
dev_attr.buffer_size = VDPASIM_BLK_CAPACITY << SECTOR_SHIFT;
  
-	vdpasim_blk_dev = vdpasim_create(_attr);

-   if (IS_ERR(vdpasim_blk_dev)) {
-   ret = PTR_ERR(vdpasim_blk_dev);
-   goto out;
-   }
+   simdev = vdpasim_create(_attr);
+   if (IS_ERR(simdev))
+   return PTR_ERR(simdev);
  
-	ret = vdpa_register_device(_blk_dev->vdpa, VDPASIM_BLK_VQ_NUM);

+   ret = _vdpa_register_device(>vdpa, VDPASIM_BLK_VQ_NUM);
if (ret)
goto put_dev;
  
  	return 0;
  
  put_dev:

-   put_device(_blk_dev->vdpa.dev);
-out:
+   put_device(>vdpa.dev);
return ret;
  }
  
-static void __exit vdpasim_blk_exit(void)

+static void vdpasim_blk_dev_del(struct vdpa_mgmt_dev *mdev,
+   struct vdpa_device *dev)
  {
-   struct vdpa_device *vdpa = _blk_dev->vdpa;
+   struct vdpasim *simdev = container_of(dev, struct vdpasim, vdpa);
+
+   _vdpa_unregister_device(>vdpa);
+}
+
+static const struct vdpa_mgmtdev_ops vdpasim_blk_mgmtdev_ops = {
+   .dev_add = vdpasim_blk_dev_add,
+   .dev_del = vdpasim_blk_dev_del
+};
  
-	vdpa_unregister_device(vdpa);

+static struct virtio_device_id id_table[] = {
+   { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+   { 0 },
+};
+
+static struct vdpa_mgmt_dev mgmt_dev = {
+   .device = _blk_mgmtdev,
+   .id_table = id_table,
+   .ops = _blk_mgmtdev_ops,
+};
+
+static int __init vdpasim_blk_init(void)
+{
+   int ret;
+
+   ret = device_register(_blk_mgmtdev);
+   if (ret)
+   return ret;
+
+   ret = vdpa_mgmtdev_register(_dev);
+   if (ret)
+   goto parent_err;
+
+   return 0;
+
+parent_err:
+   device_unregister(_blk_mgmtdev);
+   return ret;
+}
+
+static void __exit vdpasim_blk_exit(void)
+{
+   vdpa_mgmtdev_unregister(_dev);
+   device_unregister(_blk_mgmtdev);
  }
  
  module_init(vdpasim_blk_init)

[PATCH] drm/msm: Remove unneeded variable: "rc"

2021-03-17 Thread zuoqilin1

From: zuoqilin 

Remove unneeded variable: "rc".

Signed-off-by: zuoqilin 
---
 drivers/gpu/drm/msm/dp/dp_panel.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/msm/dp/dp_panel.c 
b/drivers/gpu/drm/msm/dp/dp_panel.c
index 9cc8166..8cb3d01 100644
--- a/drivers/gpu/drm/msm/dp/dp_panel.c
+++ b/drivers/gpu/drm/msm/dp/dp_panel.c
@@ -351,7 +351,6 @@ void dp_panel_dump_regs(struct dp_panel *dp_panel)
 
 int dp_panel_timing_cfg(struct dp_panel *dp_panel)
 {
-   int rc = 0;
u32 data, total_ver, total_hor;
struct dp_catalog *catalog;
struct dp_panel_private *panel;
@@ -404,7 +403,7 @@ int dp_panel_timing_cfg(struct dp_panel *dp_panel)
dp_catalog_panel_timing_cfg(catalog);
panel->panel_on = true;
 
-   return rc;
+   return 0;
 }
 
 int dp_panel_init_panel_info(struct dp_panel *dp_panel)
-- 
1.9.1

Re: [PATCH v4 10/14] vhost/vdpa: Remove the restriction that only supports virtio-net devices

2021-03-17 Thread Jason Wang




在 2021/3/16 上午12:34, Stefano Garzarella 写道:

From: Xie Yongji 

Since the config checks are done by the vDPA drivers, we can remove the
virtio-net restriction and we should be able to support all kinds of
virtio devices.

 is not needed anymore, but we need to include
 to avoid compilation failures.

Signed-off-by: Xie Yongji 
Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 



---
  drivers/vhost/vdpa.c | 6 +-
  1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 7ae4080e57d8..850ed4b62942 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -16,12 +16,12 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
  #include 
  #include 
-#include 
  
  #include "vhost.h"
  
@@ -1018,10 +1018,6 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)

int minor;
int r;
  
-	/* Currently, we only accept the network devices. */

-   if (ops->get_device_id(vdpa) != VIRTIO_ID_NET)
-   return -ENOTSUPP;
-
v = kzalloc(sizeof(*v), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!v)
return -ENOMEM;

Re: [PATCH 2/2] arm64: stacktrace: Add skip when task == current

2021-03-17 Thread chenjun (AM)

在 2021/3/18 3:34, Mark Rutland 写道:
> On Wed, Mar 17, 2021 at 06:36:36PM +, Catalin Marinas wrote:
>> On Wed, Mar 17, 2021 at 02:20:50PM +, Chen Jun wrote:
>>> On ARM64, cat /sys/kernel/debug/page_owner, all pages return the same
>>> stack:
>>>   stack_trace_save+0x4c/0x78
>>>   register_early_stack+0x34/0x70
>>>   init_page_owner+0x34/0x230
>>>   page_ext_init+0x1bc/0x1dc
>>>
>>> The reason is that:
>>> check_recursive_alloc always return 1 because that
>>> entries[0] is always equal to ip (__set_page_owner+0x3c/0x60).
>>>
>>> The root cause is that:
>>> commit 5fc57df2f6fd ("arm64: stacktrace: Convert to ARCH_STACKWALK")
>>> make the save_trace save 2 more entries.
>>>
>>> Add skip in arch_stack_walk when task == current.
>>>
>>> Fixes: 5fc57df2f6fd ("arm64: stacktrace: Convert to ARCH_STACKWALK")
>>> Signed-off-by: Chen Jun 
>>> ---
>>>   arch/arm64/kernel/stacktrace.c | 5 +++--
>>>   1 file changed, 3 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
>>> index ad20981..c26b0ac 100644
>>> --- a/arch/arm64/kernel/stacktrace.c
>>> +++ b/arch/arm64/kernel/stacktrace.c
>>> @@ -201,11 +201,12 @@ void arch_stack_walk(stack_trace_consume_fn 
>>> consume_entry, void *cookie,
>>>   
>>> if (regs)
>>> start_backtrace(, regs->regs[29], regs->pc);
>>> -   else if (task == current)
>>> +   else if (task == current) {
>>> +   ((struct stacktrace_cookie *)cookie)->skip += 2;
>>> start_backtrace(,
>>> (unsigned long)__builtin_frame_address(0),
>>> (unsigned long)arch_stack_walk);
>>> -   else
>>> +   } else
>>> start_backtrace(, thread_saved_fp(task),
>>> thread_saved_pc(task));
>>
>> I don't like abusing the cookie here. It's void * as it's meant to be an
>> opaque type. I'd rather skip the first two frames in walk_stackframe()
>> instead before invoking fn().
> 
> I agree that we shouldn't touch cookie here.
> 
> I don't think that it's right to bodge this inside walk_stackframe(),
> since that'll add bogus skipping for the case starting with regs in the
> current task. If we need a bodge, it has to live in arch_stack_walk()
> where we set up the initial unwinding state.
> 
> In another thread, we came to the conclusion that arch_stack_walk()
> should start at its parent, and its parent should add any skipping it
> requires.
> 
> Currently, arch_stack_walk() is off-by-one, and we can bodge that by
> using __builtin_frame_address(1), though I'm waiting for some compiler
> folk to confirm that's sound. Otherwise we need to add an assembly
> trampoline to snapshot the FP, which is unfortunastely convoluted.
> 
> This report suggests that a caller of arch_stack_walk() is off-by-one
> too, which suggests a larger cross-architecture semantic issue. I'll try
> to take a look tomorrow.
> 
> Thanks,
> Mark.
> 
>>
>> Prior to the conversion to ARCH_STACKWALK, we were indeed skipping two
>> more entries in __save_stack_trace() if tsk == current. Something like
>> below, completely untested:
>>
>> diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
>> index ad20981dfda4..2a9f759aa41a 100644
>> --- a/arch/arm64/kernel/stacktrace.c
>> +++ b/arch/arm64/kernel/stacktrace.c
>> @@ -115,10 +115,15 @@ NOKPROBE_SYMBOL(unwind_frame);
>>   void notrace walk_stackframe(struct task_struct *tsk, struct stackframe 
>> *frame,
>>   bool (*fn)(void *, unsigned long), void *data)
>>   {
>> +/* for the current task, we don't want this function nor its caller */
>> +int skip = tsk == current ? 2 : 0;
>> +
>>  while (1) {
>>  int ret;
>>   
>> -if (!fn(data, frame->pc))
>> +if (skip)
>> +skip--;
>> +else if (!fn(data, frame->pc))
>>  break;
>>  ret = unwind_frame(tsk, frame);
>>  if (ret < 0)
>>
>>
>> -- 
>> Catalin
> 

This change will make kmemleak broken.
Maybe the reason is what Mark pointed out. I will try to check out.

-- 
Regards
Chen Jun

Re: [PATCH v4 09/14] vhost/vdpa: use get_config_size callback in vhost_vdpa_config_validate()

2021-03-17 Thread Jason Wang




在 2021/3/16 上午12:34, Stefano Garzarella 写道:

Let's use the new 'get_config_size()' callback available instead of
using the 'virtio_id' to get the size of the device config space.

Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 



---
  drivers/vhost/vdpa.c | 9 ++---
  1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index e0a27e336293..7ae4080e57d8 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -188,13 +188,8 @@ static long vhost_vdpa_set_status(struct vhost_vdpa *v, u8 
__user *statusp)
  static int vhost_vdpa_config_validate(struct vhost_vdpa *v,
  struct vhost_vdpa_config *c)
  {
-   long size = 0;
-
-   switch (v->virtio_id) {
-   case VIRTIO_ID_NET:
-   size = sizeof(struct virtio_net_config);
-   break;
-   }
+   struct vdpa_device *vdpa = v->vdpa;
+   long size = vdpa->config->get_config_size(vdpa);
  
  	if (c->len == 0)

return -EINVAL;

Re: [PATCH v4 08/14] vdpa: add get_config_size callback in vdpa_config_ops

2021-03-17 Thread Jason Wang




在 2021/3/16 上午12:34, Stefano Garzarella 写道:

This new callback is used to get the size of the configuration space
of vDPA devices.

Signed-off-by: Stefano Garzarella 



Acked-by: Jason Wang 



---
  include/linux/vdpa.h  | 4 
  drivers/vdpa/ifcvf/ifcvf_main.c   | 6 ++
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 6 ++
  drivers/vdpa/vdpa_sim/vdpa_sim.c  | 9 +
  drivers/vdpa/virtio_pci/vp_vdpa.c | 8 
  5 files changed, 33 insertions(+)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index 15fa085fab05..1b094c1531f2 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -150,6 +150,9 @@ struct vdpa_iova_range {
   * @set_status:   Set the device status
   *@vdev: vdpa device
   *@status: virtio device status
+ * @get_config_size:   Get the size of the configuration space
+ * @vdev: vdpa device
+ * Returns size_t: configuration size
   * @get_config:   Read from device specific configuration 
space
   *@vdev: vdpa device
   *@offset: offset from the beginning of
@@ -231,6 +234,7 @@ struct vdpa_config_ops {
u32 (*get_vendor_id)(struct vdpa_device *vdev);
u8 (*get_status)(struct vdpa_device *vdev);
void (*set_status)(struct vdpa_device *vdev, u8 status);
+   size_t (*get_config_size)(struct vdpa_device *vdev);
void (*get_config)(struct vdpa_device *vdev, unsigned int offset,
   void *buf, unsigned int len);
void (*set_config)(struct vdpa_device *vdev, unsigned int offset,
diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index d555a6a5d1ba..017ab07040c7 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -332,6 +332,11 @@ static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device 
*vdpa_dev)
return IFCVF_QUEUE_ALIGNMENT;
  }
  
+static size_t ifcvf_vdpa_get_config_size(struct vdpa_device *vdpa_dev)

+{
+   return sizeof(struct virtio_net_config);
+}
+
  static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev,
  unsigned int offset,
  void *buf, unsigned int len)
@@ -392,6 +397,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = {
.get_device_id  = ifcvf_vdpa_get_device_id,
.get_vendor_id  = ifcvf_vdpa_get_vendor_id,
.get_vq_align   = ifcvf_vdpa_get_vq_align,
+   .get_config_size= ifcvf_vdpa_get_config_size,
.get_config = ifcvf_vdpa_get_config,
.set_config = ifcvf_vdpa_set_config,
.set_config_cb  = ifcvf_vdpa_set_config_cb,
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 71397fdafa6a..f6e03bf49e3e 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1814,6 +1814,11 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
  }
  
+static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)

+{
+   return sizeof(struct virtio_net_config);
+}
+
  static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int 
offset, void *buf,
 unsigned int len)
  {
@@ -1900,6 +1905,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = {
.get_vendor_id = mlx5_vdpa_get_vendor_id,
.get_status = mlx5_vdpa_get_status,
.set_status = mlx5_vdpa_set_status,
+   .get_config_size = mlx5_vdpa_get_config_size,
.get_config = mlx5_vdpa_get_config,
.set_config = mlx5_vdpa_set_config,
.get_generation = mlx5_vdpa_get_generation,
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 14dc2d3d983e..98f793bc9376 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -462,6 +462,13 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, 
u8 status)
spin_unlock(>lock);
  }
  
+static size_t vdpasim_get_config_size(struct vdpa_device *vdpa)

+{
+   struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+   return vdpasim->dev_attr.config_size;
+}
+
  static void vdpasim_get_config(struct vdpa_device *vdpa, unsigned int offset,
 void *buf, unsigned int len)
  {
@@ -598,6 +605,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = {
.get_vendor_id  = vdpasim_get_vendor_id,
.get_status = vdpasim_get_status,
.set_status = vdpasim_set_status,
+   .get_config_size= vdpasim_get_config_size,
.get_config = vdpasim_get_config,
.set_config = vdpasim_set_config,
.get_generation = vdpasim_get_generation,
@@ -625,6 +633,7 @@ static const

Re: [mm, net-next v2] mm: net: memcg accounting for TCP rx zerocopy

2021-03-17 Thread Andrew Morton

On Mon, 15 Mar 2021 18:30:03 -0700 Arjun Roy  wrote:

> From: Arjun Roy 
> 
> TCP zerocopy receive is used by high performance network applications
> to further scale. For RX zerocopy, the memory containing the network
> data filled by the network driver is directly mapped into the address
> space of high performance applications. To keep the TLB cost low,
> these applications unmap the network memory in big batches. So, this
> memory can remain mapped for long time. This can cause a memory
> isolation issue as this memory becomes unaccounted after getting
> mapped into the application address space. This patch adds the memcg
> accounting for such memory.
> 
> Accounting the network memory comes with its own unique challenges.
> The high performance NIC drivers use page pooling to reuse the pages
> to eliminate/reduce expensive setup steps like IOMMU. These drivers
> keep an extra reference on the pages and thus we can not depend on the
> page reference for the uncharging. The page in the pool may keep a
> memcg pinned for arbitrary long time or may get used by other memcg.
> 
> This patch decouples the uncharging of the page from the refcnt and
> associates it with the map count i.e. the page gets uncharged when the
> last address space unmaps it. Now the question is, what if the driver
> drops its reference while the page is still mapped? That is fine as
> the address space also holds a reference to the page i.e. the
> reference count can not drop to zero before the map count.

What tree were you hoping to get this merged through?  I'd suggest net
- it's more likely to get tested over there.

>
> ...
>
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c

These changes could be inside #ifdef CONFIG_NET.  Although I expect
MEMCG=y&=n is pretty damn rare.

S390: all HAS_IOMEM build failures in one fell swoop

2021-03-17 Thread Randy Dunlap



On ARCH=s390:

By disabling CONFIG_PCI and hence also disabling CONFIG_HAS_IOMEM
(after having done 'make ARCH=s390 allmodconfig'),
we can see all of the drivers that use IOMEM-related interfaces
without mentioning that they do so (in their respective Kconfig files).

This should catch all of them, instead of various randconfig builds
catching a few of them at a time.
(I'm not trying to pick on arch/s390/ here -- more on the piecemeal
randconfig approach of some 'bot'. :)


I have grouped them by subsystem (more or less).
(This was done on linux-next of 2021-03-15.)

~

make[1]: Entering directory 'linux-next-20210315/S390'

kernel/dma:

gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: kernel/dma/coherent.o: in 
function `dma_init_coherent_memory':
coherent.c:(.text+0x39c): undefined reference to `memremap'
gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: coherent.c:(.text+0x4e0): 
undefined reference to `memunmap'
gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: kernel/dma/coherent.o: in 
function `dma_declare_coherent_memory':
coherent.c:(.text+0xac6): undefined reference to `memunmap'

irqchip:

gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: drivers/irqchip/irq-al-fic.o: in 
function `al_fic_init_dt':
irq-al-fic.c:(.init.text+0x6c): undefined reference to `of_iomap'
gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: irq-al-fic.c:(.init.text+0x49c): 
undefined reference to `iounmap'

clk / clocksource:

gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: drivers/clk/clk-fixed-mmio.o: in 
function `fixed_mmio_clk_setup':
clk-fixed-mmio.c:(.text+0x9a): undefined reference to `of_iomap'
gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: clk-fixed-mmio.c:(.text+0xe6): 
undefined reference to `iounmap'
gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: drivers/clocksource/timer-of.o: 
in function `timer_of_init':
timer-of.c:(.init.text+0x8e): undefined reference to `of_iomap'
gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: timer-of.c:(.init.text+0x6ec): 
undefined reference to `iounmap'
gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: drivers/clocksource/timer-of.o: 
in function `timer_of_cleanup':
timer-of.c:(.init.text+0x8f2): undefined reference to `iounmap'
gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: 
drivers/clocksource/timer-microchip-pit64b.o: in function 
`mchp_pit64b_dt_init_timer':
timer-microchip-pit64b.c:(.init.text+0xf2): undefined reference to `of_iomap'
gcc-9.3.0-nolibc/s390-linux/bin/s390-linux-ld: 
timer-microchip-pit64b.c:(.init.text+0xa18): undefined reference to `iounmap'

iio:

ERROR: modpost: "devm_platform_ioremap_resource" 
[drivers/iio/adc/adi-axi-adc.ko] undefined!

pcmcia:

ERROR: modpost: "ioremap" [drivers/pcmcia/pcmcia.ko] undefined!
ERROR: modpost: "iounmap" [drivers/pcmcia/pcmcia.ko] undefined!

mtd:

ERROR: modpost: "devm_ioremap_resource" [drivers/mtd/nand/raw/denali_dt.ko] 
undefined!

nvmem:

ERROR: modpost: "memunmap" [drivers/nvmem/nvmem-rmem.ko] undefined!
ERROR: modpost: "memremap" [drivers/nvmem/nvmem-rmem.ko] undefined!

crypto:

ERROR: modpost: "devm_ioremap_resource" [drivers/crypto/ccree/ccree.ko] 
undefined!
ERROR: modpost: "debugfs_create_regset32" [drivers/crypto/ccree/ccree.ko] 
undefined!

media:

ERROR: modpost: "devm_ioremap_resource" [drivers/media/rc/ir-hix5hd2.ko] 
undefined!

input:

ERROR: modpost: "devm_ioremap" [drivers/input/keyboard/samsung-keypad.ko] 
undefined!

net:

ERROR: modpost: "devm_platform_ioremap_resource" [drivers/net/can/grcan.ko] 
undefined!
ERROR: modpost: "iounmap" [drivers/net/arcnet/arc-rimi.ko] undefined!
ERROR: modpost: "ioremap" [drivers/net/arcnet/arc-rimi.ko] undefined!
ERROR: modpost: "iounmap" [drivers/net/arcnet/com90xx.ko] undefined!
ERROR: modpost: "ioremap" [drivers/net/arcnet/com90xx.ko] undefined!
ERROR: modpost: "devm_ioremap" [drivers/net/ethernet/altera/altera_tse.ko] 
undefined!
ERROR: modpost: "ioremap" [drivers/net/ethernet/xircom/xirc2ps_cs.ko] undefined!
ERROR: modpost: "iounmap" [drivers/net/ethernet/xircom/xirc2ps_cs.ko] undefined!
ERROR: modpost: "devm_ioremap_resource" 
[drivers/net/ethernet/xilinx/xilinx_emac.ko] undefined!
ERROR: modpost: "of_address_to_resource" 
[drivers/net/ethernet/xilinx/xilinx_emac.ko] undefined!
ERROR: modpost: "of_address_to_resource" 
[drivers/net/ethernet/xilinx/xilinx_emaclite.ko] undefined!
ERROR: modpost: "devm_ioremap_resource" 
[drivers/net/ethernet/xilinx/xilinx_emaclite.ko] undefined!
ERROR: modpost: "devm_platform_ioremap_resource_byname" 
[drivers/net/ethernet/xilinx/ll_temac.ko] undefined!
ERROR: modpost: "of_address_to_resource" 
[drivers/net/ethernet/xilinx/ll_temac.ko] undefined!
ERROR: modpost: "devm_platform_ioremap_resource" 
[drivers/net/ethernet/xilinx/ll_temac.ko] undefined!
ERROR: modpost: "devm_of_iomap" [drivers/net/ethernet/xilinx/ll_temac.ko] 
undefined!
ERROR: modpost: "ioremap" [drivers/net/ethernet/smsc/smc91c92_cs.ko] undefined!
ERROR: modpost: "iounmap" [drivers/net/ethernet/smsc/smc91c92_cs.ko] undefined!
ERROR: modpost:

[PATCH v2] mm/gup: check page posion status for coredump.

2021-03-17 Thread Aili Yao

When we do coredump for user process signal, this may be an SIGBUS signal
with BUS_MCEERR_AR or BUS_MCEERR_AO code, which means this signal is
resulted from ECC memory fail like SRAR or SRAO, we expect the memory
recovery work is finished correctly, then the get_dump_page() will not
return the error page as its process pte is set invalid by
memory_failure().

But memory_failure() may fail, and the process's related pte may not be
correctly set invalid, for current code, we will return the poison page
and get it dumped and lead to system panic as its in kernel code.

So check the poison status in get_dump_page(), and if TRUE, return NULL.

There maybe other scenario that is also better to check the posion status
and not to panic, so make a wrapper for this check, suggested by
David Hildenbrand 

Signed-off-by: Aili Yao 
---
 mm/gup.c  |  4 
 mm/internal.h | 21 +
 2 files changed, 25 insertions(+)

diff --git a/mm/gup.c b/mm/gup.c
index e4c224c..3b4703a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1536,6 +1536,10 @@ struct page *get_dump_page(unsigned long addr)
  FOLL_FORCE | FOLL_DUMP | FOLL_GET);
if (locked)
mmap_read_unlock(mm);
+
+   if (ret == 1 && check_user_page_poison(page))
+   return NULL;
+
return (ret == 1) ? page : NULL;
 }
 #endif /* CONFIG_ELF_CORE */
diff --git a/mm/internal.h b/mm/internal.h
index 25d2b2439..777b3e5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -97,6 +97,27 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
 }
 
+/*
+ * When kernel touch the user page, the user page may be have been marked
+ * poison but still mapped in user space, if without this page, the kernel
+ * can guarantee the data integrity and operation success, the kernel is
+ * better to check the posion status and avoid touching it, be good not to
+ * panic, coredump for process fatal signal is a sample case matching this
+ * scenario. Or if kernel can't guarantee the data integrity, it's better
+ * not to call this function, let kernel touch the poison page and get to
+ * panic.
+ */
+static inline int check_user_page_poison(struct page *page)
+{
+   if (IS_ENABLED(CONFIG_MEMORY_FAILURE) && page != NULL) {
+   if (unlikely(PageHuge(page) && 
PageHWPoison(compound_head(page
+   return true;
+   else if (unlikely(PageHWPoison(page)))
+   return true;
+   }
+   return 0;
+}
+
 extern unsigned long highest_memmap_pfn;
 
 /*
-- 
1.8.3.1

Re: [PATCH] drivers/video/fbdev:modify 0 to NULL

2021-03-17 Thread Gustavo A. R. Silva




On 3/17/21 21:47, Chunyou Tang wrote:

> I think "if (info == NULL)" is more intuitive,and there have many
> compare likes "if (info == NULL)" in this file.

In that case, all those instances should be changed to if (!foo), instead.

--
Gustavo

[PATCH v2] drivers/video/fbdev:modify 'if (addrp == NULL)' to 'if (!addr)

2021-03-17 Thread ChunyouTang

From: tangchunyou 

modify 'if (addrp == NULL)' to 'if (!addr)

Signed-off-by: tangchunyou 
---
 drivers/video/fbdev/offb.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/video/fbdev/offb.c b/drivers/video/fbdev/offb.c
index cd1042f..52d86e3 100644
--- a/drivers/video/fbdev/offb.c
+++ b/drivers/video/fbdev/offb.c
@@ -305,9 +305,9 @@ static void __iomem *offb_map_reg(struct device_node *np, 
int index,
unsigned int flags;
 
addrp = of_get_pci_address(np, index, , );
-   if (addrp == NULL)
+   if (!addrp)
addrp = of_get_address(np, index, , );
-   if (addrp == NULL)
+   if (!addrp)
return NULL;
if ((flags & (IORESOURCE_IO | IORESOURCE_MEM)) == 0)
return NULL;
@@ -412,7 +412,7 @@ static void __init offb_init_fb(const char *name,
 
info = framebuffer_alloc(sizeof(u32) * 16, NULL);
 
-   if (info == NULL) {
+   if (!info) {
release_mem_region(res_start, res_size);
return;
}
@@ -555,25 +555,25 @@ static void __init offb_init_nodriver(struct device_node 
*dp, int no_real_node)
 #endif
 
pp = of_get_property(dp, "linux,bootx-depth", );
-   if (pp == NULL)
+   if (!pp)
pp = of_get_property(dp, "depth", );
if (pp && len == sizeof(u32))
depth = be32_to_cpup(pp);
 
pp = of_get_property(dp, "linux,bootx-width", );
-   if (pp == NULL)
+   if (!pp)
pp = of_get_property(dp, "width", );
if (pp && len == sizeof(u32))
width = be32_to_cpup(pp);
 
pp = of_get_property(dp, "linux,bootx-height", );
-   if (pp == NULL)
+   if (!pp)
pp = of_get_property(dp, "height", );
if (pp && len == sizeof(u32))
height = be32_to_cpup(pp);
 
pp = of_get_property(dp, "linux,bootx-linebytes", );
-   if (pp == NULL)
+   if (!pp)
pp = of_get_property(dp, "linebytes", );
if (pp && len == sizeof(u32) && (*pp != 0xu))
pitch = be32_to_cpup(pp);
@@ -593,7 +593,7 @@ static void __init offb_init_nodriver(struct device_node 
*dp, int no_real_node)
 * the "address" property. If none match, we pick the biggest
 */
up = of_get_property(dp, "linux,bootx-addr", );
-   if (up == NULL)
+   if (!up)
up = of_get_property(dp, "address", );
if (up && len == sizeof(u32))
addr_prop = *up;
-- 
1.9.1

Re: [PATCH] mm/gup: check page posion status for coredump.

2021-03-17 Thread Aili Yao

On Wed, 17 Mar 2021 10:12:02 +0100
David Hildenbrand  wrote:

> 
> I wonder if a simple
> 
> if (PageHWPoison(compound_head(page)))
>   ret = 0;
> 
> won't suffice. But I guess the "issue" is compound pages that are not 
> huge pages or transparent huge pages.

Yes, the simple case won't suffice, as we mark the hugetlb page poison in head, 
and
other cases in the specific single page struct.

> If not, we certainly want a wrapper for that magic, otherwise we have to 
> replicate the same logic all over the place.
> 
> > +
> > return (ret == 1) ? page : NULL;
> >   }
> >   #endif /* CONFIG_ELF_CORE */
> >   
> 
> 

Yes, May other places meet the requirements as the coredump meets, it's better 
to make a
wrapper for this. But i am not familiar with the specific scenario, so this 
patch only cover
the coredump case.

I will post a v2 patch for this.

-- 
Thanks!
Aili Yao

Re: A problem of Intel IOMMU hardware ？

2021-03-17 Thread Lu Baolu


Hi Nadav,

On 3/18/21 2:12 AM, Nadav Amit wrote:




On Mar 17, 2021, at 2:35 AM, Longpeng (Mike, Cloud Infrastructure Service Product 
Dept.)  wrote:

Hi Nadav,


-Original Message-
From: Nadav Amit [mailto:nadav.a...@gmail.com]

  reproduce the problem with high probability (~50%).


I saw Lu replied, and he is much more knowledgable than I am (I was just 
intrigued
by your email).

However, if I were you I would try also to remove some “optimizations” to look 
for
the root-cause (e.g., use domain specific invalidations instead of 
page-specific).



Good suggestion! But we did it these days, we tried to use global invalidations 
as follow:
iommu->flush.flush_iotlb(iommu, did, 0, 0,
DMA_TLB_DSI_FLUSH);
But can not resolve the problem.


The first thing that comes to my mind is the invalidation hint (ih) in
iommu_flush_iotlb_psi(). I would remove it to see whether you get the failure
without it.


We also notice the IH, but the IH is always ZERO in our case, as the spec says:
'''
Paging-structure-cache entries caching second-level mappings associated with 
the specified
domain-id and the second-level-input-address range are invalidated, if the 
Invalidation Hint
(IH) field is Clear.
'''

It seems the software is everything fine, so we've no choice but to suspect the 
hardware.


Ok, I am pretty much out of ideas. I have two more suggestions, but
they are much less likely to help. Yet, they can further help to rule
out software bugs:

1. dma_clear_pte() seems to be wrong IMHO. It should have used WRITE_ONCE()
to prevent split-write, which might potentially cause “invalid” (partially
cleared) PTE to be stored in the TLB. Having said that, the subsequent
IOTLB flush should have prevented the problem.


Agreed. The pte read/write should use READ/WRITE_ONCE() instead.



2. Consider ensuring that the problem is not somehow related to queued
invalidations. Try to use __iommu_flush_iotlb() instead of
qi_flush_iotlb().

Regards,
Nadav



Best regards,
baolu

Re: [PATCH] drm/amd/display: Remove unnecessary conversion to bool

2021-03-17 Thread Alex Deucher

On Wed, Mar 17, 2021 at 10:37 PM Jiapeng Chong
 wrote:
>
> Fix the following coccicheck warnings:
>
> ./drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dwb_cm.c:220:65-70:
> WARNING: conversion to bool not needed here.
>
> Reported-by: Abaci Robot 
> Signed-off-by: Jiapeng Chong 

Applied.  Thanks.  In general, you can just roll up most these bool
conversion patches into larger patches; no need to fix them all one at
a time.

Alex

> ---
>  drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dwb_cm.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dwb_cm.c 
> b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dwb_cm.c
> index 8593145..3fe9e41 100644
> --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dwb_cm.c
> +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dwb_cm.c
> @@ -217,7 +217,7 @@ static bool dwb3_program_ogam_lut(
> else
> next_mode = LUT_RAM_A;
>
> -   dwb3_configure_ogam_lut(dwbc30, next_mode == LUT_RAM_A ? true : 
> false);
> +   dwb3_configure_ogam_lut(dwbc30, next_mode == LUT_RAM_A);
>
> if (next_mode == LUT_RAM_A)
> dwb3_program_ogam_luta_settings(dwbc30, params);
> --
> 1.8.3.1
>
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] drm/amd/display: Remove unnecessary conversion to bool

2021-03-17 Thread Alex Deucher

On Tue, Mar 16, 2021 at 4:09 AM Jiapeng Chong
 wrote:
>
> Fix the following coccicheck warnings:
>
> ./drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c:721:65-70: WARNING:
> conversion to bool not needed here.
>
> ./drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c:1139:67-72: WARNING:
> conversion to bool not needed here.
>
> Reported-by: Abaci Robot 
> Signed-off-by: Jiapeng Chong 

Applied.  Thanks!

Alex

> ---
>  drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c 
> b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c
> index 6e864b1..434d3c4 100644
> --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c
> +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c
> @@ -718,7 +718,7 @@ bool dpp3_program_blnd_lut(
> next_mode = LUT_RAM_B;
>
> dpp3_power_on_blnd_lut(dpp_base, true);
> -   dpp3_configure_blnd_lut(dpp_base, next_mode == LUT_RAM_A ? 
> true:false);
> +   dpp3_configure_blnd_lut(dpp_base, next_mode == LUT_RAM_A);
>
> if (next_mode == LUT_RAM_A)
> dpp3_program_blnd_luta_settings(dpp_base, params);
> @@ -1136,7 +1136,7 @@ bool dpp3_program_shaper(
> else
> next_mode = LUT_RAM_A;
>
> -   dpp3_configure_shaper_lut(dpp_base, next_mode == LUT_RAM_A ? 
> true:false);
> +   dpp3_configure_shaper_lut(dpp_base, next_mode == LUT_RAM_A);
>
> if (next_mode == LUT_RAM_A)
> dpp3_program_shaper_luta_settings(dpp_base, params);
> --
> 1.8.3.1
>
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: A problem of Intel IOMMU hardware ？

2021-03-17 Thread Lu Baolu


Hi Alex,

On 3/17/21 11:18 PM, Alex Williamson wrote:

  {MAP,   0x0, 0xc000}, - (b)
  use GDB to pause at here, and then DMA read IOVA=0,

IOVA 0 seems to be a special one. Have you verified with other addresses
than IOVA 0?

It is???  That would be a problem.



No problem from hardware point of view as far as I can see. Just
thought about software might handle it specially.

Best regards,
baolu

[PATCH] jffs2: fix kasan slab-out-of-bounds problem

2021-03-17 Thread Zhe Li

From: lizhe 

KASAN report a slab-out-of-bounds problem. The logs are listed below.
It is because in function jffs2_scan_dirent_node, we alloc "checkedlen+1"
bytes for fd->name and we check crc with length rd->nsize. If checkedlen
is less than rd->nsize, it will cause the slab-out-of-bounds problem.

jffs2: Dirent at *** has zeroes in name. Truncating to %d char
==
BUG: KASAN: slab-out-of-bounds in crc32_le+0x1ce/0x260 at addr 8800842cf2d1
Read of size 1 by task test_JFFS2/915
=
BUG kmalloc-64 (Tainted: GB  O   ): kasan: bad access detected
-
INFO: Allocated in jffs2_alloc_full_dirent+0x2a/0x40 age=0 cpu=1 pid=915
___slab_alloc+0x580/0x5f0
__slab_alloc.isra.24+0x4e/0x64
__kmalloc+0x170/0x300
jffs2_alloc_full_dirent+0x2a/0x40
jffs2_scan_eraseblock+0x1ca4/0x3b64
jffs2_scan_medium+0x285/0xfe0
jffs2_do_mount_fs+0x5fb/0x1bbc
jffs2_do_fill_super+0x245/0x6f0
jffs2_fill_super+0x287/0x2e0
mount_mtd_aux.isra.0+0x9a/0x144
mount_mtd+0x222/0x2f0
jffs2_mount+0x41/0x60
mount_fs+0x63/0x230
vfs_kern_mount.part.6+0x6c/0x1f4
do_mount+0xae8/0x1940
SyS_mount+0x105/0x1d0
INFO: Freed in jffs2_free_full_dirent+0x22/0x40 age=27 cpu=1 pid=915
__slab_free+0x372/0x4e4
kfree+0x1d4/0x20c
jffs2_free_full_dirent+0x22/0x40
jffs2_build_remove_unlinked_inode+0x17a/0x1e4
jffs2_do_mount_fs+0x1646/0x1bbc
jffs2_do_fill_super+0x245/0x6f0
jffs2_fill_super+0x287/0x2e0
mount_mtd_aux.isra.0+0x9a/0x144
mount_mtd+0x222/0x2f0
jffs2_mount+0x41/0x60
mount_fs+0x63/0x230
vfs_kern_mount.part.6+0x6c/0x1f4
do_mount+0xae8/0x1940
SyS_mount+0x105/0x1d0
entry_SYSCALL_64_fastpath+0x1e/0x97
Call Trace:
 [] dump_stack+0x59/0x7e
 [] print_trailer+0x125/0x1b0
 [] object_err+0x34/0x40
 [] kasan_report.part.1+0x21f/0x534
 [] ? vprintk+0x2d/0x40
 [] ? crc32_le+0x1ce/0x260
 [] kasan_report+0x26/0x30
 [] __asan_load1+0x3d/0x50
 [] crc32_le+0x1ce/0x260
 [] ? jffs2_alloc_full_dirent+0x2a/0x40
 [] jffs2_scan_eraseblock+0x1d0c/0x3b64
 [] ? jffs2_scan_medium+0xccf/0xfe0
 [] ? jffs2_scan_make_ino_cache+0x14c/0x14c
 [] ? kasan_unpoison_shadow+0x35/0x50
 [] ? kasan_unpoison_shadow+0x35/0x50
 [] ? kasan_kmalloc+0x5e/0x70
 [] ? kmem_cache_alloc_trace+0x10c/0x2cc
 [] ? mtd_point+0xf7/0x130
 [] jffs2_scan_medium+0x285/0xfe0
 [] ? jffs2_scan_eraseblock+0x3b64/0x3b64
 [] ? kasan_unpoison_shadow+0x35/0x50
 [] ? kasan_unpoison_shadow+0x35/0x50
 [] ? kasan_kmalloc+0x5e/0x70
 [] ? __kmalloc+0x12b/0x300
 [] ? kasan_kmalloc+0x5e/0x70
 [] ? jffs2_sum_init+0x9f/0x240
 [] jffs2_do_mount_fs+0x5fb/0x1bbc
 [] ? jffs2_del_noinode_dirent+0x640/0x640
 [] ? kasan_kmalloc+0x5e/0x70
 [] ? __init_rwsem+0x97/0xac
 [] jffs2_do_fill_super+0x245/0x6f0
 [] jffs2_fill_super+0x287/0x2e0
 [] ? jffs2_parse_options+0x594/0x594
 [] mount_mtd_aux.isra.0+0x9a/0x144
 [] mount_mtd+0x222/0x2f0
 [] ? jffs2_parse_options+0x594/0x594
 [] ? mount_mtd_aux.isra.0+0x144/0x144
 [] ? free_pages+0x13/0x1c
 [] ? selinux_sb_copy_data+0x278/0x2e0
 [] jffs2_mount+0x41/0x60
 [] mount_fs+0x63/0x230
 [] ? alloc_vfsmnt+0x32f/0x3b0
 [] vfs_kern_mount.part.6+0x6c/0x1f4
 [] do_mount+0xae8/0x1940
 [] ? audit_filter_rules.constprop.6+0x1d10/0x1d10
 [] ? copy_mount_string+0x40/0x40
 [] ? alloc_pages_current+0xa4/0x1bc
 [] ? __get_free_pages+0x25/0x50
 [] ? copy_mount_options.part.17+0x183/0x264
 [] SyS_mount+0x105/0x1d0
 [] ? copy_mnt_ns+0x560/0x560
 [] ? msa_space_switch_handler+0x13d/0x190
 [] entry_SYSCALL_64_fastpath+0x1e/0x97
 [] ? msa_space_switch+0xb0/0xe0
Memory state around the buggy address:
 8800842cf180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 8800842cf200: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>8800842cf280: fc fc fc fc fc fc 00 00 00 00 01 fc fc fc fc fc
 ^
 8800842cf300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 8800842cf380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
==

Reported-by: Kunkun Xu 
Signed-off-by: lizhe 
---
 fs/jffs2/scan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index db72a9d..b676056 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -1079,7 +1079,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info 
*c, struct jffs2_eraseblo
memcpy(>name, rd->name, checkedlen);
fd->name[checkedlen] = 0;
 
-   crc = crc32(0, fd->name, rd->nsize);
+   crc = crc32(0, fd->name, checkedlen);
if (crc != je32_to_cpu(rd->name_crc)) {
pr_notice("%s(): Name CRC failed on node at 0x%08x: Read

Re: Re: [PATCH] sched: swait: use wake_up_process() instead of wake_up_state()

2021-03-17 Thread Mike Galbraith

On Thu, 2021-03-18 at 10:14 +0800, 王擎 wrote:
> >>
> >> * Mike Galbraith  wrote:
> >>
> >> > On Tue, 2021-03-16 at 19:20 +0800, Wang Qing wrote:
> >> > > Why not just use wake_up_process().
> >> >
> >> > IMO this is not an improvement.  There are other places where explicit
> >> > TASK_NORMAL is used as well, and they're all perfectly clear as is.
> >>
> >> Arguably those could all be converted to wake_up_process() as well.
> >> It's a very small kernel code size optimization. There's about 3 such
> >> places, could be converted in a single patch.
> >
> >It's still pointless churn IMO.
>
> Using wake_up_process() is more simpler and friendly for beginners,
> and it is more convenient for analysis and statistics.

If that's your argument, that should have been in the change log. That
said, it's IMO still pretty darn weak. When presenting a patch, do what
Ingo did, show the technical merit, that's what will determine whether
it flies or dies.

-Mike

Re: [PATCH 10/10] arm64: dts: Add Mediatek SoC MT8195 and evaluation board dts and Makefile

2021-03-17 Thread Chunfeng Yun

On Tue, 2021-03-16 at 19:14 +0800, Seiya Wang wrote:
> Add basic chip support for Mediatek MT8195
> 
> Signed-off-by: Seiya Wang 
> ---
>  arch/arm64/boot/dts/mediatek/Makefile   |   1 +
>  arch/arm64/boot/dts/mediatek/mt8195-evb.dts |  29 ++
>  arch/arm64/boot/dts/mediatek/mt8195.dtsi| 477 
> 
>  3 files changed, 507 insertions(+)
>  create mode 100644 arch/arm64/boot/dts/mediatek/mt8195-evb.dts
>  create mode 100644 arch/arm64/boot/dts/mediatek/mt8195.dtsi
> 
> diff --git a/arch/arm64/boot/dts/mediatek/Makefile 
> b/arch/arm64/boot/dts/mediatek/Makefile
> index deba27ab7657..aee4b9715d2f 100644
> --- a/arch/arm64/boot/dts/mediatek/Makefile
> +++ b/arch/arm64/boot/dts/mediatek/Makefile
> @@ -16,4 +16,5 @@ dtb-$(CONFIG_ARCH_MEDIATEK) += mt8183-evb.dtb
>  dtb-$(CONFIG_ARCH_MEDIATEK) += mt8183-kukui-krane-sku0.dtb
>  dtb-$(CONFIG_ARCH_MEDIATEK) += mt8183-kukui-krane-sku176.dtb
>  dtb-$(CONFIG_ARCH_MEDIATEK) += mt8192-evb.dtb
> +dtb-$(CONFIG_ARCH_MEDIATEK) += mt8195-evb.dtb
>  dtb-$(CONFIG_ARCH_MEDIATEK) += mt8516-pumpkin.dtb
> diff --git a/arch/arm64/boot/dts/mediatek/mt8195-evb.dts 
> b/arch/arm64/boot/dts/mediatek/mt8195-evb.dts
> new file mode 100644
> index ..82bb10e9a531
> --- /dev/null
> +++ b/arch/arm64/boot/dts/mediatek/mt8195-evb.dts
> @@ -0,0 +1,29 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR MIT)
> +/*
> + * Copyright (C) 2021 MediaTek Inc.
> + * Author: Seiya Wang 
> + */
> +/dts-v1/;
> +#include "mt8195.dtsi"
> +
> +/ {
> + model = "MediaTek MT8195 evaluation board";
> + compatible = "mediatek,mt8195-evb", "mediatek,mt8195";
> +
> + aliases {
> + serial0 = 
> + };
> +
> + chosen {
> + stdout-path = "serial0:921600n8";
> + };
> +
> + memory@4000 {
> + device_type = "memory";
> + reg = <0 0x4000 0 0x8000>;
> + };
> +};
> +
> + {
> + status = "okay";
> +};
> diff --git a/arch/arm64/boot/dts/mediatek/mt8195.dtsi 
> b/arch/arm64/boot/dts/mediatek/mt8195.dtsi
> new file mode 100644
> index ..356583fe4f03
> --- /dev/null
> +++ b/arch/arm64/boot/dts/mediatek/mt8195.dtsi
> @@ -0,0 +1,477 @@
> +// SPDX-License-Identifier: (GPL-2.0 OR MIT)
> +/*
> + * Copyright (c) 2021 MediaTek Inc.
> + * Author: Seiya Wang 
> + */
> +
> +/dts-v1/;
> +
> +#include 
> +#include 
> +
> +/ {
> + compatible = "mediatek,mt8195";
> + interrupt-parent = <>;
> + #address-cells = <2>;
> + #size-cells = <2>;
> +
> + clocks {
> + clk26m: oscillator0 {
> + compatible = "fixed-clock";
> + #clock-cells = <0>;
> + clock-frequency = <2600>;
> + clock-output-names = "clk26m";
> + };
> +
> + clk32k: oscillator1 {
> + compatible = "fixed-clock";
> + #clock-cells = <0>;
> + clock-frequency = <32768>;
> + clock-output-names = "clk32k";
> + };
> + };
[...]
> +
> + nor_flash: nor@1132c000 {
> + compatible = "mediatek,mt8195-nor", 
> "mediatek,mt8173-nor";
> + reg = <0 0x1132c000 0 0x1000>;
> + interrupts = ;
> + clocks = <>, <>;
> + clock-names = "spi", "sf";
> + #address-cells = <1>;
> + #size-cells = <0>;
> + status = "disabled";
> + };
> +
> + u3phy2: usb-phy2@11c4 {
use t-phy instead of usb-phy2

It's better to run dtbs_check for this patch

> + compatible = "mediatek,mt8195-tphy", 
> "mediatek,generic-tphy-v2";
> + clocks = <>;
> + clock-names = "u3phya_ref";
No need clocks for v2
> + #address-cells = <1>;
> + #size-cells = <1>;
> + ranges = <0 0 0x11c4 0x700>;
> + status = "disabled";
> +
> + u2port2: usb2-phy2@0 {
use usb-phy instead of usb2-phy2

> + reg = <0x0 0x700>;
> + clocks = <>;
> + clock-names = "ref";
> + #phy-cells = <1>;
> + status = "disabled";
I think no need disable it
it's parent node is already disabled. if enable parent node,
we also want to enable all children at the same time.

> + };
> + };
> +
> + u3phy3: usb-phy3@11c5 {
t-phy@...
> + compatible = "mediatek,mt8195-tphy", 
> "mediatek,generic-tphy-v2";
> + clocks = <>;
> + clock-names = "u3phya_ref";
No need clocks
> + #address-cells = <1>;
> + #size-cells = <1>;
> + ranges = <0 0 0x11c5 0x700>;
> +

Re: [PATCH] drivers/video/fbdev:modify 0 to NULL

2021-03-17 Thread Chunyou Tang

Hi,Gustavo

On Wed, 17 Mar 2021 20:54:41 -0500
"Gustavo A. R. Silva"  wrote:

> On 3/17/21 21:47, Chunyou Tang wrote:
> 
> > I think "if (info == NULL)" is more intuitive,and there have many
> > compare likes "if (info == NULL)" in this file.  
> 
> In that case, all those instances should be changed to if (!foo),
> instead.
> 
> --
> Gustavo

OK,I change it.

--
ChunyouTang

Re: [PATCH v6 3/8] securtiy/brute: Detect a brute force attack

2021-03-17 Thread Kees Cook

On Sun, Mar 07, 2021 at 12:30:26PM +0100, John Wood wrote:
> To detect a brute force attack it is necessary that the statistics
> shared by all the fork hierarchy processes be updated in every fatal
> crash and the most important data to update is the application crash
> period. To do so, use the new "task_fatal_signal" LSM hook added in a
> previous step.
> 
> The application crash period must be a value that is not prone to change
> due to spurious data and follows the real crash period. So, to compute
> it, the exponential moving average (EMA) is used.
> 
> There are two types of brute force attacks that need to be detected. The
> first one is an attack that happens through the fork system call and the
> second one is an attack that happens through the execve system call. The
> first type uses the statistics shared by all the fork hierarchy
> processes, but the second type cannot use this statistical data due to
> these statistics disappear when the involved tasks finished. In this
> last scenario the attack info should be tracked by the statistics of a
> higher fork hierarchy (the hierarchy that contains the process that
> forks before the execve system call).
> 
> Moreover, these two attack types have two variants. A slow brute force
> attack that is detected if the maximum number of faults per fork
> hierarchy is reached and a fast brute force attack that is detected if
> the application crash period falls below a certain threshold.
> 
> Also, this patch adds locking to protect the statistics pointer hold by
> every process.
> 
> Signed-off-by: John Wood 
> ---
>  security/brute/brute.c | 498 +++--
>  1 file changed, 479 insertions(+), 19 deletions(-)
> 
> diff --git a/security/brute/brute.c b/security/brute/brute.c
> index 99d099e45112..870db55332d4 100644
> --- a/security/brute/brute.c
> +++ b/security/brute/brute.c
> @@ -11,9 +11,14 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -37,6 +42,11 @@ struct brute_stats {
>   u64 period;
>  };
> 
> +/*
> + * brute_stats_ptr_lock - Lock to protect the brute_stats structure pointer.
> + */
> +static DEFINE_RWLOCK(brute_stats_ptr_lock);

Yeow, you've switched from an (unneeded in prior patch) per-stats lock
to a global lock? I think this isn't needed...

> +
>  /*
>   * brute_blob_sizes - LSM blob sizes.
>   *
> @@ -74,7 +84,7 @@ static struct brute_stats *brute_new_stats(void)
>  {
>   struct brute_stats *stats;
> 
> - stats = kmalloc(sizeof(struct brute_stats), GFP_KERNEL);
> + stats = kmalloc(sizeof(struct brute_stats), GFP_ATOMIC);

Why change this here? I'd just start with this in the patch that
introduces it.

>   if (!stats)
>   return NULL;
> 
> @@ -99,16 +109,17 @@ static struct brute_stats *brute_new_stats(void)
>   * It's mandatory to disable interrupts before acquiring the 
> brute_stats::lock
>   * since the task_free hook can be called from an IRQ context during the
>   * execution of the task_alloc hook.
> + *
> + * Context: Must be called with interrupts disabled and brute_stats_ptr_lock
> + *  held.
>   */
>  static void brute_share_stats(struct brute_stats *src,
> struct brute_stats **dst)
>  {
> - unsigned long flags;
> -
> - spin_lock_irqsave(>lock, flags);
> + spin_lock(>lock);
>   refcount_inc(>refc);
>   *dst = src;
> - spin_unlock_irqrestore(>lock, flags);
> + spin_unlock(>lock);
>  }

I still don't think any locking is needed here; the whole function can
go away, IMO.

> 
>  /**
> @@ -126,26 +137,36 @@ static void brute_share_stats(struct brute_stats *src,
>   * this task and the new one being allocated. Otherwise, share the statistics
>   * that the current task already has.
>   *
> + * It's mandatory to disable interrupts before acquiring brute_stats_ptr_lock
> + * and brute_stats::lock since the task_free hook can be called from an IRQ
> + * context during the execution of the task_alloc hook.
> + *
>   * Return: -ENOMEM if the allocation of the new statistics structure fails. 
> Zero
>   * otherwise.
>   */
>  static int brute_task_alloc(struct task_struct *task, unsigned long 
> clone_flags)
>  {
>   struct brute_stats **stats, **p_stats;
> + unsigned long flags;
> 
>   stats = brute_stats_ptr(task);
>   p_stats = brute_stats_ptr(current);
> + write_lock_irqsave(_stats_ptr_lock, flags);
> 
>   if (likely(*p_stats)) {
>   brute_share_stats(*p_stats, stats);
> + write_unlock_irqrestore(_stats_ptr_lock, flags);
>   return 0;
>   }
> 
>   *stats = brute_new_stats();
> - if (!*stats)
> + if (!*stats) {
> + write_unlock_irqrestore(_stats_ptr_lock, flags);
>   return -ENOMEM;
> + }
> 
>   brute_share_stats(*stats, p_stats);
> +

Re: [PATCH v2] smp: kernel/panic.c - silence warnings

2021-03-17 Thread heying (H)




在 2021/3/18 4:09, Ingo Molnar 写道:

* Peter Zijlstra  wrote:


Now, the C people figured that distinction was useless and allowed
sloppiness. But I still think there's merrit to that. And as
mentioned earlier, it is consistent with variable declarations.

Fully agreed, and my other point was that it's also consistent with
the other existing externs were used *in the same header file*
already.

I.e. there's nothing more sloppy than mixing different styles within
the same header. Checkpatch needs to be fixed or ignored here.


Thank you all for the reply!

There are already mixing different styles within linux/smp.h. I mean 
'extern' and


non 'extern' func declarations both exist in this header. Since two of 
you three


think that 'extern' is needed, I'll add it and resend my patch.


Thanks again.

Re: [PATCH 3/5] dt-bindings: remoteproc: Add the documentation for Meson AO ARC rproc

2021-03-17 Thread Bjorn Andersson

On Tue 29 Dec 19:27 CST 2020, Martin Blumenstingl wrote:

> Amlogic Meson6, Meson8, Meson8b and Meson8m2 SoCs embed an ARC EM4
> controller for always-on operations, typically used for managing system
> suspend.
> 
> Signed-off-by: Martin Blumenstingl 
> ---
>  .../remoteproc/amlogic,meson-mx-ao-arc.yaml   | 87 +++
>  1 file changed, 87 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/remoteproc/amlogic,meson-mx-ao-arc.yaml
> 
> diff --git 
> a/Documentation/devicetree/bindings/remoteproc/amlogic,meson-mx-ao-arc.yaml 
> b/Documentation/devicetree/bindings/remoteproc/amlogic,meson-mx-ao-arc.yaml
> new file mode 100644
> index ..ba5deebaf7dc
> --- /dev/null
> +++ 
> b/Documentation/devicetree/bindings/remoteproc/amlogic,meson-mx-ao-arc.yaml
> @@ -0,0 +1,87 @@
> +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +%YAML 1.2
> +---
> +$id: "http://devicetree.org/schemas/remoteproc/amlogic,meson-mx-ao-arc.yaml#;
> +$schema: "http://devicetree.org/meta-schemas/core.yaml#;
> +
> +title: Amlogic Meson AO ARC Remote Processor bindings
> +
> +description:
> +  Amlogic Meson6, Meson8, Meson8b and Meson8m2 SoCs embed an ARC core
> +  controller for always-on operations, typically used for managing
> +  system suspend. Meson6 and older use a ARC core based on the ARCv1
> +  ISA, while Meson8, Meson8b and Meson8m2 use an ARC EM4 (ARCv2 ISA)
> +  core.
> +
> +maintainers:
> +  - Martin Blumenstingl 
> +
> +properties:
> +  compatible:
> +items:
> +  - enum:
> +- amlogic,meson8-ao-arc
> +- amlogic,meson8b-ao-arc
> +  - const: amlogic,meson-mx-ao-arc
> +
> +  firmware-name:
> +$ref: /schemas/types.yaml#/definitions/string
> +description:
> +  The name of the firmware which should be loaded for this remote
> +  processor.
> +
> +  reg:
> +description:
> +  Address ranges of the remap and CPU control addresses for the
> +  remote processor.
> +minItems: 2
> +
> +  reg-names:
> +items:
> +  - const: remap
> +  - const: cpu
> +
> +  resets:
> + minItems: 1
> +
> +  clocks:
> +minItems: 1
> +
> +  sram:
> +$ref: /schemas/types.yaml#/definitions/phandle
> +description:
> +  phandles to a reserved SRAM region which is used as the memory of
> +  the ARC core. The region should be defined as child nodes of the
> +  AHB SRAM node as per the generic bindings in
> +  Documentation/devicetree/bindings/sram/sram.yaml
> +
> +  amlogic,secbus2:
> +$ref: /schemas/types.yaml#/definitions/phandle
> +description:
> +  A phandle to the SECBUS2 region which contains some configuration
> +  bits of this remote processor
> +
> +required:
> +  - compatible
> +  - reg
> +  - reg-names
> +  - resets
> +  - clocks
> +  - sram
> +  - amlogic,secbus2
> +
> +additionalProperties: false
> +
> +examples:
> +  - |
> +remoteproc@1c {
> +  compatible= "amlogic,meson8-ao-arc", "amlogic,meson-mx-ao-arc";
> +  reg = <0x1c 0x8>, <0x38 0x8>;

I'm generally not in favor of mapping "individual" registers, do you
know what hardware block this is part of? Can you express the whole
block as an single entity in your DT?

Regards,
Bjorn

> +  reg-names = "remap", "cpu";
> +  resets = <_cpu_reset>;
> +  clocks = <_cpu_clock>;
> +  sram = <_sram_ao_arc>;
> +  amlogic,secbus2 = <>;
> +};
> +
> +...
> -- 
> 2.30.0
>

Re: [PATCH] scripts: Fix incremental build header re-generation

2021-03-17 Thread Jeevan Shriram




On 3/1/2021 7:36 PM, Masahiro Yamada wrote:

On Mon, Mar 1, 2021 at 11:23 PM Jeevan Shriram  wrote:

compile.h and autoconf.h are ignored when checking headers sha as they
are always re-generated for every kernel compilation. However,
these two headers are packaged into kheaders tar. During incremental
compilation of kernel, kheaders tar file is always generated and re-packaged
irrespective of the changes in headers.


I do not see this problem.
Could you describe the steps to reproduce it, please?


Without making any changes in the kernel or it's headers, re-compile the 
kernel. i.e.,incremental kernel build without any changes.
I have added following log in gen_kheaders.sh script for confirming the hash 
differences.

diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index b7425a0..ee542a0 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -40,6 +40,10 @@ obj_files_md5="$(find $dir_list -name "*.h"  
   |
 # Any changes to this script will also cause a rebuild of the archive.
 this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
 if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
+
+echo "Old Tar file $tarfile_md5"
+echo "New Tar file hash $this_file_md5"
+
 if [ -f kernel/kheaders.md5 ] &&
[ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
[ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&

log output :
89306 19:29:02.109961   CHK kernel/kheaders_data.tar.xz
89307 19:29:02.109971 Old Tar file 2aa6990e4183c31a862951f4bcac037e
89308 19:29:02.109982 New Tar file hash ecf84e700c7cacfe8b35a0905859582d


Change-Id: I7a64faebb81df44c32230b0fea1d6df09d7ce66f
Signed-off-by: Jeevan Shriram 
---
  kernel/gen_kheaders.sh | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index c1510f0..5499f72 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -51,8 +51,7 @@ this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)"
  if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
  if [ -f kernel/kheaders.md5 ] &&
 [ "$(head -n 1 kernel/kheaders.md5)" = "$headers_md5" ] &&
-   [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ] &&
-   [ "$(tail -n 1 kernel/kheaders.md5)" = "$tarfile_md5" ]; then
+   [ "$(head -n 2 kernel/kheaders.md5 | tail -n 1)" = "$this_file_md5" ]; 
then
 exit
  fi

--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project

Re: [PATCH 3/4] locking/ww_mutex: Treat ww_mutex_lock() like a trylock

2021-03-17 Thread Waiman Long


On 3/17/21 10:24 PM, Boqun Feng wrote:

Hi Waiman,

Just a question out of curiosity: how does this problem hide so long?
;-) Because IIUC, both locktorture and ww_mutex_lock have been there for
a while, so why didn't we spot this earlier?

I ask just to make sure we don't introduce the problem because of some
subtle problems in lock(dep).

You have to explicitly specify ww_mutex in the locktorture module 
parameter to run the test. ww_mutex is usually not the intended target 
of testing as there aren't that many places that use it. Even if someone 
run it, it probably is not on a debug kernel.


Our QA people try to run locktorture on ww_mutex and discover that.

Cheers,
Longman

[PATCH 2/2] Documentation: filesystesm api-summary: add namespace.c

2021-03-17 Thread Randy Dunlap

Add fs/namespace.c to the filesystems api-summary docbook.

Signed-off-by: Randy Dunlap 
Cc: Alexander Viro 
Cc: Jonathan Corbet 
Cc: linux-...@vger.kernel.org
---
 Documentation/filesystems/api-summary.rst |3 +++
 1 file changed, 3 insertions(+)

--- linux-next-20210317.orig/Documentation/filesystems/api-summary.rst
+++ linux-next-20210317/Documentation/filesystems/api-summary.rst
@@ -101,6 +101,9 @@ Other Functions
 .. kernel-doc:: fs/xattr.c
:export:
 
+.. kernel-doc:: fs/namespace.c
+   :export:
+
 The proc filesystem
 ===

[PATCH 1/2] fs/namespace: corrent/improve kernel-doc notation

2021-03-17 Thread Randy Dunlap

Fix kernel-doc warnings in fs/namespace.c:

./fs/namespace.c:1379: warning: Function parameter or member 'm' not described 
in 'may_umount_tree'
./fs/namespace.c:1379: warning: Excess function parameter 'mnt' description in 
'may_umount_tree'
./fs/namespace.c:1950: warning: Function parameter or member 'path' not 
described in 'clone_private_mount'

Also convert path_is_mountpoint() comments to kernel-doc.

Signed-off-by: Randy Dunlap 
Cc: Al Viro 
Cc: Jonathan Corbet 
Cc: linux-...@vger.kernel.org
---
Jon, Al has OK-ed you to merge this patch (and the next one, please).

 fs/namespace.c |   14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

--- linux-next-20210317.orig/fs/namespace.c
+++ linux-next-20210317/fs/namespace.c
@@ -1242,8 +1242,9 @@ struct vfsmount *mntget(struct vfsmount
 }
 EXPORT_SYMBOL(mntget);
 
-/* path_is_mountpoint() - Check if path is a mount in the current
- *  namespace.
+/**
+ * path_is_mountpoint() - Check if path is a mount in the current namespace.
+ * @path: path to check
  *
  *  d_mountpoint() can only be used reliably to establish if a dentry is
  *  not mounted in any namespace and that common case is handled inline.
@@ -1369,7 +1370,7 @@ void mnt_cursor_del(struct mnt_namespace
 
 /**
  * may_umount_tree - check if a mount tree is busy
- * @mnt: root of mount tree
+ * @m: root of mount tree
  *
  * This is called to check if a tree of mounts has any
  * open files, pwds, chroots or sub mounts that are
@@ -1939,10 +1940,11 @@ void drop_collected_mounts(struct vfsmou
 
 /**
  * clone_private_mount - create a private clone of a path
+ * @path: path to clone
  *
- * This creates a new vfsmount, which will be the clone of @path.  The new will
- * not be attached anywhere in the namespace and will be private (i.e. changes
- * to the originating mount won't be propagated into this).
+ * This creates a new vfsmount, which will be the clone of @path.  The new 
mount
+ * will not be attached anywhere in the namespace and will be private (i.e.
+ * changes to the originating mount won't be propagated into this).
  *
  * Release with mntput().
  */

Re: [PATCH 4/5] remoteproc: meson-mx-ao-arc: Add a driver for the AO ARC remote procesor

2021-03-17 Thread Bjorn Andersson

On Tue 29 Dec 19:27 CST 2020, Martin Blumenstingl wrote:

> Amlogic Meson6, Meson8, Meson8b and Meson8m2 embed an ARC core in the
> Always-On (AO) power-domain. This is typically used for waking up the
> ARM cores after system suspend.
> 
> The configuration is spread across three different registers:
> - AO_REMAP_REG0 which must be programmed to zero, it's actual purpose
>   is unknown. There is a second remap register which is not used in the
>   vendor kernel (which served as reference for this driver).
> - AO_CPU_CNTL is used to start and stop the ARC core.
> - AO_SECURE_REG0 in the SECBUS2 register area with unknown purpose.
> 
> To boot the ARC core we also need to enable it's gate clock and trigger
> a reset.
> 
> The actual code for this ARC core can come from an ELF binary, for
> example by building the Zephyr RTOS for an ARC EM4 core and then taking
> "zephyr.elf" as firmware. This executable does not have any "rsc table"
> so we are skipping rproc_elf_load_rsc_table (rproc_ops.parse_fw) and
> rproc_elf_find_loaded_rsc_table (rproc_ops.find_loaded_rsc_table).
> 

Thanks for the patch Martin, it looks really good. Just some minor
things as I expect a respin of the DT binding as well.

> Signed-off-by: Martin Blumenstingl 
> ---
>  drivers/remoteproc/Kconfig   |  11 ++
>  drivers/remoteproc/Makefile  |   1 +
>  drivers/remoteproc/meson_mx_ao_arc.c | 240 +++
>  3 files changed, 252 insertions(+)
>  create mode 100644 drivers/remoteproc/meson_mx_ao_arc.c
> 
> diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig
> index 9e7efe542f69..0e7fb91635fe 100644
> --- a/drivers/remoteproc/Kconfig
> +++ b/drivers/remoteproc/Kconfig
> @@ -125,6 +125,17 @@ config KEYSTONE_REMOTEPROC
> It's safe to say N here if you're not interested in the Keystone
> DSPs or just want to use a bare minimum kernel.
>  
> +config MESON_MX_AO_ARC_REMOTEPROC
> + tristate "Amlogic Meson6/8/8b/8m2 AO ARC remote processor support"
> + depends on HAS_IOMEM
> + depends on (ARM && ARCH_MESON) || COMPILE_TEST
> + select GENERIC_ALLOCATOR
> + help
> +   Say m or y here to have support for the AO ARC remote processor
> +   on Amlogic Meson6/Meson8/Meson8b/Meson8m2 SoCs. This is
> +   typically used for system suspend.
> +   If unusre say N.
> +
>  config PRU_REMOTEPROC
>   tristate "TI PRU remoteproc support"
>   depends on TI_PRUSS
> diff --git a/drivers/remoteproc/Makefile b/drivers/remoteproc/Makefile
> index bb26c9e4ef9c..ce1abeb30907 100644
> --- a/drivers/remoteproc/Makefile
> +++ b/drivers/remoteproc/Makefile
> @@ -18,6 +18,7 @@ obj-$(CONFIG_OMAP_REMOTEPROC)   += 
> omap_remoteproc.o
>  obj-$(CONFIG_WKUP_M3_RPROC)  += wkup_m3_rproc.o
>  obj-$(CONFIG_DA8XX_REMOTEPROC)   += da8xx_remoteproc.o
>  obj-$(CONFIG_KEYSTONE_REMOTEPROC)+= keystone_remoteproc.o
> +obj-$(CONFIG_MESON_MX_AO_ARC_REMOTEPROC)+= meson_mx_ao_arc.o
>  obj-$(CONFIG_PRU_REMOTEPROC) += pru_rproc.o
>  obj-$(CONFIG_QCOM_PIL_INFO)  += qcom_pil_info.o
>  obj-$(CONFIG_QCOM_RPROC_COMMON)  += qcom_common.o
> diff --git a/drivers/remoteproc/meson_mx_ao_arc.c 
> b/drivers/remoteproc/meson_mx_ao_arc.c
> new file mode 100644
> index ..1deb03ca30f4
> --- /dev/null
> +++ b/drivers/remoteproc/meson_mx_ao_arc.c
> @@ -0,0 +1,240 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (C) 2020 Martin Blumenstingl 
> 
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "remoteproc_internal.h"
> +
> +#define AO_REMAP_REG00x0
> +#define AO_REMAP_REG10x4
> +
> +#define AO_CPU_CNTL  0x0
> + #define AO_CPU_CNTL_MEM_ADDR_UPPER  GENMASK(28, 16)
> + #define AO_CPU_CNTL_HALTBIT(9)
> + #define AO_CPU_CNTL_UNKNONWNBIT(8)
> + #define AO_CPU_CNTL_RUN BIT(0)
> +
> +#define AO_CPU_STAT  0x4
> +
> +#define AO_SECURE_REG0   0x0
> + #define AO_SECURE_REG0_UNKNOWN  GENMASK(23, 8)
> +
> +#define MESON_AO_RPROC_SRAM_USABLE_BITS  GENMASK(31, 20)
> +#define MESON_AO_RPROC_MEMORY_OFFSET 0x1000
> +
> +struct meson_mx_ao_arc_rproc_priv {
> + void __iomem*remap_base;
> + void __iomem*cpu_base;
> + unsigned long   sram_va;
> + phys_addr_t sram_pa;
> + size_t  sram_size;
> + struct gen_pool *sram_pool;
> + struct reset_control*arc_reset;
> + struct clk  *arc_pclk;
> + struct regmap

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1440 matches

Mail list logo