[PATCH] mm: trivial mark_page_accessed() cleanup
This avoids duplicated PageReferenced() calls. No behavior change. Signed-off-by: Fengguang Wu --- mm/swap.c | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 38a52b9..c55720c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -373,9 +373,15 @@ void mark_page_accessed(struct page *page) page = compound_head(page); inc_node_page_state(page, NR_ACCESSED); - if (!PageActive(page) && !PageUnevictable(page) && - PageReferenced(page)) { - + if (!PageReferenced(page)) { + SetPageReferenced(page); + } else if (PageUnevictable(page)) { + /* +* Unevictable pages are on the "LRU_UNEVICTABLE" list. But, +* this list is never rotated or maintained, so marking an +* evictable page accessed has no effect. +*/ + } else if (!PageActive(page)) { /* * If the page is on the LRU, queue it for activation via * activate_page_pvecs. Otherwise, assume the page is on a @@ -389,8 +395,6 @@ void mark_page_accessed(struct page *page) ClearPageReferenced(page); if (page_is_file_cache(page)) workingset_activation(page); - } else if (!PageReferenced(page)) { - SetPageReferenced(page); } if (page_is_idle(page)) clear_page_idle(page); -- 2.7.4
Re: [PATCH for vm-scalability] usemem: Add new option -Z|--read-again
Applied, thanks Teawater! On Sat, Sep 14, 2019 at 11:07:18AM +0800, Hui Zhu wrote: usemem will read memory again after access the memory with this option. It can help test the speed that load page from swap to memory. Signed-off-by: Hui Zhu --- usemem.c | 46 -- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/usemem.c b/usemem.c index 264d52a..2d31946 100644 --- a/usemem.c +++ b/usemem.c @@ -94,6 +94,7 @@ int opt_sync_rw = 0; int opt_sync_free = 0; int opt_bind_interval = 0; unsigned long opt_delay = 0; +int opt_read_again = 0; int nr_task; int nr_thread; int nr_cpu; @@ -151,6 +152,7 @@ void usage(int ok) "-e|--delay delay for each page in ns\n" "-O|--anonymous mmap with MAP_ANONYMOUS\n" "-U|--hugetlballocate hugetlbfs page\n" + "-Z|--read-again read memory again after access the memory\n" "-h|--help show this message\n" , ourname); @@ -188,6 +190,7 @@ static const struct option opts[] = { { "sync-rw" , 0, NULL, 'y' }, { "delay" , 1, NULL, 'e' }, { "hugetlb" , 0, NULL, 'U' }, + { "read-again", 0, NULL, 'Z' }, { "help" , 0, NULL, 'h' }, { NULL , 0, NULL, 0 } }; @@ -616,7 +619,7 @@ unsigned long do_unit(unsigned long bytes, struct drand48_data *rand_data, return rw_bytes; } -static void output_statistics(unsigned long unit_bytes) +static void output_statistics(unsigned long unit_bytes, const char *intro) { struct timeval stop; char buf[1024]; @@ -629,8 +632,8 @@ static void output_statistics(unsigned long unit_bytes) (stop.tv_usec - start_time.tv_usec); throughput = ((unit_bytes * 100ULL) >> 10) / delta_us; len = snprintf(buf, sizeof(buf), - "%lu bytes / %lu usecs = %lu KB/s\n", - unit_bytes, delta_us, throughput); + "%s%lu bytes / %lu usecs = %lu KB/s\n", + intro, unit_bytes, delta_us, throughput); fflush(stdout); write(1, buf, len); } @@ -690,7 +693,34 @@ long do_units(void) } while (bytes); if (!opt_write_signal_read && unit_bytes) - output_statistics(unit_bytes); + output_statistics(unit_bytes, ""); + + if (opt_read_again && unit_bytes) { + unsigned long rw_bytes = 0; + + gettimeofday(&start_time, NULL); + for (i = 0; i < nptr; i++) { + int rep; + + for (rep = 0; rep < reps; rep++) { + if (rep > 0 && !quiet) { + printf("."); + fflush(stdout); + } + + rw_bytes += do_rw_once(ptrs[i], lens[i], &rand_data, 1, &rep, reps); + + if (msync_mode) { + if ((msync(ptrs[i], lens[i], msync_mode)) == -1) { + fprintf(stderr, "msync failed with error %s \n", strerror(errno)); + exit(1); + } + } + } + } + + output_statistics(rw_bytes, "read again "); + } if (opt_write_signal_read) { struct sigaction act; @@ -731,7 +761,7 @@ long do_units(void) sigsuspend(&set); gettimeofday(&start_time, NULL); unit_bytes = do_rw_once(buffer, opt_bytes, &rand_data, 1, NULL, 0); - output_statistics(unit_bytes); + output_statistics(unit_bytes, ""); } if (opt_sync_free) @@ -879,7 +909,7 @@ int main(int argc, char *argv[]) pagesize = getpagesize(); while ((c = getopt_long(argc, argv, - "aAB:f:FPp:gqowRMm:n:t:b:ds:T:Sr:u:j:e:EHDNLWyxOUh", opts, NULL)) != -1) + "aAB:f:FPp:gqowRMm:n:t:b:ds:T:Sr:u:j:e:EHDNLWyxOUZh", opts, NULL)) != -1) { switch (c) { case 'a': @@ -1005,6 +1035,10 @@ int main(int argc, char *argv[]) map_hugetlb = MAP_HUGETLB | MAP_HUGE_2MB; break; + case 'Z': + opt_read_again = 1; + break; + default: usage(1); } -- 2.7.4
Re: [v2 RFC PATCH 0/9] Another Approach to Use PMEM as NUMA Node
On Wed, Apr 17, 2019 at 11:17:48AM +0200, Michal Hocko wrote: On Tue 16-04-19 12:19:21, Yang Shi wrote: On 4/16/19 12:47 AM, Michal Hocko wrote: [...] > Why cannot we simply demote in the proximity order? Why do you make > cpuless nodes so special? If other close nodes are vacant then just use > them. We could. But, this raises another question, would we prefer to just demote to the next fallback node (just try once), if it is contended, then just swap (i.e. DRAM0 -> PMEM0 -> Swap); or would we prefer to try all the nodes in the fallback order to find the first less contended one (i.e. DRAM0 -> PMEM0 -> DRAM1 -> PMEM1 -> Swap)? I would go with the later. Why, because it is more natural. Because that is the natural allocation path so I do not see why this shouldn't be the natural demotion path. "Demotion" should be more performance wise by "demoting to the next-level (cheaper/slower) memory". Otherwise something like this may happen. DRAM0 pressured => demote cold pages to DRAM1 DRAM1 pressured => demote cold pages to DRAM0 Kind of DRAM0/DRAM1 exchanged a fraction of the demoted cold pages, which looks not helpful for overall system performance. Over time, it's even possible some cold pages get "demoted" in path DRAM0=>DRAM1=>DRAM0=>DRAM1=>... Thanks, Fengguang
Re: [v2 RFC PATCH 0/9] Another Approach to Use PMEM as NUMA Node
On Thu, Apr 18, 2019 at 11:02:27AM +0200, Michal Hocko wrote: On Wed 17-04-19 13:43:44, Yang Shi wrote: [...] And, I'm wondering whether this optimization is also suitable to general NUMA balancing or not. If there are convincing numbers then this should be a preferable way to deal with it. Please note that the number of promotions is not the only metric to watch. The overal performance/access latency would be another one. Good question. Shi and me aligned today. Also talked with Mel (but sorry I must missed some points due to poor English listening). It becomes clear that 1) PMEM/DRAM page promotion/demotion is a hard problem to attack. There will and should be multiple approaches for open discussion before settling down. The criteria might be balanced complexity, overheads, performance, etc. 2) We need a lot more data to lay solid foundation for effective discussions. Testing will be a rather time consuming part for contributor. We'll need to work together to create a number of benchmarks that can well exercise the kernel promotion/demotion paths and gather the necessary numbers. By collaborating on a common set of tests, we can not only amortize efforts, but also compare different approaches or compare v1/v2/... of the same approach conveniently. Ying has already created several LKP test cases for that purpose. Shi and me plan to join the efforts, too. Thanks, Fengguang
Re: [RFC PATCH] mm: readahead: add readahead_shift into backing device
On Mon, Mar 25, 2019 at 09:59:31AM -0700, Mark Salyzyn wrote: On 03/25/2019 05:16 AM, Fengguang Wu wrote: Martin, On Fri, Mar 22, 2019 at 11:46:11PM +0800, Martin Liu wrote: As the discussion https://lore.kernel.org/patchwork/patch/334982/ We know an open file's ra_pages might run out of sync from bdi.ra_pages since sequential, random or error read. Current design is we have to ask users to reopen the file or use fdavise system call to get it sync. However, we might have some cases to change system wide file ra_pages to enhance system performance such as enhance the boot time by increasing the ra_pages or decrease it to Do you have examples that some distro making use of larger ra_pages for boot time optimization? Android (if you are willing to squint and look at android-common AOSP kernels as a Distro). OK. I wonder how exactly Android makes use of it. Since phones are not using hard disks, so should benefit less from large ra_pages. Would you kindly point me to the code? Suppose N read streams with equal read speed. The thrash-free memory requirement would be (N * 2 * ra_pages). If N=1000 and ra_pages=1MB, it'd require 2GB memory. Which looks affordable in mainstream servers. That is 50% of the memory on a high end Android device ... Yeah but I'm obviously not talking Android device here. Will a phone serve 1000 concurrent read streams? Sorry but it sounds like introducing an unnecessarily twisted new interface. I'm afraid it fixes the pain for 0.001% users while bringing more puzzle to the majority others. >2B Android devices on the planet is 0.001%? Nope. Sorry I didn't know about the Android usage. Actually nobody mentioned it in the past discussions. I am not defending the proposed interface though, if there is something better that can be used, then looking into: Then let fadvise() and shrink_readahead_size_eio() adjust that per-file ra_pages_shift. Sounds like this would require a lot from init to globally audit and reduce the read-ahead for all open files? It depends. In theory it should be possible to create a standalone kernel module to dump the page cache and get the current snapshot of all cached file pages. It'd be a one-shot action and don't require continuous auditing. [RFC] kernel facilities for cache prefetching https://lwn.net/Articles/182128 This tool may also work. It's quick to get the list of opened files by walking /proc/*/fd/, however not as easy to get the list of cached file names. https://github.com/tobert/pcstat Perhaps we can do a simplified /proc/filecache that only dumps the list of cached file names. Then let mincore() based tools take care of the rest work. Regards, Fengguang
Re: [LSF/MM ATTEND ] memory reclaim with NUMA rebalancing
On Sat, Feb 23, 2019 at 09:27:48PM +0800, Fengguang Wu wrote: On Thu, Jan 31, 2019 at 12:19:47PM +0530, Aneesh Kumar K.V wrote: Michal Hocko writes: Hi, I would like to propose the following topic for the MM track. Different group of people would like to use NVIDMMs as a low cost & slower memory which is presented to the system as a NUMA node. We do have a NUMA API but it doesn't really fit to "balance the memory between nodes" needs. People would like to have hot pages in the regular RAM while cold pages might be at lower speed NUMA nodes. We do have NUMA balancing for promotion path but there is notIhing for the other direction. Can we start considering memory reclaim to move pages to more distant and idle NUMA nodes rather than reclaim them? There are certainly details that will get quite complicated but I guess it is time to start discussing this at least. I would be interested in this topic too. I would like to understand So do me. I'd be glad to take in the discussions if can attend the slot. the API and how it can help exploit the different type of devices we have on OpenCAPI. IMHO there are few proposals related to this which we could discuss together 1. HMAT series which want to expose these devices as Numa nodes 2. The patch series from Dave Hansen which just uses Pmem as Numa node. 3. The patch series from Fengguang Wu which does prevent default allocation from these numa nodes by excluding them from zone list. 4. The patch series from Jerome Glisse which doesn't expose these as numa nodes. IMHO (3) is suggesting that we really don't want them as numa nodes. But since Numa is the only interface we currently have to present them as memory and control the allocation and migration we are forcing ourselves to Numa nodes and then excluding them from default allocation. Regarding (3), we actually made a default policy choice for "separating fallback zonelists for PMEM/DRAM nodes" for the typical use scenarios. In long term, it's better to not build such assumption into kernel. There may well be workloads that are cost sensitive rather than performance sensitive. Suppose people buy a machine with tiny DRAM and large PMEM. In which case the suitable policy may be to 1) prefer (but not bind) slab etc. kernel pages in DRAM 2) allocate LRU etc. pages from either DRAM or PMEM node The point is not separating fallback zonelists for DRAM and PMEM in this case. In summary, kernel may offer flexibility for different policies for use by different users. PMEM has different characteristics comparing to DRAM, users may or may not be treated differently than DRAM through policies. Thanks, Fengguang
Re: [LSF/MM ATTEND ] memory reclaim with NUMA rebalancing
On Thu, Jan 31, 2019 at 12:19:47PM +0530, Aneesh Kumar K.V wrote: Michal Hocko writes: Hi, I would like to propose the following topic for the MM track. Different group of people would like to use NVIDMMs as a low cost & slower memory which is presented to the system as a NUMA node. We do have a NUMA API but it doesn't really fit to "balance the memory between nodes" needs. People would like to have hot pages in the regular RAM while cold pages might be at lower speed NUMA nodes. We do have NUMA balancing for promotion path but there is notIhing for the other direction. Can we start considering memory reclaim to move pages to more distant and idle NUMA nodes rather than reclaim them? There are certainly details that will get quite complicated but I guess it is time to start discussing this at least. I would be interested in this topic too. I would like to understand So do me. I'd be glad to take in the discussions if can attend the slot. the API and how it can help exploit the different type of devices we have on OpenCAPI. IMHO there are few proposals related to this which we could discuss together 1. HMAT series which want to expose these devices as Numa nodes 2. The patch series from Dave Hansen which just uses Pmem as Numa node. 3. The patch series from Fengguang Wu which does prevent default allocation from these numa nodes by excluding them from zone list. 4. The patch series from Jerome Glisse which doesn't expose these as numa nodes. IMHO (3) is suggesting that we really don't want them as numa nodes. But since Numa is the only interface we currently have to present them as memory and control the allocation and migration we are forcing ourselves to Numa nodes and then excluding them from default allocation. Regarding (3), we actually made a default policy choice for "separating fallback zonelists for PMEM/DRAM nodes" for the typical use scenarios. In long term, it's better to not build such assumption into kernel. There may well be workloads that are cost sensitive rather than performance sensitive. Suppose people buy a machine with tiny DRAM and large PMEM. In which case the suitable policy may be to 1) prefer (but not bind) slab etc. kernel pages in DRAM 2) allocate LRU etc. pages from either DRAM or PMEM node In summary, kernel may offer flexibility for different policies for use by different users. PMEM has different characteristics comparing to DRAM, users may or may not be treated differently than DRAM through policies. Thanks, Fengguang
Re: [RFC][PATCH v2 14/21] kvm: register in mm_struct
Hi Peter, On Sat, Feb 02, 2019 at 02:57:41PM +0800, Peter Xu wrote: On Wed, Dec 26, 2018 at 09:15:00PM +0800, Fengguang Wu wrote: VM is associated with an address space and not a specific thread. >From Documentation/virtual/kvm/api.txt: Only run VM ioctls from the same process (address space) that was used to create the VM. Hi, Fengguang, AFAIU the commit message only explains why a kvm object needs to bind to a single mm object (say, the reason why there is kvm->mm) however not the reverse (say, the reason why there is mm->kvm), while the latter is what this patch really needs? Yeah good point. The addition of mm->kvm makes code in this patchset simple. However if that field is considered not general useful for other possible users, and the added space overheads is a concern, we can instead do with a flag (saying the mm is referenced by some KVM), and add extra lookup code to find out the exact kvm instance. I'm thinking whether it's legal for multiple VMs to run on a single mm address space. I don't see a limitation so far but it's very possible I am just missing something there (if there is, IMHO they might be something nice to put into the commit message?). Thanks, So far one QEMU only starts one KVM. I cannot think of any strong benefit to start multiple KVMs in one single QEMU, so it may well remain so in future. Anyway it's internal data structure instead of API, which can adapt to possible future changes. Thanks, Fengguang CC: Nikita Leshenko CC: Christian Borntraeger Signed-off-by: Fengguang Wu --- include/linux/mm_types.h | 11 +++ virt/kvm/kvm_main.c |3 +++ 2 files changed, 14 insertions(+) --- linux.orig/include/linux/mm_types.h 2018-12-23 19:58:06.993417137 +0800 +++ linux/include/linux/mm_types.h 2018-12-23 19:58:06.993417137 +0800 @@ -27,6 +27,7 @@ typedef int vm_fault_t; struct address_space; struct mem_cgroup; struct hmm; +struct kvm; /* * Each physical page in the system has a struct page associated with @@ -496,6 +497,10 @@ struct mm_struct { /* HMM needs to track a few things per mm */ struct hmm *hmm; #endif + +#if IS_ENABLED(CONFIG_KVM) + struct kvm *kvm; +#endif } __randomize_layout; /* @@ -507,6 +512,12 @@ struct mm_struct { extern struct mm_struct init_mm; +#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return mm->kvm; } +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return NULL; } +#endif + /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { --- linux.orig/virt/kvm/kvm_main.c 2018-12-23 19:58:06.993417137 +0800 +++ linux/virt/kvm/kvm_main.c 2018-12-23 19:58:06.993417137 +0800 @@ -727,6 +727,7 @@ static void kvm_destroy_vm(struct kvm *k struct mm_struct *mm = kvm->mm; kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); + mm->kvm = NULL; kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); @@ -3224,6 +3225,8 @@ static int kvm_dev_ioctl_create_vm(unsig fput(file); return -ENOMEM; } + + kvm->mm->kvm = kvm; kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); fd_install(r, file); Regards, -- Peter Xu
Re: [RFC][PATCH v2 00/21] PMEM NUMA node and hotness accounting/migration
Hi Jonathan, Thanks for showing the gap on tracking hot accesses from devices. On Mon, Jan 28, 2019 at 05:42:39PM +, Jonathan Cameron wrote: On Wed, 2 Jan 2019 12:21:10 + Jonathan Cameron wrote: On Fri, 28 Dec 2018 20:52:24 +0100 Michal Hocko wrote: > [Ccing Mel and Andrea] > Hi, I just wanted to highlight this section as I didn't feel we really addressed this in the earlier conversation. * Hot pages may not be hot just because the host is using them a lot. It would be very useful to have a means of adding information available from accelerators beyond simple accessed bits (dreaming ;) One problem here is translation caches (ATCs) as they won't normally result in any updates to the page accessed bits. The arm SMMU v3 spec for example makes it clear (though it's kind of obvious) that the ATS request is the only opportunity to update the accessed bit. The nasty option here would be to periodically flush the ATC to force the access bit updates via repeats of the ATS request (ouch). That option only works if the iommu supports updating the accessed flag (optional on SMMU v3 for example). If ATS based updates are supported, we may trigger it when closing the /proc/pid/idle_pages file. We already do TLB flushes at that time. For example, [PATCH 15/21] ept-idle: EPT walk for virtual machine ept_idle_release(): kvm_flush_remote_tlbs(kvm); [PATCH 17/21] proc: introduce /proc/PID/idle_pages mm_idle_release(): flush_tlb_mm(mm); The flush cost is kind of "minimal necessary" in our current use model, where user space scan+migration daemon will do such loop: loop: walk page table N times: open,read,close /proc/PID/idle_pages (flushes TLB on file close) sleep for a short interval sort and migrate hot pages sleep for a while If we ignore the IOMMU hardware update issue which will simply need to be addressed by future hardware if these techniques become common, how do we address the Address Translation Cache issue without potentially causing big performance problems by flushing the cache just to force an accessed bit update? These devices are frequently used with PRI and Shared Virtual Addressing and can be accessing most of your memory without you having any visibility of it in the page tables (as they aren't walked if your ATC is well matched in size to your usecase. Classic example would be accelerated DB walkers like the the CCIX demo Xilinx has shown at a few conferences. The whole point of those is that most of the time only your large set of database walkers is using your memory and they have translations cached for for a good part of what they are accessing. Flushing that cache could hurt a lot. Pinning pages hurts for all the normal flexibility reasons. Last thing we want is to be migrating these pages that can be very hot but in an invisible fashion. If there are some other way to get hotness for special device memory, the user space daemon may be extended to cover that. Perhaps by querying another new kernel interface. By driving hotness accounting and migration in user space, we harvest this kind of flexibility. In the daemon POV, /proc/PID/idle_pages provides one common way to get "accessed" bits hence hotness, though the daemon does not need to depend solely on it. Thanks, Fengguang
Re: [PATCH 0/4] Allow persistent memory to be used like normal RAM
With this patch set, an unmodified application would either use: 1) whatever memory it happened to get 2) only the faster dram (via numactl --membind=) 3) only the slower pmem (again, via numactl --membind1) 4) preferentially one or the other (numactl --preferred=) Yet another option: MemoryOptimizer -- hot page accounting and migration daemon https://github.com/intel/memory-optimizer Once PMEM NUMA nodes are available, we may run a user space daemon to walk page tables of virtual machines (EPT) or processes, collect the "accessed" bits to find out hot pages, and finally migrate hot pages to DRAM and cold pages to PMEM. In that scenario, only kernel and the migrate daemon need to be aware of the PMEM nodes. Unmodified virtual machines and processes can enjoy the added memory space w/o knowing whether it's using DRAM or PMEM. Thanks, Fengguang
Re: [RFC][PATCH v2 11/21] kvm: allocate page table pages from DRAM
On Wed, Jan 02, 2019 at 08:47:25AM -0800, Dave Hansen wrote: On 12/26/18 5:14 AM, Fengguang Wu wrote: +static unsigned long __get_dram_free_pages(gfp_t gfp_mask) +{ + struct page *page; + + page = __alloc_pages(GFP_KERNEL_ACCOUNT, 0, numa_node_id()); + if (!page) + return 0; + return (unsigned long) page_address(page); +} There seems to be a ton of *policy* baked into these patches. For instance: thou shalt not allocate page tables pages from PMEM. That's surely not a policy we want to inflict on every Linux user until the end of time. Right. It's straight forward policy for users that care performance. The project is planned by 3 steps, at this moment we are in phase (1): 1) core functionalities, easy to backport 2) upstream-able total solution 3) upstream when API stabilized The dumb kernel interface /proc/PID/idle_pages enables doing the majority policies in user space. However for the other smaller parts, it looks easier to just implement an obvious policy first. Then to consider more possibilities. I think the more important question is how we can have the specific policy that this patch implements, but also leave open room for other policies, such as: "I don't care how slow this VM runs, minimize the amount of fast memory it eats." Agreed. I'm open for more ways. We can treat these patches as the soliciting version. If anyone send reasonable improvements or even totally different way of doing it, I'd be happy to incorporate. Thanks, Fengguang
Re: [RFC][PATCH v2 10/21] mm: build separate zonelist for PMEM and DRAM node
On Tue, Jan 01, 2019 at 02:44:41PM +0530, Aneesh Kumar K.V wrote: Fengguang Wu writes: From: Fan Du When allocate page, DRAM and PMEM node should better not fall back to each other. This allows migration code to explicitly control which type of node to allocate pages from. With this patch, PMEM NUMA node can only be used in 2 ways: - migrate in and out - numactl Can we achieve this using nodemask? That way we don't tag nodes with different properties such as DRAM/PMEM. We can then give the flexibilility to the device init code to add the new memory nodes to the right nodemask Aneesh, in patch 2 we did create nodemask numa_nodes_pmem and numa_nodes_dram. What's your supposed way of "using nodemask"? Thanks, Fengguang
Re: [PATCH] printk: Add caller information to printk() output.
On Thu, Jan 03, 2019 at 07:27:41PM +0100, Dmitry Vyukov wrote: On Wed, Jan 2, 2019 at 5:09 PM Dmitry Vyukov wrote: On Tue, Dec 18, 2018 at 9:58 AM Sergey Senozhatsky wrote: > > On (12/18/18 09:39), Petr Mladek wrote: > > > > Sergey, are you okay with this squashed patch, please? > > > > Yeah. There are several minor nitpicks, but here is my > Acked-by: Sergey Senozhatsky > > > One final question - can syzbot folks confirm that the patch > helps? Just curious. This slip through the cracks. Tetsuo pinged me and I am testing now. Need to create a set of tests and update parsing code to handle this. I've pushed support for CONFIG_PRINTK_CALLER to syzkaller/syzbot: https://github.com/google/syzkaller/commit/7da2392541a49c3f17b2e7d24e04b84d72b965fb Let's see what happens. Limited local testing shows that it's working as intended and significatly improves quality of reports and ability to make sense out of kernel output. Tetsuo, thanks for your great persistence with this change! Sergey, Petr, thanks for reviews! +Fengguang, Kevin, maybe you will find this useful for 0-day/kernel-ci. Thanks! We'll try out CONFIG_PRINTK_CALLER. Regards, Fengguang
Re: [RFC][PATCH v2 00/21] PMEM NUMA node and hotness accounting/migration
> I haven't looked at the implementation yet but if you are proposing a > special cased zone lists then this is something CDM (Coherent Device > Memory) was trying to do two years ago and there was quite some > skepticism in the approach. It looks we are pretty different than CDM. :) We creating new NUMA nodes rather than CDM's new ZONE. The zonelists modification is just to make PMEM nodes more separated. Yes, this is exactly what CDM was after. Have a zone which is not reachable without explicit request AFAIR. So no, I do not think you are too different, you just use a different terminology ;) Got it. OK.. The fall back zonelists patch does need more thoughts. In long term POV, Linux should be prepared for multi-level memory. Then there will arise the need to "allocate from this level memory". So it looks good to have separated zonelists for each level of memory. On the other hand, there will also be page allocations that don't care about the exact memory level. So it looks reasonable to expect different kind of fallback zonelists that can be selected by NUMA policy. Thanks, Fengguang
Re: [RFC][PATCH v2 00/21] PMEM NUMA node and hotness accounting/migration
On Fri, Dec 28, 2018 at 01:15:15PM +0100, Michal Hocko wrote: On Fri 28-12-18 17:42:08, Wu Fengguang wrote: [...] Those look unnecessary complexities for this post. This v2 patchset mainly fulfills our first milestone goal: a minimal viable solution that's relatively clean to backport. Even when preparing for new upstreamable versions, it may be good to keep it simple for the initial upstream inclusion. On the other hand this is creating a new NUMA semantic and I would like to have something long term thatn let's throw something in now and care about long term later. So I would really prefer to talk about long term plans first and only care about implementation details later. That makes good sense. FYI here are the several in-house patches that try to leverage (but not yet integrate with) NUMA balancing. The last one is brutal force hacking. They obviously break original NUMA balancing logic. Thanks, Fengguang >From ef41a542568913c8c62251021c3bc38b7a549440 Mon Sep 17 00:00:00 2001 From: Liu Jingqi Date: Sat, 29 Sep 2018 23:29:56 +0800 Subject: [PATCH 074/166] migrate: set PROT_NONE on the PTEs and let NUMA balancing Need to enable CONFIG_NUMA_BALANCING firstly. Set PROT_NONE on the PTEs that map to the page, and do the actual migration in the context of process which initiate migration. Signed-off-by: Liu Jingqi Signed-off-by: Fengguang Wu --- mm/migrate.c | 15 +++ 1 file changed, 15 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index b27a287081c2..d933f6966601 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1530,6 +1530,21 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (page_mapcount(page) > 1 && !migrate_all) goto out_putpage; + if (flags & MPOL_MF_SW_YOUNG) { + unsigned long start, end; + unsigned long nr_pte_updates = 0; + + start = max(addr, vma->vm_start); + + /* TODO: if huge page */ + end = ALIGN(addr + (1 << PAGE_SHIFT), PAGE_SIZE); + end = min(end, vma->vm_end); + nr_pte_updates = change_prot_numa(vma, start, end); + + err = 0; + goto out_putpage; + } + if (PageHuge(page)) { if (PageHead(page)) { /* Check if the page is software young. */ -- 2.15.0 >From e617e8c2034387cbed50bafa786cf83528dbe3df Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Sun, 30 Sep 2018 10:50:58 +0800 Subject: [PATCH 075/166] migrate: consolidate MPOL_MF_SW_YOUNG behaviors - if page already in target node: SetPageReferenced - otherwise: change_prot_numa Signed-off-by: Fengguang Wu --- arch/x86/kvm/Kconfig | 1 + mm/migrate.c | 65 +++- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4c6dec47fac6..c103373536fc 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -100,6 +100,7 @@ config KVM_EPT_IDLE tristate "KVM EPT idle page tracking" depends on KVM_INTEL depends on PROC_PAGE_MONITOR + depends on NUMA_BALANCING ---help--- Provides support for walking EPT to get the A bits on Intel processors equipped with the VT extensions. diff --git a/mm/migrate.c b/mm/migrate.c index d933f6966601..d944f031c9ea 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1500,6 +1500,8 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *vma; struct page *page; + unsigned long end; + unsigned int page_nid; unsigned int follflags; int err; bool migrate_all = flags & MPOL_MF_MOVE_ALL; @@ -1522,49 +1524,60 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, if (!page) goto out; - err = 0; - if (page_to_nid(page) == node) - goto out_putpage; + page_nid = page_to_nid(page); err = -EACCES; if (page_mapcount(page) > 1 && !migrate_all) goto out_putpage; - if (flags & MPOL_MF_SW_YOUNG) { - unsigned long start, end; - unsigned long nr_pte_updates = 0; - - start = max(addr, vma->vm_start); - - /* TODO: if huge page */ - end = ALIGN(addr + (1 << PAGE_SHIFT), PAGE_SIZE); - end = min(end, vma->vm_end); - nr_pte_updates = change_prot_numa(vma, start, end); - - err = 0; - goto out_putpage; - } - + err = 0; if (PageHuge(page)) { - if (PageHead(page)) { - /* Check if the page is software young. */ - if (flags & MPOL_MF_SW_YOUNG) + if (!PageHead(page)) { + err = -EACCES; + goto out_putpage; + } + if (flags & MPOL_MF_SW_YOUNG) { + if (page_nid == node) SetPageReferenced(page); - isolate_huge_page(page, pagelist); - err = 0; + else if (PageAnon(page)) { +end = addr + (hpage_nr_pages(page) << PAGE_SHIFT); +if (end <= vma->vm_end) + change_prot_numa(vma, addr, end); + } + goto out_putpage; } + if (page_nid == node) + goto out_putpage; + isolate_huge_page(page, pagelist); } else { struct page *head; head = compound_head(page); + + if (flag
Re: [RFC][PATCH v2 00/21] PMEM NUMA node and hotness accounting/migration
On Fri, Dec 28, 2018 at 09:41:05AM +0100, Michal Hocko wrote: On Fri 28-12-18 13:08:06, Wu Fengguang wrote: [...] Optimization: do hot/cold page tracking and migration = Since PMEM is slower than DRAM, we need to make sure hot pages go to DRAM and cold pages stay in PMEM, to get the best out of PMEM and DRAM. - DRAM=>PMEM cold page migration It can be done in kernel page reclaim path, near the anonymous page swap out point. Instead of swapping out, we now have the option to migrate cold pages to PMEM NUMA nodes. OK, this makes sense to me except I am not sure this is something that should be pmem specific. Is there any reason why we shouldn't migrate pages on memory pressure to other nodes in general? In other words rather than paging out we whould migrate over to the next node that is not under memory pressure. Swapout would be the next level when the memory is (almost_) fully utilized. That wouldn't be pmem specific. In future there could be multi memory levels with different performance/size/cost metric. There are ongoing HMAT works to describe that. When ready, we can switch to the HMAT based general infrastructure. Then the code will no longer be PMEM specific, but do general promotion/demotion migrations between high/low memory levels. Swapout could be from the lowest level memory. Migration between peer nodes is the obvious simple way and a good choice for the initial implementation. But yeah, it's possible to migrate to other nodes. For example, it can be combined with NUMA balancing: if we know the page is mostly accessed by the other socket, then it'd best to migrate hot/cold pages directly to that socket. User space may also do it, however cannot act on-demand, when there are memory pressure in DRAM nodes. - PMEM=>DRAM hot page migration While LRU can be good enough for identifying cold pages, frequency based accounting can be more suitable for identifying hot pages. Our design choice is to create a flexible user space daemon to drive the accounting and migration, with necessary kernel supports by this patchset. We do have numa balancing, why cannot we rely on it? This along with the above would allow to have pmem numa nodes (cpuless nodes in fact) without any special casing and a natural part of the MM. It would be only the matter of the configuration to set the appropriate distance to allow reasonable allocation fallback strategy. Good question. We actually tried reusing NUMA balancing mechanism to do page-fault triggered migration. move_pages() only calls change_prot_numa(). It turns out the 2 migration types have different purposes (one for hotness, another for home node) and hence implement details. We end up modifying some few NUMA balancing logic -- removing rate limiting, changing target node logics, etc. Those look unnecessary complexities for this post. This v2 patchset mainly fulfills our first milestone goal: a minimal viable solution that's relatively clean to backport. Even when preparing for new upstreamable versions, it may be good to keep it simple for the initial upstream inclusion. I haven't looked at the implementation yet but if you are proposing a special cased zone lists then this is something CDM (Coherent Device Memory) was trying to do two years ago and there was quite some skepticism in the approach. It looks we are pretty different than CDM. :) We creating new NUMA nodes rather than CDM's new ZONE. The zonelists modification is just to make PMEM nodes more separated. Thanks, Fengguang
Re: [RFC][PATCH v2 00/21] PMEM NUMA node and hotness accounting/migration
On Thu, Dec 27, 2018 at 09:31:58PM +0100, Michal Hocko wrote: On Wed 26-12-18 21:14:46, Wu Fengguang wrote: This is an attempt to use NVDIMM/PMEM as volatile NUMA memory that's transparent to normal applications and virtual machines. The code is still in active development. It's provided for early design review. So can we get a high level description of the design and expected usecases please? Good question. Use cases = The general use case is to use PMEM as slower but cheaper "DRAM". The suitable ones can be - workloads care memory size more than bandwidth/latency - workloads with a set of warm/cold pages that don't change rapidly over time - low cost VM/containers Foundation: create PMEM NUMA nodes == To create PMEM nodes in native kernel, Dave Hansen and Dan Williams have working patches for kernel and ndctl. According to Ying, it'll work like this ndctl destroy-namespace -f namespace0.0 ndctl destroy-namespace -f namespace1.0 ipmctl create -goal MemoryMode=100 reboot To create PMEM nodes in QEMU VMs, current Debian/Fedora etc. distros already support this qemu-system-x86_64 -machine pc,nvdimm -enable-kvm -smp 64 -m 256G # DRAM node 0 -object memory-backend-file,size=128G,share=on,mem-path=/dev/shm/qemu_node0,id=tmpfs-node0 -numa node,cpus=0-31,nodeid=0,memdev=tmpfs-node0 # PMEM node 1 -object memory-backend-file,size=128G,share=on,mem-path=/dev/dax1.0,align=128M,id=dax-node1 -numa node,cpus=32-63,nodeid=1,memdev=dax-node1 Optimization: do hot/cold page tracking and migration = Since PMEM is slower than DRAM, we need to make sure hot pages go to DRAM and cold pages stay in PMEM, to get the best out of PMEM and DRAM. - DRAM=>PMEM cold page migration It can be done in kernel page reclaim path, near the anonymous page swap out point. Instead of swapping out, we now have the option to migrate cold pages to PMEM NUMA nodes. User space may also do it, however cannot act on-demand, when there are memory pressure in DRAM nodes. - PMEM=>DRAM hot page migration While LRU can be good enough for identifying cold pages, frequency based accounting can be more suitable for identifying hot pages. Our design choice is to create a flexible user space daemon to drive the accounting and migration, with necessary kernel supports by this patchset. Linux kernel already offers move_pages(2) for user space to migrate pages to specified NUMA nodes. The major gap lies in hotness accounting. User space driven hotness accounting One way to find out hot/cold pages is to scan page table multiple times and collect the "accessed" bits. We created the kvm-ept-idle kernel module to provide the "accessed" bits via interface /proc/PID/idle_pages. User space can open it and read the "accessed" bits for a range of virtual address. Inside kernel module, it implements 2 independent set of page table scan code, seamlessly providing the same interface: - for QEMU, scan HVA range of the VM's EPT(Extended Page Table) - for others, scan VA range of the process page table With /proc/PID/idle_pages and move_pages(2), the user space daemon can work like this One round of scan+migration: loop N=(3-10) times: sleep 0.01-10s (typical values) scan page tables and read/accumulate accessed bits into arrays treat pages with accessed_count == N as hot pages treat pages with accessed_count == 0 as cold pages migrate hot pages to DRAM nodes migrate cold pages to PMEM nodes (optional, may do it once on multi scan rounds, to make sure they are really cold) That just describes the bare minimal working model. A real world daemon should consider lots more to be useful and robust. The notable one is to avoid thrashing. Hotness accounting can be rough and workload can be unstable. We need to avoid promoting a warm page to DRAM and then demoting it soon. The basic scheme is to auto control scan interval and count, so that each round of scan will get hot pages < 1/2 DRAM size. May also do multiple round of scans before migration, to filter out unstable/burst accesses. In long run, most of the accounted hot pages will already be in DRAM. So only need to migrate the new ones to DRAM. When doing so, should consider QoS and rate limiting to reduce impacts to user workloads. When user space drives hot page migration, the DRAM nodes may well be pressured, which will in turn trigger in-kernel cold page migration. The above 1/2 DRAM size hot pages target can help kernel easily find cold pages on LRU scan. To avoid thrashing, it's also important to maintain persistent kernel and user-space view of hot/cold pages. Since they will do migrations in 2 different directions. - the regular page table scans will clear PMD/PTE young - user spac
Re: [RFC][PATCH v2 01/21] e820: cheat PMEM as DRAM
On Thu, Dec 27, 2018 at 11:32:06AM -0800, Yang Shi wrote: On Wed, Dec 26, 2018 at 9:13 PM Dan Williams wrote: On Wed, Dec 26, 2018 at 8:11 PM Fengguang Wu wrote: > > On Wed, Dec 26, 2018 at 07:41:41PM -0800, Matthew Wilcox wrote: > >On Wed, Dec 26, 2018 at 09:14:47PM +0800, Fengguang Wu wrote: > >> From: Fan Du > >> > >> This is a hack to enumerate PMEM as NUMA nodes. > >> It's necessary for current BIOS that don't yet fill ACPI HMAT table. > >> > >> WARNING: take care to backup. It is mutual exclusive with libnvdimm > >> subsystem and can destroy ndctl managed namespaces. > > > >Why depend on firmware to present this "correctly"? It seems to me like > >less effort all around to have ndctl label some namespaces as being for > >this kind of use. > > Dave Hansen may be more suitable to answer your question. He posted > patches to make PMEM NUMA node coexist with libnvdimm and ndctl: > > [PATCH 0/9] Allow persistent memory to be used like normal RAM > https://lkml.org/lkml/2018/10/23/9 > > That depends on future BIOS. So we did this quick hack to test out > PMEM NUMA node for the existing BIOS. No, it does not depend on a future BIOS. It is correct. We already have Dave's patches + Dan's patch (added target_node field) work on our machine which has SRAT. Thanks for the correction. It looks my perception was out of date. So we can follow Dave+Dan's patches to create the PMEM NUMA nodes. Thanks, Fengguang Willy, have a look here [1], here [2], and here [3] for the work-in-progress ndctl takeover approach (actually 'daxctl' in this case). [1]: https://lkml.org/lkml/2018/10/23/9 [2]: https://lkml.org/lkml/2018/10/31/243 [3]: https://lists.01.org/pipermail/linux-nvdimm/2018-November/018677.html
Re: [RFC][PATCH v2 08/21] mm: introduce and export pgdat peer_node
On Thu, Dec 27, 2018 at 08:07:26PM +, Christopher Lameter wrote: On Wed, 26 Dec 2018, Fengguang Wu wrote: Each CPU socket can have 1 DRAM and 1 PMEM node, we call them "peer nodes". Migration between DRAM and PMEM will by default happen between peer nodes. Which one does numa_node_id() point to? I guess that is the DRAM node and Yes. In our test machine, PMEM nodes show up as memory-only nodes, so numa_node_id() points to DRAM node. Here is numactl --hardware output on a 2S test machine. available: 4 nodes (0-3) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 node 0 size: 257712 MB node 0 free: 178251 MB node 1 cpus: 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 node 1 size: 258038 MB node 1 free: 174796 MB node 2 cpus: node 2 size: 503999 MB node 2 free: 438349 MB node 3 cpus: node 3 size: 503999 MB node 3 free: 438349 MB node distances: node 0 1 2 3 0: 10 21 20 20 1: 21 10 20 20 2: 20 20 10 20 3: 20 20 20 10 then we fall back to the PMEM node? Fall back is possible but not the scope of this patchset. We modified fallback zonelists in patch 10 to simplify PMEM usage. With that patch, page allocations on DRAM nodes won't fallback to PMEM nodes. Instead, PMEM nodes will mainly be used by explicit numactl placement and as migration target. When there is memory pressure in DRAM node, LRU cold pages there will be demote migrated to its peer PMEM node on the same socket by patch 20. Thanks, Fengguang
Re: [RFC][PATCH v2 01/21] e820: cheat PMEM as DRAM
On Wed, Dec 26, 2018 at 07:41:41PM -0800, Matthew Wilcox wrote: On Wed, Dec 26, 2018 at 09:14:47PM +0800, Fengguang Wu wrote: From: Fan Du This is a hack to enumerate PMEM as NUMA nodes. It's necessary for current BIOS that don't yet fill ACPI HMAT table. WARNING: take care to backup. It is mutual exclusive with libnvdimm subsystem and can destroy ndctl managed namespaces. Why depend on firmware to present this "correctly"? It seems to me like less effort all around to have ndctl label some namespaces as being for this kind of use. Dave Hansen may be more suitable to answer your question. He posted patches to make PMEM NUMA node coexist with libnvdimm and ndctl: [PATCH 0/9] Allow persistent memory to be used like normal RAM https://lkml.org/lkml/2018/10/23/9 That depends on future BIOS. So we did this quick hack to test out PMEM NUMA node for the existing BIOS. Thanks, Fengguang
[RFC][PATCH v2 12/21] x86/pgtable: allocate page table pages from DRAM
On rand read/writes on large data, we find near half memory accesses caused by TLB misses, hence hit the page table pages. So better keep page table pages in faster DRAM nodes. Signed-off-by: Fengguang Wu --- arch/x86/include/asm/pgalloc.h | 10 +++--- arch/x86/mm/pgtable.c | 22 ++ 2 files changed, 25 insertions(+), 7 deletions(-) --- linux.orig/arch/x86/mm/pgtable.c2018-12-26 19:41:57.494900885 +0800 +++ linux/arch/x86/mm/pgtable.c 2018-12-26 19:42:35.531621035 +0800 @@ -22,17 +22,30 @@ EXPORT_SYMBOL(physical_mask); #endif gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; +nodemask_t all_node_mask = NODE_MASK_ALL; + +unsigned long __get_free_pgtable_pages(gfp_t gfp_mask, +unsigned int order) +{ + struct page *page; + + page = __alloc_pages_nodemask(gfp_mask, order, numa_node_id(), &all_node_mask); + if (!page) + return 0; + return (unsigned long) page_address(page); +} +EXPORT_SYMBOL(__get_free_pgtable_pages); pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); + return (pte_t *)__get_free_pgtable_pages(PGALLOC_GFP & ~__GFP_ACCOUNT, 0); } pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; - pte = alloc_pages(__userpte_alloc_gfp, 0); + pte = __alloc_pages_nodemask(__userpte_alloc_gfp, 0, numa_node_id(), &all_node_mask); if (!pte) return NULL; if (!pgtable_page_ctor(pte)) { @@ -241,7 +254,7 @@ static int preallocate_pmds(struct mm_st gfp &= ~__GFP_ACCOUNT; for (i = 0; i < count; i++) { - pmd_t *pmd = (pmd_t *)__get_free_page(gfp); + pmd_t *pmd = (pmd_t *)__get_free_pgtable_pages(gfp, 0); if (!pmd) failed = true; if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { @@ -422,7 +435,8 @@ static inline void _pgd_free(pgd_t *pgd) static inline pgd_t *_pgd_alloc(void) { - return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); + return (pgd_t *)__get_free_pgtable_pages(PGALLOC_GFP, +PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) --- linux.orig/arch/x86/include/asm/pgalloc.h 2018-12-26 19:40:12.992251270 +0800 +++ linux/arch/x86/include/asm/pgalloc.h2018-12-26 19:42:35.531621035 +0800 @@ -96,10 +96,11 @@ static inline pmd_t *pmd_alloc_one(struc { struct page *page; gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; + nodemask_t all_node_mask = NODE_MASK_ALL; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - page = alloc_pages(gfp, 0); + page = __alloc_pages_nodemask(gfp, 0, numa_node_id(), &all_node_mask); if (!page) return NULL; if (!pgtable_pmd_page_ctor(page)) { @@ -141,13 +142,16 @@ static inline void p4d_populate(struct m set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } +extern unsigned long __get_free_pgtable_pages(gfp_t gfp_mask, + unsigned int order); + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_KERNEL_ACCOUNT; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - return (pud_t *)get_zeroed_page(gfp); + return (pud_t *)__get_free_pgtable_pages(gfp | __GFP_ZERO, 0); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) @@ -179,7 +183,7 @@ static inline p4d_t *p4d_alloc_one(struc if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - return (p4d_t *)get_zeroed_page(gfp); + return (p4d_t *)__get_free_pgtable_pages(gfp | __GFP_ZERO, 0); } static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
[RFC][PATCH v2 07/21] mm: export node type {pmem|dram} under /sys/bus/node
From: Fan Du User space migration daemon could check /sys/bus/node/devices/nodeX/type for node type. Software can interrogate node type for node memory type and distance to get desirable target node in migration. grep -r . /sys/devices/system/node/*/type /sys/devices/system/node/node0/type:dram /sys/devices/system/node/node1/type:dram /sys/devices/system/node/node2/type:pmem /sys/devices/system/node/node3/type:pmem Along with next patch which export `peer_node`, migration daemon could easily find the memory type of current node, and the target node in case of migration. grep -r . /sys/devices/system/node/*/peer_node /sys/devices/system/node/node0/peer_node:2 /sys/devices/system/node/node1/peer_node:3 /sys/devices/system/node/node2/peer_node:0 /sys/devices/system/node/node3/peer_node:1 Signed-off-by: Fan Du Signed-off-by: Fengguang Wu --- drivers/base/node.c | 10 ++ 1 file changed, 10 insertions(+) --- linux.orig/drivers/base/node.c 2018-12-23 19:39:04.763414931 +0800 +++ linux/drivers/base/node.c 2018-12-23 19:39:04.763414931 +0800 @@ -233,6 +233,15 @@ static ssize_t node_read_distance(struct } static DEVICE_ATTR(distance, S_IRUGO, node_read_distance, NULL); +static ssize_t type_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int nid = dev->id; + + return sprintf(buf, is_node_pmem(nid) ? "pmem\n" : "dram\n"); +} +static DEVICE_ATTR(type, S_IRUGO, type_show, NULL); + static struct attribute *node_dev_attrs[] = { &dev_attr_cpumap.attr, &dev_attr_cpulist.attr, @@ -240,6 +249,7 @@ static struct attribute *node_dev_attrs[ &dev_attr_numastat.attr, &dev_attr_distance.attr, &dev_attr_vmstat.attr, + &dev_attr_type.attr, NULL }; ATTRIBUTE_GROUPS(node_dev);
[RFC][PATCH v2 03/21] x86/numa_emulation: fix fake NUMA in uniform case
From: Fan Du The index of numa_meminfo is expected to the same as of numa_meminfo.blk[]. and numa_remove_memblk_from break the expectation. 2S system does not break, because before numa_remove_memblk_from index nid 0 0 1 1 after numa_remove_memblk_from index nid 0 1 1 1 If you try to configure uniform fake node in 4S system. index nid 0 0 1 1 2 2 3 3 node 3 will be removed by numa_remove_memblk_from when iterate index 2. so we only create fake node for 3 physcial node, and a portion of memroy wasted as much as it hit lost pages checking in numa_meminfo_cover_memory. Signed-off-by: Fan Du --- arch/x86/mm/numa_emulation.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) --- linux.orig/arch/x86/mm/numa_emulation.c 2018-12-23 19:20:51.570664269 +0800 +++ linux/arch/x86/mm/numa_emulation.c 2018-12-23 19:20:51.54364 +0800 @@ -381,7 +381,21 @@ void __init numa_emulation(struct numa_m goto no_emu; memset(&ei, 0, sizeof(ei)); - pi = *numa_meminfo; + + { + /* Make sure the index is identical with nid */ + struct numa_meminfo *mi = numa_meminfo; + int nid; + + for (i = 0; i < mi->nr_blks; i++) { + nid = mi->blk[i].nid; + pi.blk[nid].nid = nid; + pi.blk[nid].start = mi->blk[i].start; + pi.blk[nid].end = mi->blk[i].end; + } + pi.nr_blks = mi->nr_blks; + + } for (i = 0; i < MAX_NUMNODES; i++) emu_nid_to_phys[i] = NUMA_NO_NODE;
[RFC][PATCH v2 11/21] kvm: allocate page table pages from DRAM
From: Yao Yuan Signed-off-by: Yao Yuan Signed-off-by: Fengguang Wu --- arch/x86/kvm/mmu.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) --- linux.orig/arch/x86/kvm/mmu.c 2018-12-26 20:54:48.846720344 +0800 +++ linux/arch/x86/kvm/mmu.c2018-12-26 20:54:48.842719614 +0800 @@ -950,6 +950,16 @@ static void mmu_free_memory_cache(struct kmem_cache_free(cache, mc->objects[--mc->nobjs]); } +static unsigned long __get_dram_free_pages(gfp_t gfp_mask) +{ + struct page *page; + + page = __alloc_pages(GFP_KERNEL_ACCOUNT, 0, numa_node_id()); + if (!page) + return 0; + return (unsigned long) page_address(page); +} + static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, int min) { @@ -958,7 +968,7 @@ static int mmu_topup_memory_cache_page(s if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); + page = (void *)__get_dram_free_pages(GFP_KERNEL_ACCOUNT); if (!page) return cache->nobjs >= min ? 0 : -ENOMEM; cache->objects[cache->nobjs++] = page;
[RFC][PATCH v2 05/21] mmzone: new pgdat flags for DRAM and PMEM
From: Fan Du One system with DRAM and PMEM, we need new flag to tag pgdat is made of DRAM or peristent memory. This patch serves as preparetion one for follow up patch. Signed-off-by: Fan Du Signed-off-by: Fengguang Wu --- include/linux/mmzone.h | 26 ++ 1 file changed, 26 insertions(+) --- linux.orig/include/linux/mmzone.h 2018-12-23 19:29:42.430602202 +0800 +++ linux/include/linux/mmzone.h2018-12-23 19:29:42.430602202 +0800 @@ -522,6 +522,8 @@ enum pgdat_flags { * many pages under writeback */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ + PGDAT_DRAM, /* Volatile DRAM memory node */ + PGDAT_PMEM, /* Persistent memory node */ }; static inline unsigned long zone_end_pfn(const struct zone *zone) @@ -919,6 +921,30 @@ extern struct pglist_data contig_page_da #endif /* !CONFIG_NEED_MULTIPLE_NODES */ +static inline int is_node_pmem(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + return test_bit(PGDAT_PMEM, &pgdat->flags); +} + +static inline int is_node_dram(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + return test_bit(PGDAT_DRAM, &pgdat->flags); +} + +static inline void set_node_type(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + + if (node_isset(nid, numa_nodes_pmem)) + set_bit(PGDAT_PMEM, &pgdat->flags); + else + set_bit(PGDAT_DRAM, &pgdat->flags); +} + extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); extern struct zone *next_zone(struct zone *zone);
[RFC][PATCH v2 08/21] mm: introduce and export pgdat peer_node
From: Fan Du Each CPU socket can have 1 DRAM and 1 PMEM node, we call them "peer nodes". Migration between DRAM and PMEM will by default happen between peer nodes. It's a temp solution. In multiple memory layers, a node can have both promotion and demotion targets instead of a single peer node. User space may also be able to infer promotion/demotion targets based on future HMAT info. Signed-off-by: Fan Du Signed-off-by: Fengguang Wu --- drivers/base/node.c| 11 +++ include/linux/mmzone.h | 12 mm/page_alloc.c| 29 + 3 files changed, 52 insertions(+) --- linux.orig/drivers/base/node.c 2018-12-23 19:39:51.647261099 +0800 +++ linux/drivers/base/node.c 2018-12-23 19:39:51.643261112 +0800 @@ -242,6 +242,16 @@ static ssize_t type_show(struct device * } static DEVICE_ATTR(type, S_IRUGO, type_show, NULL); +static ssize_t peer_node_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int nid = dev->id; + struct pglist_data *pgdat = NODE_DATA(nid); + + return sprintf(buf, "%d\n", pgdat->peer_node); +} +static DEVICE_ATTR(peer_node, S_IRUGO, peer_node_show, NULL); + static struct attribute *node_dev_attrs[] = { &dev_attr_cpumap.attr, &dev_attr_cpulist.attr, @@ -250,6 +260,7 @@ static struct attribute *node_dev_attrs[ &dev_attr_distance.attr, &dev_attr_vmstat.attr, &dev_attr_type.attr, + &dev_attr_peer_node.attr, NULL }; ATTRIBUTE_GROUPS(node_dev); --- linux.orig/include/linux/mmzone.h 2018-12-23 19:39:51.647261099 +0800 +++ linux/include/linux/mmzone.h2018-12-23 19:39:51.643261112 +0800 @@ -713,6 +713,18 @@ typedef struct pglist_data { /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; + + /* +* Points to the nearest node in terms of latency +* E.g. peer of node 0 is node 2 per SLIT +* node distances: +* node 0 1 2 3 +* 0: 10 21 17 28 +* 1: 21 10 28 17 +* 2: 17 28 10 28 +* 3: 28 17 28 10 +*/ + int peer_node; } pg_data_t; #define node_present_pages(nid)(NODE_DATA(nid)->node_present_pages) --- linux.orig/mm/page_alloc.c 2018-12-23 19:39:51.647261099 +0800 +++ linux/mm/page_alloc.c 2018-12-23 19:39:51.643261112 +0800 @@ -6926,6 +6926,34 @@ static void check_for_memory(pg_data_t * } } +/* + * Return the nearest peer node in terms of *locality* + * E.g. peer of node 0 is node 2 per SLIT + * node distances: + * node 0 1 2 3 + * 0: 10 21 17 28 + * 1: 21 10 28 17 + * 2: 17 28 10 28 + * 3: 28 17 28 10 + */ +static int find_best_peer_node(int nid) +{ + int n, val; + int min_val = INT_MAX; + int peer = NUMA_NO_NODE; + + for_each_online_node(n) { + if (n == nid) + continue; + val = node_distance(nid, n); + if (val < min_val) { + min_val = val; + peer = n; + } + } + return peer; +} + /** * free_area_init_nodes - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone @@ -7012,6 +7040,7 @@ void __init free_area_init_nodes(unsigne if (pgdat->node_present_pages) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); + pgdat->peer_node = find_best_peer_node(nid); } }
[RFC][PATCH v2 18/21] kvm-ept-idle: enable module
Signed-off-by: Fengguang Wu --- arch/x86/kvm/Kconfig | 11 +++ arch/x86/kvm/Makefile |4 2 files changed, 15 insertions(+) --- linux.orig/arch/x86/kvm/Kconfig 2018-12-23 20:09:04.628882396 +0800 +++ linux/arch/x86/kvm/Kconfig 2018-12-23 20:09:04.628882396 +0800 @@ -96,6 +96,17 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. +config KVM_EPT_IDLE + tristate "KVM EPT idle page tracking" + depends on KVM_INTEL + depends on PROC_PAGE_MONITOR + ---help--- + Provides support for walking EPT to get the A bits on Intel + processors equipped with the VT extensions. + + To compile this as a module, choose M here: the module + will be called kvm-ept-idle. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig --- linux.orig/arch/x86/kvm/Makefile2018-12-23 20:09:04.628882396 +0800 +++ linux/arch/x86/kvm/Makefile 2018-12-23 20:09:04.628882396 +0800 @@ -19,6 +19,10 @@ kvm-y+= x86.o mmu.o emulate.o i8259.o kvm-intel-y+= vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o +kvm-ept-idle-y += ept_idle.o + obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL)+= kvm-intel.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o + +obj-$(CONFIG_KVM_EPT_IDLE) += kvm-ept-idle.o
[RFC][PATCH v2 01/21] e820: cheat PMEM as DRAM
From: Fan Du This is a hack to enumerate PMEM as NUMA nodes. It's necessary for current BIOS that don't yet fill ACPI HMAT table. WARNING: take care to backup. It is mutual exclusive with libnvdimm subsystem and can destroy ndctl managed namespaces. Signed-off-by: Fan Du Signed-off-by: Fengguang Wu --- arch/x86/kernel/e820.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) --- linux.orig/arch/x86/kernel/e820.c 2018-12-23 19:20:34.587078783 +0800 +++ linux/arch/x86/kernel/e820.c2018-12-23 19:20:34.587078783 +0800 @@ -403,7 +403,8 @@ static int __init __append_e820_table(st /* Ignore the entry on 64-bit overflow: */ if (start > end && likely(size)) return -1; - + if (type == E820_TYPE_PMEM) + type = E820_TYPE_RAM; e820__range_add(start, size, type); entry++;
[RFC][PATCH v2 09/21] mm: avoid duplicate peer target node
To ensure 1:1 peer node mapping on broken BIOS node distances: node 0 1 2 3 0: 10 21 20 20 1: 21 10 20 20 2: 20 20 10 20 3: 20 20 20 10 or with numa=fake=4U node distances: node 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0: 10 10 10 10 21 21 21 21 17 17 17 17 28 28 28 28 1: 10 10 10 10 21 21 21 21 17 17 17 17 28 28 28 28 2: 10 10 10 10 21 21 21 21 17 17 17 17 28 28 28 28 3: 10 10 10 10 21 21 21 21 17 17 17 17 28 28 28 28 4: 21 21 21 21 10 10 10 10 28 28 28 28 17 17 17 17 5: 21 21 21 21 10 10 10 10 28 28 28 28 17 17 17 17 6: 21 21 21 21 10 10 10 10 28 28 28 28 17 17 17 17 7: 21 21 21 21 10 10 10 10 28 28 28 28 17 17 17 17 8: 17 17 17 17 28 28 28 28 10 10 10 10 28 28 28 28 9: 17 17 17 17 28 28 28 28 10 10 10 10 28 28 28 28 10: 17 17 17 17 28 28 28 28 10 10 10 10 28 28 28 28 11: 17 17 17 17 28 28 28 28 10 10 10 10 28 28 28 28 12: 28 28 28 28 17 17 17 17 28 28 28 28 10 10 10 10 13: 28 28 28 28 17 17 17 17 28 28 28 28 10 10 10 10 14: 28 28 28 28 17 17 17 17 28 28 28 28 10 10 10 10 15: 28 28 28 28 17 17 17 17 28 28 28 28 10 10 10 10 Signed-off-by: Fengguang Wu --- mm/page_alloc.c |6 ++ 1 file changed, 6 insertions(+) --- linux.orig/mm/page_alloc.c 2018-12-23 19:48:27.366110325 +0800 +++ linux/mm/page_alloc.c 2018-12-23 19:48:27.362110332 +0800 @@ -6941,16 +6941,22 @@ static int find_best_peer_node(int nid) int n, val; int min_val = INT_MAX; int peer = NUMA_NO_NODE; + static nodemask_t target_nodes = NODE_MASK_NONE; for_each_online_node(n) { if (n == nid) continue; val = node_distance(nid, n); + if (val == LOCAL_DISTANCE) + continue; + if (node_isset(n, target_nodes)) + continue; if (val < min_val) { min_val = val; peer = n; } } + node_set(peer, target_nodes); return peer; }
[RFC][PATCH v2 04/21] x86/numa_emulation: pass numa node type to fake nodes
From: Fan Du Signed-off-by: Fan Du --- arch/x86/mm/numa_emulation.c | 14 ++ 1 file changed, 14 insertions(+) --- linux.orig/arch/x86/mm/numa_emulation.c 2018-12-23 19:21:11.002206144 +0800 +++ linux/arch/x86/mm/numa_emulation.c 2018-12-23 19:21:10.998206236 +0800 @@ -12,6 +12,8 @@ static int emu_nid_to_phys[MAX_NUMNODES]; static char *emu_cmdline __initdata; +static nodemask_t emu_numa_nodes_pmem; +static nodemask_t emu_numa_nodes_dram; void __init numa_emu_cmdline(char *str) { @@ -311,6 +313,12 @@ static int __init split_nodes_size_inter min(end, limit) - start); if (ret < 0) return ret; + + /* Update numa node type for fake numa node */ + if (node_isset(i, emu_numa_nodes_pmem)) + node_set(nid - 1, numa_nodes_pmem); + else + node_set(nid - 1, numa_nodes_dram); } } return nid; @@ -410,6 +418,12 @@ void __init numa_emulation(struct numa_m unsigned long n; int nid = 0; + emu_numa_nodes_pmem = numa_nodes_pmem; + emu_numa_nodes_dram = numa_nodes_dram; + + nodes_clear(numa_nodes_pmem); + nodes_clear(numa_nodes_dram); + n = simple_strtoul(emu_cmdline, &emu_cmdline, 0); ret = -1; for_each_node_mask(i, physnode_mask) {
[RFC][PATCH v2 06/21] x86,numa: update numa node type
From: Fan Du Signed-off-by: Fan Du Signed-off-by: Fengguang Wu --- arch/x86/mm/numa.c |1 + 1 file changed, 1 insertion(+) --- linux.orig/arch/x86/mm/numa.c 2018-12-23 19:38:17.363582512 +0800 +++ linux/arch/x86/mm/numa.c2018-12-23 19:38:17.363582512 +0800 @@ -594,6 +594,7 @@ static int __init numa_register_memblks( continue; alloc_node_data(nid); + set_node_type(nid); } /* Dump memblock with node info and return. */
[RFC][PATCH v2 02/21] acpi/numa: memorize NUMA node type from SRAT table
From: Fan Du Mark NUMA node as DRAM or PMEM. This could happen in boot up state (see the e820 pmem type override patch), or on fly when bind devdax device with kmem driver. It depends on BIOS supplying PMEM NUMA proximity in SRAT table, that's current production BIOS does. Signed-off-by: Fan Du Signed-off-by: Fengguang Wu --- arch/x86/include/asm/numa.h |2 ++ arch/x86/mm/numa.c |2 ++ drivers/acpi/numa.c |5 + 3 files changed, 9 insertions(+) --- linux.orig/arch/x86/include/asm/numa.h 2018-12-23 19:20:39.890947888 +0800 +++ linux/arch/x86/include/asm/numa.h 2018-12-23 19:20:39.890947888 +0800 @@ -30,6 +30,8 @@ extern int numa_off; */ extern s16 __apicid_to_node[MAX_LOCAL_APIC]; extern nodemask_t numa_nodes_parsed __initdata; +extern nodemask_t numa_nodes_pmem; +extern nodemask_t numa_nodes_dram; extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); extern void __init numa_set_distance(int from, int to, int distance); --- linux.orig/arch/x86/mm/numa.c 2018-12-23 19:20:39.890947888 +0800 +++ linux/arch/x86/mm/numa.c2018-12-23 19:20:39.890947888 +0800 @@ -20,6 +20,8 @@ int numa_off; nodemask_t numa_nodes_parsed __initdata; +nodemask_t numa_nodes_pmem; +nodemask_t numa_nodes_dram; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); --- linux.orig/drivers/acpi/numa.c 2018-12-23 19:20:39.890947888 +0800 +++ linux/drivers/acpi/numa.c 2018-12-23 19:20:39.890947888 +0800 @@ -297,6 +297,11 @@ acpi_numa_memory_affinity_init(struct ac node_set(node, numa_nodes_parsed); + if (ma->flags & ACPI_SRAT_MEM_NON_VOLATILE) + node_set(node, numa_nodes_pmem); + else + node_set(node, numa_nodes_dram); + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1,
[RFC][PATCH v2 00/21] PMEM NUMA node and hotness accounting/migration
This is an attempt to use NVDIMM/PMEM as volatile NUMA memory that's transparent to normal applications and virtual machines. The code is still in active development. It's provided for early design review. Key functionalities: 1) create and describe PMEM NUMA node for NVDIMM memory 2) dumb /proc/PID/idle_pages interface, for user space driven hot page accounting 3) passive kernel cold page migration in page reclaim path 4) improved move_pages() for active user space hot/cold page migration (1) is foundation for transparent usage of NVDIMM for normal apps and virtual machines. (2-4) enable auto placing hot pages in DRAM for better performance. A user space migration daemon is being built based on this kernel patchset to make the full vertical solution. Base kernel is v4.20 . The patches are not suitable for upstreaming in near future -- some are quick hacks, some others need more works. However they are complete enough to demo the necessary kernel changes for the proposed app&VM transparent NVDIMM volatile use model. The interfaces are far from finalized. They kind of illustrate what would be necessary for creating a user space driven solution. The exact forms will ask for more thoughts and inputs. We may adopt HMAT based solution for NUMA node related interface when they are ready. The /proc/PID/idle_pages interface is standalone but non-trivial. Before upstreaming some day, it's expected to take long time to collect various real use cases and feedbacks, so as to refine and stabilize the format. Create PMEM numa node [PATCH 01/21] e820: cheat PMEM as DRAM Mark numa node as DRAM/PMEM [PATCH 02/21] acpi/numa: memorize NUMA node type from SRAT table [PATCH 03/21] x86/numa_emulation: fix fake NUMA in uniform case [PATCH 04/21] x86/numa_emulation: pass numa node type to fake nodes [PATCH 05/21] mmzone: new pgdat flags for DRAM and PMEM [PATCH 06/21] x86,numa: update numa node type [PATCH 07/21] mm: export node type {pmem|dram} under /sys/bus/node Point neighbor DRAM/PMEM to each other [PATCH 08/21] mm: introduce and export pgdat peer_node [PATCH 09/21] mm: avoid duplicate peer target node Standalone zonelist for DRAM and PMEM nodes [PATCH 10/21] mm: build separate zonelist for PMEM and DRAM node Keep page table pages in DRAM [PATCH 11/21] kvm: allocate page table pages from DRAM [PATCH 12/21] x86/pgtable: allocate page table pages from DRAM /proc/PID/idle_pages interface for virtual machine and normal tasks [PATCH 13/21] x86/pgtable: dont check PMD accessed bit [PATCH 14/21] kvm: register in mm_struct [PATCH 15/21] ept-idle: EPT walk for virtual machine [PATCH 16/21] mm-idle: mm_walk for normal task [PATCH 17/21] proc: introduce /proc/PID/idle_pages [PATCH 18/21] kvm-ept-idle: enable module Mark hot pages [PATCH 19/21] mm/migrate.c: add move_pages(MPOL_MF_SW_YOUNG) flag Kernel DRAM=>PMEM migration [PATCH 20/21] mm/vmscan.c: migrate anon DRAM pages to PMEM node [PATCH 21/21] mm/vmscan.c: shrink anon list if can migrate to PMEM arch/x86/include/asm/numa.h|2 arch/x86/include/asm/pgalloc.h | 10 arch/x86/include/asm/pgtable.h |3 arch/x86/kernel/e820.c |3 arch/x86/kvm/Kconfig | 11 arch/x86/kvm/Makefile |4 arch/x86/kvm/ept_idle.c| 841 +++ arch/x86/kvm/ept_idle.h| 116 arch/x86/kvm/mmu.c | 12 arch/x86/mm/numa.c |3 arch/x86/mm/numa_emulation.c | 30 + arch/x86/mm/pgtable.c | 22 drivers/acpi/numa.c|5 drivers/base/node.c| 21 fs/proc/base.c |2 fs/proc/internal.h |1 fs/proc/task_mmu.c | 54 + include/linux/mm_types.h | 11 include/linux/mmzone.h | 38 + mm/mempolicy.c | 14 mm/migrate.c | 13 mm/page_alloc.c| 77 ++ mm/pagewalk.c |1 mm/vmscan.c| 38 + virt/kvm/kvm_main.c|3 25 files changed, 1306 insertions(+), 29 deletions(-) V1 patches: https://lkml.org/lkml/2018/9/2/13 Regards, Fengguang
[RFC][PATCH v2 20/21] mm/vmscan.c: migrate anon DRAM pages to PMEM node
From: Jingqi Liu With PMEM nodes, the demotion path could be 1) DRAM pages: migrate to PMEM node 2) PMEM pages: swap out This patch does (1) for anonymous pages only. Since we cannot detect hotness of (unmapped) page cache pages for now. The user space daemon can do migration in both directions: - PMEM=>DRAM hot page migration - DRAM=>PMEM cold page migration However it's more natural for user space to do hot page migration and kernel to do cold page migration. Especially, only kernel can guarantee on-demand migration when there is memory pressure. So the big picture will look like this: user space daemon does regular hot page migration to DRAM, creating memory pressure on DRAM nodes, which triggers kernel cold page migration to PMEM nodes. Du Fan: - Support multiple NUMA nodes. - Don't migrate clean MADV_FREE pages to PMEM node. With advise(MADV_FREE) syscall, both vma structure and its corresponding page entries still lives, but we got MADV_FREE page, anonymous but WITHOUT SwapBacked. In case of page reclaim, clean MADV_FREE pages will be freed and return to buddy system, the dirty ones then turn into canonical anonymous page with PageSwapBacked(page) set, and put into LRU_INACTIVE_FILE list falling into standard aging routine. Point is clean MADV_FREE pages should not be migrated, it has steal (useless) user data once madvise(MADV_FREE) called and guard against thus scenarios. P.S. MADV_FREE is heavily used by jemalloc engine, and workload like redis, refer to [1] for detailed backgroud, usecase, and benchmark result. [1] https://lore.kernel.org/patchwork/patch/622179/ Fengguang: - detect migrate thp and hugetlb - avoid moving pages to a non-existent node Signed-off-by: Fan Du Signed-off-by: Jingqi Liu Signed-off-by: Fengguang Wu --- mm/vmscan.c | 33 + 1 file changed, 33 insertions(+) --- linux.orig/mm/vmscan.c 2018-12-23 20:37:58.305551976 +0800 +++ linux/mm/vmscan.c 2018-12-23 20:37:58.305551976 +0800 @@ -1112,6 +1112,7 @@ static unsigned long shrink_page_list(st { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); + LIST_HEAD(move_pages); int pgactivate = 0; unsigned nr_unqueued_dirty = 0; unsigned nr_dirty = 0; @@ -1121,6 +1122,7 @@ static unsigned long shrink_page_list(st unsigned nr_immediate = 0; unsigned nr_ref_keep = 0; unsigned nr_unmap_fail = 0; + int page_on_dram = is_node_dram(pgdat->node_id); cond_resched(); @@ -1275,6 +1277,21 @@ static unsigned long shrink_page_list(st } /* +* Check if the page is in DRAM numa node. +* Skip MADV_FREE pages as it might be freed +* immediately to buddy system if it's clean. +*/ + if (node_online(pgdat->peer_node) && + PageAnon(page) && (PageSwapBacked(page) || PageTransHuge(page))) { + if (page_on_dram) { + /* Add to the page list which will be moved to pmem numa node. */ + list_add(&page->lru, &move_pages); + unlock_page(page); + continue; + } + } + + /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. * Lazyfree page could be freed directly @@ -1496,6 +1513,22 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + /* Move the anonymous pages to PMEM numa node. */ + if (!list_empty(&move_pages)) { + int err; + + /* Could not block. */ + err = migrate_pages(&move_pages, alloc_new_node_page, NULL, + pgdat->peer_node, + MIGRATE_ASYNC, MR_NUMA_MISPLACED); + if (err) { + putback_movable_pages(&move_pages); + + /* Join the pages which were not migrated. */ + list_splice(&ret_pages, &move_pages); + } + } + mem_cgroup_uncharge_list(&free_pages); try_to_unmap_flush(); free_unref_page_list(&free_pages);
[RFC][PATCH v2 19/21] mm/migrate.c: add move_pages(MPOL_MF_SW_YOUNG) flag
From: Liu Jingqi Introduce MPOL_MF_SW_YOUNG flag to move_pages(). When on, the already-in-DRAM pages will be set PG_referenced. Background: The use space migration daemon will frequently scan page table and read-clear accessed bits to detect hot/cold pages. Then migrate hot pages from PMEM to DRAM node. When doing so, it btw tells kernel that these are the hot page set. This maintains a persistent view of hot/cold pages between kernel and user space daemon. The more concrete steps are 1) do multiple scan of page table, count accessed bits 2) highest accessed count => hot pages 3) call move_pages(hot pages, DRAM nodes, MPOL_MF_SW_YOUNG) (1) regularly clears PTE young, which makes kernel lose access to PTE young information (2) for anonymous pages, user space daemon defines which is hot and which is cold (3) conveys user space view of hot/cold pages to kernel through PG_referenced In the long run, most hot pages could already be in DRAM. move_pages(MPOL_MF_SW_YOUNG) sets PG_referenced for those already in DRAM hot pages. But not for newly migrated hot pages. Since they are expected to put to the end of LRU, thus has long enough time in LRU to gather accessed/PG_referenced bit and prove to kernel they are really hot. The daemon may only select DRAM/2 pages as hot for 2 purposes: - avoid thrashing, eg. some warm pages got promoted then demoted soon - make sure enough DRAM LRU pages look "cold" to kernel, so that vmscan won't run into trouble busy scanning LRU lists Signed-off-by: Liu Jingqi Signed-off-by: Fengguang Wu --- mm/migrate.c | 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) --- linux.orig/mm/migrate.c 2018-12-23 20:37:12.604621319 +0800 +++ linux/mm/migrate.c 2018-12-23 20:37:12.604621319 +0800 @@ -55,6 +55,8 @@ #include "internal.h" +#define MPOL_MF_SW_YOUNG (1<<7) + /* * migrate_prep() needs to be called before we start compiling a list of pages * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is @@ -1484,12 +1486,13 @@ static int do_move_pages_to_node(struct * the target node */ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, - int node, struct list_head *pagelist, bool migrate_all) + int node, struct list_head *pagelist, int flags) { struct vm_area_struct *vma; struct page *page; unsigned int follflags; int err; + bool migrate_all = flags & MPOL_MF_MOVE_ALL; down_read(&mm->mmap_sem); err = -EFAULT; @@ -1519,6 +1522,8 @@ static int add_page_for_migration(struct if (PageHuge(page)) { if (PageHead(page)) { + if (flags & MPOL_MF_SW_YOUNG) + SetPageReferenced(page); isolate_huge_page(page, pagelist); err = 0; } @@ -1531,6 +1536,8 @@ static int add_page_for_migration(struct goto out_putpage; err = 0; + if (flags & MPOL_MF_SW_YOUNG) + SetPageReferenced(head); list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + page_is_file_cache(head), @@ -1606,7 +1613,7 @@ static int do_pages_move(struct mm_struc * report them via status */ err = add_page_for_migration(mm, addr, current_node, - &pagelist, flags & MPOL_MF_MOVE_ALL); + &pagelist, flags); if (!err) continue; @@ -1725,7 +1732,7 @@ static int kernel_move_pages(pid_t pid, nodemask_t task_nodes; /* Check flags */ - if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL|MPOL_MF_SW_YOUNG)) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
[RFC][PATCH v2 13/21] x86/pgtable: dont check PMD accessed bit
From: Jingqi Liu ept-idle will clear PMD accessed bit to speedup PTE scan -- if the bit remains unset in the next scan, all the 512 PTEs can be skipped. So don't complain on !_PAGE_ACCESSED in pmd_bad(). Note that clearing PMD accessed bit has its own cost, the optimization may only be worthwhile for - large idle area - sparsely populated area Signed-off-by: Jingqi Liu Signed-off-by: Fengguang Wu --- arch/x86/include/asm/pgtable.h |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) --- linux.orig/arch/x86/include/asm/pgtable.h 2018-12-23 19:50:50.917902600 +0800 +++ linux/arch/x86/include/asm/pgtable.h2018-12-23 19:50:50.913902605 +0800 @@ -821,7 +821,8 @@ static inline pte_t *pte_offset_kernel(p static inline int pmd_bad(pmd_t pmd) { - return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; + return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) != + (_KERNPG_TABLE & ~_PAGE_ACCESSED); } static inline unsigned long pages_to_mb(unsigned long npg)
[RFC][PATCH v2 10/21] mm: build separate zonelist for PMEM and DRAM node
From: Fan Du When allocate page, DRAM and PMEM node should better not fall back to each other. This allows migration code to explicitly control which type of node to allocate pages from. With this patch, PMEM NUMA node can only be used in 2 ways: - migrate in and out - numactl That guarantees PMEM NUMA node will only hold anon pages. We don't detect hotness for other types of pages for now. So need to prevent some PMEM page goes hot while not able to detect/move it to DRAM. Another implication is, new page allocations will by default goto DRAM nodes. Which is normally a good choice -- since DRAM writes are cheaper than PMEM, it's often benefitial to watch new pages in DRAM for some time and only move the likely cold pages to PMEM. However there can be exceptions. For example, if PMEM:DRAM ratio is very high, some page allocations may better go to PMEM nodes directly. In long term, we may create more kind of fallback zonelists and make them configurable by NUMA policy. Signed-off-by: Fan Du Signed-off-by: Fengguang Wu --- mm/mempolicy.c | 14 ++ mm/page_alloc.c | 42 +- 2 files changed, 43 insertions(+), 13 deletions(-) --- linux.orig/mm/mempolicy.c 2018-12-26 20:03:49.821417489 +0800 +++ linux/mm/mempolicy.c2018-12-26 20:29:24.597884301 +0800 @@ -1745,6 +1745,20 @@ static int policy_node(gfp_t gfp, struct WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + if (policy->mode == MPOL_BIND) { + nodemask_t nodes = policy->v.nodes; + + /* +* The rule is if we run on DRAM node and mbind to PMEM node, +* perferred node id is the peer node, vice versa. +* if we run on DRAM node and mbind to DRAM node, #PF node is +* the preferred node, vice versa, so just fall back. +*/ + if ((is_node_dram(nd) && nodes_subset(nodes, numa_nodes_pmem)) || + (is_node_pmem(nd) && nodes_subset(nodes, numa_nodes_dram))) + nd = NODE_DATA(nd)->peer_node; + } + return nd; } --- linux.orig/mm/page_alloc.c 2018-12-26 20:03:49.821417489 +0800 +++ linux/mm/page_alloc.c 2018-12-26 20:03:49.817417321 +0800 @@ -5153,6 +5153,10 @@ static int find_next_best_node(int node, if (node_isset(n, *used_node_mask)) continue; + /* DRAM node doesn't fallback to pmem node */ + if (is_node_pmem(n)) + continue; + /* Use the distance array to find the distance */ val = node_distance(node, n); @@ -5242,19 +5246,31 @@ static void build_zonelists(pg_data_t *p nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { - /* -* We don't want to pressure a particular node. -* So adding penalty to the first node in same -* distance group to make it round-robin. -*/ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) - node_load[node] = load; - - node_order[nr_nodes++] = node; - prev_node = node; - load--; + /* Pmem node doesn't fallback to DRAM node */ + if (is_node_pmem(local_node)) { + int n; + + /* Pmem nodes should fallback to each other */ + node_order[nr_nodes++] = local_node; + for_each_node_state(n, N_MEMORY) { + if ((n != local_node) && is_node_pmem(n)) + node_order[nr_nodes++] = n; + } + } else { + while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + /* +* We don't want to pressure a particular node. +* So adding penalty to the first node in same +* distance group to make it round-robin. +*/ + if (node_distance(local_node, node) != + node_distance(local_node, prev_node)) + node_load[node] = load; + + node_order[nr_nodes++] = node; + prev_node = node; + load--; + } } build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
[RFC][PATCH v2 15/21] ept-idle: EPT walk for virtual machine
For virtual machines, "accessed" bits will be set in guest page tables and EPT/NPT. So for qemu-kvm process, convert HVA to GFN to GPA, then do EPT/NPT walks. This borrows host page table walk macros/functions to do EPT/NPT walk. So it depends on them using the same level. As proposed by Dave Hansen, invalidate TLB when finished one round of scan, in order to ensure HW will set accessed bit for super-hot pages. V2: convert idle_bitmap to idle_pages to be more efficient on - huge pages - sparse page table - ranges of similar pages The new idle_pages file contains a series of records of different size reporting ranges of different page size to user space. That interface has a major downside: it breaks read() assumption about range_to_read == read_buffer_size. Now we workaround this problem by deducing range_to_read from read_buffer_size, and let read() return when either read_buffer_size is filled, or range_to_read is fully scanned. To make a more precise interface, we may need further switch to ioctl(). CC: Dave Hansen Signed-off-by: Peng Dong Signed-off-by: Liu Jingqi Signed-off-by: Fengguang Wu --- arch/x86/kvm/ept_idle.c | 637 ++ arch/x86/kvm/ept_idle.h | 116 ++ 2 files changed, 753 insertions(+) create mode 100644 arch/x86/kvm/ept_idle.c create mode 100644 arch/x86/kvm/ept_idle.h --- /dev/null 1970-01-01 00:00:00.0 + +++ linux/arch/x86/kvm/ept_idle.c 2018-12-26 20:38:07.298994533 +0800 @@ -0,0 +1,637 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ept_idle.h" + +/* #define DEBUG 1 */ + +#ifdef DEBUG + +#define debug_printk trace_printk + +#define set_restart_gpa(val, note) ({ \ + unsigned long old_val = eic->restart_gpa; \ + eic->restart_gpa = (val); \ + trace_printk("restart_gpa=%lx %luK %s %s %d\n", \ +(val), (eic->restart_gpa - old_val) >> 10, \ +note, __func__, __LINE__); \ +}) + +#define set_next_hva(val, note)({ \ + unsigned long old_val = eic->next_hva; \ + eic->next_hva = (val); \ + trace_printk(" next_hva=%lx %luK %s %s %d\n", \ +(val), (eic->next_hva - old_val) >> 10,\ +note, __func__, __LINE__); \ +}) + +#else + +#define debug_printk(...) + +#define set_restart_gpa(val, note) ({ \ + eic->restart_gpa = (val); \ +}) + +#define set_next_hva(val, note)({ \ + eic->next_hva = (val); \ +}) + +#endif + +static unsigned long pagetype_size[16] = { + [PTE_ACCESSED] = PAGE_SIZE,/* 4k page */ + [PMD_ACCESSED] = PMD_SIZE, /* 2M page */ + [PUD_PRESENT] = PUD_SIZE, /* 1G page */ + + [PTE_DIRTY] = PAGE_SIZE, + [PMD_DIRTY] = PMD_SIZE, + + [PTE_IDLE] = PAGE_SIZE, + [PMD_IDLE] = PMD_SIZE, + [PMD_IDLE_PTES] = PMD_SIZE, + + [PTE_HOLE] = PAGE_SIZE, + [PMD_HOLE] = PMD_SIZE, +}; + +static void u64_to_u8(uint64_t n, uint8_t *p) +{ + p += sizeof(uint64_t) - 1; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p-- = n; n >>= 8; + *p = n; +} + +static void dump_eic(struct ept_idle_ctrl *eic) +{ + debug_printk("ept_idle_ctrl: pie_read=%d pie_read_max=%d buf_size=%d " +"bytes_copied=%d next_hva=%lx restart_gpa=%lx " +"gpa_to_hva=%lx\n", +eic->pie_read, +eic->pie_read_max, +eic->buf_size, +eic->bytes_copied, +eic->next_hva, +eic->restart_gpa, +eic->gpa_to_hva); +} + +static void eic_report_addr(struct ept_idle_ctrl *eic, unsigned long addr) +{ + unsigned long hva; + eic->kpie[eic->pie_read++] = PIP_CMD_SET_HVA; + hva = addr; + u64_to_u8(hva, &eic->kpie[eic->pie_read]); + eic->pie_read += sizeof(uint64_t); + debug_printk("eic_report_addr %lx\n", addr); + dump_eic(eic); +} + +static int eic_add_page(struct ept_idle_ctrl *eic, + unsigned long addr, + unsigned long next, + enum ProcIdlePageType page_type) +{ + int page_size = pagety
[RFC][PATCH v2 21/21] mm/vmscan.c: shrink anon list if can migrate to PMEM
Fix OOM by making in-kernel DRAM=>PMEM migration reachable. Here we assume these 2 possible demotion paths: - DRAM migrate to PMEM - PMEM to swap device Signed-off-by: Fengguang Wu --- mm/vmscan.c |5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) --- linux.orig/mm/vmscan.c 2018-12-23 20:38:44.310446223 +0800 +++ linux/mm/vmscan.c 2018-12-23 20:38:44.306446146 +0800 @@ -2259,7 +2259,7 @@ static bool inactive_list_is_low(struct * If we don't have swap space, anonymous page deactivation * is pointless. */ - if (!file && !total_swap_pages) + if (!file && (is_node_pmem(pgdat->node_id) && !total_swap_pages)) return false; inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); @@ -2340,7 +2340,8 @@ static void get_scan_count(struct lruvec enum lru_list lru; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { + if (is_node_pmem(pgdat->node_id) && + (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0)) { scan_balance = SCAN_FILE; goto out; }
[RFC][PATCH v2 16/21] mm-idle: mm_walk for normal task
From: Zhang Yi File pages are skipped for now. They are in general not guaranteed to be mapped. It means when become hot, there is no guarantee to find and move them to DRAM nodes. Signed-off-by: Zhang Yi Signed-off-by: Fengguang Wu --- arch/x86/kvm/ept_idle.c | 204 ++ mm/pagewalk.c |1 2 files changed, 205 insertions(+) --- linux.orig/arch/x86/kvm/ept_idle.c 2018-12-26 19:58:30.576894801 +0800 +++ linux/arch/x86/kvm/ept_idle.c 2018-12-26 19:58:39.840936072 +0800 @@ -510,6 +510,9 @@ static int ept_idle_walk_hva_range(struc return ret; } +static ssize_t mm_idle_read(struct file *file, char *buf, + size_t count, loff_t *ppos); + static ssize_t ept_idle_read(struct file *file, char *buf, size_t count, loff_t *ppos) { @@ -615,6 +618,207 @@ out: return ret; } +static int mm_idle_pte_range(struct ept_idle_ctrl *eic, pmd_t *pmd, +unsigned long addr, unsigned long next) +{ + enum ProcIdlePageType page_type; + pte_t *pte; + int err = 0; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!pte_present(*pte)) + page_type = PTE_HOLE; + else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED, +(unsigned long *) &pte->pte)) + page_type = PTE_IDLE; + else { + page_type = PTE_ACCESSED; + } + + err = eic_add_page(eic, addr, addr + PAGE_SIZE, page_type); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != next); + + return err; +} + +static int mm_idle_pmd_entry(pmd_t *pmd, unsigned long addr, +unsigned long next, struct mm_walk *walk) +{ + struct ept_idle_ctrl *eic = walk->private; + enum ProcIdlePageType page_type; + enum ProcIdlePageType pte_page_type; + int err; + + /* +* Skip duplicate PMD_IDLE_PTES: when the PMD crosses VMA boundary, +* walk_page_range() can call on the same PMD twice. +*/ + if ((addr & PMD_MASK) == (eic->last_va & PMD_MASK)) { + debug_printk("ignore duplicate addr %lx %lx\n", +addr, eic->last_va); + return 0; + } + eic->last_va = addr; + + if (eic->flags & SCAN_HUGE_PAGE) + pte_page_type = PMD_IDLE_PTES; + else + pte_page_type = IDLE_PAGE_TYPE_MAX; + + if (!pmd_present(*pmd)) + page_type = PMD_HOLE; + else if (!test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmd)) { + if (pmd_large(*pmd)) + page_type = PMD_IDLE; + else if (eic->flags & SCAN_SKIM_IDLE) + page_type = PMD_IDLE_PTES; + else + page_type = pte_page_type; + } else if (pmd_large(*pmd)) { + page_type = PMD_ACCESSED; + } else + page_type = pte_page_type; + + if (page_type != IDLE_PAGE_TYPE_MAX) + err = eic_add_page(eic, addr, next, page_type); + else + err = mm_idle_pte_range(eic, pmd, addr, next); + + return err; +} + +static int mm_idle_pud_entry(pud_t *pud, unsigned long addr, +unsigned long next, struct mm_walk *walk) +{ + struct ept_idle_ctrl *eic = walk->private; + + if ((addr & PUD_MASK) != (eic->last_va & PUD_MASK)) { + eic_add_page(eic, addr, next, PUD_PRESENT); + eic->last_va = addr; + } + return 1; +} + +static int mm_idle_test_walk(unsigned long start, unsigned long end, +struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + + if (vma->vm_file) { + if ((vma->vm_flags & (VM_WRITE|VM_MAYSHARE)) == VM_WRITE) + return 0; + return 1; + } + + return 0; +} + +static int mm_idle_walk_range(struct ept_idle_ctrl *eic, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma; + int ret; + + init_ept_idle_ctrl_buffer(eic); + + for (; start < end;) + { + down_read(&walk->mm->mmap_sem); + vma = find_vma(walk->mm, start); + if (vma) { + if (end > vma->vm_start) { + local_irq_disable(); + ret = walk_page_range(start, end, walk); + local_irq_enable(); + }
[RFC][PATCH v2 17/21] proc: introduce /proc/PID/idle_pages
This will be similar to /sys/kernel/mm/page_idle/bitmap documented in Documentation/admin-guide/mm/idle_page_tracking.rst, however indexed by process virtual address. When using the global PFN indexed idle bitmap, we find 2 kind of overheads: - to track a task's working set, Brendan Gregg end up writing wss-v1 for small tasks and wss-v2 for large tasks: https://github.com/brendangregg/wss That's because VAs may point to random PAs throughout the physical address space. So we either query /proc/pid/pagemap first and access the lots of random PFNs (with lots of syscalls) in the bitmap, or write+read the whole system idle bitmap beforehand. - page table walking by PFN has much more overheads than to walk a page table in its natural order: - rmap queries - more locking - random memory reads/writes This interface provides a cheap path for the majority non-shared mapping pages. To walk 1TB memory of 4k active pages, it costs 2s vs 15s system time to scan the per-task/global idle bitmaps. Which means ~7x speedup. The gap will be enlarged if consider - the extra /proc/pid/pagemap walk - natural page table walks can skip the whole 512 PTEs if PMD is idle OTOH, the per-task idle bitmap is not suitable in some situations: - not accurate for shared pages - don't work with non-mapped file pages - don't perform well for sparse page tables (pointed out by Huang Ying) So it's more about complementing the existing global idle bitmap. CC: Huang Ying CC: Brendan Gregg Signed-off-by: Fengguang Wu --- fs/proc/base.c |2 + fs/proc/internal.h |1 fs/proc/task_mmu.c | 54 +++ 3 files changed, 57 insertions(+) --- linux.orig/fs/proc/base.c 2018-12-23 20:08:14.228919325 +0800 +++ linux/fs/proc/base.c2018-12-23 20:08:14.224919327 +0800 @@ -2969,6 +2969,7 @@ static const struct pid_entry tgid_base_ REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap",S_IRUSR, proc_pagemap_operations), + REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -3357,6 +3358,7 @@ static const struct pid_entry tid_base_s REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap",S_IRUSR, proc_pagemap_operations), + REG("idle_pages", S_IRUSR|S_IWUSR, proc_mm_idle_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), --- linux.orig/fs/proc/internal.h 2018-12-23 20:08:14.228919325 +0800 +++ linux/fs/proc/internal.h2018-12-23 20:08:14.224919327 +0800 @@ -298,6 +298,7 @@ extern const struct file_operations proc extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_mm_idle_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, --- linux.orig/fs/proc/task_mmu.c 2018-12-23 20:08:14.228919325 +0800 +++ linux/fs/proc/task_mmu.c2018-12-23 20:08:14.224919327 +0800 @@ -1559,6 +1559,60 @@ const struct file_operations proc_pagema .open = pagemap_open, .release= pagemap_release, }; + +/* will be filled when kvm_ept_idle module loads */ +struct file_operations proc_ept_idle_operations = { +}; +EXPORT_SYMBOL_GPL(proc_ept_idle_operations); + +static ssize_t mm_idle_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + if (proc_ept_idle_operations.read) + return proc_ept_idle_operations.read(file, buf, count, ppos); + + return 0; +} + + +static int mm_idle_open(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = proc_mem_open(inode, PTRACE_MODE_READ); + + if (IS_ERR(mm)) + return PTR_ERR(mm); + + file->private_data = mm; + + if (proc_ept_idle_operations.open) + return proc_ept_idle_operations.open(inode, file); + + return 0; +} + +static int mm_idle_release(struct inode *inode, struct file *file) +{ + struct mm_struct *mm = file->private_data; + + if (mm) { + if (!mm_kvm(mm)) + flush_tlb_mm(mm); + mmdrop(mm); + } + + if (proc_ept_idle_operations.release) + return proc_ept_idle_operations.release(inode, file); + + return 0; +} + +const struct file_o
[RFC][PATCH v2 14/21] kvm: register in mm_struct
VM is associated with an address space and not a specific thread. >From Documentation/virtual/kvm/api.txt: Only run VM ioctls from the same process (address space) that was used to create the VM. CC: Nikita Leshenko CC: Christian Borntraeger Signed-off-by: Fengguang Wu --- include/linux/mm_types.h | 11 +++ virt/kvm/kvm_main.c |3 +++ 2 files changed, 14 insertions(+) --- linux.orig/include/linux/mm_types.h 2018-12-23 19:58:06.993417137 +0800 +++ linux/include/linux/mm_types.h 2018-12-23 19:58:06.993417137 +0800 @@ -27,6 +27,7 @@ typedef int vm_fault_t; struct address_space; struct mem_cgroup; struct hmm; +struct kvm; /* * Each physical page in the system has a struct page associated with @@ -496,6 +497,10 @@ struct mm_struct { /* HMM needs to track a few things per mm */ struct hmm *hmm; #endif + +#if IS_ENABLED(CONFIG_KVM) + struct kvm *kvm; +#endif } __randomize_layout; /* @@ -507,6 +512,12 @@ struct mm_struct { extern struct mm_struct init_mm; +#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return mm->kvm; } +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return NULL; } +#endif + /* Pointer magic because the dynamic array size confuses some compilers. */ static inline void mm_init_cpumask(struct mm_struct *mm) { --- linux.orig/virt/kvm/kvm_main.c 2018-12-23 19:58:06.993417137 +0800 +++ linux/virt/kvm/kvm_main.c 2018-12-23 19:58:06.993417137 +0800 @@ -727,6 +727,7 @@ static void kvm_destroy_vm(struct kvm *k struct mm_struct *mm = kvm->mm; kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); + mm->kvm = NULL; kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); @@ -3224,6 +3225,8 @@ static int kvm_dev_ioctl_create_vm(unsig fput(file); return -ENOMEM; } + + kvm->mm->kvm = kvm; kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); fd_install(r, file);
Re: [PATCH 0/3] RISC-V: A few build/warning fixes and cleanup
CC Philip, current kbuild test robot maintainer. On Wed, Oct 31, 2018 at 12:12:21PM -0600, Logan Gunthorpe wrote: On 2018-10-31 11:52 a.m., Olof Johansson wrote: I've migrated machines to a new system and that CI is up and running. Right now the emails don't go anywhere but me, but let me know where you want them sent and I'll be happy to do so (probably not linux-riscv due to volume/noise). They're also uploaded to http://arm-soc.lixom.net/buildlogs/riscv/. Since you keep single-patch branches in your tree, and I compile everything for every branch, I've restricted what branches I poll and build. Right now it's "for*" and "riscv*". I think it would also be very nice to get the existing kbuild test robot to start compile testing a few riscv configs. It already does most of the other arches. This would help us catch these kinds of issues even earlier seeing that test robot tests patches on mailing lists and many other repos before Palmer would even pull them into his branch. (Including a repo I routinely push branches to before sending out patches.) I'm not sure who's currently responsible for the robot but I've copied Fenguang who, I believe, set it up originally as well as the kbuild lists. Hopefully he can point us in the right direction to help get this set up. Thanks, Logan
Re: [PATCH] mm: simplify get_next_ra_size
Looks good to me, thanks! Reviewed-by: Fengguang Wu On Sun, Oct 28, 2018 at 02:13:26PM +0800, Gao Xiang wrote: It's a trivial simplification for get_next_ra_size and clear enough for humans to understand. It also fixes potential overflow if ra->size(< ra_pages) is too large. Cc: Fengguang Wu Signed-off-by: Gao Xiang --- mm/readahead.c | 12 +--- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 4e63014..205ac34 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -272,17 +272,15 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) * return it as the new window size. */ static unsigned long get_next_ra_size(struct file_ra_state *ra, - unsigned long max) + unsigned long max) { unsigned long cur = ra->size; - unsigned long newsize; if (cur < max / 16) - newsize = 4 * cur; - else - newsize = 2 * cur; - - return min(newsize, max); + return 4 * cur; + if (cur <= max / 2) + return 2 * cur; + return max; } /* -- 2.7.4
Re: [PATCH v4.19-rc7] treewide: Replace more open-coded allocation size multiplications
On Sat, Oct 06, 2018 at 08:51:16AM -0700, Kees Cook wrote: On Sat, Oct 6, 2018 at 1:49 AM, Fengguang Wu wrote: On Fri, Oct 05, 2018 at 08:14:34PM -0700, Joel Fernandes wrote: On Fri, Oct 05, 2018 at 05:22:35PM -0700, Greg KH wrote: And do we have a way to add a rule to 0-day to catch these so that they get a warning when they are added again? They could just be added to scripts/coccinelle and 0-day will report them? For example, 0-day ran scripts/coccinelle/api/platform_no_drv_owner.cocci on a recently submitted patch and reported it here: https://lore.kernel.org/lkml/201808301856.vmnjerss%25fengguang...@intel.com/ But I'm not sure if 0-day runs make coccicheck on specific semantic patches, or runs all of them (CC'd Fengguang). 0-day runs all coccinelle scripts. However only auto report out warnings that are known to have low false positives. So if you add new coccinelle scripts that emit accurate enough warnings, it'd be good to inform the LKP team to add the new warnings to our auto-report-out white list. It runs with MODE=report by default, yes? I'd need to expand the cases to cover that (it is patch-only currently) so that would be a roughly 10,000 line Coccinelle script. :) It first runs with "-D report", then with "-D patch" to create possible patches. Thanks, Fengguang
Re: [PATCH v4.19-rc7] treewide: Replace more open-coded allocation size multiplications
On Fri, Oct 05, 2018 at 08:14:34PM -0700, Joel Fernandes wrote: On Fri, Oct 05, 2018 at 05:22:35PM -0700, Greg KH wrote: On Fri, Oct 05, 2018 at 05:04:16PM -0700, Kees Cook wrote: > On Fri, Oct 5, 2018 at 4:51 PM, Greg KH wrote: > > On Fri, Oct 05, 2018 at 04:35:59PM -0700, Kees Cook wrote: > >> As done treewide earlier, this catches several more open-coded > >> allocation size calculations that were added to the kernel during the > >> merge window. This performs the following mechanical transformations > >> using Coccinelle: > >> > >> kvmalloc(a * b, ...) -> kvmalloc_array(a, b, ...) > >> kvzalloc(a * b, ...) -> kvcalloc(a, b, ...) > >> devm_kzalloc(..., a * b, ...) -> devm_kcalloc(..., a, b, ...) > >> > >> Signed-off-by: Kees Cook > > > > Has this had any testing in linux-next? > > No; they're mechanical transformations (though I did build test them). > If you want I could add this to linux-next for a week? That would be good, thanks. > > And when was "earlier"? > > v4.18, when all of these were originally eliminated: > > 026f05079b00 treewide: Use array_size() in f2fs_kzalloc() > c86065938aab treewide: Use array_size() in f2fs_kmalloc() > 76e43e37a407 treewide: Use array_size() in sock_kmalloc() > 84ca176bf54a treewide: Use array_size() in kvzalloc_node() > fd7becedb1f0 treewide: Use array_size() in vzalloc_node() > fad953ce0b22 treewide: Use array_size() in vzalloc() > 42bc47b35320 treewide: Use array_size() in vmalloc() > a86854d0c599 treewide: devm_kzalloc() -> devm_kcalloc() > 3c4211ba8ad8 treewide: devm_kmalloc() -> devm_kmalloc_array() > 778e1cdd81bb treewide: kvzalloc() -> kvcalloc() > 344476e16acb treewide: kvmalloc() -> kvmalloc_array() > 590b5b7d8671 treewide: kzalloc_node() -> kcalloc_node() > 6396bb221514 treewide: kzalloc() -> kcalloc() > 6da2ec56059c treewide: kmalloc() -> kmalloc_array() > > The new patch is catching new open-coded multiplications introduced in v4.19. As this is getting smaller, why not just break it up and do it through all of the different subsystems instead of one large patch? And do we have a way to add a rule to 0-day to catch these so that they get a warning when they are added again? They could just be added to scripts/coccinelle and 0-day will report them? For example, 0-day ran scripts/coccinelle/api/platform_no_drv_owner.cocci on a recently submitted patch and reported it here: https://lore.kernel.org/lkml/201808301856.vmnjerss%25fengguang...@intel.com/ But I'm not sure if 0-day runs make coccicheck on specific semantic patches, or runs all of them (CC'd Fengguang). 0-day runs all coccinelle scripts. However only auto report out warnings that are known to have low false positives. So if you add new coccinelle scripts that emit accurate enough warnings, it'd be good to inform the LKP team to add the new warnings to our auto-report-out white list. Thanks, Fengguang
[PATCH] mm: fix __get_user_pages_fast() comment
mmu_gather_tlb no longer exist. Replace with mmu_table_batch. CC: triv...@kernel.org Signed-off-by: Fengguang Wu --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index fc5f98069f4e..69194043ddd4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1798,8 +1798,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * interrupts disabled by get_futex_key. * * With interrupts disabled, we block page table pages from being -* freed from under us. See mmu_gather_tlb in asm-generic/tlb.h -* for more details. +* freed from under us. See struct mmu_table_batch comments in +* include/asm-generic/tlb.h for more details. * * We do not adopt an rcu_read_lock(.) here as we also want to * block IPIs that come from THPs splitting. -- 2.15.0
[PATCH] mm: fix comment typo in inc_tlb_flush_pending()
CC: triv...@kernel.org Signed-off-by: Fengguang Wu --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 27c5446f3deb..d4f99f5f677c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -552,7 +552,7 @@ static inline void inc_tlb_flush_pending(struct mm_struct *mm) * flush_tlb_range(); * atomic_dec(&mm->tlb_flush_pending); * -* Where the increment if constrained by the PTL unlock, it thus +* Where the increment is constrained by the PTL unlock, it thus * ensures that the increment is visible if the PTE modification is * visible. After all, if there is no PTE modification, nobody cares * about TLB flushes either. -- 2.15.0
Re: [RFC][PATCH 1/5] [PATCH 1/5] kvm: register in task_struct
On Tue, Sep 04, 2018 at 09:43:50AM +0200, Christian Borntraeger wrote: On 09/04/2018 09:15 AM, Fengguang Wu wrote: On Tue, Sep 04, 2018 at 08:37:03AM +0200, Nikita Leshenko wrote: On 4 Sep 2018, at 2:46, Fengguang Wu wrote: Here it goes: diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 99ce070e7dcb..27c5446f3deb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -27,6 +27,7 @@ typedef int vm_fault_t; struct address_space; struct mem_cgroup; struct hmm; +struct kvm; /* * Each physical page in the system has a struct page associated with @@ -489,10 +490,19 @@ struct mm_struct { /* HMM needs to track a few things per mm */ struct hmm *hmm; #endif +#if IS_ENABLED(CONFIG_KVM) + struct kvm *kvm; +#endif } __randomize_layout; extern struct mm_struct init_mm; +#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return mm->kvm; } +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return NULL; } +#endif + static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0c483720de8d..dca6156a7b35 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3892,7 +3892,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) if (type == KVM_EVENT_CREATE_VM) { add_uevent_var(env, "EVENT=create"); kvm->userspace_pid = task_pid_nr(current); - current->kvm = kvm; + current->mm->kvm = kvm; I think you also need to reset kvm to NULL once the VM is destroyed, otherwise it would point to dangling memory. Good point! Here is the incremental patch: --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3894,6 +3894,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) kvm->userspace_pid = task_pid_nr(current); current->mm->kvm = kvm; } else if (type == KVM_EVENT_DESTROY_VM) { + current->mm->kvm = NULL; add_uevent_var(env, "EVENT=destroy"); } add_uevent_var(env, "PID=%d", kvm->userspace_pid); I think you should put both code snippets somewhere else. This has probably nothing to do with the uevent. Instead this should go into kvm_destroy_vm and kvm_create_vm. Make sure to take care of the error handling. OK. Will set the pointer late and reset it early like this. Since there are several error conditions after kvm_create_vm(), it may be more convenient to set it in kvm_dev_ioctl_create_vm(), when there are no more errors to handle: --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -724,6 +724,7 @@ static void kvm_destroy_vm(struct kvm *kvm) struct mm_struct *mm = kvm->mm; kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); + current->mm->kvm = NULL; kvm_destroy_vm_debugfs(kvm); kvm_arch_sync_events(kvm); spin_lock(&kvm_lock); @@ -3206,6 +3207,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) fput(file); return -ENOMEM; } + current->mm->kvm = kvm; kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); fd_install(r, file); Can you point us to the original discussion about the why and what you are trying to achieve? It's the initial RFC post. [PATCH 0] describes some background info. Basically we're implementing /proc/PID/idle_bitmap for user space to walk page tables and get "accessed" bits. Since VM's "accessed" bits will be reflected in EPT (or AMD NPT), we'll need to walk EPT when detected it is QEMU main process. Thanks, Fengguang
Re: [RFC][PATCH 3/5] [PATCH 3/5] kvm-ept-idle: HVA indexed EPT read
Yeah thanks! Currently we are restructuring the related functions, will add these calls when sorted out the walk order and hole issues. Thanks, Fengguang On Tue, Sep 04, 2018 at 04:12:00PM +0800, Peng Dong wrote: kvm_get_kvm() kvm_put_kvm() -Original Message- From: Nikita Leshenko [mailto:nikita.leshche...@oracle.com] Sent: Tuesday, September 4, 2018 3:57 PM To: Wu, Fengguang Cc: Andrew Morton ; Linux Memory Management List ; Peng, DongX ; Liu, Jingqi ; Dong, Eddie ; Hansen, Dave ; Huang, Ying ; Brendan Gregg ; k...@vger.kernel.org; LKML Subject: Re: [RFC][PATCH 3/5] [PATCH 3/5] kvm-ept-idle: HVA indexed EPT read On 1 Sep 2018, at 13:28, Fengguang Wu wrote: +static ssize_t ept_idle_read(struct file *file, char *buf, +size_t count, loff_t *ppos) +{ + struct task_struct *task = file->private_data; + struct ept_idle_ctrl *eic; + unsigned long hva_start = *ppos << BITMAP_BYTE2PVA_SHIFT; + unsigned long hva_end = hva_start + (count << BITMAP_BYTE2PVA_SHIFT); + int ret; + + if (*ppos % IDLE_BITMAP_CHUNK_SIZE || + count % IDLE_BITMAP_CHUNK_SIZE) + return -EINVAL; + + eic = kzalloc(sizeof(*eic), GFP_KERNEL); + if (!eic) + return -EBUSY; + + eic->buf = buf; + eic->buf_size = count; + eic->kvm = task_kvm(task); + if (!eic->kvm) { + ret = -EINVAL; + goto out_free; + } I think you need to increment the refcount while using kvm, otherwise kvm can be destroyed from another thread while you're walking it. -Nikita + + ret = ept_idle_walk_hva_range(eic, hva_start, hva_end); + if (ret) + goto out_free; + + ret = eic->bytes_copied; + *ppos += ret; +out_free: + kfree(eic); + + return ret; +}
Re: [RFC][PATCH 1/5] [PATCH 1/5] kvm: register in task_struct
On Tue, Sep 04, 2018 at 08:37:03AM +0200, Nikita Leshenko wrote: On 4 Sep 2018, at 2:46, Fengguang Wu wrote: Here it goes: diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 99ce070e7dcb..27c5446f3deb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -27,6 +27,7 @@ typedef int vm_fault_t; struct address_space; struct mem_cgroup; struct hmm; +struct kvm; /* * Each physical page in the system has a struct page associated with @@ -489,10 +490,19 @@ struct mm_struct { /* HMM needs to track a few things per mm */ struct hmm *hmm; #endif +#if IS_ENABLED(CONFIG_KVM) + struct kvm *kvm; +#endif } __randomize_layout; extern struct mm_struct init_mm; +#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return mm->kvm; } +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return NULL; } +#endif + static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0c483720de8d..dca6156a7b35 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3892,7 +3892,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) if (type == KVM_EVENT_CREATE_VM) { add_uevent_var(env, "EVENT=create"); kvm->userspace_pid = task_pid_nr(current); - current->kvm = kvm; + current->mm->kvm = kvm; I think you also need to reset kvm to NULL once the VM is destroyed, otherwise it would point to dangling memory. Good point! Here is the incremental patch: --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3894,6 +3894,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) kvm->userspace_pid = task_pid_nr(current); current->mm->kvm = kvm; } else if (type == KVM_EVENT_DESTROY_VM) { + current->mm->kvm = NULL; add_uevent_var(env, "EVENT=destroy"); } add_uevent_var(env, "PID=%d", kvm->userspace_pid); Thanks, Fengguang
Re: [RFC][PATCH 1/5] [PATCH 1/5] kvm: register in task_struct
On Tue, Sep 04, 2018 at 08:28:18AM +0800, Fengguang Wu wrote: Hi Christian and Nikita, On Mon, Sep 03, 2018 at 06:03:49PM +0200, Christian Borntraeger wrote: On 09/03/2018 04:10 PM, Nikita Leshenko wrote: On September 2, 2018 5:21:15 AM, fengguang...@intel.com wrote: diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8b47507faab5..0c483720de8d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3892,6 +3892,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) if (type == KVM_EVENT_CREATE_VM) { add_uevent_var(env, "EVENT=create"); kvm->userspace_pid = task_pid_nr(current); + current->kvm = kvm; Is it OK to store `kvm` on the task_struct? What if the thread that originally created the VM exits? From the documentation it seems like a VM is associated with an address space and not a specific thread, so maybe it should be stored on mm_struct? Yes, ioctls accessing the kvm can happen from all threads. Good point, thank you for the tips! I'll move kvm pointer to mm_struct. From Documentation/virtual/kvm/api.txt: Only run VM ioctls from the same process (address space) that was used to create the VM. -Nikita Here it goes: diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 99ce070e7dcb..27c5446f3deb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -27,6 +27,7 @@ typedef int vm_fault_t; struct address_space; struct mem_cgroup; struct hmm; +struct kvm; /* * Each physical page in the system has a struct page associated with @@ -489,10 +490,19 @@ struct mm_struct { /* HMM needs to track a few things per mm */ struct hmm *hmm; #endif +#if IS_ENABLED(CONFIG_KVM) + struct kvm *kvm; +#endif } __randomize_layout; extern struct mm_struct init_mm; +#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return mm->kvm; } +#else +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return NULL; } +#endif + static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0c483720de8d..dca6156a7b35 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3892,7 +3892,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) if (type == KVM_EVENT_CREATE_VM) { add_uevent_var(env, "EVENT=create"); kvm->userspace_pid = task_pid_nr(current); - current->kvm = kvm; + current->mm->kvm = kvm; } else if (type == KVM_EVENT_DESTROY_VM) { add_uevent_var(env, "EVENT=destroy"); }
Re: [RFC][PATCH 1/5] [PATCH 1/5] kvm: register in task_struct
Hi Christian and Nikita, On Mon, Sep 03, 2018 at 06:03:49PM +0200, Christian Borntraeger wrote: On 09/03/2018 04:10 PM, Nikita Leshenko wrote: On September 2, 2018 5:21:15 AM, fengguang...@intel.com wrote: diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8b47507faab5..0c483720de8d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3892,6 +3892,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) if (type == KVM_EVENT_CREATE_VM) { add_uevent_var(env, "EVENT=create"); kvm->userspace_pid = task_pid_nr(current); + current->kvm = kvm; Is it OK to store `kvm` on the task_struct? What if the thread that originally created the VM exits? From the documentation it seems like a VM is associated with an address space and not a specific thread, so maybe it should be stored on mm_struct? Yes, ioctls accessing the kvm can happen from all threads. Good point, thank you for the tips! I'll move kvm pointer to mm_struct. From Documentation/virtual/kvm/api.txt: Only run VM ioctls from the same process (address space) that was used to create the VM. -Nikita Regards, Fengguang
Re: [RFC][PATCH 0/5] introduce /proc/PID/idle_bitmap
Here are the diffstat: arch/x86/kvm/Kconfig| 11 + arch/x86/kvm/Makefile |4 arch/x86/kvm/ept_idle.c | 329 ++ arch/x86/kvm/ept_idle.h | 79 + fs/proc/base.c |2 fs/proc/internal.h |1 fs/proc/task_mmu.c | 63 +++ include/linux/sched.h | 10 + virt/kvm/kvm_main.c |1 9 files changed, 500 insertions(+) Regards, Fengguang
[RFC][PATCH 2/5] [PATCH 2/5] proc: introduce /proc/PID/idle_bitmap
This will be similar to /sys/kernel/mm/page_idle/bitmap documented in Documentation/admin-guide/mm/idle_page_tracking.rst, however indexed by process virtual address. When using the global PFN indexed idle bitmap, we find 2 kind of overheads: - to track a task's working set, Brendan Gregg end up writing wss-v1 for small tasks and wss-v2 for large tasks: https://github.com/brendangregg/wss That's because VAs may point to random PAs throughout the physical address space. So we either query /proc/pid/pagemap first and access the lots of random PFNs (with lots of syscalls) in the bitmap, or write+read the whole system idle bitmap beforehand. - page table walking by PFN has much more overheads than to walk a page table in its natural order: - rmap queries - more locking - random memory reads/writes This interface provides a cheap path for the majority non-shared mapping pages. To walk 1TB memory of 4k active pages, it costs 2s vs 15s system time to scan the per-task/global idle bitmaps. Which means ~7x speedup. The gap will be enlarged if consider - the extra /proc/pid/pagemap walk - natural page table walks can skip the whole 512 PTEs if PMD is idle OTOH, the per-task idle bitmap is not suitable in some situations: - not accurate for shared pages - don't work with non-mapped file pages - don't perform well for sparse page tables (pointed out by Huang Ying) So it's more about complementing the existing global idle bitmap. CC: Huang Ying CC: Brendan Gregg Signed-off-by: Fengguang Wu --- fs/proc/base.c | 2 ++ fs/proc/internal.h | 1 + fs/proc/task_mmu.c | 63 ++ 3 files changed, 66 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index aaffc0c30216..d81322b5b8d2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2942,6 +2942,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap",S_IRUSR, proc_pagemap_operations), + REG("idle_bitmap", S_IRUSR|S_IWUSR, proc_mm_idle_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), @@ -3327,6 +3328,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("smaps", S_IRUGO, proc_tid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap",S_IRUSR, proc_pagemap_operations), + REG("idle_bitmap", S_IRUSR|S_IWUSR, proc_mm_idle_operations), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index da3dbfa09e79..732a502acc27 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -305,6 +305,7 @@ extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_tid_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; +extern const struct file_operations proc_mm_idle_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dfd73a4616ce..376406a9cf45 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1564,6 +1564,69 @@ const struct file_operations proc_pagemap_operations = { .open = pagemap_open, .release= pagemap_release, }; + +/* will be filled when kvm_ept_idle module loads */ +struct file_operations proc_ept_idle_operations = { +}; +EXPORT_SYMBOL_GPL(proc_ept_idle_operations); + +static ssize_t mm_idle_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = file->private_data; + ssize_t ret = -ESRCH; + + // TODO: implement mm_walk for normal tasks + + if (task_kvm(task)) { + if (proc_ept_idle_operations.read) + return proc_ept_idle_operations.read(file, buf, count, ppos); + } + + return ret; +} + + +static int mm_idle_open(struct inode *inode, struct file *file) +{ + struct task_struct *task = get_proc_task(inode); + + if (!task) + return -ESRCH; + + file->private_data = task; + + if (task_kvm(task)) { + if (proc_ept_idle_operations.open) + return proc_ept_idle_operations.open(inode, file); + } + + return 0; +} + +static int mm_idle_release(struct inode *inode, struct file *file) +{ + struct task_struct *task = file->private_dat
[RFC][PATCH 3/5] [PATCH 3/5] kvm-ept-idle: HVA indexed EPT read
For virtual machines, "accessed" bits will be set in guest page tables and EPT/NPT. So for qemu-kvm process, convert HVA to GFN to GPA, then do EPT/NPT walks. Thanks to the in-memslot linear HVA-GPA mapping, the conversion can be done efficiently, outside of the loops for page table walks. In this manner, we provide uniform interface for both virtual machines and normal processes. The use scenario would be per task/VM working set tracking and migration. Very convenient for applying task/vma and VM granularity policies. Signed-off-by: Peng DongX Signed-off-by: Fengguang Wu --- arch/x86/kvm/ept_idle.c | 118 arch/x86/kvm/ept_idle.h | 24 ++ 2 files changed, 142 insertions(+) create mode 100644 arch/x86/kvm/ept_idle.c create mode 100644 arch/x86/kvm/ept_idle.h diff --git a/arch/x86/kvm/ept_idle.c b/arch/x86/kvm/ept_idle.c new file mode 100644 index ..5b97dd01011b --- /dev/null +++ b/arch/x86/kvm/ept_idle.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#include "ept_idle.h" + + +// mindless copy from kvm_handle_hva_range(). +// TODO: handle order and hole. +static int ept_idle_walk_hva_range(struct ept_idle_ctrl *eic, + unsigned long start, + unsigned long end) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int ret = 0; + + slots = kvm_memslots(eic->kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* +* {gfn(page) | page intersects with [hva_start, hva_end)} = +* {gfn_start, gfn_start+1, ..., gfn_end-1}. +*/ + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + ret = ept_idle_walk_gfn_range(eic, gfn_start, gfn_end); + if (ret) + return ret; + } + + return ret; +} + +static ssize_t ept_idle_read(struct file *file, char *buf, +size_t count, loff_t *ppos) +{ + struct task_struct *task = file->private_data; + struct ept_idle_ctrl *eic; + unsigned long hva_start = *ppos << BITMAP_BYTE2PVA_SHIFT; + unsigned long hva_end = hva_start + (count << BITMAP_BYTE2PVA_SHIFT); + int ret; + + if (*ppos % IDLE_BITMAP_CHUNK_SIZE || + count % IDLE_BITMAP_CHUNK_SIZE) + return -EINVAL; + + eic = kzalloc(sizeof(*eic), GFP_KERNEL); + if (!eic) + return -EBUSY; + + eic->buf = buf; + eic->buf_size = count; + eic->kvm = task_kvm(task); + if (!eic->kvm) { + ret = -EINVAL; + goto out_free; + } + + ret = ept_idle_walk_hva_range(eic, hva_start, hva_end); + if (ret) + goto out_free; + + ret = eic->bytes_copied; + *ppos += ret; +out_free: + kfree(eic); + + return ret; +} + +static int ept_idle_open(struct inode *inode, struct file *file) +{ + if (!try_module_get(THIS_MODULE)) + return -EBUSY; + + return 0; +} + +static int ept_idle_release(struct inode *inode, struct file *file) +{ + module_put(THIS_MODULE); + return 0; +} + +extern struct file_operations proc_ept_idle_operations; + +static int ept_idle_entry(void) +{ + proc_ept_idle_operations.owner = THIS_MODULE; + proc_ept_idle_operations.read = ept_idle_read; + proc_ept_idle_operations.open = ept_idle_open; + proc_ept_idle_operations.release = ept_idle_release; + + return 0; +} + +static void ept_idle_exit(void) +{ + memset(&proc_ept_idle_operations, 0, sizeof(proc_ept_idle_operations)); +} + +MODULE_LICENSE("GPL"); +module_init(ept_idle_entry); +module_exit(ept_idle_exit); diff --git a/arch/x86/kvm/ept_idle.h b/arch/x86/kvm/ept_idle.h new file mode 100644 index ..e0b9dcecf50b --- /dev/null +++ b/arch/x86/kvm/ept_idle.h @@ -0,0 +1,24 @@ +#ifndef _EPT_IDLE_H +#define _EPT_IDLE_H + +#define IDLE_BITMAP_CHUNK_SIZE sizeof(u64) +#define IDLE_BITMAP_CHUNK_BITS (IDLE_BITMAP_CHUNK_SIZE * BITS_PER_BYTE) + +#define BITMAP_BYTE2PVA_SHIFT (3 + PAGE_SHIFT) + +#define EPT_IDLE_KBUF_FULL 1 +#define EPT_IDLE_KBUF_BYTES 8000 +#define EPT_IDLE_KBUF_BITS (EPT_IDLE_KBUF_BYTES * 8) + +struct ept_idle_ctrl { + struct kvm *kvm; + + u64 kbuf[EPT_IDLE_K
[RFC][PATCH 5/5] [PATCH 5/5] kvm-ept-idle: enable module
Signed-off-by: Fengguang Wu --- arch/x86/kvm/Kconfig | 11 +++ arch/x86/kvm/Makefile | 4 2 files changed, 15 insertions(+) diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 1bbec387d289..4c6dec47fac6 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -96,6 +96,17 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. +config KVM_EPT_IDLE + tristate "KVM EPT idle page tracking" + depends on KVM_INTEL + depends on PROC_PAGE_MONITOR + ---help--- + Provides support for walking EPT to get the A bits on Intel + processors equipped with the VT extensions. + + To compile this as a module, choose M here: the module + will be called kvm-ept-idle. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index dc4f2fdf5e57..5cad0590205d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -19,6 +19,10 @@ kvm-y+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ kvm-intel-y+= vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o +kvm-ept-idle-y += ept_idle.o + obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL)+= kvm-intel.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o + +obj-$(CONFIG_KVM_EPT_IDLE) += kvm-ept-idle.o -- 2.15.0
[RFC][PATCH 0/5] introduce /proc/PID/idle_bitmap
This new /proc/PID/idle_bitmap interface aims to complement the current global /sys/kernel/mm/page_idle/bitmap. To enable efficient user space driven migrations. The pros and cons will be discussed in changelog of "[PATCH] proc: introduce /proc/PID/idle_bitmap". The driving force is to improve efficiency by 10+ times, so that hot/cold page tracking can be done in some regular intervals in user space w/o too much overheads. Making it possible for some user space daemon to do regular page migration between NUMA nodes of different speeds. Note it's not about NUMA migration between local and remote nodes -- we already have NUMA balancing for that. This interface and user space migration daemon targets for NUMA nodes made of different mediums -- ie. DIMM and NVDIMM(*) -- with larger performance gaps. Basic policy will be "move hot pages to DIMM; cold pages to NVDIMM". Since NVDIMMs size can easily reach several Terabytes, working set tracking efficiency will matter and be challeging. (*) Here we use persistent memory (PMEM) w/o using its persistence. Persistence is good to have, however it requires modifying applications. Upcoming NVDIMM products like Intel Apache Pass (AEP) will be more cost and energy effective than DRAM, but slower. Merely using it in form of NUMA memory node could immediately benefit many workloads. For example, warm but not hot apps, workloads with sharp hot/cold page distribution (good for migration), or relies more on memory size than latency and bandwidth, and do more reads than writes. This is an early RFC version to collect feedbacks. It's complete enough to demo the basic ideas and performance, however not usable yet. Regards, Fengguang
[RFC][PATCH 1/5] [PATCH 1/5] kvm: register in task_struct
The added pointer will be used by the /proc/PID/idle_bitmap code to quickly identify QEMU task and walk EPT/NPT accordingly. For virtual machines, the A bits will be set in guest page tables and EPT/NPT, rather than the QEMU task page table. This costs 8 bytes in task_struct which could be wasteful for the majority normal tasks. The alternative is to add a flag only, and let it find the corresponding VM in kvm vm_list. Signed-off-by: Fengguang Wu --- include/linux/sched.h | 10 ++ virt/kvm/kvm_main.c | 1 + 2 files changed, 11 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 43731fe51c97..26c8549bbc28 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -38,6 +38,7 @@ struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; +struct kvm; struct mempolicy; struct nameidata; struct nsproxy; @@ -1179,6 +1180,9 @@ struct task_struct { /* Used by LSM modules for access restriction: */ void*security; #endif +#if IS_ENABLED(CONFIG_KVM) +struct kvm *kvm; +#endif /* * New fields for task_struct should be added above here, so that @@ -1898,4 +1902,10 @@ static inline void rseq_syscall(struct pt_regs *regs) #endif +#if IS_ENABLED(CONFIG_KVM) +static inline struct kvm *task_kvm(struct task_struct *t) { return t->kvm; } +#else +static inline struct kvm *task_kvm(struct task_struct *t) { return NULL; } +#endif + #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8b47507faab5..0c483720de8d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3892,6 +3892,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) if (type == KVM_EVENT_CREATE_VM) { add_uevent_var(env, "EVENT=create"); kvm->userspace_pid = task_pid_nr(current); + current->kvm = kvm; } else if (type == KVM_EVENT_DESTROY_VM) { add_uevent_var(env, "EVENT=destroy"); } -- 2.15.0
[RFC][PATCH 4/5] [PATCH 4/5] kvm-ept-idle: EPT page table walk for A bits
This borrows host page table walk macros/functions to do EPT walk. So it depends on them using the same level. Dave Hansen raised the concern that hottest pages may be cached in TLB and don't frequently set the accessed bits. The solution would be to invalidate TLB for the mm being walked, when finished one round of scan. Warning: read() also clears the accessed bit btw, in order to avoid one more page table walk for write(). That may not be desirable for some use cases, so we can avoid clearing accessed bit when opened in readonly mode. The interface should be further improved to 1) report holes and huge pages in one go 2) represent huge pages and sparse page tables efficiently (1) can be trivially fixed by extending the bitmap to more bits per PAGE_SIZE. (2) would need fundemental changes to the interface. It seems existing solutions for sparse files like SEEK_HOLE/SEEK_DATA and FIEMAP ioctl may not serve this situation well. The most efficient way could be to fill user space read() buffer with an array of small extents: struct idle_extent { unsigned type : 4; unsigned nr : 4; }; where type can be one of 4K_HOLE 4K_IDLE 4K_ACCESSED 2M_HOLE 2M_IDLE 2M_ACCESSED 1G_OR_LARGER_PAGE ... There can be up to 16 types, so more page sizes can be defined. The above names are just for easy understanding the typical case. It's also possible that PAGE_SIZE is not 4K, or PMD represents 4M pages. In which case we change type names to more suitable ones like PTE_HOLE, PMD_ACCESSED. Since it's page table walking, the user space should better know the exact page sizes. Either the accessed bit or page migration are tied to the real page size. Anyone interested in adding PTE_DIRTY or more types? The main problem with such extent reporting interface is, the number of bytes returned by read (variable extents) will mismatch the advanced file position (fixed VA indexes), which is not POSIX compliant. Simple cp/cat may still work, as they don't lseek based on read return value. If that's really a concern, we may use ioctl() instead.. CC: Dave Hansen Signed-off-by: Fengguang Wu --- arch/x86/kvm/ept_idle.c | 211 arch/x86/kvm/ept_idle.h | 55 + 2 files changed, 266 insertions(+) diff --git a/arch/x86/kvm/ept_idle.c b/arch/x86/kvm/ept_idle.c index 5b97dd01011b..8a233ab8656d 100644 --- a/arch/x86/kvm/ept_idle.c +++ b/arch/x86/kvm/ept_idle.c @@ -9,6 +9,217 @@ #include "ept_idle.h" +static int add_to_idle_bitmap(struct ept_idle_ctrl *eic, + int idle, unsigned long addr_range) +{ + int nbits = addr_range >> PAGE_SHIFT; + int bits_left = EPT_IDLE_KBUF_BITS - eic->bits_read; + int ret = 0; + + if (nbits >= bits_left) { + ret = EPT_IDLE_KBUF_FULL; + nbits = bits_left; + } + + // TODO: this assumes u64 == unsigned long + if (!idle) + __bitmap_clear((unsigned long *)eic->kbuf, eic->bits_read, nbits); + eic->bits_read += nbits; + + return ret; +} + +static int ept_pte_range(struct ept_idle_ctrl *eic, +pmd_t *pmd, unsigned long addr, unsigned long end) +{ + pte_t *pte; + int err = 0; + int idle; + + pte = pte_offset_kernel(pmd, addr); + do { + if (!ept_pte_present(*pte) || + !ept_pte_accessed(*pte)) + idle = 1; + else { + idle = 0; + pte_clear_flags(*pte, _PAGE_EPT_ACCESSED); + } + + err = add_to_idle_bitmap(eic, idle, PAGE_SIZE); + if (err) + break; + } while (pte++, addr += PAGE_SIZE, addr != end); + + return err; +} + +static int ept_pmd_range(struct ept_idle_ctrl *eic, +pud_t *pud, unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next; + int err = 0; + int idle; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + idle = -1; + if (!ept_pmd_present(*pmd) || + !ept_pmd_accessed(*pmd)) { + idle = 1; + } else if (pmd_large(*pmd)) { + idle = 0; + pmd_clear_flags(*pmd, _PAGE_EPT_ACCESSED); + } + if (idle >= 0) + err = add_to_idle_bitmap(eic, idle, next - addr); + else + err = ept_pte_range(eic, pmd, addr, next); + if (err) + break; + } while (pmd++, addr = next, addr != end); + + return err; +} + +static int ept_pud_range(struct ept_idle_ctrl *eic, +
Re: [PATCH] printk: inject caller information into the body of message
On Wed, Jun 20, 2018 at 02:45:25PM +0200, Dmitry Vyukov wrote: On Wed, Jun 20, 2018 at 2:41 PM, Fengguang Wu wrote: On Wed, Jun 20, 2018 at 02:31:51PM +0200, Dmitry Vyukov wrote: On Wed, Jun 20, 2018 at 1:37 PM, Fengguang Wu wrote: On Wed, Jun 20, 2018 at 11:30:05AM +0200, Dmitry Vyukov wrote: On Wed, Jun 20, 2018 at 11:06 AM, Sergey Senozhatsky wrote: Hi Dmitry, On (06/20/18 10:45), Dmitry Vyukov wrote: Hi Sergey, What are the visible differences between this patch and Tetsuo's patch? I guess none, and looking at your requirements below I tend to agree that Tetsuo's approach is probably what you need at the end of the day. The only thing that will matter for syzkaller parsing in the end is the resulting text format as it appears on console. But you say "I'm not pushing for this particular message format", so what exactly do you want me to provide feedback on? I guess we need to handle pr_cont properly whatever approach we take. Mostly, was wondering about if: a) you need pr_cont() handling b) you need printk_safe() handling The reasons I left those things behind: a) pr_cont() is officially hated. It was never supposed to be used on SMP systems. So I wasn't sure if we need all that effort and add tricky code to handle pr_cont(). Given that syzkaller is probably the only user of that functionality. Well, if I put my syzkaller hat on, then I don't care what exactly happens in the kernel, the only thing I care is well-formed output on console that can be parsed unambiguously in all cases. +1 for 0day kernel testing. I admit that goal may never be 100% achievable -- at least some serial console logs can sometimes become messy. So we'll have to write dmesg parsing code in defensive ways. But some unnecessary pr_cont() broken-up messages can obviously be avoided. For example, arch/x86/mm/fault.c: printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); I've actually proposed to remove the above KERN_CONT, unfortunately the patch was silently ignored. I've just cooked this change too, but do you mind reviving your patch? Yes, sure. My version is more dumb. Since I'm not sure if it's OK to do string formatting at this critical point. Let's see how others think about the 2 approaches. I'm fine as long as our problem is fixed. :) It already does string formatting for address. And I think we also need to get rid of KERN_CONT for address while we are here. Ah yes, sorry I overlooked the next KERN_CONT.. diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9a84a0d08727..c7b068c6b010 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -671,11 +671,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(smep_warning, from_kuid(&init_user_ns, current_uid())); } - printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) - printk(KERN_CONT "NULL pointer dereference"); + printk(KERN_ALERT "BUG: unable to handle kernel NULL pointer dereference"); else - printk(KERN_CONT "paging request"); + printk(KERN_ALERT "BUG: unable to handle kernel paging request"); printk(KERN_CONT " at %px\n", (void *) address); It actually makes the code even shorter, which is nice: --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -671,13 +671,9 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(smep_warning, from_kuid(&init_user_ns, current_uid())); } - printk(KERN_ALERT "BUG: unable to handle kernel "); - if (address < PAGE_SIZE) - printk(KERN_CONT "NULL pointer dereference"); - else - printk(KERN_CONT "paging request"); - - printk(KERN_CONT " at %px\n", (void *) address); + printk(KERN_ALERT "BUG: unable to handle kernel %s at %px\n", + (address < PAGE_SIZE ? "NULL pointer dereference" : + "paging request"), (void *) address); dump_pagetable(address); }
Re: [PATCH] printk: inject caller information into the body of message
On Wed, Jun 20, 2018 at 02:31:51PM +0200, Dmitry Vyukov wrote: On Wed, Jun 20, 2018 at 1:37 PM, Fengguang Wu wrote: On Wed, Jun 20, 2018 at 11:30:05AM +0200, Dmitry Vyukov wrote: On Wed, Jun 20, 2018 at 11:06 AM, Sergey Senozhatsky wrote: Hi Dmitry, On (06/20/18 10:45), Dmitry Vyukov wrote: Hi Sergey, What are the visible differences between this patch and Tetsuo's patch? I guess none, and looking at your requirements below I tend to agree that Tetsuo's approach is probably what you need at the end of the day. The only thing that will matter for syzkaller parsing in the end is the resulting text format as it appears on console. But you say "I'm not pushing for this particular message format", so what exactly do you want me to provide feedback on? I guess we need to handle pr_cont properly whatever approach we take. Mostly, was wondering about if: a) you need pr_cont() handling b) you need printk_safe() handling The reasons I left those things behind: a) pr_cont() is officially hated. It was never supposed to be used on SMP systems. So I wasn't sure if we need all that effort and add tricky code to handle pr_cont(). Given that syzkaller is probably the only user of that functionality. Well, if I put my syzkaller hat on, then I don't care what exactly happens in the kernel, the only thing I care is well-formed output on console that can be parsed unambiguously in all cases. +1 for 0day kernel testing. I admit that goal may never be 100% achievable -- at least some serial console logs can sometimes become messy. So we'll have to write dmesg parsing code in defensive ways. But some unnecessary pr_cont() broken-up messages can obviously be avoided. For example, arch/x86/mm/fault.c: printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); I've actually proposed to remove the above KERN_CONT, unfortunately the patch was silently ignored. I've just cooked this change too, but do you mind reviving your patch? Yes, sure. My version is more dumb. Since I'm not sure if it's OK to do string formatting at this critical point. Let's see how others think about the 2 approaches. I'm fine as long as our problem is fixed. :) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9a84a0d08727..c7b068c6b010 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -671,11 +671,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(smep_warning, from_kuid(&init_user_ns, current_uid())); } - printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) - printk(KERN_CONT "NULL pointer dereference"); + printk(KERN_ALERT "BUG: unable to handle kernel NULL pointer dereference"); else - printk(KERN_CONT "paging request"); + printk(KERN_ALERT "BUG: unable to handle kernel paging request"); printk(KERN_CONT " at %px\n", (void *) address); It actually makes the code even shorter, which is nice: --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -671,13 +671,9 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(smep_warning, from_kuid(&init_user_ns, current_uid())); } - printk(KERN_ALERT "BUG: unable to handle kernel "); - if (address < PAGE_SIZE) - printk(KERN_CONT "NULL pointer dereference"); - else - printk(KERN_CONT "paging request"); - - printk(KERN_CONT " at %px\n", (void *) address); + printk(KERN_ALERT "BUG: unable to handle kernel %s at %px\n", + (address < PAGE_SIZE ? "NULL pointer dereference" : + "paging request"), (void *) address); dump_pagetable(address); }
Re: [PATCH] printk: inject caller information into the body of message
On Wed, Jun 20, 2018 at 11:30:05AM +0200, Dmitry Vyukov wrote: On Wed, Jun 20, 2018 at 11:06 AM, Sergey Senozhatsky wrote: Hi Dmitry, On (06/20/18 10:45), Dmitry Vyukov wrote: Hi Sergey, What are the visible differences between this patch and Tetsuo's patch? I guess none, and looking at your requirements below I tend to agree that Tetsuo's approach is probably what you need at the end of the day. The only thing that will matter for syzkaller parsing in the end is the resulting text format as it appears on console. But you say "I'm not pushing for this particular message format", so what exactly do you want me to provide feedback on? I guess we need to handle pr_cont properly whatever approach we take. Mostly, was wondering about if: a) you need pr_cont() handling b) you need printk_safe() handling The reasons I left those things behind: a) pr_cont() is officially hated. It was never supposed to be used on SMP systems. So I wasn't sure if we need all that effort and add tricky code to handle pr_cont(). Given that syzkaller is probably the only user of that functionality. Well, if I put my syzkaller hat on, then I don't care what exactly happens in the kernel, the only thing I care is well-formed output on console that can be parsed unambiguously in all cases. +1 for 0day kernel testing. I admit that goal may never be 100% achievable -- at least some serial console logs can sometimes become messy. So we'll have to write dmesg parsing code in defensive ways. But some unnecessary pr_cont() broken-up messages can obviously be avoided. For example, arch/x86/mm/fault.c: printk(KERN_ALERT "BUG: unable to handle kernel "); if (address < PAGE_SIZE) printk(KERN_CONT "NULL pointer dereference"); else printk(KERN_CONT "paging request"); I've actually proposed to remove the above KERN_CONT, unfortunately the patch was silently ignored. From this point of view I guess pr_cont is actually syzkaller's worst enemy. If pr_const is officially hated, and it causes corrupted crash reports, then we can resolve it by just getting rid of more pr_cont's. So potentially we do not need any support for pr_cont in this patch. However, we also need to be practical and if there are tons of pr_cont's then we need some intermediate support of them, just because we won't be able to get rid of all of them overnight. But even if we attach context to pr_cont, it still causes problems for crash parsing, because today we see: BUG: unable to handle ... 10 lines ... kernel ... 10 lines ... paging request ... 10 lines ... at ADDR Which is not too friendly for parsing regardless of contexts. We met exactly the same issue and ended up with special handling in https://github.com/intel/lkp-tests/blob/master/lib/dmesg.rb: /(BUG: unable to handle kernel)/, /(BUG: unable to handle kernel) NULL pointer dereference/, /(BUG: unable to handle kernel) paging request/, So I am leaning towards to getting rid of pr_cont's as the solution to the problem. +1 for reducing unnecessary pr_cont() uses. Thanks, Fengguang
Re: [Ksummit-discuss] bug-introducing patches
On Mon, May 14, 2018 at 10:48:03AM +0200, Boris Brezillon wrote: +Fengguang On Mon, 14 May 2018 10:40:10 +0200 Geert Uytterhoeven wrote: Hi Boris, On Mon, May 14, 2018 at 10:34 AM, Boris Brezillon wrote: > On Mon, 14 May 2018 10:29:04 +0200 > Geert Uytterhoeven wrote: >> On Mon, May 14, 2018 at 10:12 AM, Boris Brezillon >> wrote: >> > On Mon, 14 May 2018 10:00:30 +0200 >> > Geert Uytterhoeven wrote: >> >> On Tue, May 1, 2018 at 10:00 PM, Sasha Levin >> >> wrote: >> >> > On Tue, May 01, 2018 at 03:44:50PM -0400, Theodore Y. Ts'o wrote: >> >> >>On Tue, May 01, 2018 at 04:38:21PM +, Sasha Levin wrote: >> >> > What's worse is that that commit is tagged for stable, which means >> >> > that (given Greg's schedule) it may find it's way to -stable users >> >> > even before some -next users/bots had a chance to test it out. >> >> >> >> I just noticed a case where a commit was picked up for stable, while a >> >> bot had flagged it as a build regression 18 hours earlier (with a CC to >> >> lkml). >> > >> > Also, this patch has been on a tree that I know is tested by Fengguang's >> > robots for more than a week (and in linux-next for 2 days, which, I >> > agree, is probably not enough), and still, I only received the bug >> > report when the patch reached mainline. Are there tests that are only >> > run on Linus' tree? >> >> Have your received a success report from Fengguang's bot, listing all >> configs tested (the broken one should be included; it is included in the >> configs tested on my branches)? > > Yes I did (see below). > > -->8-- > From: kbuild test robot > To: Boris Brezillon > Subject: [bbrezillon-0day:mtd/fixes] BUILD SUCCESS fc3a9e15b492eef707afd56b7478001fdecfe53f > Date: Mon, 07 May 2018 20:05:52 +0800 > User-Agent: Heirloom mailx 12.5 6/20/10 > > tree/branch: https://github.com/bbrezillon/linux-0day mtd/fixes > branch HEAD: fc3a9e15b492eef707afd56b7478001fdecfe53f mtd: rawnand: Make sure we wait tWB before polling the STATUS reg > > elapsed time: 49m > > configs tested: 142 But the failed config (m68k/allmodconfig) is not listed? Yes, that's my point. It seems that some configs are only rarely (never?) tested on my linux-0day tree (probably because they take longer to build), and I should only take kbuild robot results as an indication not a guarantee. Yeah sorry, there is no 100% guarantee. There are 2 main aspects to this problem. - Response time vs coverage. Most build errors can be caught within 1 day. The build success notification email is typically sent within half day (a reasonable feedback time). At this time, it can only be a rough indication not a guarantee. After sending the 0day build success notification, the build tests will actually continue for about 1 week to increase test coverage. - Merge-test-bisect based workflow. If one branch is hard to merge with others, especially if it's based on old kernel, it'll receive much less test coverage. Branches with known build/boot errors will be excluded from further merges, too. Thanks, Fengguang
Re: cross-compiling a 64-bit kernel on a 32-bit host
Hi Josh, CC LKP team. On Thu, May 10, 2018 at 05:36:19PM -0500, Josh Poimboeuf wrote: Hi Fengguang, I occasionally get compilation bug reports from people who are cross-compiling an x86-64 kernel target on an x86-32 host. Any chance the 0-day build bot could test that configuration? I think just building a defconfig would be sufficient. It would help sort out issues in objtool and other host-built scripts. To do that we'll need to create a new build chroot. Julie/Philip should be able to evaluate the efforts and do the plan. Thanks, Fengguang
Re: [v2] mm: access to uninitialized struct page
Hi Pavel, FYI here is 0day's bisect result. The attached dmesg has reproduce script at the bottom. [27e2ce5dba4c30db031744c8140675d03d2ae7aa] mm: access to uninitialized struct page git://git.cmpxchg.org/linux-mmotm.git devel-catchup-201805041701 git bisect start 53eff77ad4b0adaf1ca6e1ecc6acf3804c344531 6da6c0db5316275015e8cc2959f12a17584aeb64 -- git bisect bad 3fc24705ffb48c18b23ce2c229f5018d39b18ab0 # 20:22 B 2Merge 'djwong-xfs/djwong-devel' into devel-catchup-201805041701 git bisect bad 78bd9ee71ffbbe5ab169cbe469503af2dfb913f9 # 20:35 B 2Merge 'linux-review/Geert-Uytterhoeven/dt-bindings-can-rcar_can-Fix-R8A7796-SoC-name/20180504-154952' into devel-catchup-201805041701 git bisect bad 1deba87932c5d0adcffe63d8ce4847f39e864775 # 20:56 B 2Merge 'yhuang/fix_thp_swap' into devel-catchup-201805041701 git bisect good e0997365e1e89e8bc9f5ed4a58a6cd2500b58668 # 21:09 G 20day base guard for 'devel-catchup-201805041701' git bisect bad 98815bfa9156a8d1da1f1ca5f3748e250fa19a88 # 21:22 B 2Merge 'yhuang/thp_delay_split3_r1a' into devel-catchup-201805041701 git bisect good 6da6c0db5316275015e8cc2959f12a17584aeb64 # 21:22 G 3Linux v4.17-rc3 git bisect bad 97c561bb48a33e135a90573c596bc755ed4eab32 # 21:36 B 2mm, pagemap: Hide swap entry for unprivileged users git bisect bad 466b08a3a87e8e43af677375a4cf8eb105f50007 # 21:47 B 2mm, swap: fix race between swapoff and some swap operations git bisect bad 44ea77b7788384a7c27a960ec5752de08e47882a # 21:58 B 2zram-introduce-zram-memory-tracking-fix git bisect bad 7a53abd52e920e8a2c16ea2cb81439e6e87a7ea4 # 22:06 B 2prctl: add PR_[GS]ET_PDEATHSIG_PROC git bisect good 1fda92fccc022924575edb98191f1ad2c0477c31 # 22:15 G 2z3fold-fix-reclaim-lock-ups-checkpatch-fixes git bisect bad 685cc80b632235416b72869247df7d6ae2816d61 # 22:30 B 2mm: migrate: fix double call of radix_tree_replace_slot() git bisect bad 27e2ce5dba4c30db031744c8140675d03d2ae7aa # 22:55 B 2mm: access to uninitialized struct page git bisect good 7a0e68e17b8aa41aa33e8c80015e36d47dde390a # 23:11 G 2mm: sections are not offlined during memory hotremove # extra tests on first bad commit # bad: [27e2ce5dba4c30db031744c8140675d03d2ae7aa] mm: access to uninitialized struct page # extra tests on parent commit # good: [7a0e68e17b8aa41aa33e8c80015e36d47dde390a] mm: sections are not offlined during memory hotremove tests: 2 testcase/path_params/tbox_group/run: boot/1/vm-vp-quantal-x86_64 7a0e68e17b8aa41a 27e2ce5dba4c30db031744c814 -- fail:runs %reproductionfail:runs | | | :2 100% 2:2 dmesg.BUG:kernel_reboot-without-warning_in_boot_stage testcase/path_params/tbox_group/run: boot/1/vm-lkp-nex04-yocto-x86_64 7a0e68e17b8aa41a 27e2ce5dba4c30db031744c814 -- :4 100% 4:4 dmesg.BUG:kernel_reboot-without-warning_in_boot_stage Thanks, Fengguang dmesg.xz Description: application/xz # # Automatically generated file; DO NOT EDIT. # Linux/x86_64 4.17.0-rc3 Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y CONFIG_X86=y CONFIG_INSTRUCTION_DECODER=y CONFIG_OUTPUT_FORMAT="elf64-x86-64" CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_MMU=y CONFIG_ARCH_MMAP_RND_BITS_MIN=28 CONFIG_ARCH_MMAP_RND_BITS_MAX=32 CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 CONFIG_NEED_DMA_MAP_STATE=y CONFIG_NEED_SG_DMA_LENGTH=y CONFIG_GENERIC_ISA_DMA=y CONFIG_GENERIC_BUG=y CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y CONFIG_GENERIC_HWEIGHT=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_ARCH_HAS_FILTER_PGPROT=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y CONFIG_ARCH_WANT_GENERAL_HUGETLB=y CONFIG_ZONE_DMA32=y CONFIG_AUDIT_ARCH=y CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_ARCH_SUPPORTS_UPROBES=y CONFIG_FIX_EARLYCON_MEM=y CONFIG_PGTABLE_LEVELS=4 CONFIG_CONSTRUCTORS=y CONFIG_IRQ_WORK=y CONFIG_BUILDTIME_EXTABLE_SORT=y CONFIG_THREAD_INFO_IN_TASK=y # # General setup # CONFIG_BROKEN_ON_SMP=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_CROSS_COMPILE="" # CONFIG_COMPILE_TEST is not set CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y CONFIG_HAVE_KERNEL_GZIP=y CONFIG_HAVE_KERNEL_BZIP2=y CONFIG_HAVE_KERNEL_LZMA=y CONFIG_HAVE_KERNEL_XZ=y CONFIG_HAVE_KERNEL_LZO=y CONFIG_HAVE_KERNEL_LZ4=y # CONFIG_KERNEL_GZIP is not set # CONFIG_KERNEL_BZIP2 is not set # CONFIG_KERNEL_LZMA is not set # CONFIG_KERNEL_XZ is not set # CONFIG_KE
[async_page_fault] PANIC: double fault, error_code: 0x0
Hello, FYI this happens in mainline kernel 4.17.0-rc3. It at least dates back to v4.16 . It occurs in 2 out of 2 boots. It happens only with CONFIG_IA32_EMULATION enabled. [0.001000] Good, all 261 testcases passed! | [0.001000] - [0.001000] ACPI: Core revision 20180313 [0.001000] clocksource: hpet: mask: 0x max_cycles: 0x, max_idle_ns: 19112604467 ns [0.001000] hpet clockevent registered [0.001000] PANIC: double fault, error_code: 0x0 [0.001000] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.17.0-rc3 #248 [0.001000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [0.001000] RIP: 0010:async_page_fault+0x3/0x30: async_page_fault at arch/x86/entry/entry_64.S:1163 [0.001000] RSP: :c900 EFLAGS: 00010082 [0.001000] RAX: f52e RBX: 0003 RCX: 82a00a20 [0.001000] RDX: dc00 RSI: 0003 RDI: 8342c368 [0.001000] RBP: c9f8 R08: R09: [0.001000] R10: R11: R12: c9000158 [0.001000] R13: f5200048 R14: 8342bb80 R15: [0.001000] FS: () GS:c900() knlGS: [0.001000] CS: 0010 DS: ES: CR0: 80050033 [0.001000] CR2: c8f8 CR3: 03424000 CR4: 06b0 [0.001000] Call Trace: [0.001000] Code: 48 89 e7 48 8b 74 24 78 48 c7 44 24 78 ff ff ff ff e8 02 1b 6d fe e9 fd 01 00 00 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 08 01 00 00 48 89 e7 48 8b 74 24 78 48 c7 44 24 78 ff ff ff [0.001000] Kernel panic - not syncing: Machine halted. [0.001000] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.17.0-rc3 #248 [0.001000] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [0.001000] Call Trace: [0.001000] <#DF> [0.001000] dump_stack+0x162/0x221: dump_stack at lib/dump_stack.c:115 [0.001000] ? arch_local_irq_restore+0x44/0x44: rcu_read_lock at include/linux/rcupdate.h:629 arch_local_irq_restore+0x44/0x44: cr4_set_bits at arch/x86/include/asm/tlbflush.h:264 arch_local_irq_restore+0x44/0x44: dump_header at mm/oom_kill.c:423 arch_local_irq_restore+0x44/0x44: dump_stack at lib/dump_stack.c:89 [0.001000] ? trace_hardirqs_off_caller+0x14f/0x350: trace_hardirqs_off_caller at kernel/locking/lockdep.c:2922 [0.001000] panic+0x1ca/0x380: panic at kernel/panic.c:195 [0.001000] ? refcount_error_report+0x290/0x290: panic at kernel/panic.c:136 [0.001000] df_debug+0x2d/0x30: df_debug at ??:? [0.001000] do_double_fault+0xa0/0xc0: do_double_fault at arch/x86/kernel/traps.c:450 (discriminator 1) [0.001000] double_fault+0x23/0x30: double_fault at arch/x86/entry/entry_64.S:994 [0.001000] RIP: 0010:async_page_fault+0x3/0x30: async_page_fault at arch/x86/entry/entry_64.S:1163 [0.001000] RSP: :c900 EFLAGS: 00010082 [0.001000] RAX: f52e RBX: 0003 RCX: 82a00a20 [0.001000] RDX: dc00 RSI: 0003 RDI: 8342c368 [0.001000] RBP: c9f8 R08: R09: [0.001000] R10: R11: R12: c9000158 [0.001000] R13: f5200048 R14: 8342bb80 R15: [0.001000] ? restore_regs_and_return_to_kernel+0x2e/0x2e: native_irq_return_iret at arch/x86/entry/entry_64.S:752 [0.001000] Attached the full dmesg, kconfig and reproduce scripts. Out of them, there are 2 occurrences of "BUG: stack guard page was hit": [1.717675] gfs2: path_lookup on rootfs returned error -2 [1.719152] mount (320) used greatest stack depth: 13960 bytes left Configuring network interfaces... Kernel tests: Boot OK! [ 12.877799] trinity-main uses obsolete (PF_INET,SOCK_PACKET) [ 12.915462] BUG: stack g
Re: [per_cpu_ptr_to_phys] PANIC: early exception 0x0d IP 10:ffffffffa892f15f error 0 cr2 0xffff88001fbff000
Hi all, On Wed, Apr 18, 2018 at 06:38:25PM -0500, Dennis Zhou wrote: Hi, On Wed, Apr 18, 2018 at 09:55:53PM +0800, Fengguang Wu wrote: Hello, FYI here is a slightly different boot error in mainline kernel 4.17.0-rc1. It also dates back to v4.16 . Now I find 2 more occurrances in v4.15 kernel. Here are the statistics: kernel count error-id v4.15: 2 RIP:per_cpu_ptr_to_phys v4.16: 12 RIP:per_cpu_ptr_to_phys v4.16: 1 BUG:KASAN:null-ptr-deref-in-per_cpu_ptr_to_phys v4.16-rc7: 2 RIP:per_cpu_ptr_to_phys v4.17-rc1:217 RIP:per_cpu_ptr_to_phys v4.17-rc1: 5 BUG:KASAN:null-ptr-deref-in-per_cpu_ptr_to_phys v4.17-rc2: 46 RIP:per_cpu_ptr_to_phys v4.17-rc2: 15 BUG:KASAN:null-ptr-deref-in-per_cpu_ptr_to_phys v4.17-rc3: 12 RIP:per_cpu_ptr_to_phys It occurs in 4 out of 4 boots. [0.00] Built 1 zonelists, mobility grouping on. Total pages: 128873 [0.00] Kernel command line: root=/dev/ram0 hung_task_panic=1 debug apic=debug sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 net.ifnames=0 printk.devkmsg=on panic=-1 softlockup_panic=1 nmi_watchdog=panic oops=panic load_ramdisk=2 prompt_ramdisk=0 drbd.minor_count=8 systemd.log_level=err ignore_loglevel console=tty0 earlyprintk=ttyS0,115200 console=ttyS0,115200 vga=normal rw link=/kbuild-tests/run-queue/kvm/x86_64-randconfig-a0-04172313/linux-devel:devel-hourly-2018041714:60cc43fc888428bb2f18f08997432d426a243338/.vmlinuz-60cc43fc888428bb2f18f08997432d426a243338-20180418000325-19:yocto-lkp-nhm-dp2-4 branch=linux-devel/devel-hourly-2018041714 BOOT_IMAGE=/pkg/linux/x86_64-randconfig-a0-04172313/gcc-7/60cc43fc888428bb2f18f08997432d426a243338/vmlinuz-4.17.0-rc1 drbd.minor_count=8 rcuperf.shutdown=0 [0.00] sysrq: sysrq always enabled. [0.00] Dentry cache hash table entries: 65536 (order: 7, 524288 bytes) [0.00] Inode-cache hash table entries: 32768 (order: 6, 262144 bytes) PANIC: early exception 0x0d IP 10:a892f15f error 0 cr2 0x88001fbff000 [0.00] CPU: 0 PID: 0 Comm: swapper Tainted: GT 4.17.0-rc1 #238 [0.00] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [0.00] RIP: 0010:per_cpu_ptr_to_phys+0x16a/0x298: __section_mem_map_addr at include/linux/mmzone.h:1188 (inlined by) per_cpu_ptr_to_phys at mm/percpu.c:1849 [0.00] RSP: :ab407e50 EFLAGS: 00010046 ORIG_RAX: [0.00] RAX: dc00 RBX: 88001f17c340 RCX: 000f [0.00] RDX: RSI: 0001 RDI: acfbf580 [0.00] RBP: ab40d000 R08: fbfff57c4eca R09: [0.00] R10: 880015421000 R11: fbfff57c4ec9 R12: [0.00] R13: 88001fb03ff8 R14: 88001fc051c0 R15: [0.00] FS: () GS:ab4c5000() knlGS: [0.00] CS: 0010 DS: ES: CR0: 80050033 [0.00] CR2: 88001fbff000 CR3: 1a06c000 CR4: 06b0 [0.00] Call Trace: [0.00] setup_cpu_entry_areas+0x7b/0x27b: setup_cpu_entry_area at arch/x86/mm/cpu_entry_area.c:104 (inlined by) setup_cpu_entry_areas at arch/x86/mm/cpu_entry_area.c:177 [0.00] trap_init+0xb/0x13d: trap_init at arch/x86/kernel/traps.c:949 [0.00] start_kernel+0x2a5/0x91d: mm_init at init/main.c:519 (inlined by) start_kernel at init/main.c:589 [0.00] ? thread_stack_cache_init+0x6/0x6 [0.00] ? memcpy_orig+0x16/0x110: memcpy_orig at arch/x86/lib/memcpy_64.S:77 [0.00] ? x86_family+0x5/0x1d: x86_family at arch/x86/lib/cpu.c:8 [0.00] ? load_ucode_bsp+0x42/0x13e: load_ucode_bsp at arch/x86/kernel/cpu/microcode/core.c:183 [0.00] secondary_startup_64+0xa5/0xb0: secondary_startup_64 at arch/x86/kernel/head_64.S:242 [0.00] Code: 78 06 00 49 8b 45 00 48 85 c0 74 a5 49 c1 ec 28 41 81 e4 e0 0f 00 00 49 01 c4 4c 89 e2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 74 08 4c 89 e7 e8 63 78 06 00 49 8b 04 24 81 e5 ff BUG: kernel hang in boot stage I spent some time bisecting this one and it seemse to be an intermittent issue starting with this commit for me: c9e97a1997, mm: initialize pages on demand during boo
Re: [crng_reseed] WARNING: inconsistent lock state
On Sun, Apr 29, 2018 at 03:07:06AM +, Linus Torvalds wrote: On Sat, Apr 28, 2018 at 7:26 PM Fengguang Wu wrote: FYI this happens in mainline kernel 4.17.0-rc2. It looks like a new regression. It occurs in 3 out of 3 boots. There is another "[ 294.642506] BUG: sleeping function called from invalid context at mm/slab.h:421" at the bottom of this long dmesg: This should be fixed by commit 6c1e851c4edc ("random: fix possible sleeping allocation from irq context"). Yes that fixes the bug. Sorry for the late report! Regards, Fengguang
Re: [llc_ui_release] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004
On Sun, Apr 29, 2018 at 03:30:48AM +, Linus Torvalds wrote: On Sat, Apr 28, 2018 at 7:12 PM Fengguang Wu wrote: FYI this happens in mainline kernel 4.17.0-rc2. It looks like a new regression. It occurs in 5 out of 5 boots. [main] 375 sockets created based on info from socket cachefile. [main] Generating file descriptors [main] Added 83 filenames from /dev udevd[507]: failed to execute '/sbin/modprobe' '/sbin/modprobe -bv platform:regulatory': No such file or directory [ 372.057947] caif:caif_disconnect_client(): nothing to disconnect [ 372.082415] BUG: unable to handle kernel NULL pointer dereference at 0004 I think this is fixed by commit 3a04ce7130a7 ("llc: fix NULL pointer deref for SOCK_ZAPPED") Confirmed. Sorry for the late report! Regards, Fengguang
Re: ed74ae0342 ("blk-mq: Avoid that a completion can be ignored .."): BUG: kernel hang in test stage
Hi Jens, On Fri, Apr 27, 2018 at 06:52:58PM -0600, Jens Axboe wrote: On 4/24/18 3:00 PM, kernel test robot wrote: Greetings, 0day kernel testing robot got the below dmesg and the first bad commit is https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-linus commit ed74ae03424684a6ad8a973c3fa727c6b4162432 Author: Bart Van Assche AuthorDate: Thu Apr 19 09:43:53 2018 -0700 Commit: Jens Axboe CommitDate: Thu Apr 19 14:21:47 2018 -0600 blk-mq: Avoid that a completion can be ignored for BLK_EH_RESET_TIMER Any chance you can try with the newer version? https://github.com/bvanassche/linux/commit/4acd555fa13087 That works! Tested-by: Fengguang Wu
Re: Makefile:636: arch/score/Makefile: No such file or directory
CC Shun Hao. On Thu, Apr 26, 2018 at 09:15:01AM +0200, Arnd Bergmann wrote: On Thu, Apr 26, 2018 at 4:23 AM, kbuild test robot wrote: Hi Arnd, FYI, the error/warning still remains. tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master head: 26ed24e429d89e045e5eb2d2b75215244347b7f2 commit: b8c9c8f0190f4004d3d4364edb2dea5978dfc824 arch: remove score port date: 7 weeks ago config: score-spct6600_defconfig Please stop trying to build the removed architectures: tile, blackfin, metag, m32r, frv, mn10300, cris and score. https://lwn.net/Articles/748074/ has details about why this broke. Sorry about that! It looks the architectures removal is not thorough enough. Hao, would you help take a look? Note that we do have a few new architectures that you may want to add if you haven't done this already: nds32 and riscv were added this year, and a couple of older architectures that didn't have upstream gcc releases now have those, and I uploaded gcc binaries to https://cdn.kernel.org/pub/tools/crosstool/. This should allow building everything we have in the kernel now. Unfortunately the nds32 toolchain I uploaded is a little incomplete, but I have a gcc-6.4 build that (mostly) works and gcc-8 should be fine as well once it gets released. OK, thanks for the tips! Philip may help plan add the 2 new archs nds32 and riscv. Thanks, Fengguang
Re: [gcov_module_notifier] WARNING: CPU: 0 PID: 155 at mm/slab_common.c:996 kmalloc_slab+0x1f/0x79
Hi Peter, On Fri, Apr 20, 2018 at 01:08:07PM +0200, Peter Oberparleiter wrote: On 19.04.2018 04:58, Fengguang Wu wrote: FYI this happens in mainline kernel 4.17.0-rc1. It at least dates back to v4.5 . This is likely the result of compiling the kernel with GCC 7 while specifying that gcov-kernel should expect GCC <= 3.4 format data: dmesg: (gcc version 7.3.0 (Debian 7.3.0-1)) #31 Mon Apr 16 23:17:50 CST 2018 .config: # CONFIG_GCOV_FORMAT_AUTODETECT is not set CONFIG_GCOV_FORMAT_3_4=y # CONFIG_GCOV_FORMAT_4_7 is not set As such, it is an expected test failure. Assuming that this is a randconfig test, my recommendation would be to blacklist "CONFIG_GCOV_FORMAT_3_4=y" when using GCC >3.4. Ah OK, thanks! It looks easy to fix for us. Interestingly there's the default GCOV_FORMAT_AUTODETECT option, however the randconfig test obviously (blindly) will try other options if ever they are selectable. Thanks, Fengguang
Re: [console_unlock] BUG: KASAN: use-after-scope in console_unlock+0x9cd/0xd10
On Thu, Apr 19, 2018 at 03:20:50PM +0900, Sergey Senozhatsky wrote: On (04/19/18 08:04), Dmitry Vyukov wrote: [..] We could also make them mutually exclusive in config to prevent people from hitting these false positives again and again. Let's do it. Ard and Kees agreed on making them mutually exclusive [1][2]. Dmitry, could send out a patch? [1] lkml.kernel.org/r/cakv+gu8hn-t2om8scfjxcwbsgsir54fzw222dsed0xwqph2...@mail.gmail.com [2] lkml.kernel.org/r/CAGXu5j+mcfo4aB3PM1We6O62bFBJcMFX-9obJE4jFU1Dp=g...@mail.gmail.com That'd be great, thank you very much! Cheers, Fengguang
Re: [string_selftest_init] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [swapper/0:1]
On Thu, Apr 19, 2018 at 08:22:24AM +0200, Geert Uytterhoeven wrote: Hi Fengguang, On Thu, Apr 19, 2018 at 4:50 AM, Fengguang Wu wrote: FYI this happens in mainline kernel 4.17.0-rc1. It dates back to v4.14-rc1 , perhaps since the test was introduced. It occurs in 1 out of 1 boots. I guess the test just takes too long, causing a panic if BOOTPARAM_HUNG_TASK_PANIC=y? That does the trick of saving test time. ;) Another option is to add cond_resched(), will that be feasible? Thanks, Fengguang [ 83.615461] Block layer SCSI generic (bsg) driver version 0.4 loaded (major 246) [ 83.627616] io scheduler noop registered [ 83.634396] io scheduler deadline registered (default) [ 83.643012] io scheduler mq-deadline registered (default) [ 83.652509] io scheduler bfq registered [ 108.580122] watchdog: BUG: soft lockup - CPU#0 stuck for 23s! [swapper/0:1] [ 108.580122] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.17.0-rc1 #210 [ 108.580122] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 108.580122] RIP: 0010:check_kcov_mode+0x24/0x37: __read_once_size at include/linux/compiler.h:188 (discriminator 1) (inlined by) check_kcov_mode at kernel/kcov.c:69 (discriminator 1) [ 108.580122] RSP: :88000fd43e48 EFLAGS: 0206 ORIG_RAX: ff13 [ 108.580122] RAX: RBX: 88000fd3e000 RCX: [ 108.580122] RDX: RSI: 88000fd3e000 RDI: 0002 [ 108.580122] RBP: 853b9f96 R08: 014000c0 R09: [ 108.580122] R10: 88000fd3e000 R11: 0002 R12: 88001a6c0458 [ 108.580122] R13: 0011 R14: 88001a6c0530 R15: a1a1a1a1 [ 108.580122] FS: () GS:88001f60() knlGS: [ 108.580122] CS: 0010 DS: ES: CR0: 80050033 [ 108.580122] CR2: CR3: 03c6a000 CR4: 06b0 [ 108.580122] Call Trace: [ 108.580122] __sanitizer_cov_trace_pc+0x2b/0x66: __sanitizer_cov_trace_pc at kernel/kcov.c:101 [ 108.580122] string_selftest_init+0x25e/0x5c0: memset32_selftest at lib/test_string.c:60 (inlined by) string_selftest_init at lib/test_string.c:124 [ 108.580122] ? prandom_reseed+0x52/0x52: string_selftest_init at lib/test_string.c:115 [ 108.580122] do_one_initcall+0x18e/0x3df: do_one_initcall at init/main.c:883 [ 108.580122] ? check_kcov_mode+0x5/0x37: check_kcov_mode at kernel/kcov.c:60 [ 108.580122] ? __sanitizer_cov_trace_pc+0x2b/0x66: __sanitizer_cov_trace_pc at kernel/kcov.c:101 [ 108.580122] ? check_kcov_mode+0x5/0x37: check_kcov_mode at kernel/kcov.c:60 [ 108.580122] kernel_init_freeable+0x250/0x3a5: do_initcall_level at init/main.c:951 (inlined by) do_initcalls at init/main.c:959 (inlined by) do_basic_setup at init/main.c:977 (inlined by) kernel_init_freeable at init/main.c:1127 [ 108.580122] ? rest_init+0x13a/0x13a: kernel_init at init/main.c:1050 [ 108.580122] kernel_init+0x17/0x218: kernel_init at init/main.c:1053 [ 108.580122] ? rest_init+0x13a/0x13a: kernel_init at init/main.c:1050 [ 108.580122] ret_from_fork+0x35/0x40: ret_from_fork at arch/x86/entry/entry_64.S:418 [ 108.580122] Code: 41 5c 41 5d 41 5e c3 e8 e8 c6 a3 01 48 ff 05 71 33 fd 04 31 c0 65 8b 15 78 00 e5 7e 81 e2 00 01 1f 00 75 19 48 ff 05 61 33 fd 04 <8b> 86 18 1c 00 00 39 c7 0f 94 c0 48 ff 05 57 33 fd 04 c3 e8 b1 [ 108.580122] Kernel panic - not syncing: softlockup: hung tasks [ 108.580122] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G L 4.17.0-rc1 #210 [ 108.580122] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 108.580122] Call Trace: [ 108.580122] [ 108.580122] dump_stack+0xf3/0x135: __dump_stack at lib/dump_stack.c:77 (inlined by) dump_stack at lib/dump_stack.c:113 [ 108.580122] panic+0x173/0x440: panic at kernel/panic.c:195 [ 108.580122] watchdog_timer_fn+0x343
Re: WARNING: stack going in the wrong direction? ip=__schedule+0x489/0x830
On Thu, Apr 19, 2018 at 01:49:41PM +0800, Fengguang Wu wrote: Hello, FYI this warning dates back to v4.16-rc5 . It's rather rare and often happen together with other errors. Sorry, that should be 0day didn't catch this particular WARNING. So it just occasionally show up in the context of other errors. I jut added that WARNING pattern to 0day and hope we can get more information about it. Thanks, Fengguang
d17a1d97dc ("x86/mm/kasan: don't use vmemmap_populate() to initialize shadow"): [ 0.001000] BUG: KASAN: use-after-scope in console_unlock
On Thu, Apr 19, 2018 at 10:17:57AM +0800, Fengguang Wu wrote: >Hello, > >FYI this happens in mainline kernel 4.17.0-rc1. >It at least dates back to v4.15-rc1 . > >The regression was reported before > > https://lkml.org/lkml/2017/11/30/33 > >Where the last message from Dmitry mentions that use-after-scope has >known false positives with CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y >If so, what would be the best way to workaround such false positives >in boot testing? Disable the above config? > >0day bisects produce diverged results, with 2 of them converge to >commit d17a1d97dc ("x86/mm/kasan: don't use vmemmap_populate() to >initialize shadow") and 1 bisected to the earlier a4a3ede213 ("mm: >zero reserved and unavailable struct pages"). I'll send the bisect >reports in follow up emails. Here is the bisect report for commit d17a1d97dc208d664c91cc387ffb752c7f85dc61 Author: Andrey Ryabinin AuthorDate: Wed Nov 15 17:36:35 2017 -0800 Commit: Linus Torvalds CommitDate: Wed Nov 15 18:21:05 2017 -0800 x86/mm/kasan: don't use vmemmap_populate() to initialize shadow The kasan shadow is currently mapped using vmemmap_populate() since that provides a semi-convenient way to map pages into init_top_pgt. However, since that no longer zeroes the mapped pages, it is not suitable for kasan, which requires zeroed shadow memory. Add kasan_populate_shadow() interface and use it instead of vmemmap_populate(). Besides, this allows us to take advantage of gigantic pages and use them to populate the shadow, which should save us some memory wasted on page tables and reduce TLB pressure. Link: http://lkml.kernel.org/r/20171103185147.2688-2-pasha.tatas...@oracle.com Signed-off-by: Andrey Ryabinin Signed-off-by: Pavel Tatashin Cc: Steven Sistare Cc: Daniel Jordan Cc: Bob Picco Cc: Michal Hocko Cc: Alexander Potapenko Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds a4a3ede213 mm: zero reserved and unavailable struct pages d17a1d97dc x86/mm/kasan: don't use vmemmap_populate() to initialize shadow d6bbd51587 Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace 73005e1a35 Add linux-next specific files for 20180103 +++++---+ || a4a3ede213 | d17a1d97dc | d6bbd51587 | next-20180103 | +++++---+ | boot_successes | 35 | 0 | 0 | 10 | | boot_failures | 0 | 15 | 17 | | | BUG:KASAN:use-after-scope_in_c | 0 | 15 | 17 | | +++++---+ [0.004000] Tasks RCU enabled. [0.004000] RCU: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=2 [0.004000] NR_IRQS: 4352, nr_irqs: 440, preallocated irqs: 16 [0.004000] Offload RCU callbacks from CPUs: . [0.004000] == [0.004000] BUG: KASAN: use-after-scope in console_unlock+0x516/0x7bf [0.004000] Write of size 4 at addr af207aa0 by task swapper/0 [0.004000] [0.004000] CPU: 0 PID: 0 Comm: swapper Not tainted 4.14.0-04319-gd17a1d9 #2 [0.004000] Call Trace: [0.004000] ? dump_stack+0xd1/0x178 [0.004000] ? _atomic_dec_and_lock+0x11a/0x11a [0.004000] ? show_regs_print_info+0x51/0x51 [0.004000] ? do_raw_spin_unlock+0x223/0x247 [0.004000] ? print_address_description+0x94/0x2d9 [0.004000] ? console_unlock+0x516/0x7bf [0.004000] ? kasan_report+0x21e/0x244 [0.004000] ? console_unlock+0x516/0x7bf [0.004000] ? wake_up_klogd+0xe6/0xe6 [0.004000] ? vprintk_emit+0x3ee/0x426 [0.004000] ? __down_trylock_console_sem+0x5d/0x6c [0.004000] ? vprintk_emit+0x3f7/0x426 [0.004000] ? console_unlock+0x7bf/0x7bf [0.004000] ? memblock_virt_alloc_try_nid+0xd9/0x107 [0.004000] ? zero_pud_populate+0x7f1/0x8e8 [0.004000] ? printk+0x8f/0xab [0.004000] ? show_regs_print_info+0x51/0x51 [0.004000] ? native_flush_tlb_global+0x71/0x7d [0.004000] ? setup_arch+0x2427/0x2770 [0.004000] ? reserve_standard_io_resources+0x83/0x83 [0.004000] ? debug
c9e97a1997 BUG: kernel reboot-without-warning in early-boot stage, last printk: early console in setup code
On Wed, Apr 18, 2018 at 06:38:25PM -0500, Dennis Zhou wrote: >Hi, > >On Wed, Apr 18, 2018 at 09:55:53PM +0800, Fengguang Wu wrote: >> >> Hello, >> >> FYI here is a slightly different boot error in mainline kernel 4.17.0-rc1. >> It also dates back to v4.16 . >> >> It occurs in 4 out of 4 boots. >> >> [0.00] Built 1 zonelists, mobility grouping on. Total pages: 128873 >> [0.00] Kernel command line: root=/dev/ram0 hung_task_panic=1 debug >> apic=debug sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 >> net.ifnames=0 printk.devkmsg=on panic=-1 softlockup_panic=1 >> nmi_watchdog=panic oops=panic load_ramdisk=2 prompt_ramdisk=0 >> drbd.minor_count=8 systemd.log_level=err ignore_loglevel console=tty0 >> earlyprintk=ttyS0,115200 console=ttyS0,115200 vga=normal rw >> link=/kbuild-tests/run-queue/kvm/x86_64-randconfig-a0-04172313/linux-devel:devel-hourly-2018041714:60cc43fc888428bb2f18f08997432d426a243338/.vmlinuz-60cc43fc888428bb2f18f08997432d426a243338-20180418000325-19:yocto-lkp-nhm-dp2-4 >> branch=linux-devel/devel-hourly-2018041714 >> BOOT_IMAGE=/pkg/linux/x86_64-randconfig-a0-04172313/gcc-7/60cc43fc888428bb2f18f08997432d426a243338/vmlinuz-4.17.0-rc1 >> drbd.minor_count=8 rcuperf.shutdown=0 >> [0.00] sysrq: sysrq always enabled. >> [0.00] Dentry cache hash table entries: 65536 (order: 7, 524288 >> bytes) >> [0.00] Inode-cache hash table entries: 32768 (order: 6, 262144 bytes) >> PANIC: early exception 0x0d IP 10:a892f15f error 0 cr2 >> 0x88001fbff000 >> [0.00] CPU: 0 PID: 0 Comm: swapper Tainted: GT >> 4.17.0-rc1 #238 >> [0.00] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS >> 1.10.2-1 04/01/2014 >> [0.00] RIP: 0010:per_cpu_ptr_to_phys+0x16a/0x298: >> __section_mem_map_addr at >> include/linux/mmzone.h:1188 >> (inlined by) >> per_cpu_ptr_to_phys at mm/percpu.c:1849 >> [0.00] RSP: :ab407e50 EFLAGS: 00010046 ORIG_RAX: >> >> [0.00] RAX: dc00 RBX: 88001f17c340 RCX: >> 000f >> [0.00] RDX: RSI: 0001 RDI: >> acfbf580 >> [0.00] RBP: ab40d000 R08: fbfff57c4eca R09: >> >> [0.00] R10: 880015421000 R11: fbfff57c4ec9 R12: >> >> [0.00] R13: 88001fb03ff8 R14: 88001fc051c0 R15: >> >> [0.00] FS: () GS:ab4c5000() >> knlGS: >> [0.00] CS: 0010 DS: ES: CR0: 80050033 >> [0.00] CR2: 88001fbff000 CR3: 1a06c000 CR4: >> 06b0 >> [0.00] Call Trace: >> [0.00] setup_cpu_entry_areas+0x7b/0x27b: >> setup_cpu_entry_area at >> arch/x86/mm/cpu_entry_area.c:104 >> (inlined by) >> setup_cpu_entry_areas at arch/x86/mm/cpu_entry_area.c:177 >> [0.00] trap_init+0xb/0x13d: >> trap_init at >> arch/x86/kernel/traps.c:949 >> [0.00] start_kernel+0x2a5/0x91d: >> mm_init at init/main.c:519 >> (inlined by) start_kernel at >> init/main.c:589 >> [0.00] ? thread_stack_cache_init+0x6/0x6 >> [0.00] ? memcpy_orig+0x16/0x110: >> memcpy_orig at >> arch/x86/lib/memcpy_64.S:77 >> [0.00] ? x86_family+0x5/0x1d: >> x86_family at >> arch/x86/lib/cpu.c:8 >> [0.00] ? load_ucode_bsp+0x42/0x13e: >> load_ucode_bsp at >> arch/x86/kernel/cpu/microcode/core.c:183 >> [0.00] secondary_startup_64+0xa5/0xb0: >> secondary_startup_64 at >> arch/x86/kernel/head_64.S:242 >> [0.00] Code: 78 06 00 49 8b 45 00 48 85 c0 74 a5 49 c1 ec 28 41 81 >> e4 e0 0f 00 00 49 01 c4 4c 89 e2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 >> <80> 3c 02 00 74 08 4c 89 e7 e8 63 78 06 00 49 8b 04 24 81 e5 ff >> BUG: kernel hang in boot stage >> > >I spent some time bisecting this one and it seemse to be an intermittent >issue starting with this commit for
Re: [cfs_trace_lock_tcd] BUG: unable to handle kernel NULL pointer dereference at 00000050
Hi James, On Wed, Apr 18, 2018 at 02:59:15PM +0100, James Simmons wrote: Hello, FYI this happens in mainline kernel 4.17.0-rc1. It looks like a new regression. [7.587002] lnet_selftest_init+0x2c4/0x5d9: lnet_selftest_init at drivers/staging/lustre/lnet/selftest/module.c:134 [7.587002] ? lnet_selftest_exit+0x8d/0x8d: lnet_selftest_init at drivers/staging/lustre/lnet/selftest/module.c:90 Are you running lnet selftest ? Perhaps yes -- it's randconfig boot test and the .config does include CONFIG_LNET_SELFTEST: CONFIG_LNET=y CONFIG_LNET_MAX_PAYLOAD=1048576 ==> CONFIG_LNET_SELFTEST=y CONFIG_LNET_XPRT_IB=y Is this a UMP setup? Yes, .config has: # CONFIG_SMP is not set The reason I ask is that their is a SMP handling bug in lnet selftest. If you look at the mailing list I pushed a SMP patch series. Can you try that series and tell me if it works for you. So it looks your fixup patch is not for this case? Anyway the reproduce-* script attached in the previous email should be fairly straightforward to try out for reproducing the bug. Thanks, Fengguang
[__cpa_process_fault] CPA: called for zero pte.
Hello, FYI this happens in mainline kernel 4.17.0-rc1. It's a new regression. [0.00] Memory: 210548K/523752K available (16392K kernel code, 2355K rwdata, 5536K rodata, 2996K init, 21012K bss, 66128K reserved, 0K cma-reserved) [0.00] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=1, Nodes=1 [0.00] Kernel/User page tables isolation: enabled [0.00] [ cut here ] [0.00] CPA: called for zero pte. vaddr = b240 cpa->vaddr = b240 [0.00] WARNING: CPU: 0 PID: 0 at arch/x86/mm/pageattr.c:1189 __cpa_process_fault+0x572/0x5a0: __cpa_process_fault at arch/x86/mm/pageattr.c:1187 (discriminator 1) [0.00] CPU: 0 PID: 0 Comm: swapper Tainted: GT 4.17.0-rc1 #304 [0.00] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [0.00] RIP: 0010:__cpa_process_fault+0x572/0x5a0: __cpa_process_fault at arch/x86/mm/pageattr.c:1187 (discriminator 1) [0.00] RSP: :b0a03c40 EFLAGS: 00010086 [0.00] RAX: RBX: b220 RCX: 0002 [0.00] RDX: RSI: 0002 RDI: 0046 [0.00] RBP: b0a03ca0 R08: 0001 R09: 0001 [0.00] R10: b0a884f8 R11: R12: b0a03df0 [0.00] R13: 00e3 R14: 8800 R15: b240 [0.00] FS: () GS:88001f80() knlGS: [0.00] CS: 0010 DS: ES: CR0: 80050033 [0.00] CR2: 88000bbe6000 CR3: 0a26a001 CR4: 000606b0 [0.00] DR0: DR1: DR2: [0.00] DR3: DR6: fffe0ff0 DR7: 0400 [0.00] Call Trace: [0.00] __change_page_attr_set_clr+0x1a0/0xde0: __change_page_attr at arch/x86/mm/pageattr.c:1218 (inlined by) __change_page_attr_set_clr at arch/x86/mm/pageattr.c:1374 [0.00] ? lock_release+0x350/0x380: lock_release at kernel/locking/lockdep.c:3943 [0.00] change_page_attr_set_clr+0x17c/0x3f0: change_page_attr_set_clr at arch/x86/mm/pageattr.c:1475 [0.00] ? 0xaf20 [0.00] set_memory_nonglobal+0x24/0x30: set_memory_nonglobal at arch/x86/mm/pageattr.c:1765 [0.00] pti_set_kernel_image_nonglobal+0x73/0x80: pti_set_kernel_image_nonglobal at arch/x86/mm/pti.c:464 [0.00] pti_init+0x4b/0x1fb: pti_clone_entry_text at arch/x86/mm/pti.c:385 (inlined by) pti_init at arch/x86/mm/pti.c:481 [0.00] start_kernel+0x2ee/0x4da: start_kernel at init/main.c:601 [0.00] x86_64_start_reservations+0x2a/0x2c: x86_64_start_reservations at arch/x86/kernel/head64.c:446 [0.00] x86_64_start_kernel+0x77/0x7a: x86_64_start_kernel at arch/x86/kernel/head64.c:427 [0.00] secondary_startup_64+0xa5/0xb0: secondary_startup_64 at arch/x86/kernel/head_64.S:242 [0.00] Code: 48 89 f7 31 db e8 5f 44 00 00 48 c1 e8 0c 49 89 44 24 30 eb 24 49 8b 04 24 4c 89 fe 48 c7 c7 f8 e8 80 b0 48 8b 10 e8 0e d9 07 00 <0f> 0b bb f2 ff ff ff eb 05 bb ff ff ff ff 48 83 c4 38 89 d8 5b [0.00] ---[ end trace 9c220c3d1fdf6e76 ]--- [0.00] [ cut here ] [0.00] [ cut here ] [0.00] kernel BUG at arch/x86/mm/pageattr.c:175! [0.00] invalid opcode: [#1] SMP PTI [0.00] CPU: 0 PID: 0 Comm: swapper Tainted: GW T 4.17.0-rc1 #304 [0.00] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [0.00] RIP: 0010:cpa_flush_all+0x10/0x30: cpa_flush_all at arch/x86/mm/pageattr.c:175 [0.00] RSP: :b0a03db0 EFLAGS: 00010046 [0.00] RAX: 0082 RBX: RCX: fff2 [0.00] RDX: RSI: RDI: [0.00] RBP: b0a03db0 R08: 0001 R09: [0.00] R10: 0001 R11: R12: b0a03e80 [0.00] R13: 3400 R14: R15: [
[per_cpu_ptr_to_phys] PANIC: early exception 0x0d IP 10:ffffffffa892f15f error 0 cr2 0xffff88001fbff000
Hello, FYI here is a slightly different boot error in mainline kernel 4.17.0-rc1. It also dates back to v4.16 . It occurs in 4 out of 4 boots. [0.00] Built 1 zonelists, mobility grouping on. Total pages: 128873 [0.00] Kernel command line: root=/dev/ram0 hung_task_panic=1 debug apic=debug sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 net.ifnames=0 printk.devkmsg=on panic=-1 softlockup_panic=1 nmi_watchdog=panic oops=panic load_ramdisk=2 prompt_ramdisk=0 drbd.minor_count=8 systemd.log_level=err ignore_loglevel console=tty0 earlyprintk=ttyS0,115200 console=ttyS0,115200 vga=normal rw link=/kbuild-tests/run-queue/kvm/x86_64-randconfig-a0-04172313/linux-devel:devel-hourly-2018041714:60cc43fc888428bb2f18f08997432d426a243338/.vmlinuz-60cc43fc888428bb2f18f08997432d426a243338-20180418000325-19:yocto-lkp-nhm-dp2-4 branch=linux-devel/devel-hourly-2018041714 BOOT_IMAGE=/pkg/linux/x86_64-randconfig-a0-04172313/gcc-7/60cc43fc888428bb2f18f08997432d426a243338/vmlinuz-4.17.0-rc1 drbd.minor_count=8 rcuperf.shutdown=0 [0.00] sysrq: sysrq always enabled. [0.00] Dentry cache hash table entries: 65536 (order: 7, 524288 bytes) [0.00] Inode-cache hash table entries: 32768 (order: 6, 262144 bytes) PANIC: early exception 0x0d IP 10:a892f15f error 0 cr2 0x88001fbff000 [0.00] CPU: 0 PID: 0 Comm: swapper Tainted: GT 4.17.0-rc1 #238 [0.00] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [0.00] RIP: 0010:per_cpu_ptr_to_phys+0x16a/0x298: __section_mem_map_addr at include/linux/mmzone.h:1188 (inlined by) per_cpu_ptr_to_phys at mm/percpu.c:1849 [0.00] RSP: :ab407e50 EFLAGS: 00010046 ORIG_RAX: [0.00] RAX: dc00 RBX: 88001f17c340 RCX: 000f [0.00] RDX: RSI: 0001 RDI: acfbf580 [0.00] RBP: ab40d000 R08: fbfff57c4eca R09: [0.00] R10: 880015421000 R11: fbfff57c4ec9 R12: [0.00] R13: 88001fb03ff8 R14: 88001fc051c0 R15: [0.00] FS: () GS:ab4c5000() knlGS: [0.00] CS: 0010 DS: ES: CR0: 80050033 [0.00] CR2: 88001fbff000 CR3: 1a06c000 CR4: 06b0 [0.00] Call Trace: [0.00] setup_cpu_entry_areas+0x7b/0x27b: setup_cpu_entry_area at arch/x86/mm/cpu_entry_area.c:104 (inlined by) setup_cpu_entry_areas at arch/x86/mm/cpu_entry_area.c:177 [0.00] trap_init+0xb/0x13d: trap_init at arch/x86/kernel/traps.c:949 [0.00] start_kernel+0x2a5/0x91d: mm_init at init/main.c:519 (inlined by) start_kernel at init/main.c:589 [0.00] ? thread_stack_cache_init+0x6/0x6 [0.00] ? memcpy_orig+0x16/0x110: memcpy_orig at arch/x86/lib/memcpy_64.S:77 [0.00] ? x86_family+0x5/0x1d: x86_family at arch/x86/lib/cpu.c:8 [0.00] ? load_ucode_bsp+0x42/0x13e: load_ucode_bsp at arch/x86/kernel/cpu/microcode/core.c:183 [0.00] secondary_startup_64+0xa5/0xb0: secondary_startup_64 at arch/x86/kernel/head_64.S:242 [0.00] Code: 78 06 00 49 8b 45 00 48 85 c0 74 a5 49 c1 ec 28 41 81 e4 e0 0f 00 00 49 01 c4 4c 89 e2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 74 08 4c 89 e7 e8 63 78 06 00 49 8b 04 24 81 e5 ff BUG: kernel hang in boot stage Attached the full dmesg, kconfig and reproduce scripts. Thanks, Fengguang early console in setup code early console in extract_kernel input_data: 0x04cf62b6 input_len: 0x01686ad3 output: 0x0100 output_len: 0x0448ca6c kernel_total_size: 0x053b1000 trampoline_32bit: 0x0009d000 booted via startup_32() Physical KASLR using RDTSC... Virtual KASLR using RDTSC... Decompressing Linux... Parsing ELF... Performing relocations... done. Booting the kernel. [0.00] Linux version 4.17.0-rc1 (kbuild@athens) (gcc version 7.3.0 (Debian 7.3.0-1)) #238 Tue Apr 17 23:21:37 CST 2018 [0.00] Command line: root=/dev/ram0 hung_task_panic=1 debug apic=debug sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 net.ifnames=0 printk.devkmsg=on panic=-1 softlockup_panic=1 nmi_watchdog=panic oops=panic load_ramdisk=2 prompt_ramdisk=0 drbd.minor_count=8 systemd.log_level=err ignore_loglevel console=tty0 earlyprintk=ttyS0
[per_cpu_ptr_to_phys] BUG: unable to handle kernel NULL pointer dereference at 0000000000000000
Hello, FYI this happens in mainline kernel 4.17.0-rc1. It at least dates back to v4.16 . It occurs in 2 out of 6 boots. [0.00] Built 1 zonelists, mobility grouping on. Total pages: 128869 [0.00] Kernel command line: ip=vm-intel12-yocto-x86_64-2::dhcp root=/dev/ram0 user=lkp job=/lkp/scheduled/vm-intel12-yocto-x86_64-2/boot-1-yocto-minimal-x86_64-2016-04-22.cgz-60cc43fc888428bb2f18f08997432d426a243338-20180418-63270-jrkd3n-1.yaml ARCH=x86_64 kconfig=x86_64-randconfig-s0-04090505 branch=linux-devel/devel-spot-201804082042 commit=60cc43fc888428bb2f18f08997432d426a243338 BOOT_IMAGE=/pkg/linux/x86_64-randconfig-s0-04090505/gcc-6/60cc43fc888428bb2f18f08997432d426a243338/vmlinuz-4.17.0-rc1 max_uptime=600 RESULT_ROOT=/result/boot/1/vm-intel12-yocto-x86_64/yocto-minimal-x86_64-2016-04-22.cgz/x86_64-randconfig-s0-04090505/gcc-6/60cc43fc888428bb2f18f08997432d426a243338/2 LKP_SERVER=inn debug apic=debug sysrq_always_enabled rcupdate.rcu_cpu_stall_timeout=100 net.ifnames=0 printk.devkmsg=on panic=-1 softlockup_panic=1 nmi_watchdog=panic oops=panic load_ramdisk=2 prompt_ramdisk=0 drbd.minor_count=8 systemd.log_level=err ignore_loglevel console=tty0 earlyprintk=ttyS0,115200 co [0.00] sysrq: sysrq always enabled. [0.00] Dentry cache hash table entries: 65536 (order: 7, 524288 bytes) [0.00] Inode-cache hash table entries: 32768 (order: 6, 262144 bytes) [0.00] BUG: unable to handle kernel NULL pointer dereference at [0.00] PGD 0 P4D 0 [0.00] Oops: [#1] PREEMPT DEBUG_PAGEALLOC [0.00] CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0-rc1 #1 [0.00] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [0.00] RIP: 0010:per_cpu_ptr_to_phys+0xf1/0x14e: __section_mem_map_addr at include/linux/mmzone.h:1188 (inlined by) per_cpu_ptr_to_phys at mm/percpu.c:1849 [0.00] RSP: :87803ea0 EFLAGS: 00010046 [0.00] RAX: 88001f00c240 RBX: 87809000 RCX: [0.00] RDX: RSI: 88001ffd2000 RDI: [0.00] RBP: 87803ea8 R08: R09: 88001f8c4000 [0.00] R10: b080 R11: 8868746c R12: 8161 [0.00] R13: 87a70940 R14: 87a772e0 R15: [0.00] FS: () GS:8783d000() knlGS: [0.00] CS: 0010 DS: ES: CR0: 80050033 [0.00] CR2: CR3: 1c21a000 CR4: 06b0 [0.00] Call Trace: [0.00] setup_cpu_entry_areas+0x35/0x15e: setup_cpu_entry_area at arch/x86/mm/cpu_entry_area.c:104 (inlined by) setup_cpu_entry_areas at arch/x86/mm/cpu_entry_area.c:177 [0.00] trap_init+0x9/0x6e: trap_init at arch/x86/kernel/traps.c:949 [0.00] start_kernel+0x241/0x54b: mm_init at init/main.c:519 (inlined by) start_kernel at init/main.c:589 [0.00] x86_64_start_reservations+0x2a/0x2c: x86_64_start_reservations at arch/x86/kernel/head64.c:446 [0.00] x86_64_start_kernel+0x76/0x79: x86_64_start_kernel at arch/x86/kernel/head64.c:427 [0.00] secondary_startup_64+0xa5/0xb0: secondary_startup_64 at arch/x86/kernel/head_64.S:242 [0.00] Code: c1 e1 05 48 01 ca 48 8b 12 eb 23 48 8b 10 48 89 d1 48 c1 e9 34 48 8b 0c ce 48 85 c9 74 0d 48 c1 ea 28 81 e2 e0 0f 00 00 48 01 d1 <48> 8b 11 48 83 e2 f8 81 e3 ff 0f 00 00 48 29 d0 48 c1 f8 06 48 [0.00] RIP: per_cpu_ptr_to_phys+0xf1/0x14e: __section_mem_map_addr at include/linux/mmzone.h:1188 (inlined by) per_cpu_ptr_to_phys at mm/percpu.c:1849 RSP: 87803ea0 [0.00] CR2: [0.00] ---[ end trace 142a0423c71f6258 ]--- [0.00] Kernel panic - not syncing: Fatal exception Attached the full dmesg, kconfig and reproduce scripts. Thanks, Fengguang early console in setup code early console in extract_kernel input_data: 0x030ab2b6 input_len: 0x00654010 output: 0x0100 output_len: 0x01bac4d4 kernel_total_size: 0x02733000 trampoline_32bit: 0x0009d000 booted via startup_32() Physical KASLR using RDTSC... Virtual KASLR using RDTSC... Decompressing Linux... Parsing ELF... Performing relocations... done. Booting the kernel. [0.00] Linux versio
[cfs_trace_lock_tcd] BUG: unable to handle kernel NULL pointer dereference at 00000050
Hello, FYI this happens in mainline kernel 4.17.0-rc1. It looks like a new regression. It occurs in 5 out of 5 boots. [6.524361] ledtrig-cpu: registered to indicate activity on CPUs [6.527658] NET: Registered protocol family 4 [6.528191] comedi: version 0.7.76 - http://www.comedi.org [6.528851] LNetError: 1:0:(module.c:546:libcfs_init()) misc_register: error -16 [7.220272] input: ImExPS/2 Generic Explorer Mouse as /devices/platform/i8042/serio1/input/input3 [7.586283] BUG: unable to handle kernel NULL pointer dereference at 0050 [7.586962] *pdpt = *pde = f000ff53f000ff53 [7.587002] Oops: [#1] PREEMPT [7.587002] CPU: 0 PID: 1 Comm: swapper Not tainted 4.17.0-rc1 #1 [7.587002] EIP: cfs_trace_lock_tcd+0xb/0xa0: cfs_trace_lock_tcd at drivers/staging/lustre/lnet/libcfs/linux/linux-tracefile.c:149 [7.587002] EFLAGS: 00210246 CPU: 0 [7.587002] EAX: EBX: ECX: 81fcb588 EDX: [7.587002] ESI: 1800 EDI: 8f5d1e08 EBP: 8f5d1d7c ESP: 8f5d1d70 [7.587002] DS: 007b ES: 007b FS: GS: 00e0 SS: 0068 [7.587002] CR0: 80050033 CR2: 0050 CR3: 022f CR4: 06b0 [7.587002] Call Trace: [7.587002] libcfs_debug_vmsg2+0x8f/0x82f: libcfs_debug_vmsg2 at drivers/staging/lustre/lnet/libcfs/tracefile.c:317 [7.587002] ? trace_irq_enable_rcuidle+0x25/0x62: static_key_false at include/linux/jump_label.h:206 (inlined by) trace_irq_enable_rcuidle at include/trace/events/preemptirq.h:40 [7.587002] ? slob_free+0x249/0x251: slob_free at mm/slob.c:421 [7.587002] libcfs_debug_msg+0x19/0x1b: libcfs_debug_msg at drivers/staging/lustre/lnet/libcfs/tracefile.c:287 [7.587002] ksocknal_startup+0xe77/0x12b2: ksocknal_startup at drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c:2845 (discriminator 3) [7.587002] ? lock_release+0x135/0x1ec: lock_release at kernel/locking/lockdep.c:3942 [7.587002] ? _raw_spin_unlock+0x3c/0x4b: __raw_spin_unlock at include/linux/spinlock_api_smp.h:152 (inlined by) _raw_spin_unlock at kernel/locking/spinlock.c:176 [7.587002] lnet_startup_lndni+0x4cd/0x9ec: lnet_startup_lndni at drivers/staging/lustre/lnet/lnet/api-ni.c:1304 [7.587002] LNetNIInit+0x880/0xa00: lnet_startup_lndnis at drivers/staging/lustre/lnet/lnet/api-ni.c:1385 (inlined by) LNetNIInit at drivers/staging/lustre/lnet/lnet/api-ni.c:1543 [7.587002] ? read_seqcount_retry+0x1b/0x22: read_seqcount_retry at include/linux/seqlock.h:222 read_seqcount_retry+0x1b/0x22: read_seqcount_retry at include/linux/seqlock.h:222 read_seqcount_retry+0x1b/0x22: read_seqcount_retry at include/linux/seqlock.h:222 [7.587002] srpc_startup+0x84/0x381: srpc_startup at drivers/staging/lustre/lnet/selftest/rpc.c:1613 [7.587002] lnet_selftest_init+0x2c4/0x5d9: lnet_selftest_init at drivers/staging/lustre/lnet/selftest/module.c:134 [7.587002] ? lnet_selftest_exit+0x8d/0x8d: lnet_selftest_init at drivers/staging/lustre/lnet/selftest/module.c:90 [7.587002] do_one_initcall+0x76/0x1d7: __read_once_size at include/linux/compiler.h:188 (inlined by) arch_atomic_read at arch/x86/include/asm/atomic.h:31 (inlined by) atomic_read at include/asm-generic/atomic-instrumented.h:22 (inlined by) static_key_count at include/linux/jump_label.h:194 (inlined by) static_key_false at include/linux/jump_label.h:206 (inlined by) trace_initcall_finish at include/trace/events/initcall.h:44 (inlined by) do_one_initcall at init/main.c:884 [7.587002] ? do_early_param+0
Re: [lkp-robot] [bisect done] e71e836f46 [ 17.567570] WARNING: CPU: 0 PID: 1 at arch/x86/include/asm/pgtable.h:540 __change_page_attr_set_clr
Hi Dave, On Tue, Apr 10, 2018 at 01:59:21PM -0700, Dave Hansen wrote: On 04/09/2018 06:12 PM, kernel test robot wrote: +---+++++ | | 64c8075940 | e71e836f46 | 0564258fb2 | 87e1e2f51c | +---+++++ | boot_successes| 35 | 0 | 19 | 11 | | boot_failures | 0 | 26 ||| | WARNING:at_arch/x86/include/asm/pgtable.h:#__change_page_attr_set_clr | 0 | 26 ||| | RIP:__change_page_attr_set_clr| 0 | 26 ||| +---+++++ LKP folks, does this mean that the system didn't boot in all the places that we saw this warning? Or does this just say that it *had* the warning 26 times? The 'e71e836f46' column means there are 0 boots (sorry boot_successes might be a bit confusing) with clean dmesg and 26 boots with warning/error dmesg. It indicates that WARNING is 100% reproducible. boot_successes + boot_failures = total test boots I looked into this a bit. This LKP report points the finger at this commit which trips over a new debugging WARN_ON() I added: [patch 06/11] x86/mm: Remove extra filtering in pageattr code It's because set_memory_nx() encounters the (unsupported) _PAGE_GLOBAL bit while clearing _PAGE_NX. I never saw this in testing because this patch: [PATCH 10/11] x86/pti: never implicitly clear _PAGE_GLOBAL for kernel image explicitly clears out _PAGE_GLOBAL long before the set_memory_nx() call. So I *think* this is a transient issue resulting in bad ordering of the _PAGE_GLOBAL patch set. I believe it is harmless. Right. The '0564258fb2' and '87e1e2f51c' columns mean the 'tip/x86/pti' and 'tip/master' branch HEADs boot clean. So it's a transient warning inside the branch. Thanks, Fengguang
Re: [PATCH 05/45] C++: Set compilation as C++ for .c files
Hi David, On Tue, Apr 10, 2018 at 09:44:14AM +0100, David Howells wrote: Hi Fengguang, There's another kbuild test that's probably worth adding if you don't do it already: create a c++ source file that just #includes all the exported UAPI headers and then try to build it. This should catch words like "private" cropping up in structs. That's a good idea! And it looks straightforward to do. CC Philip (current LKP maintainer) for possible plans. Thanks, Fengguang
Re: [PATCH 05/45] C++: Set compilation as C++ for .c files
On Tue, Apr 03, 2018 at 02:16:50PM +0100, David Howells wrote: kbuild test robot wrote: scripts/Makefile.kasan:17: Cannot use CONFIG_KASAN: -fsanitize=kernel-address is not supported by compiler cc1: warning: command line option '-fno-rtti' is valid for C++/ObjC++ but not for C cc1: warning: command line option '-fpermissive' is valid for C++/ObjC++ but not for C It would need to use g++-8 to compile it ;-) Awesome! :) Cheers, Fengguang
Re: [PATCH v3] ANDROID: binder: change down_write to down_read
On Fri, Mar 30, 2018 at 09:37:36AM +0900, Minchan Kim wrote: On Fri, Mar 30, 2018 at 07:42:37AM +0800, kbuild test robot wrote: Hi Minchan, I love your patch! Yet something to improve: Glad to hear. It's first time someone loves my patch. ;-) FYI, that message originates from Linus. :-) [auto build test ERROR on staging/staging-testing] [also build test ERROR on v4.16-rc7 next-20180329] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Minchan-Kim/ANDROID-binder-change-down_write-to-down_read/20180330-043057 config: x86_64-randconfig-x014-201812 (attached as .config) compiler: gcc-7 (Debian 7.3.0-1) 7.3.0 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 All errors (new ones prefixed by >>): drivers//android/binder.c: In function 'binder_mmap': >> drivers//android/binder.c:4725:24: error: 'struct vm_area_struct' has no member named 'flags'; did you mean 'vm_flags'? vma->vm_flags = (vma->flags | VM_DONTCOPY | VM_MIXEDMAP) & ^ vm_flags http://lkml.kernel.org/r/<20180329065424.203172-1-minc...@kernel.org> This time, I was little bit fast. :) Hurrah, quick hands! :) Cheers, Fengguang
Re: [asm-generic:asm-generic] BUILD SUCCESS 18fd258e74835b40a4f89f11ae933267866cc7af
Hi Arnd, CC Shun, who maintains the kbuild test code now. On Wed, Mar 14, 2018 at 09:00:59PM +0100, Arnd Bergmann wrote: On Wed, Mar 14, 2018 at 8:34 PM, kbuild test robot wrote: tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git asm-generic branch HEAD: 18fd258e74835b40a4f89f11ae933267866cc7af staging: iio: remove iio-trig-bfin-timer driver elapsed time: 192m configs tested: 176 The following configs have been built successfully. More configs may be tested in the coming days. blackfinBF526-EZBRD_defconfig blackfinBF533-EZKIT_defconfig blackfinBF561-EZKIT-SMP_defconfig blackfin TCM-BF537_defconfig cris etrax-100lx_v2_defconfig mn10300 asb2364_defconfig frv defconfig tile tilegx_defconfig m32r m32104ut_defconfig m32r mappi3.smp_defconfig m32r opsput_defconfig m32r usrv_defconfig score spct6600_defconfig Hi Fengguang, Something odd is going in with the reporting here. I got individual reports for a number of the above that they failed, but the summary for the branch reports the builds as successful. Yes that looks like a problem. However I don't see obviously clues by looking at the log file. Perhaps we'll need some more verbose logging in the hope we can catch the root cause next time it shows up. This branch contains the removal of blackfin, cris, mn10300, frv, tile, score, m32r, and metag, so none of the above should be marked as successful. You may want to prevent them from being built in the future, but it also seems the reporting needs to be checked. Good to know that -- I just send patch to remove tests on those archs. Thanks, Fengguang
Re: [linux-stable-rc:linux-4.9.y 6083/6211] warning: __mcount_loc already exists: drivers/net/wireless/intel/iwlwifi/mvm/rx.o
On Sun, Mar 04, 2018 at 09:29:17AM +0100, Arnd Bergmann wrote: On Sun, Mar 4, 2018 at 7:21 AM, kbuild test robot wrote: tree: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.9.y head: 85afb4e51ccfdb10f2969b24439ae2887fe0fcff commit: c0ecbd663fe6961c6ff181a211f090005858656f [6083/6211] scsi: advansys: fix uninitialized data access config: i386-allmodconfig (attached as .config) compiler: gcc-7 (Debian 7.3.0-1) 7.3.0 reproduce: git checkout c0ecbd663fe6961c6ff181a211f090005858656f # save the attached .config to linux build tree make ARCH=i386 All warnings (new ones prefixed by >>): warning: __mcount_loc already exists: drivers/net/wireless/intel/iwlwifi/mvm/rx.o I'm pretty sure this is unrelated to my patch, but it's a recurring CC Shun for checking if there are problems with the bisect. problem that we should get to the bottom of. Olof's autobuilder has reported the same thing for the 3.18-stable tree but not any others (for every single file in the kernel, not sure if this warning only shows up for all of them, or just one module). Thanks, Fengguang
Re: ERROR: "ia64_delay_loop" [drivers/spi/spi-thunderx.ko] undefined!
On Thu, Mar 01, 2018 at 04:25:45PM +, Robin Murphy wrote: On 01/03/18 03:18, kbuild test robot wrote: Hi Robin, First bad commit (maybe != root cause): I have to admit I'm slightly intrigued how this commit could possibly appear relevant to the IA64 build at all, let alone to this error :/ Yeah it looks like a bisect error -- the commit merely changed an ARM header file. Thanks, Fengguang tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master head: 97ace515f01439d4cf6e898b4094040dc12d36e7 commit: e1a50de37860b3a93a9d643b09638db5aff47650 arm64: cputype: Silence Sparse warnings date: 12 days ago config: ia64-allmodconfig (attached as .config) compiler: ia64-linux-gcc (GCC) 7.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout e1a50de37860b3a93a9d643b09638db5aff47650 # save the attached .config to linux build tree make.cross ARCH=ia64 All errors (new ones prefixed by >>): ERROR: "ia64_delay_loop" [drivers/spi/spi-thunderx.ko] undefined! ERROR: "ia64_delay_loop" [drivers/net/phy/mdio-cavium.ko] undefined! --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
Re: [RFC PATCH tip] x86/acpi: u64_x86_init_noop() can be static
On Wed, Feb 21, 2018 at 09:51:48AM +0100, Ingo Molnar wrote: * kbuild test robot wrote: Fixes: 62d8b7fba8d3 ("x86/acpi: Add a new x86_init_acpi structure to x86_init_ops") Signed-off-by: Fengguang Wu --- 0 files changed -ENOPATCH? Ah the script crashed when it tries to indent the new code, since it assumes func_name() { } while here it's func_name() { } I'll fix it up. Thanks, Fengguang
Re: Kconfig:12: can't open file "arch/powerpc64/Kconfig"
Hi Masahiro, Sorry it looks like a regression in the robot. CC Shun for looking into this. Thanks, Fengguang On Sun, Feb 11, 2018 at 01:07:32PM +0900, Masahiro Yamada wrote: Hi test robot, 2018-02-11 12:41 GMT+09:00 kbuild test robot : tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master head: d48fcbd864a008802a90c58a9ceddd9436d11a49 commit: 9e3e10c725360b9d07018cfcd5b7b6b7d325fae5 kconfig: send error messages to stderr date: 2 days ago config: powerpc64-defconfig compiler: powerpc64-linux-gcc (GCC) 7.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout 9e3e10c725360b9d07018cfcd5b7b6b7d325fae5 make.cross ARCH=powerpc64 defconfig make.cross ARCH=powerpc64 I think this test setting is weird. With the following error, it is pointless to test this. Makefile:499: arch/powerpc64/Makefile: No such file or directory arch/powerpc64/ does not exist in the first place. If you really want to give ARCH=powerpc64, you need to add something like follows in the top Makefile (but I doubt this is the right thing to do) ifeq ($(ARCH),powerpc64) SRCARCH := powerpc endif Could you check your test setting, please? All errors (new ones prefixed by >>): Makefile:499: arch/powerpc64/Makefile: No such file or directory make[1]: *** No rule to make target 'arch/powerpc64/Makefile'. make[1]: Failed to remake makefile 'arch/powerpc64/Makefile'. Kconfig:12: can't open file "arch/powerpc64/Kconfig" make[2]: *** [defconfig] Error 1 make[1]: *** [defconfig] Error 2 make: *** [sub-make] Error 2 -- Makefile:499: arch/powerpc64/Makefile: No such file or directory make[1]: *** No rule to make target 'arch/powerpc64/Makefile'. make[1]: Failed to remake makefile 'arch/powerpc64/Makefile'. Kconfig:12: can't open file "arch/powerpc64/Kconfig" make[2]: *** [oldconfig] Error 1 make[1]: *** [oldconfig] Error 2 make: *** [sub-make] Error 2 -- Makefile:499: arch/powerpc64/Makefile: No such file or directory make[1]: *** No rule to make target 'arch/powerpc64/Makefile'. make[1]: Failed to remake makefile 'arch/powerpc64/Makefile'. Kconfig:12: can't open file "arch/powerpc64/Kconfig" make[2]: *** [olddefconfig] Error 1 make[2]: Target 'oldnoconfig' not remade because of errors. make[1]: *** [oldnoconfig] Error 2 make: *** [sub-make] Error 2 vim +12 Kconfig 838a2e55 Arnaud Lacombe 2010-09-04 7 838a2e55 Arnaud Lacombe 2010-09-04 8 config SRCARCH 838a2e55 Arnaud Lacombe 2010-09-04 9 string 838a2e55 Arnaud Lacombe 2010-09-04 10 option env="SRCARCH" 838a2e55 Arnaud Lacombe 2010-09-04 11 838a2e55 Arnaud Lacombe 2010-09-04 @12 source "arch/$SRCARCH/Kconfig" :: The code at line 12 was first introduced by commit :: 838a2e55e6a4e9e8a10451ed2ef0f7a08dabdb04 kbuild: migrate all arch to the kconfig mainmenu upgrade :: TO: Arnaud Lacombe :: CC: Arnaud Lacombe --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation -- Best Regards Masahiro Yamada
Re: what trees/branches to test on syzbot
Hi Dmitry, On Tue, Jan 16, 2018 at 10:58:51AM +0100, Dmitry Vyukov wrote: On Tue, Jan 16, 2018 at 10:45 AM, Guenter Roeck wrote: On Mon, Jan 15, 2018 at 11:51 PM, Dmitry Vyukov wrote: Hello, Several people proposed that linux-next should not be tested on syzbot. While some people suggested that it needs to test as many trees as possible. I've initially included linux-next as it is a staging area before upstream tree, with the intention that patches are _tested_ there, is they are not tested there, bugs enter upstream tree. And then it takes much longer to get fix into other trees. So the question is: what trees/branches should be tested? Preferably in priority order as syzbot can't test all of them. I always thought that -next existed specifically to give people a chance to test the code in it. Maybe the question is where to report the test results ? FTR, from Guenter on another thread: Interesting. Assuming that refers to linux-next, not linux-net, that may explain why linux-next tends to deteriorate. I wonder if I should drop it from my testing as well. I'll be happy to follow whatever the result of this exchange is and do the same. If we agree on some list of important branches, and what branches specifically should not be tested with automatic reporting, I think it will benefit everybody. +Fengguang, can you please share your list and rationale behind it? 0-day aims to aggressively test as much tree and branches as possible, including various developer trees, maintainer, linux-next, mainline and stable trees. Here are the complete list of 800+ trees we monitored: https://git.kernel.org/pub/scm/linux/kernel/git/wfg/lkp-tests.git/tree/repo/linux The rationale is obvious. IMHO what really matters here is about capability rather than rationale: that policy heavily relies on the fundamental capability of auto bisecting. Once regressions are bisected, we know the owners of problem to auto send report to, ie. the first bad commit's author and committer. For the bugs that cannot be bisected, they tend to be old ones and we report more often on mainline tree than linux-next. Thanks, Fengguang
Re: LKML admins (syzbot emails are not delivered)
On Tue, Jan 16, 2018 at 08:59:36AM +0100, Dmitry Vyukov wrote: On Tue, Jan 16, 2018 at 8:12 AM, Theodore Ts'o wrote: On Mon, Jan 15, 2018 at 10:38:42AM -0600, Eric W. Biederman wrote: Sometimes the branches on linux-next are experimental crap. If someone adds an experimental memory allocator to linux-next before discovering it causes all kinds of problems I don't want bug reports about my code not being able to allocate memory because the memory allocator was bad. If you don't have the resources to test the individual branches of linux-next please just test Linus's tree. That will be much more meaningful and productive. I have to agree with Eric here, the reason why Fengguang Wu's 0-day testing robot is much better received by developers is that he does not test linux-net, but rather individual subsystem git trees and branches. His test automation also does an automatic bisection search, and can point at a specific commit --- at which point e-mail goes out to owner of the subsystem git tree, and to the people who authored and/or reviewed the guilty commit. Dmitry, perhaps you could collaborate with Intel's 0-day testing folks? They have code which does all of this, and perhaps it can be leveraged. +Fengguang Please note that in most cases 0-day solves an order of magnitude simpler problem. Build/sparse errors are much faster to find, always possible to precisely bisect and attribute. Yes, for that you just test every commit, bisect and send targeted emails. syzbot only finds runtime bugs, lots of them are related to races and can't be reliably reproduced, bisected, etc. Lots of them are old (e.g. predate KASAN that detects them). But they still can be fixed. In ~half of cases developers fix them looking only at the oops report. The last time I checked 0-day infrastructure was closed source. Fengguang, what do you do with trinity crashes that happen episodically, but you can't reliably reproduce, bisect and attribute? 0-day runs most trinity tests in QEMU machines, which can run massively in parallel. Which means we can afford to bisect them by running up to 1000 boots in each bisect step. Ditto for KASAN errors. Since Xiaolong (CCed) has enabled syzkaller in 0-day, it could in theory utilize the same auto bisect infrastructure. If we can make sure syzkaller runs effectively in the 0-day test farm. Thanks, Fengguang
arch/c6x/platforms/plldata.c:279:33: error: implicit declaration of function 'get_coreid'; did you mean 'get_order'?
tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master head: 328b4ed93b69a6f2083d52f31a240a09e5de386a commit: 71af2ed5eeea639339e3a1497a0196bab7de4b57 kasan, sched/headers: Remove from date: 9 months ago config: c6x-evmc6472_defconfig (attached as .config) compiler: c6x-elf-gcc (GCC) 7.2.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout 71af2ed5eeea639339e3a1497a0196bab7de4b57 # save the attached .config to linux build tree make.cross ARCH=c6x All errors (new ones prefixed by >>): arch/c6x/platforms/plldata.c: In function 'c6472_setup_clocks': >> arch/c6x/platforms/plldata.c:279:33: error: implicit declaration of function >> 'get_coreid'; did you mean 'get_order'? >> [-Werror=implicit-function-declaration] c6x_core_clk.parent = &sysclks[get_coreid() + 1]; ^~ get_order cc1: some warnings being treated as errors # https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=71af2ed5eeea639339e3a1497a0196bab7de4b57 git remote add linus https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git git remote update linus git checkout 71af2ed5eeea639339e3a1497a0196bab7de4b57 vim +279 arch/c6x/platforms/plldata.c 81ec9889 Mark Salter 2011-10-04 257 81ec9889 Mark Salter 2011-10-04 258 static void __init c6472_setup_clocks(struct device_node *node) 81ec9889 Mark Salter 2011-10-04 259 { 81ec9889 Mark Salter 2011-10-04 260struct pll_data *pll = &c6x_soc_pll1; 81ec9889 Mark Salter 2011-10-04 261struct clk *sysclks = pll->sysclks; 81ec9889 Mark Salter 2011-10-04 262int i; 81ec9889 Mark Salter 2011-10-04 263 81ec9889 Mark Salter 2011-10-04 264pll->flags = PLL_HAS_MUL; 81ec9889 Mark Salter 2011-10-04 265 81ec9889 Mark Salter 2011-10-04 266for (i = 1; i <= 6; i++) { 81ec9889 Mark Salter 2011-10-04 267sysclks[i].flags |= FIXED_DIV_PLL; 81ec9889 Mark Salter 2011-10-04 268sysclks[i].div = 1; 81ec9889 Mark Salter 2011-10-04 269} 81ec9889 Mark Salter 2011-10-04 270 81ec9889 Mark Salter 2011-10-04 271sysclks[7].flags |= FIXED_DIV_PLL; 81ec9889 Mark Salter 2011-10-04 272sysclks[7].div = 3; 81ec9889 Mark Salter 2011-10-04 273sysclks[8].flags |= FIXED_DIV_PLL; 81ec9889 Mark Salter 2011-10-04 274sysclks[8].div = 6; 81ec9889 Mark Salter 2011-10-04 275sysclks[9].flags |= FIXED_DIV_PLL; 81ec9889 Mark Salter 2011-10-04 276sysclks[9].div = 2; 81ec9889 Mark Salter 2011-10-04 277sysclks[10].div = PLLDIV10; 81ec9889 Mark Salter 2011-10-04 278 81ec9889 Mark Salter 2011-10-04 @279c6x_core_clk.parent = &sysclks[get_coreid() + 1]; 81ec9889 Mark Salter 2011-10-04 280c6x_i2c_clk.parent = &sysclks[8]; 81ec9889 Mark Salter 2011-10-04 281c6x_watchdog_clk.parent = &sysclks[8]; 81ec9889 Mark Salter 2011-10-04 282c6x_mdio_clk.parent = &sysclks[5]; 81ec9889 Mark Salter 2011-10-04 283 81ec9889 Mark Salter 2011-10-04 284c6x_clks_init(c6472_clks); 81ec9889 Mark Salter 2011-10-04 285 } 81ec9889 Mark Salter 2011-10-04 286 #endif /* CONFIG_SOC_TMS320C6472 */ 81ec9889 Mark Salter 2011-10-04 287 81ec9889 Mark Salter 2011-10-04 288 --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
Re: [linus:master] BUILD REGRESSION d1f854ac240ea3928a99294390048e9b2aa6fa0e
On Sat, Dec 23, 2017 at 07:33:37PM -0800, Linus Torvalds wrote: On Sat, Dec 23, 2017 at 4:28 PM, kbuild test robot wrote: Regressions in current branch: This looks more like some odd compiler regression than a kernel one. Yeah sorry. We probably should avoid reporting them here. I'll teach the robot to filter them out. Regards, Fengguang
[PATCH kernel-tests] ignore compiler errors
This looks more like some odd compiler regression than a kernel one. Linus The original report is: To: Linus Torvalds Cc: LKML Subject: [linus:master] BUILD REGRESSION d1f854ac240ea3928a99294390048e9b2aa6fa0e tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master branch HEAD: d1f854ac240ea3928a99294390048e9b2aa6fa0e Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm Regressions in current branch: arch/c6x/platforms/plldata.c:279:33: error: implicit declaration of function 'get_coreid'; did you mean 'get_order'? [-Werror=implicit-function-declaration] drivers/tty/serial/8250/8250_core.c:1094:1: error: unrecognizable insn: drivers/tty/serial/8250/8250_core.c:1094:1: internal compiler error: in extract_insn, at recog.c:2311 fs//xfs/xfs_ioctl.c:1624:1: internal compiler error: in change_address_1, at emit-rtl.c:2150 fs/xfs/xfs_ioctl.c:1629:1: internal compiler error: in change_address_1, at emit-rtl.c:2150 Please submit a full bug report, {standard input}:1226: Error: displacement to undefined symbol .L329 overflows 12-bit field {standard input}:1233: Error: displacement to undefined symbol .L331 overflows 12-bit field {standard input}:1253: Error: displacement to undefined symbol .L359 overflows 12-bit field {standard input}:1278: Error: displacement to undefined symbol .L360 overflows 12-bit field {standard input}:1405: Error: displacement to undefined symbol .L255 overflows 12-bit field {standard input}:1408: Error: invalid operands for opcode {standard input}:1408: Error: missing operand {standard input}:1453: Error: displacement to undefined symbol .L285 overflows 12-bit field {standard input}:1457: Error: displacement to undefined symbol .L286 overflows 12-bit field {standard input}:1467: Error: displacement to undefined symbol .L257 overflows 12-bit field {standard input}:1893: Error: displacement to undefined symbol .L229 overflows 12-bit field {standard input}:199: Error: unknown opcode {standard input}:2013: Error: displacement to undefined symbol .L235 overflows 12-bit field {standard input}:9613: Error: invalid operands for opcode {standard input}:9613: Error: missing operand {standard input}: Error: open CFI at the end of file; missing .cfi_endproc directive verifier.c:(.text+0x31ec): undefined reference to `__multi3' Error ids grouped by kconfigs: recent_errors ├── c6x-evmc6472_defconfig │ └── arch-c6x-platforms-plldata.c:error:implicit-declaration-of-function-get_coreid-did-you-mean-get_order ├── cris-allyesconfig │ ├── drivers-tty-serial-8250_core.c:error:unrecognizable-insn: │ └── drivers-tty-serial-8250_core.c:internal-compiler-error:in-extract_insn-at-recog.c ├── mips-64r6el_defconfig │ └── verifier.c:(.text):undefined-reference-to-__multi3 ├── sh-allyesconfig │ ├── fs-xfs-xfs_ioctl.c:internal-compiler-error:in-change_address_1-at-emit-rtl.c │ ├── Please-submit-a-full-bug-report │ ├── standard-input:Error:displacement-to-undefined-symbol-.L229-overflows-bit-field │ ├── standard-input:Error:displacement-to-undefined-symbol-.L235-overflows-bit-field │ ├── standard-input:Error:invalid-operands-for-opcode │ ├── standard-input:Error:missing-operand │ └── standard-input:Error:open-CFI-at-the-end-of-file-missing-.cfi_endproc-directive ├── sh-j2_defconfig │ └── standard-input:Error:unknown-opcode ├── sh-sdk7786_defconfig │ ├── standard-input:Error:displacement-to-undefined-symbol-.L255-overflows-bit-field │ ├── standard-input:Error:displacement-to-undefined-symbol-.L257-overflows-bit-field │ ├── standard-input:Error:displacement-to-undefined-symbol-.L285-overflows-bit-field │ └── standard-input:Error:displacement-to-undefined-symbol-.L286-overflows-bit-field └── sh-titan_defconfig ├── fs-xfs-xfs_ioctl.c:internal-compiler-error:in-change_address_1-at-emit-rtl.c ├── Please-submit-a-full-bug-report ├── standard-input:Error:displacement-to-undefined-symbol-.L329-overflows-bit-field ├── standard-input:Error:displacement-to-undefined-symbol-.L331-overflows-bit-field ├── standard-input:Error:displacement-to-undefined-symbol-.L359-overflows-bit-field ├── standard-input:Error:displacement-to-undefined-symbol-.L360-overflows-bit-field ├── standard-input:Error:invalid-operands-for-opcode └── standard-input:Error:missing-operand Signed-off-by: Fengguang Wu --- ignore-errors | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ignore-errors b/ignore-errors index 02e0163..c56e228 100755 --- a/ignore-errors +++ b/ignore-errors @@ -1,7 +1,9 @@ \[-Werror\] \[-Werror=return-type\] -gcc: internal compiler error: +: internal compiler error: +Please submit a full bug report, error: insn does not satisfy its constraints: +error: unrecognizable insn: relocation truncated to fit: dangerous relocation: additional relocation overflows omitted from the
Re: running leaking_addresses.pl
Hi Tobin, On Tue, Dec 05, 2017 at 10:36:13AM +1100, Tobin C. Harding wrote: Hi, Recently scripts/leaking_addresses.pl was merged into the mainline with the hope of catching leaking kernel addresses. Would it be in scope for this script to be run by the kbuild test robot? Excuse my very little knowledge of the kbuild test robot but would this lead to the script being run on a number of kernels with varying configuration? Yes it's in our plan. Bill Roberts also requested testing leaking_addresses.pl and Philip has scheduled it to next quarter. In the event that this may be a possibility can I add that I do not have a suggestion for what to do with the output, can it go to LKML? I'm happy to be CC'd on the output to help investigate. That'd be great! In general we are mainly responsible for catching regressions. Due to limit of time and expertise, we leave the analyze and bug fixing works to the first bad commit's authors and anyone who are interested in looking into a class of problems. For example, we'll CC some smatch and coccinelle regressions (possibly false positive) to Dan and Julia. Thanks, Fengguang
Re: [tip:WIP.x86/kpti 50/65] include/linux/compiler.h:319:38: error: call to '__compiletime_assert_321' declared with attribute error: BUILD_BUG_ON failed: MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE
On Mon, Dec 04, 2017 at 06:19:34PM +0100, Thomas Gleixner wrote: On Mon, 4 Dec 2017, kbuild test robot wrote: tree: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.x86/kpti head: c7ddf30cab554658b154ee16ae5e5d577ff530bf commit: 9ebd9d9cdbc90021a5e320fb054cf48c027e6d34 [50/65] x86/fixmap: Add ldt entries to user shared fixmap config: x86_64-allmodconfig (attached as .config) Yes, we figured that out 10 minutes before you. Working on a fix. OK, we'll need to act fast in order to catch up with you. :) Thanks, Fengguang
Re: [e1000_shutdown] e1000 0000:00:03.0: disabling already-disabled device
Hi Tushar, On Tue, Nov 28, 2017 at 01:01:23AM +0530, Tushar Dave wrote: On 11/23/2017 04:43 AM, Fengguang Wu wrote: On Wed, Nov 22, 2017 at 03:40:52AM +0530, Tushar Dave wrote: On 11/21/2017 06:11 PM, Fengguang Wu wrote: Hello, FYI this happens in mainline kernel 4.14.0-01330-g3c07399. It happens since 4.13 . It occurs in 3 out of 162 boots. [ 44.637743] advantechwdt: Unexpected close, not stopping watchdog! [ 44.997548] input: ImExPS/2 Generic Explorer Mouse as /devices/platform/i8042/serio1/input/input6 [ 45.013419] e1000 :00:03.0: disabling already-disabled device [ 45.013447] [ cut here ] [ 45.014868] WARNING: CPU: 1 PID: 71 at drivers/pci/pci.c:1641 pci_disable_device+0xa1/0x105: pci_disable_device at drivers/pci/pci.c:1640 [ 45.016171] CPU: 1 PID: 71 Comm: rcu_perf_shutdo Not tainted 4.14.0-01330-g3c07399 #1 [ 45.017197] task: 88011bee9e40 task.stack: c986 [ 45.017987] RIP: 0010:pci_disable_device+0xa1/0x105: pci_disable_device at drivers/pci/pci.c:1640 [ 45.018603] RSP: :c9863e30 EFLAGS: 00010286 [ 45.019282] RAX: 0035 RBX: 88013a230008 RCX: [ 45.020182] RDX: RSI: RDI: 0203 [ 45.021084] RBP: 88013a3f31e8 R08: 0001 R09: [ 45.021986] R10: 827ec29c R11: 0002 R12: 0001 [ 45.022946] R13: 88013a230008 R14: 880117802b20 R15: c9863e8f [ 45.023842] FS: () GS:88013fd0() knlGS: [ 45.024863] CS: 0010 DS: ES: CR0: 80050033 [ 45.025583] CR2: c96d4000 CR3: 0220f000 CR4: 06a0 [ 45.026478] Call Trace: [ 45.026811] __e1000_shutdown+0x1d4/0x1e2: __e1000_shutdown at drivers/net/ethernet/intel/e1000/e1000_main.c:5162 [ 45.027344] ? rcu_perf_cleanup+0x2a1/0x2a1: rcu_perf_shutdown at kernel/rcu/rcuperf.c:627 [ 45.027883] e1000_shutdown+0x14/0x3a: e1000_shutdown at drivers/net/ethernet/intel/e1000/e1000_main.c:5235 [ 45.028351] device_shutdown+0x110/0x1aa: device_shutdown at drivers/base/core.c:2807 [ 45.028858] kernel_power_off+0x31/0x64: kernel_power_off at kernel/reboot.c:260 [ 45.029343] rcu_perf_shutdown+0x9b/0xa7: rcu_perf_shutdown at kernel/rcu/rcuperf.c:637 [ 45.029852] ? __wake_up_common_lock+0xa2/0xa2: autoremove_wake_function at kernel/sched/wait.c:376 [ 45.030414] kthread+0x126/0x12e: kthread at kernel/kthread.c:233 [ 45.030834] ? __kthread_bind_mask+0x8e/0x8e: kthread at kernel/kthread.c:190 [ 45.031399] ? ret_from_fork+0x1f/0x30: ret_from_fork at arch/x86/entry/entry_64.S:443 [ 45.031883] ? kernel_init+0xa/0xf5: kernel_init at init/main.c:997 [ 45.032325] ret_from_fork+0x1f/0x30: ret_from_fork at arch/x86/entry/entry_64.S:443 [ 45.032777] Code: 00 48 85 ed 75 07 48 8b ab a8 00 00 00 48 8d bb 98 00 00 00 e8 aa d1 11 00 48 89 ea 48 89 c6 48 c7 c7 d8 e4 0b 82 e8 55 7d da ff <0f> ff b9 01 00 00 00 31 d2 be 01 00 00 00 48 c7 c7 f0 b1 61 82 [ 45.035222] ---[ end trace c257137b1b1976ef ]--- [ 45.037838] ACPI: Preparing to enter system sleep state S5 Attached the full dmesg, kconfig and reproduce scripts. Looks like e1000 pci/pxi-x device is already suspended. And therefore call to e1000_suspend() -> __e1000_shutdown() -> pci_disable_device() already had disabled the device. Disabling device again by e1000_shutdown handler during system shutdown causes warning at drivers/pci/pci.c:1641. I think function __e1000_shutdown should just return if device is already suspended! I don't have e1000 hardware to test right now. So if this seems logical to others I will send a patch. Tushar, it happens on QEMU boot testing, so do not rely on e1000 HW. Unless you'd like to prevent regressions on real HW. The original report attached a reproduce script to run the QEMU test. Or you may send me the patch for testing. Fengguang, Would you please try this patch and test. The patch is compile tested only. The patch is similar to how ixgbe handled the issue. Thanks. e1000: fix disabling already-disabled warning This patch adds check so that driver does not disable already disabled device. It works! I tried 100 boots and the "e1000 :00:03.0: disabling already-disabled device" error no longer show up. Tested-by: Fengguang Wu Thanks, Fengguang Signed-off-by: Tushar Dave --- drivers/net/ethernet/intel/e1000/e1000.h | 3 ++- drivers/net/ethernet/intel/e1000/e1000_main.c | 23 ++- 2 files changed, 20 insertions(+), 6 deletions(-) d