Re: [PATCH v5] x86: fix kaslr and memmap collision

2017-01-05 Thread Baoquan He
Add Kees to let him have a look at this too.

On 01/05/17 at 05:21pm, Baoquan He wrote:
> On 01/04/17 at 11:29am, Dave Jiang wrote:
> > CONFIG_RANDOMIZE_BASE relocates the kernel to a random base address.
> > However it does not take into account the memmap= parameter passed in from
> > the kernel cmdline. This results in the kernel sometimes being put in
> > the middle of memmap. Teaching kaslr to not insert the kernel in
> > memmap defined regions. We will support up to 4 memmap regions. Any
> > additional regions will cause kaslr to disable. The mem_avoid set has
> > been augmented to add up to 4 unusable regions of memmaps provided by the
> > user to exclude those regions from the set of valid address range to insert
> > the uncompressed kernel image. The nn@ss ranges will be skipped by the
> > mem_avoid set since it indicates memory useable.
> > 
> > Signed-off-by: Dave Jiang 
> > ---
> > 
> > v2:
> > Addressing comments from Ingo.
> > - Handle entire list of memmaps
> > v3:
> > Fix 32bit build issue
> > v4:
> > Addressing comments from Baoquan
> > - Not exclude nn@ss ranges
> > v5:
> > Addressing additional comments from Baoquan
> > - Update commit header and various coding style changes
> > 
> > diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
> > index e5612f3..59c2075 100644
> > --- a/arch/x86/boot/boot.h
> > +++ b/arch/x86/boot/boot.h
> > @@ -332,7 +332,10 @@ int strncmp(const char *cs, const char *ct, size_t 
> > count);
> >  size_t strnlen(const char *s, size_t maxlen);
> >  unsigned int atou(const char *s);
> >  unsigned long long simple_strtoull(const char *cp, char **endp, unsigned 
> > int base);
> > +unsigned long simple_strtoul(const char *cp, char **endp, unsigned int 
> > base);
> > +long simple_strtol(const char *cp, char **endp, unsigned int base);
> >  size_t strlen(const char *s);
> > +char *strchr(const char *s, int c);
> >  
> >  /* tty.c */
> >  void puts(const char *);
> > diff --git a/arch/x86/boot/compressed/kaslr.c 
> > b/arch/x86/boot/compressed/kaslr.c
> > index a66854d..036b514 100644
> > --- a/arch/x86/boot/compressed/kaslr.c
> > +++ b/arch/x86/boot/compressed/kaslr.c
> > @@ -11,6 +11,7 @@
> >   */
> >  #include "misc.h"
> >  #include "error.h"
> > +#include "../boot.h"
> >  
> >  #include 
> >  #include 
> > @@ -56,11 +57,16 @@ struct mem_vector {
> > unsigned long size;
> >  };
> >  
> > +/* only supporting at most 4 unusable memmap regions with kaslr */
> > +#define MAX_MEMMAP_REGIONS 4
> > +
> >  enum mem_avoid_index {
> > MEM_AVOID_ZO_RANGE = 0,
> > MEM_AVOID_INITRD,
> > MEM_AVOID_CMDLINE,
> > MEM_AVOID_BOOTPARAMS,
> > +   MEM_AVOID_MEMMAP_BEGIN,
> > +   MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1,
> > MEM_AVOID_MAX,
> >  };
> >  
> > @@ -77,6 +83,121 @@ static bool mem_overlaps(struct mem_vector *one, struct 
> > mem_vector *two)
> > return true;
> >  }
> >  
> > +/**
> > + * _memparse - parse a string with mem suffixes into a number
> > + * @ptr: Where parse begins
> > + * @retptr: (output) Optional pointer to next char after parse completes
> > + *
> > + * Parses a string into a number.  The number stored at @ptr is
> > + * potentially suffixed with K, M, G, T, P, E.
> > + */
> > +static unsigned long long _memparse(const char *ptr, char **retptr)
> > +{
> > +   char *endptr;   /* local pointer to end of parsed string */
> > +
> > +   unsigned long long ret = simple_strtoull(ptr, , 0);
> > +
> > +   switch (*endptr) {
> > +   case 'E':
> > +   case 'e':
> > +   ret <<= 10;
> > +   case 'P':
> > +   case 'p':
> > +   ret <<= 10;
> > +   case 'T':
> > +   case 't':
> > +   ret <<= 10;
> > +   case 'G':
> > +   case 'g':
> > +   ret <<= 10;
> > +   case 'M':
> > +   case 'm':
> > +   ret <<= 10;
> > +   case 'K':
> > +   case 'k':
> > +   ret <<= 10;
> > +   endptr++;
> > +   default:
> > +   break;
> > +   }
> > +
> > +   if (retptr)
> > +   *retptr = endptr;
> > +
> > +   return ret;
> > +}
> > +
> > +static int
> > +parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
> > +{
> > +   char *oldp;
> > +
> > +   if (!p)
> > +   return -EINVAL;
> > +
> > +   /* we don't care about this option here */
> > +   if (!strncmp(p, "exactmap", 8))
> > +   return -EINVAL;
> > +
> > +   oldp = p;
> > +   *size = _memparse(p, );
> > +   if (p == oldp)
> > +   return -EINVAL;
> > +
> > +   switch (*p) {
> > +   case '@':
> > +   /* skip this region, usable */
> > +   *start = 0;
> > +   *size = 0;
> > +   return 0;
> > +   case '#':
> > +   case '$':
> > +   case '!':
> > +   *start = _memparse(p + 1, );
> > +   return 0;
> > +   }
> > +
> > +   return -EINVAL;
> > +}
> > +
> > +static int mem_avoid_memmap(void)
> > +{
> > +   char arg[128];
> > +   int rc = 0;
> > +
> > +   /* see if we have any memmap areas */
> > +   if 

Re: Enabling peer to peer device transactions for PCIe devices

2017-01-05 Thread Jerome Glisse
On Thu, Jan 05, 2017 at 05:30:34PM -0700, Jason Gunthorpe wrote:
> On Thu, Jan 05, 2017 at 06:23:52PM -0500, Jerome Glisse wrote:
> 
> > > I still don't understand what you driving at - you've said in both
> > > cases a user VMA exists.
> > 
> > In the former case no, there is no VMA directly but if you want one than
> > a device can provide one. But such VMA is useless as CPU access is not
> > expected.
> 
> I disagree it is useless, the VMA is going to be necessary to support
> upcoming things like CAPI, you need it to support O_DIRECT from the
> filesystem, DPDK, etc. This is why I am opposed to any model that is
> not VMA based for setting up RDMA - that is shorted sighted and does
> not seem to reflect where the industry is going.
> 
> So focus on having VMA backed by actual physical memory that covers
> your GPU objects and ask how do we wire up the '__user *' to the DMA
> API in the best way so the DMA API still has enough information to
> setup IOMMUs and whatnot.

I am talking about 2 different thing. Existing hardware and API where you
_do not_ have a vma and you do not need one. This is just existing stuff.
Some close driver provide a functionality on top of this design. Question
is do we want to do the same ? If yes and you insist on having a vma we
could provide one but this is does not apply and is useless for where we
are going with new hardware.

With new hardware you just use malloc or mmap to allocate memory and then
you use it directly with the device. Device driver can migrate any part of
the process address space to device memory. In this scheme you have your
usual VMAs but there is nothing special about them.

Now when you try to do get_user_page() on any page that is inside the
device it will fails because we do not allow any device memory to be pin.
There is various reasons for that and they are not going away in any hw
in the planing (so for next few years).

Still we do want to support peer to peer mapping. Plan is to only do so
with ODP capable hardware. Still we need to solve the IOMMU issue and
it needs special handling inside the RDMA device. The way it works is
that RDMA ask for a GPU page, GPU check if it has place inside its PCI
bar to map this page for the device, this can fail. If it succeed then
you need the IOMMU to let the RDMA device access the GPU PCI bar.

So here we have 2 orthogonal problem. First one is how to make 2 drivers
talks to each other to setup mapping to allow peer to peer and second is
about IOMMU.


> > What i was trying to get accross is that no matter what level you
> > consider in the end you still need something at the DMA API level.
> > And that the 2 different use case (device vma or regular vma) means
> > 2 differents API for the device driver.
> 
> I agree we need new stuff at the DMA API level, but I am opposed to
> the idea we need two API paths that the *driver* has to figure out.
> That is fundamentally not what I want as a driver developer.
> 
> Give me a common API to convert '__user *' to a scatter list and pin
> the pages. This needs to figure out your two cases. And Huge
> Pages. And ZONE_DIRECT.. (a better get_user_pages)

Pining is not gonna happen like i said it would hinder the GPU to the
point it would become useless.


> Give me an API to take the scatter list and DMA map it, handling all
> the stuff associated with peer-peer. (a better dma_map_sg)
> 
> Give me a notifier scheme to rework my scatter list when physical
> pages need to change (mmu notifiers)
> 
> Use the scatter list memory to convey needed information from the
> first step to the second.
> 
> Do not bother the driver with distinctions on what kind of memory is
> behind that VMA. Don't ask me to use get_user_pages or
> gpu_get_user_pages, do not ask me to use dma_map_sg or
> dma_map_sg_peer_direct. The Driver Doesn't Need To Know.

I understand you want it easy but there must be part that must be aware,
at very least the ODP logic. Creating a peer to peer mapping is a multi
step process and some of those step can fails. Fallback is always to
migrate back to system memory as a default path that can not fail, except
if we are out of memory.


> IMHO this is why GPU direct is not mergable - it creates a crazy
> parallel mini-mm subsystem inside RDMA and uses that to connect to a
> GPU driver, everything is expected to have parallel paths for GPU
> direct and normal MM. No good at all.

Existing hardware and new hardware works differently. I am trying to
explain the two different design needed for each one. You understandtably
dislike the existing hardware that has more stringent requirement and
can not be supported transparently and need dedicated communication with
the two driver.

New hardware that have a completely different API in userspace. We can
decide to only support the latter and forget about the former.


> > > So, how do you identify these GPU objects? How do you expect RDMA
> > > convert them to scatter lists? How will ODP work?
> > 
> > No ODP on 

Re: [PATCH v2 0/4] Write protect DAX PMDs in *sync path

2017-01-05 Thread Andrew Morton
On Tue, 3 Jan 2017 17:13:49 -0700 Ross Zwisler  
wrote:

> On Thu, Dec 22, 2016 at 02:18:52PM -0700, Ross Zwisler wrote:
> > Currently dax_mapping_entry_mkclean() fails to clean and write protect the
> > pmd_t of a DAX PMD entry during an *sync operation.  This can result in
> > data loss, as detailed in patch 4.
> > 
> > You can find a working tree here:
> > 
> > https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_pmd_clean_v2
> > 
> > This series applies cleanly to mmotm-2016-12-19-16-31.
> > 
> > Changes since v1:
> >  - Included Dan's patch to kill DAX support for UML.
> >  - Instead of wrapping the DAX PMD code in dax_mapping_entry_mkclean() in
> >an #ifdef, we now create a stub for pmdp_huge_clear_flush() for the case
> >when CONFIG_TRANSPARENT_HUGEPAGE isn't defined. (Dan & Jan)
> > 
> > Dan Williams (1):
> >   dax: kill uml support
> > 
> > Ross Zwisler (3):
> >   dax: add stub for pmdp_huge_clear_flush()
> >   mm: add follow_pte_pmd()
> >   dax: wrprotect pmd_t in dax_mapping_entry_mkclean
> > 
> >  fs/Kconfig|  2 +-
> >  fs/dax.c  | 49 
> > ++-
> >  include/asm-generic/pgtable.h | 10 +
> >  include/linux/mm.h|  4 ++--
> >  mm/memory.c   | 41 
> >  5 files changed, 79 insertions(+), 27 deletions(-)
> 
> Well, 0-day found another architecture that doesn't define pmd_pfn() et al.,
> so we'll need some more fixes. (Thank you, 0-day, for the coverage!)
> 
> I have to apologize, I didn't understand that Dan intended his "dax: kill uml
> support" patch to land in v4.11.  I thought he intended it as a cleanup to my
> series, which really needs to land in v4.10.  That's why I folded them
> together into this v2, along with the wrapper suggested by Jan.
> 
> Andrew, does it work for you to just keep v1 of this series, and eventually
> send that to Linus for v4.10?
> 
> https://lkml.org/lkml/2016/12/20/649
> 
> You've already pulled that one into -mm, and it does correctly solve the data
> loss issue.
> 
> That would let us deal with getting rid of the #ifdef, blacklisting
> architectures and introducing the pmdp_huge_clear_flush() strub in a follow-on
> series for v4.11.

I have mm-add-follow_pte_pmd.patch and
dax-wrprotect-pmd_t-in-dax_mapping_entry_mkclean.patch queued for 4.10.
Please (re)send any additional patches, indicating for each one
whether you believe it should also go into 4.10?


___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: Enabling peer to peer device transactions for PCIe devices

2017-01-05 Thread Serguei Sagalovitch

On 2017-01-05 07:30 PM, Jason Gunthorpe wrote:

 but I am opposed to
the idea we need two API paths that the *driver* has to figure out.
That is fundamentally not what I want as a driver developer.

Give me a common API to convert '__user *' to a scatter list and pin
the pages.

Completely agreed. IMHO there is no sense to duplicate the same logic
everywhere as well as  trying to find places where it is missing.

Sincerely yours,
Serguei Sagalovitch

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: Enabling peer to peer device transactions for PCIe devices

2017-01-05 Thread Jason Gunthorpe
On Thu, Jan 05, 2017 at 06:23:52PM -0500, Jerome Glisse wrote:

> > I still don't understand what you driving at - you've said in both
> > cases a user VMA exists.
> 
> In the former case no, there is no VMA directly but if you want one than
> a device can provide one. But such VMA is useless as CPU access is not
> expected.

I disagree it is useless, the VMA is going to be necessary to support
upcoming things like CAPI, you need it to support O_DIRECT from the
filesystem, DPDK, etc. This is why I am opposed to any model that is
not VMA based for setting up RDMA - that is shorted sighted and does
not seem to reflect where the industry is going.

So focus on having VMA backed by actual physical memory that covers
your GPU objects and ask how do we wire up the '__user *' to the DMA
API in the best way so the DMA API still has enough information to
setup IOMMUs and whatnot.

> What i was trying to get accross is that no matter what level you
> consider in the end you still need something at the DMA API level.
> And that the 2 different use case (device vma or regular vma) means
> 2 differents API for the device driver.

I agree we need new stuff at the DMA API level, but I am opposed to
the idea we need two API paths that the *driver* has to figure out.
That is fundamentally not what I want as a driver developer.

Give me a common API to convert '__user *' to a scatter list and pin
the pages. This needs to figure out your two cases. And Huge
Pages. And ZONE_DIRECT.. (a better get_user_pages)

Give me an API to take the scatter list and DMA map it, handling all
the stuff associated with peer-peer. (a better dma_map_sg)

Give me a notifier scheme to rework my scatter list when physical
pages need to change (mmu notifiers)

Use the scatter list memory to convey needed information from the
first step to the second.

Do not bother the driver with distinctions on what kind of memory is
behind that VMA. Don't ask me to use get_user_pages or
gpu_get_user_pages, do not ask me to use dma_map_sg or
dma_map_sg_peer_direct. The Driver Doesn't Need To Know.

IMHO this is why GPU direct is not mergable - it creates a crazy
parallel mini-mm subsystem inside RDMA and uses that to connect to a
GPU driver, everything is expected to have parallel paths for GPU
direct and normal MM. No good at all.

> > So, how do you identify these GPU objects? How do you expect RDMA
> > convert them to scatter lists? How will ODP work?
> 
> No ODP on those. If you want vma, the GPU device driver can provide

You said you needed invalidate, that has to be done via ODP.

Jason
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: Enabling peer to peer device transactions for PCIe devices

2017-01-05 Thread Jerome Glisse
On Thu, Jan 05, 2017 at 03:42:15PM -0700, Jason Gunthorpe wrote:
> On Thu, Jan 05, 2017 at 03:19:36PM -0500, Jerome Glisse wrote:
> 
> > > Always having a VMA changes the discussion - the question is how to
> > > create a VMA that reprensents IO device memory, and how do DMA
> > > consumers extract the correct information from that VMA to pass to the
> > > kernel DMA API so it can setup peer-peer DMA.
> > 
> > Well my point is that it can't be. In HMM case inside a single VMA
> > you
> [..]
> 
> > In the GPUDirect case the idea is that you have a specific device vma
> > that you map for peer to peer.
> 
> [..]
> 
> I still don't understand what you driving at - you've said in both
> cases a user VMA exists.

In the former case no, there is no VMA directly but if you want one than
a device can provide one. But such VMA is useless as CPU access is not
expected.

> 
> From my perspective in RDMA, all I want is a core kernel flow to
> convert a '__user *' into a scatter list of DMA addresses, that works no
> matter what is backing that VMA, be it HMM, a 'hidden' GPU object, or
> struct page memory.
> 
> A '__user *' pointer is the only way to setup a RDMA MR, and I see no
> reason to have another API at this time.
> 
> The details of how to translate to a scatter list are a MM subject,
> and the MM folks need to get 
> 
> I just don't care if that routine works at a page level, or a whole
> VMA level, or some combination of both, that is up to the MM team to
> figure out :)

And that's what i am trying to get accross. There is 2 cases here.
What exist on today hardware. Thing like GPU direct, that works on
VMA level. Versus where some new hardware is going were want to do
thing on page level. Both require different API at different level.

What i was trying to get accross is that no matter what level you
consider in the end you still need something at the DMA API level.
And that the 2 different use case (device vma or regular vma) means
2 differents API for the device driver.

> 
> > a page level. Expectation here is that the GPU userspace expose a special
> > API to allow RDMA to directly happen on GPU object allocated through
> > GPU specific API (ie it is not regular memory and it is not accessible
> > by CPU).
> 
> So, how do you identify these GPU objects? How do you expect RDMA
> convert them to scatter lists? How will ODP work?

No ODP on those. If you want vma, the GPU device driver can provide
one. GPU object are disjoint from regular memory (coming from some
form of mmap). They are created through ioctl and in many case are
never expose to the CPU. They only exist inside the GPU driver realm.

None the less there is usecase where exchanging those object accross
computer over a network make sense. I am not an end user here :)


> > > We have MMU notifiers to handle this today in RDMA. Async RDMA MR
> > > Invalidate like you see in the above out of tree patches is totally
> > > crazy and shouldn't be in mainline. Use ODP capable RDMA hardware.
> > 
> > Well there is still a large base of hardware that do not have such
> > feature and some people would like to be able to keep using those.
> 
> Hopefully someone will figure out how to do that without the crazy
> async MR invalidation.

Personnaly i don't care too much about this old hardware and thus i am
fine without supporting them. The open source userspace is playing
catchup and doing feature for old hardware probably does not make sense.

Cheers,
Jérôme
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: Enabling peer to peer device transactions for PCIe devices

2017-01-05 Thread Jason Gunthorpe
On Thu, Jan 05, 2017 at 03:19:36PM -0500, Jerome Glisse wrote:

> > Always having a VMA changes the discussion - the question is how to
> > create a VMA that reprensents IO device memory, and how do DMA
> > consumers extract the correct information from that VMA to pass to the
> > kernel DMA API so it can setup peer-peer DMA.
> 
> Well my point is that it can't be. In HMM case inside a single VMA
> you
[..]

> In the GPUDirect case the idea is that you have a specific device vma
> that you map for peer to peer.

[..]

I still don't understand what you driving at - you've said in both
cases a user VMA exists.

>From my perspective in RDMA, all I want is a core kernel flow to
convert a '__user *' into a scatter list of DMA addresses, that works no
matter what is backing that VMA, be it HMM, a 'hidden' GPU object, or
struct page memory.

A '__user *' pointer is the only way to setup a RDMA MR, and I see no
reason to have another API at this time.

The details of how to translate to a scatter list are a MM subject,
and the MM folks need to get 

I just don't care if that routine works at a page level, or a whole
VMA level, or some combination of both, that is up to the MM team to
figure out :)

> a page level. Expectation here is that the GPU userspace expose a special
> API to allow RDMA to directly happen on GPU object allocated through
> GPU specific API (ie it is not regular memory and it is not accessible
> by CPU).

So, how do you identify these GPU objects? How do you expect RDMA
convert them to scatter lists? How will ODP work?

> > We have MMU notifiers to handle this today in RDMA. Async RDMA MR
> > Invalidate like you see in the above out of tree patches is totally
> > crazy and shouldn't be in mainline. Use ODP capable RDMA hardware.
> 
> Well there is still a large base of hardware that do not have such
> feature and some people would like to be able to keep using those.

Hopefully someone will figure out how to do that without the crazy
async MR invalidation.

Jason
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: Enabling peer to peer device transactions for PCIe devices

2017-01-05 Thread Jerome Glisse
On Thu, Jan 05, 2017 at 01:07:19PM -0700, Jason Gunthorpe wrote:
> On Thu, Jan 05, 2017 at 02:54:24PM -0500, Jerome Glisse wrote:
> 
> > Mellanox and NVidia support peer to peer with what they market a
> > GPUDirect. It only works without IOMMU. It is probably not upstream :
> > 
> > https://www.mail-archive.com/linux-rdma@vger.kernel.org/msg21402.html
> > 
> > I thought it was but it seems it require an out of tree driver to work.
> 
> Right, it is out of tree and not under consideration for mainline.
> 
> > Wether there is a vma or not isn't important to the issue anyway. If
> > you want to enforce VMA rule for RDMA it is an RDMA specific discussion
> > in which i don't want to be involve, it is not my turf :)
> 
> Always having a VMA changes the discussion - the question is how to
> create a VMA that reprensents IO device memory, and how do DMA
> consumers extract the correct information from that VMA to pass to the
> kernel DMA API so it can setup peer-peer DMA.

Well my point is that it can't be. In HMM case inside a single VMA you
can have one page inside GPU memory at address A but next page inside
regular memory at A+4k. So handling this at the VMA level does not make
sense. So in this case you would get the device from the struct page
and you would query through common API to determine if you can do peer
to peer. If not it would trigger migration back to regular memory.
If yes then you still have to solve the IOMMU issue and hence the DMA
API changes that were propose.

In the GPUDirect case the idea is that you have a specific device vma
that you map for peer to peer. Here thing can be at vma level and not at
a page level. Expectation here is that the GPU userspace expose a special
API to allow RDMA to directly happen on GPU object allocated through
GPU specific API (ie it is not regular memory and it is not accessible
by CPU).


Both case are disjoint. Both case need to solve the IOMMU issue which
seems to be best solve at the DMA API level.


> > What matter is the back channel API between peer-to-peer device. Like
> > the above patchset points out for GPU we need to be able to invalidate
> > a mapping at any point in time. Pining is not something we want to
> > live with.
> 
> We have MMU notifiers to handle this today in RDMA. Async RDMA MR
> Invalidate like you see in the above out of tree patches is totally
> crazy and shouldn't be in mainline. Use ODP capable RDMA hardware.

Well there is still a large base of hardware that do not have such
feature and some people would like to be able to keep using those.
I believe allowing direct access to GPU object that are otherwise
hidden from regular kernel memory management is still meaningfull.

Cheers,
Jérôme

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: Enabling peer to peer device transactions for PCIe devices

2017-01-05 Thread Jason Gunthorpe
On Thu, Jan 05, 2017 at 02:54:24PM -0500, Jerome Glisse wrote:

> Mellanox and NVidia support peer to peer with what they market a
> GPUDirect. It only works without IOMMU. It is probably not upstream :
> 
> https://www.mail-archive.com/linux-rdma@vger.kernel.org/msg21402.html
> 
> I thought it was but it seems it require an out of tree driver to work.

Right, it is out of tree and not under consideration for mainline.

> Wether there is a vma or not isn't important to the issue anyway. If
> you want to enforce VMA rule for RDMA it is an RDMA specific discussion
> in which i don't want to be involve, it is not my turf :)

Always having a VMA changes the discussion - the question is how to
create a VMA that reprensents IO device memory, and how do DMA
consumers extract the correct information from that VMA to pass to the
kernel DMA API so it can setup peer-peer DMA.

> What matter is the back channel API between peer-to-peer device. Like
> the above patchset points out for GPU we need to be able to invalidate
> a mapping at any point in time. Pining is not something we want to
> live with.

We have MMU notifiers to handle this today in RDMA. Async RDMA MR
Invalidate like you see in the above out of tree patches is totally
crazy and shouldn't be in mainline. Use ODP capable RDMA hardware.

Jason
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[ndctl PATCH] test: document workaround for unit-test-module load priority

2017-01-05 Thread Dan Williams
On some distributions it appears the recommended kernel build procedure
can sometimes prefer the in-tree module over an out-of-tree module of
the same name in the /lib/modules//extra directory. This can be
worked around with an explicit depmod.d policy. Document how to setup
this workaround.

Reported-by: Dave Jiang 
Signed-off-by: Dan Williams 
---
 README.md |   26 ++
 1 file changed, 26 insertions(+)

diff --git a/README.md b/README.md
index e9dec67a0ea2..38fc050210c3 100644
--- a/README.md
+++ b/README.md
@@ -56,3 +56,29 @@ 
git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git`
 5. Now run `make check` in the ndctl source directory, or `ndctl test`,
if ndctl was built with `--enable-test`.
 
+Troubleshooting
+===
+
+The unit tests will validate that the environment is set up correctly
+before they try to run. If the platform is misconfigured, i.e. the unit
+test modules are not available, or the test versions of the modules are
+superseded by the "in-tree/production" version of the modules `make
+check` will skip tests and report a message like the following in
+test/test-suite.log:  
+`SKIP: libndctl`  
+`==`  
+`test/init: nfit_test_init: nfit.ko: appears to be production version: 
/lib/modules/4.8.8-200.fc24.x86_64/kernel/drivers/acpi/nfit/nfit.ko.xz`  
+`__ndctl_test_skip: explicit skip test_libndctl:2684`  
+`nfit_test unavailable skipping tests`  
+
+If the unit test modules are indeed available in the modules 'extra'
+directory the default depmod policy can be overridden by adding a file
+to /etc/depmod.d with the following contents:  
+`override nfit * extra`  
+`override dax * extra`  
+`override dax_pmem * extra`  
+`override libnvdimm * extra`  
+`override nd_blk * extra`  
+`override nd_btt * extra`  
+`override nd_e820 * extra`  
+`override nd_pmem * extra`  

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: Enabling peer to peer device transactions for PCIe devices

2017-01-05 Thread Jason Gunthorpe
On Thu, Jan 05, 2017 at 01:39:29PM -0500, Jerome Glisse wrote:

>   1) peer-to-peer because of userspace specific API like NVidia GPU
> direct (AMD is pushing its own similar API i just can't remember
> marketing name). This does not happen through a vma, this happens
> through specific device driver call going through device specific
> ioctl on both side (GPU and RDMA). So both kernel driver are aware
> of each others.

Today you can only do user-initiated RDMA operations in conjection
with a VMA.

We'd need a really big and strong reason to create an entirely new
non-VMA based memory handle scheme for RDMA.

So my inclination is to just completely push back on this idea. You
need a VMA to do RMA.

GPUs need to create VMAs for the memory they want to RDMA from, even
if the VMA handle just causes SIGBUS for any CPU access.

Jason
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v5 1/2] mm, dax: make pmd_fault() and friends to be the same as fault()

2017-01-05 Thread Dave Jiang


On 12/19/2016 11:27 AM, Dave Jiang wrote:
> Instead of passing in multiple parameters in the pmd_fault() handler,
> a vmf can be passed in just like a fault() handler. This will simplify
> code and remove the need for the actual pmd fault handlers to allocate a
> vmf. Related functions are also modified to do the same.
> 
> Signed-off-by: Dave Jiang 
> Reviewed-by: Ross Zwisler 
> Reviewed-by: Jan Kara 

Andrew,
I haven't seen any more comments on these 2 patches. Will you pick them
up for the mm tree? Thanks!

> ---
>  drivers/dax/dax.c |   16 +++-
>  fs/dax.c  |   42 
> ++---
>  fs/ext4/file.c|9 -
>  fs/xfs/xfs_file.c |   10 --
>  include/linux/dax.h   |7 +++
>  include/linux/mm.h|3 +--
>  include/trace/events/fs_dax.h |   15 +++
>  mm/memory.c   |6 ++
>  8 files changed, 46 insertions(+), 62 deletions(-)
> 
> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
> index c753a4c..947e49a 100644
> --- a/drivers/dax/dax.c
> +++ b/drivers/dax/dax.c
> @@ -379,10 +379,9 @@ static int dax_dev_fault(struct vm_area_struct *vma, 
> struct vm_fault *vmf)
>  }
>  
>  static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
> - struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
> - unsigned int flags)
> + struct vm_area_struct *vma, struct vm_fault *vmf)
>  {
> - unsigned long pmd_addr = addr & PMD_MASK;
> + unsigned long pmd_addr = vmf->address & PMD_MASK;
>   struct device *dev = _dev->dev;
>   struct dax_region *dax_region;
>   phys_addr_t phys;
> @@ -414,23 +413,22 @@ static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
>  
>   pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
>  
> - return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
> - flags & FAULT_FLAG_WRITE);
> + return vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
> + vmf->flags & FAULT_FLAG_WRITE);
>  }
>  
> -static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
> - pmd_t *pmd, unsigned int flags)
> +static int dax_dev_pmd_fault(struct vm_area_struct *vma, struct vm_fault 
> *vmf)
>  {
>   int rc;
>   struct file *filp = vma->vm_file;
>   struct dax_dev *dax_dev = filp->private_data;
>  
>   dev_dbg(_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
> - current->comm, (flags & FAULT_FLAG_WRITE)
> + current->comm, (vmf->flags & FAULT_FLAG_WRITE)
>   ? "write" : "read", vma->vm_start, vma->vm_end);
>  
>   rcu_read_lock();
> - rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
> + rc = __dax_dev_pmd_fault(dax_dev, vma, vmf);
>   rcu_read_unlock();
>  
>   return rc;
> diff --git a/fs/dax.c b/fs/dax.c
> index d3fe880..446e861 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1310,18 +1310,17 @@ static int dax_pmd_load_hole(struct vm_area_struct 
> *vma, pmd_t *pmd,
>   return VM_FAULT_FALLBACK;
>  }
>  
> -int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
> - pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
> +int dax_iomap_pmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> + struct iomap_ops *ops)
>  {
>   struct address_space *mapping = vma->vm_file->f_mapping;
> - unsigned long pmd_addr = address & PMD_MASK;
> - bool write = flags & FAULT_FLAG_WRITE;
> + unsigned long pmd_addr = vmf->address & PMD_MASK;
> + bool write = vmf->flags & FAULT_FLAG_WRITE;
>   unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
>   struct inode *inode = mapping->host;
>   int result = VM_FAULT_FALLBACK;
>   struct iomap iomap = { 0 };
> - pgoff_t max_pgoff, pgoff;
> - struct vm_fault vmf;
> + pgoff_t max_pgoff;
>   void *entry;
>   loff_t pos;
>   int error;
> @@ -1331,10 +1330,10 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, 
> unsigned long address,
>* supposed to hold locks serializing us with truncate / punch hole so
>* this is a reliable test.
>*/
> - pgoff = linear_page_index(vma, pmd_addr);
> + vmf->pgoff = linear_page_index(vma, pmd_addr);
>   max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
>  
> - trace_dax_pmd_fault(inode, vma, address, flags, pgoff, max_pgoff, 0);
> + trace_dax_pmd_fault(inode, vma, vmf, max_pgoff, 0);
>  
>   /* Fall back to PTEs if we're going to COW */
>   if (write && !(vma->vm_flags & VM_SHARED))
> @@ -1346,13 +1345,13 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, 
> unsigned long address,
>   if ((pmd_addr + PMD_SIZE) > vma->vm_end)
>   goto fallback;
>  
> - if (pgoff > max_pgoff) {
> + if 

Re: Enabling peer to peer device transactions for PCIe devices

2017-01-05 Thread Jerome Glisse
Sorry to revive this thread but it fells through my filters and i
miss it. I have been going through it and i think the discussion
has been hinder by the fact that distinct problems were merge while
they should be address separately.

First for peer-to-peer we need to be clear on how this happens. Two
cases here :
  1) peer-to-peer because of userspace specific API like NVidia GPU
direct (AMD is pushing its own similar API i just can't remember
marketing name). This does not happen through a vma, this happens
through specific device driver call going through device specific
ioctl on both side (GPU and RDMA). So both kernel driver are aware
of each others.
  2) peer-to-peer because RDMA/device is trying to access a regular
vma (ie non special either private anonymous or share memory or
mmap of a regular file not a device file).

For 1) there is no need to over complicate thing. Device driver must
have a back-channel between them and must be able to invalidate their
respective mapping (ie GPU must be able to ask RDMA device to kill/
stop its MR).

So remaining issue for 1) is how to enable effective peer-to-peer
mapping given that it might not work reliably on all platform. Here
Alex was listing existing proposal:
  A P2P DMA DMA-API/PCI map_peer_resource support for peer-to-peer
http://www.spinics.net/lists/linux-pci/msg44560.html
  B ZONE_DEVICE IO irect I/O and DMA for persistent memory
https://lwn.net/Articles/672457/
  C DMA-BUF RDMA subsystem DMA-BUF support
http://www.spinics.net/lists/linux-rdma/msg38748.html
  D iopmem iopmem : A block device for PCIe memory
https://lwn.net/Articles/703895/
  E HMM (not interesting for case 1)
  F Something new

Of the above D is ill suited for for GPU as we do not want to pin
GPU memory and D is design with long live object that do not move.
Also i do not think that exposing device PCIe bar through a new
/dev/somefilename is a good idea for GPU. So i think this should
be discarded.

HMM should be discard in respect of case 1 too. It is useful for
case 2. I don't think dma-buf is the right path either.

So we i think there is only A and B that make sense. Now for use
case 1 i think A is the best solution. No need to have struct page
and it require explicit knowlegde for device driver that it is
mapping another device memory which is a given in usecase 1.


If we look at case 2 the situation is bit more complex. Here RDMA
is just trying to access a regular VMA but it might happens that
some memory inside that VMA reside inside a device memory. When
that happens we would like to avoid to move that memory back to
system memory assuming that a peer mapping is doable.

Usecase 2 assume that the GPU is either on platform with CAPI or
CCTX (or something similar) in which case it is easy as device
memory will have struct page and is always accessible by CPU and
transparent from device to device access (AFAICT).

So we left with platform that do not have proper support for
device memory (ie CPU can not access it the same as DDR or as
limited access). Which apply to x86 for the foreseeable future.

This is the problem HMM address, allowing to transparently use
device memory inside a process even if direct CPU access are not
permited. I have plan to support peer-to-peer with HMM because
it is an important usecase. The idea is to have the device driver
fault against ZONE_DEVICE page and communicate through common API
to establish mapping. HMM will only handle keeping track of device
to device mapping and allowing to invalidate such mapping at any
time to allow memory to be migrated.

I do not intend to solve the IOMMU side of the problem or even
the PCI hierarchy issue where you can't peer-to-peer between device
accross some PCI bridge. I believe this is an orthogonal problem
and that it is best solve inside the DMA API ie with solution A.


I do not think we should try to solve all the problems with a
common solutions. They are too disparate from capabilities (what
the hardware can and can't do).

>From my point of view there is few take aways:
  - device should only access regular vma
  - device should never try to access vma that point to another
device (mmap of any file in /dev)
  - peer to peer access through dedicated userspace API must
involve dedicated API between kernel driver taking part into
the peer to peer access
  - peer to peer of regular vma must involve a common API for
drivers to interact so no driver can block the other


So i think the DMA-API proposal is the one to pursue and others
problem relating to handling GPU memory and how to use it is a
different kind of problem. One with either an hardware solution
(CAPI, CCTX, ...) or a software solution (HMM so far).

I don't think we should conflict the 2 problems into one. Anyway
i think this should be something worth discussing face to face
with interested party to flesh out a solution (can be at LSF/MM
or in another forum).

Cheers,
Jérôme

[ndctl PATCH] test: validate nfit_test modules before running tests

2017-01-05 Thread Dan Williams
System crashes and other failures can occur if an environment attempts
to run the unit tests with the "production" / in-tree versions of the
nfit and libnvdimm modules. Ensure that all required test modules are
supplied by out-of-tree versions.

Granted that this detection mechanism will not differentiate the proper
tools/testing/nvdimm/ out-of-tree modules with something like a "driver
update" version of the modules, but it should catch the common case of a
platform being misconfigured for unit-test execution.

Reported-by: Dave Jiang 
Signed-off-by: Dan Williams 
---
 test/Makefile.am |   25 ++
 test/core.c  |   76 ++
 2 files changed, 90 insertions(+), 11 deletions(-)

diff --git a/test/Makefile.am b/test/Makefile.am
index 592308ee5f07..524fafa4a28a 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -40,30 +40,33 @@ LIBNDCTL_LIB =\
../ndctl/lib/libndctl.la \
../daxctl/lib/libdaxctl.la
 
-libndctl_SOURCES = libndctl.c core.c
+testcore =\
+   core.c \
+   ../util/log.c \
+   ../util/sysfs.c
+
+libndctl_SOURCES = libndctl.c $(testcore)
 libndctl_LDADD = $(LIBNDCTL_LIB) $(UUID_LIBS) $(KMOD_LIBS)
 
 dsm_fail_SOURCES =\
dsm-fail.c \
-   core.c \
-   ../util/log.c \
-   ../util/sysfs.c
+   $(testcore)
 
 dsm_fail_LDADD = $(LIBNDCTL_LIB) $(KMOD_LIBS)
 
-blk_ns_SOURCES = blk_namespaces.c core.c
+blk_ns_SOURCES = blk_namespaces.c $(testcore)
 blk_ns_LDADD = $(LIBNDCTL_LIB) $(KMOD_LIBS)
 
-pmem_ns_SOURCES = pmem_namespaces.c core.c
+pmem_ns_SOURCES = pmem_namespaces.c $(testcore)
 pmem_ns_LDADD = $(LIBNDCTL_LIB) $(KMOD_LIBS)
 
-dpa_alloc_SOURCES = dpa-alloc.c core.c
+dpa_alloc_SOURCES = dpa-alloc.c $(testcore)
 dpa_alloc_LDADD = $(LIBNDCTL_LIB) $(UUID_LIBS) $(KMOD_LIBS)
 
-parent_uuid_SOURCES = parent-uuid.c core.c
+parent_uuid_SOURCES = parent-uuid.c $(testcore)
 parent_uuid_LDADD = $(LIBNDCTL_LIB) $(UUID_LIBS) $(KMOD_LIBS)
 
-dax_dev_SOURCES = dax-dev.c core.c
+dax_dev_SOURCES = dax-dev.c $(testcore)
 dax_dev_LDADD = $(LIBNDCTL_LIB) $(KMOD_LIBS)
 
 dax_pmd_SOURCES = dax-pmd.c
@@ -73,7 +76,7 @@ device_dax_SOURCES = \
device-dax.c \
dax-dev.c \
dax-pmd.c \
-   core.c \
+   $(testcore) \
../ndctl/builtin-xaction-namespace.c \
../util/json.c
 device_dax_LDADD = \
@@ -84,7 +87,7 @@ device_dax_LDADD = \
 
 multi_pmem_SOURCES = \
multi-pmem.c \
-   core.c \
+   $(testcore) \
../ndctl/builtin-xaction-namespace.c \
../util/json.c
 multi_pmem_LDADD = \
diff --git a/test/core.c b/test/core.c
index 3119fd826a94..d1c38902feb4 100644
--- a/test/core.c
+++ b/test/core.c
@@ -6,6 +6,10 @@
 #include 
 #include 
 
+#include 
+#include 
+#include 
+
 #define KVER_STRLEN 20
 
 struct ndctl_test {
@@ -103,12 +107,84 @@ int nfit_test_init(struct kmod_ctx **ctx, struct 
kmod_module **mod,
int log_level)
 {
int rc;
+   unsigned int i;
+   struct log_ctx log_ctx;
+   const char *list[] = {
+   "nfit",
+   "dax",
+   "dax_pmem",
+   "libnvdimm",
+   "nd_blk",
+   "nd_btt",
+   "nd_e820",
+   "nd_pmem",
+   };
+
+   log_init(_ctx, "test/init", "NDCTL_TEST");
+   log_ctx.log_priority = log_level;
 
*ctx = kmod_new(NULL, NULL);
if (!*ctx)
return -ENXIO;
kmod_set_log_priority(*ctx, log_level);
 
+   /*
+* Check that all nfit, libnvdimm, and device-dax modules are
+* the mocked versions. If they are loaded, check that they have
+* the "out-of-tree" kernel taint, otherwise check that they
+* come from the "/lib/modules//extra" directory.
+*/
+   for (i = 0; i < ARRAY_SIZE(list); i++) {
+   char attr[SYSFS_ATTR_SIZE];
+   const char *name = list[i];
+   const char *path;
+   char buf[100];
+   int state;
+
+   rc = kmod_module_new_from_name(*ctx, name, mod);
+   if (rc) {
+   log_err(_ctx, "%s.ko: missing\n", name);
+   break;
+   }
+
+   path = kmod_module_get_path(*mod);
+   if (!path) {
+   log_err(_ctx, "%s.ko: failed to get path\n", name);
+   break;
+   }
+
+   if (!strstr(path, "/extra/")) {
+   log_err(_ctx, "%s.ko: appears to be production 
version: %s\n",
+   name, path);
+   break;
+   }
+
+   state = kmod_module_get_initstate(*mod);
+   if (state == KMOD_MODULE_LIVE) {
+   sprintf(buf, "/sys/module/%s/taint", 

Re: [PATCH] pmem: return EIO on read_pmem() failure

2017-01-05 Thread Jeff Moyer
Stefan Hajnoczi  writes:

> The read_pmem() function uses memcpy_mcsafe() on x86 where an EFAULT
> error code indicates a failed read.  Block I/O should use EIO to
> indicate failure.  Other pmem code paths (like bad blocks) already use
> EIO so let's be consistent.

I agree that it's the right thing to return, though right now it
probably doesn't matter.  As Dan said, all paths leading back to
userspace interpret any non-zero value as EIO.

Reviewed-by: Jeff Moyer 

> Cc: Dan Williams 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  drivers/nvdimm/pmem.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 7282d74..5b536be 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -90,7 +90,9 @@ static int read_pmem(struct page *page, unsigned int off,
>  
>   rc = memcpy_from_pmem(mem + off, pmem_addr, len);
>   kunmap_atomic(mem);
> - return rc;
> + if (rc)
> + return -EIO;
> + return 0;
>  }
>  
>  static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [ndctl PATCH] ndctl: support machines without numa

2017-01-05 Thread Jeff Moyer
Dan Williams  writes:

> From: Diego Dompe 
>
> Currently when using ndctl in machines that do not have numa, the ndctl
> command fails because it fails to find the numa_node attribute in
> systems.

s/machines that do not have numa/kernels compiled with CONFIG_NUMA=n/

I don't know that this was the right decision for the kernel, but that
ship's sailed.

> Just assume the default, zero, in these cases rather than fail.

Looks fine to me.

Reviewed-by: Jeff Moyer 

>
> Link: https://github.com/pmem/ndctl/pull/9
> [djbw: minor style fixups]
> Signed-off-by: Dan Williams 
> ---
>
> Diego, if this reworked version looks good to you I'll add your
> "Signed-off-by: Diego Dompe " to the tags. Let me know.
> Thanks for the fix!
>
>  ndctl/lib/libndctl.c |5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/ndctl/lib/libndctl.c b/ndctl/lib/libndctl.c
> index ea3111eb6c93..8240235dff5c 100644
> --- a/ndctl/lib/libndctl.c
> +++ b/ndctl/lib/libndctl.c
> @@ -2780,9 +2780,8 @@ static void *add_namespace(void *parent, int id, const 
> char *ndns_base)
>   ndns->raw_mode = strtoul(buf, NULL, 0);
>  
>   sprintf(path, "%s/numa_node", ndns_base);
> - if (sysfs_read_attr(ctx, path, buf) < 0)
> - goto err_read;
> - ndns->numa_node = strtol(buf, NULL, 0);
> + if (sysfs_read_attr(ctx, path, buf) == 0)
> + ndns->numa_node = strtol(buf, NULL, 0);
>  
>   switch (ndns->type) {
>   case ND_DEVICE_NAMESPACE_BLK:
>
> ___
> Linux-nvdimm mailing list
> Linux-nvdimm@lists.01.org
> https://lists.01.org/mailman/listinfo/linux-nvdimm
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [PATCH v5] x86: fix kaslr and memmap collision

2017-01-05 Thread Baoquan He
On 01/04/17 at 11:29am, Dave Jiang wrote:
> CONFIG_RANDOMIZE_BASE relocates the kernel to a random base address.
> However it does not take into account the memmap= parameter passed in from
> the kernel cmdline. This results in the kernel sometimes being put in
> the middle of memmap. Teaching kaslr to not insert the kernel in
> memmap defined regions. We will support up to 4 memmap regions. Any
> additional regions will cause kaslr to disable. The mem_avoid set has
> been augmented to add up to 4 unusable regions of memmaps provided by the
> user to exclude those regions from the set of valid address range to insert
> the uncompressed kernel image. The nn@ss ranges will be skipped by the
> mem_avoid set since it indicates memory useable.
> 
> Signed-off-by: Dave Jiang 
> ---
> 
> v2:
> Addressing comments from Ingo.
> - Handle entire list of memmaps
> v3:
> Fix 32bit build issue
> v4:
> Addressing comments from Baoquan
> - Not exclude nn@ss ranges
> v5:
> Addressing additional comments from Baoquan
> - Update commit header and various coding style changes
> 
> diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
> index e5612f3..59c2075 100644
> --- a/arch/x86/boot/boot.h
> +++ b/arch/x86/boot/boot.h
> @@ -332,7 +332,10 @@ int strncmp(const char *cs, const char *ct, size_t 
> count);
>  size_t strnlen(const char *s, size_t maxlen);
>  unsigned int atou(const char *s);
>  unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int 
> base);
> +unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base);
> +long simple_strtol(const char *cp, char **endp, unsigned int base);
>  size_t strlen(const char *s);
> +char *strchr(const char *s, int c);
>  
>  /* tty.c */
>  void puts(const char *);
> diff --git a/arch/x86/boot/compressed/kaslr.c 
> b/arch/x86/boot/compressed/kaslr.c
> index a66854d..036b514 100644
> --- a/arch/x86/boot/compressed/kaslr.c
> +++ b/arch/x86/boot/compressed/kaslr.c
> @@ -11,6 +11,7 @@
>   */
>  #include "misc.h"
>  #include "error.h"
> +#include "../boot.h"
>  
>  #include 
>  #include 
> @@ -56,11 +57,16 @@ struct mem_vector {
>   unsigned long size;
>  };
>  
> +/* only supporting at most 4 unusable memmap regions with kaslr */
> +#define MAX_MEMMAP_REGIONS   4
> +
>  enum mem_avoid_index {
>   MEM_AVOID_ZO_RANGE = 0,
>   MEM_AVOID_INITRD,
>   MEM_AVOID_CMDLINE,
>   MEM_AVOID_BOOTPARAMS,
> + MEM_AVOID_MEMMAP_BEGIN,
> + MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1,
>   MEM_AVOID_MAX,
>  };
>  
> @@ -77,6 +83,121 @@ static bool mem_overlaps(struct mem_vector *one, struct 
> mem_vector *two)
>   return true;
>  }
>  
> +/**
> + *   _memparse - parse a string with mem suffixes into a number
> + *   @ptr: Where parse begins
> + *   @retptr: (output) Optional pointer to next char after parse completes
> + *
> + *   Parses a string into a number.  The number stored at @ptr is
> + *   potentially suffixed with K, M, G, T, P, E.
> + */
> +static unsigned long long _memparse(const char *ptr, char **retptr)
> +{
> + char *endptr;   /* local pointer to end of parsed string */
> +
> + unsigned long long ret = simple_strtoull(ptr, , 0);
> +
> + switch (*endptr) {
> + case 'E':
> + case 'e':
> + ret <<= 10;
> + case 'P':
> + case 'p':
> + ret <<= 10;
> + case 'T':
> + case 't':
> + ret <<= 10;
> + case 'G':
> + case 'g':
> + ret <<= 10;
> + case 'M':
> + case 'm':
> + ret <<= 10;
> + case 'K':
> + case 'k':
> + ret <<= 10;
> + endptr++;
> + default:
> + break;
> + }
> +
> + if (retptr)
> + *retptr = endptr;
> +
> + return ret;
> +}
> +
> +static int
> +parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
> +{
> + char *oldp;
> +
> + if (!p)
> + return -EINVAL;
> +
> + /* we don't care about this option here */
> + if (!strncmp(p, "exactmap", 8))
> + return -EINVAL;
> +
> + oldp = p;
> + *size = _memparse(p, );
> + if (p == oldp)
> + return -EINVAL;
> +
> + switch (*p) {
> + case '@':
> + /* skip this region, usable */
> + *start = 0;
> + *size = 0;
> + return 0;
> + case '#':
> + case '$':
> + case '!':
> + *start = _memparse(p + 1, );
> + return 0;
> + }
> +
> + return -EINVAL;
> +}
> +
> +static int mem_avoid_memmap(void)
> +{
> + char arg[128];
> + int rc = 0;
> +
> + /* see if we have any memmap areas */
> + if (cmdline_find_option("memmap", arg, sizeof(arg)) > 0) {
> + int i = 0;
> + char *str = arg;
> +
> + while (str && (i < MAX_MEMMAP_REGIONS)) {
> + unsigned long long start, size;
> + char *k = strchr(str, ',');
> +