[PATCH v2] dma-buf: Rename dma-ops to prevent conflict with kunmap_atomic macro

2017-04-19 Thread Logan Gunthorpe
Seeing the kunmap_atomic dma_buf_ops share the same name with a macro
in highmem.h, the former can be aliased if any dma-buf user includes
that header.

I'm personally trying to include highmem.h inside scatterlist.h and this
breaks the dma-buf code proper.

Christoph Hellwig suggested [1] renaming it and pushing this patch ASAP.

To maintain consistency I've renamed all four of kmap* and kunmap* to be
map* and unmap*. (Even though only kmap_atomic presently conflicts.)

[1] https://www.spinics.net/lists/target-devel/msg15070.html

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Sinclair Yeh <s...@vmware.com>
---

Changes since v1:

- Added the missing tegra driver (noticed by kbuild robot)
- Rebased off of drm-intel-next to get the i915 selftest that is new
- Fixed nits Sinclair pointed out.

 drivers/dma-buf/dma-buf.c  | 16 
 drivers/gpu/drm/armada/armada_gem.c|  8 
 drivers/gpu/drm/drm_prime.c|  8 
 drivers/gpu/drm/i915/i915_gem_dmabuf.c |  8 
 drivers/gpu/drm/i915/selftests/mock_dmabuf.c   |  8 
 drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c  |  8 
 drivers/gpu/drm/tegra/gem.c|  8 
 drivers/gpu/drm/udl/udl_dmabuf.c   |  8 
 drivers/gpu/drm/vmwgfx/vmwgfx_prime.c  |  8 
 drivers/media/v4l2-core/videobuf2-dma-contig.c |  4 ++--
 drivers/media/v4l2-core/videobuf2-dma-sg.c |  4 ++--
 drivers/media/v4l2-core/videobuf2-vmalloc.c|  4 ++--
 drivers/staging/android/ion/ion.c  |  8 
 include/linux/dma-buf.h| 22 +++---
 14 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index f72aaac..512bdbc 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -405,8 +405,8 @@ struct dma_buf *dma_buf_export(const struct 
dma_buf_export_info *exp_info)
  || !exp_info->ops->map_dma_buf
  || !exp_info->ops->unmap_dma_buf
  || !exp_info->ops->release
- || !exp_info->ops->kmap_atomic
- || !exp_info->ops->kmap
+ || !exp_info->ops->map_atomic
+ || !exp_info->ops->map
  || !exp_info->ops->mmap)) {
return ERR_PTR(-EINVAL);
}
@@ -872,7 +872,7 @@ void *dma_buf_kmap_atomic(struct dma_buf *dmabuf, unsigned 
long page_num)
 {
WARN_ON(!dmabuf);

-   return dmabuf->ops->kmap_atomic(dmabuf, page_num);
+   return dmabuf->ops->map_atomic(dmabuf, page_num);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kmap_atomic);

@@ -889,8 +889,8 @@ void dma_buf_kunmap_atomic(struct dma_buf *dmabuf, unsigned 
long page_num,
 {
WARN_ON(!dmabuf);

-   if (dmabuf->ops->kunmap_atomic)
-   dmabuf->ops->kunmap_atomic(dmabuf, page_num, vaddr);
+   if (dmabuf->ops->unmap_atomic)
+   dmabuf->ops->unmap_atomic(dmabuf, page_num, vaddr);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kunmap_atomic);

@@ -907,7 +907,7 @@ void *dma_buf_kmap(struct dma_buf *dmabuf, unsigned long 
page_num)
 {
WARN_ON(!dmabuf);

-   return dmabuf->ops->kmap(dmabuf, page_num);
+   return dmabuf->ops->map(dmabuf, page_num);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kmap);

@@ -924,8 +924,8 @@ void dma_buf_kunmap(struct dma_buf *dmabuf, unsigned long 
page_num,
 {
WARN_ON(!dmabuf);

-   if (dmabuf->ops->kunmap)
-   dmabuf->ops->kunmap(dmabuf, page_num, vaddr);
+   if (dmabuf->ops->unmap)
+   dmabuf->ops->unmap(dmabuf, page_num, vaddr);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kunmap);

diff --git a/drivers/gpu/drm/armada/armada_gem.c 
b/drivers/gpu/drm/armada/armada_gem.c
index 1597458..d6c2a5d 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -529,10 +529,10 @@ static const struct dma_buf_ops 
armada_gem_prime_dmabuf_ops = {
.map_dma_buf= armada_gem_prime_map_dma_buf,
.unmap_dma_buf  = armada_gem_prime_unmap_dma_buf,
.release= drm_gem_dmabuf_release,
-   .kmap_atomic= armada_gem_dmabuf_no_kmap,
-   .kunmap_atomic  = armada_gem_dmabuf_no_kunmap,
-   .kmap   = armada_gem_dmabuf_no_kmap,
-   .kunmap = armada_gem_dmabuf_no_kunmap,
+   .map_atomic = armada_gem_dmabuf_no_kmap,
+   .unmap_atomic   = armada_gem_dmabuf_no_kunmap,
+   .map= armada_gem_dmabuf_no_kmap,
+   .unmap  = armada_gem_dmabuf_no_kunmap,
.mmap   = armada_gem_dmabuf_mmap,
 };

diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 9fb65b7..954eb84 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/d

Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-19 Thread Logan Gunthorpe


On 19/04/17 01:31 PM, Jason Gunthorpe wrote:
> Try it with VT-D turned on. It shouldn't work or there is a notable
> security hole in your platform..

Ah, ok.

>>>  const struct dma_map_ops *comp_ops = get_dma_ops(completer);
>>>  const struct dma_map_ops *init_ops = get_dma_ops(initiator);
>>
>> So, in this case, what device does the completer point to? The PCI
>> device or a more specific GPU device? If it's the former, who's
>> responsible for setting the new dma_ops? Typically the dma_ops are arch
>> specific but now you'd be adding ones that are tied to hmm or the gpu.
> 
> Donno, that is for GPU folks to figure out :)
> 
> But.. it could point to a GPU and the GPU struct device could have a
> proxy dma_ops like Dan pointed out.

Seems a bit awkward to me that in order for the intended use case, you
have to proxy the dma_ops. I'd probably still suggest throwing a couple
ops for things like this in the dev_pagemap.

>> It appears to me like it's calculating the DMA address, and the check is
>> just a side requirement. It reads as though it's only doing the check.
> 
> pci_p2p_same_segment_get_pa() then?

Ok, I think that's a bit clearer.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-19 Thread Logan Gunthorpe


On 19/04/17 12:32 PM, Jason Gunthorpe wrote:
> On Wed, Apr 19, 2017 at 12:01:39PM -0600, Logan Gunthorpe wrote:
> Not entirely, it would have to call through the whole process
> including the arch_p2p_cross_segment()..

Hmm, yes. Though it's still not clear what, if anything,
arch_p2p_cross_segment would be doing. In my experience, if you are
going between host bridges, the CPU address (or PCI address -- I'm not
sure which seeing they are the same on my system) would still work fine
-- it just _may_ be a bad idea because of performance.

Similarly if you are crossing via a QPI bus or similar, I'd expect the
CPU address to work fine. But here the performance is even less likely
to be any good.


>  // Try the arch specific helper
>const struct dma_map_ops *comp_ops = get_dma_ops(completer);
>const struct dma_map_ops *init_ops = get_dma_ops(initiator);

So, in this case, what device does the completer point to? The PCI
device or a more specific GPU device? If it's the former, who's
responsible for setting the new dma_ops? Typically the dma_ops are arch
specific but now you'd be adding ones that are tied to hmm or the gpu.

>> I'm not sure I like the name pci_p2p_same_segment. It reads as though
>> it's only checking if the devices are not the same segment.
> 
> Well, that is exactly what it is doing. If it succeeds then the caller
> knows the DMA will not flow outside the segment and no iommu setup/etc
> is required.

It appears to me like it's calculating the DMA address, and the check is
just a side requirement. It reads as though it's only doing the check.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-15 Thread Logan Gunthorpe
Thanks, Benjamin, for the summary of some of the issues.

On 14/04/17 04:07 PM, Benjamin Herrenschmidt wrote
> So I assume the p2p code provides a way to address that too via special
> dma_ops ? Or wrappers ?

Not at this time. We will probably need a way to ensure the iommus do
not attempt to remap these addresses. Though if it does, I'd expect
everything would still work you just wouldn't get the performance or
traffic flow you are looking for. We've been testing with the software
iommu which doesn't have this problem.

> The problem is that the latter while seemingly easier, is also slower
> and not supported by all platforms and architectures (for example,
> POWER currently won't allow it, or rather only allows a store-only
> subset of it under special circumstances).

Yes, I think situations where we have to cross host bridges will remain
unsupported by this work for a long time. There are two many cases where
it just doesn't work or it performs too poorly to be useful.

> I don't fully understand how p2pmem "solves" that by creating struct
> pages. The offset problem is one issue. But there's the iommu issue as
> well, the driver cannot just use the normal dma_map ops.

We are not using a proper iommu and we are dealing with systems that
have zero offset. This case is also easily supported. I expect fixing
the iommus to not map these addresses would also be reasonably achievable.

Logan


Re: [PATCH 03/22] libiscsi: Make use of new the sg_map helper function

2017-04-14 Thread Logan Gunthorpe


On 14/04/17 02:36 AM, Christoph Hellwig wrote:
> On Thu, Apr 13, 2017 at 04:05:16PM -0600, Logan Gunthorpe wrote:
>> Convert the kmap and kmap_atomic uses to the sg_map function. We now
>> store the flags for the kmap instead of a boolean to indicate
>> atomicitiy. We also propogate a possible kmap error down and create
>> a new ISCSI_TCP_INTERNAL_ERR error type for this.
> 
> Can you split out the new error handling into a separate prep patch
> which should go to the iscsi maintainers ASAP?
> 

Yes, I can do that. I'd just have thought they'd want to see the use
case for the new error before accepting a patch like that...

Logan


Re: [PATCH 04/22] target: Make use of the new sg_map function at 16 call sites (fwd)

2017-04-14 Thread Logan Gunthorpe
Thanks Julia. I missed that and I'll fix it in my series.

Logan

On 14/04/17 09:19 AM, Julia Lawall wrote:
> It looks like >cmdr_lock should be released at line 512 if it has
> not been released otherwise.  The lock was taken at line 438.
> 
> julia
> 
> -- Forwarded message --
> Date: Fri, 14 Apr 2017 22:21:44 +0800
> From: kbuild test robot <fengguang...@intel.com>
> To: kbu...@01.org
> Cc: Julia Lawall <julia.law...@lip6.fr>
> Subject: Re: [PATCH 04/22] target: Make use of the new sg_map function at 16
> call sites
> 
> Hi Logan,
> 
> [auto build test WARNING on scsi/for-next]
> [also build test WARNING on v4.11-rc6]
> [cannot apply to next-20170413]
> [if your patch is applied to the wrong git tree, please drop us a note to 
> help improve the system]
> 
> url:
> https://github.com/0day-ci/linux/commits/Logan-Gunthorpe/Introduce-common-scatterlist-map-function/20170414-142518
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi.git for-next
> :: branch date: 8 hours ago
> :: commit date: 8 hours ago
> 
>>> drivers/target/target_core_user.c:512:2-8: preceding lock on line 438
>drivers/target/target_core_user.c:512:2-8: preceding lock on line 471
> 
> git remote add linux-review https://github.com/0day-ci/linux
> git remote update linux-review
> git checkout 78082134e7afdc59d744eb8d2def5c588e89c378
> vim +512 drivers/target/target_core_user.c
> 
> 7c9e7a6f Andy Grover  2014-10-01  432 
> sizeof(struct tcmu_cmd_entry));
> 7c9e7a6f Andy Grover  2014-10-01  433 command_size = base_command_size
> 7c9e7a6f Andy Grover  2014-10-01  434 + 
> round_up(scsi_command_size(se_cmd->t_task_cdb), TCMU_OP_ALIGN_SIZE);
> 7c9e7a6f Andy Grover  2014-10-01  435
> 7c9e7a6f Andy Grover  2014-10-01  436 WARN_ON(command_size & 
> (TCMU_OP_ALIGN_SIZE-1));
> 7c9e7a6f Andy Grover  2014-10-01  437
> 7c9e7a6f Andy Grover  2014-10-01 @438 spin_lock_irq(>cmdr_lock);
> 7c9e7a6f Andy Grover  2014-10-01  439
> 7c9e7a6f Andy Grover  2014-10-01  440 mb = udev->mb_addr;
> 7c9e7a6f Andy Grover  2014-10-01  441 cmd_head = mb->cmd_head % 
> udev->cmdr_size; /* UAM */
> 26418649 Sheng Yang   2016-02-26  442 data_length = 
> se_cmd->data_length;
> 26418649 Sheng Yang   2016-02-26  443 if (se_cmd->se_cmd_flags & 
> SCF_BIDI) {
> 26418649 Sheng Yang   2016-02-26  444 
> BUG_ON(!(se_cmd->t_bidi_data_sg && se_cmd->t_bidi_data_nents));
> 26418649 Sheng Yang   2016-02-26  445 data_length += 
> se_cmd->t_bidi_data_sg->length;
> 26418649 Sheng Yang   2016-02-26  446 }
> 554617b2 Andy Grover  2016-08-25  447 if ((command_size > 
> (udev->cmdr_size / 2)) ||
> 554617b2 Andy Grover  2016-08-25  448 data_length > 
> udev->data_size) {
> 554617b2 Andy Grover  2016-08-25  449 pr_warn("TCMU: Request 
> of size %zu/%zu is too big for %u/%zu "
> 3d9b9555 Andy Grover  2016-08-25  450 "cmd ring/data 
> area\n", command_size, data_length,
> 7c9e7a6f Andy Grover  2014-10-01  451 
> udev->cmdr_size, udev->data_size);
> 554617b2 Andy Grover  2016-08-25  452 
> spin_unlock_irq(>cmdr_lock);
> 554617b2 Andy Grover  2016-08-25  453 return 
> TCM_INVALID_CDB_FIELD;
> 554617b2 Andy Grover  2016-08-25  454 }
> 7c9e7a6f Andy Grover  2014-10-01  455
> 26418649 Sheng Yang   2016-02-26  456 while 
> (!is_ring_space_avail(udev, command_size, data_length)) {
> 7c9e7a6f Andy Grover  2014-10-01  457 int ret;
> 7c9e7a6f Andy Grover  2014-10-01  458 DEFINE_WAIT(__wait);
> 7c9e7a6f Andy Grover  2014-10-01  459
> 7c9e7a6f Andy Grover  2014-10-01  460 
> prepare_to_wait(>wait_cmdr, &__wait, TASK_INTERRUPTIBLE);
> 7c9e7a6f Andy Grover  2014-10-01  461
> 7c9e7a6f Andy Grover  2014-10-01  462 pr_debug("sleeping for 
> ring space\n");
> 7c9e7a6f Andy Grover  2014-10-01  463 
> spin_unlock_irq(>cmdr_lock);
> 7c9e7a6f Andy Grover  2014-10-01  464 ret = 
> schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT));
> 7c9e7a6f Andy Grover  2014-10-01  465 
> finish_wait(>wait_cmdr, &__wait);
> 7c9e7a6f Andy Grover  2014-10-01  466 if (!ret) {
> 7c9e7a6f Andy Grover  2014-10-01  467 pr_warn("tcmu: 
> command timed out\n");
> 02eb924f Andy Grover  2016-10-06  468 

Re: [PATCH 10/22] staging: unisys: visorbus: Make use of the new sg_map helper function

2017-04-14 Thread Logan Gunthorpe
Great, thanks!

Logan

On 14/04/17 10:07 AM, Kershner, David A wrote:
> Can you add Acked-by for this patch? 
> 
> Acked-by: David Kershner 
> 
> Tested on s-Par and no problems. 
> 
> Thanks,
> David Kershner
> 
>> ---
>>  drivers/staging/unisys/visorhba/visorhba_main.c | 12 +++-
>>  1 file changed, 7 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/staging/unisys/visorhba/visorhba_main.c
>> b/drivers/staging/unisys/visorhba/visorhba_main.c
>> index 0ce92c8..2d8c8bc 100644
>> --- a/drivers/staging/unisys/visorhba/visorhba_main.c
>> +++ b/drivers/staging/unisys/visorhba/visorhba_main.c
>> @@ -842,7 +842,6 @@ do_scsi_nolinuxstat(struct uiscmdrsp *cmdrsp, struct
>> scsi_cmnd *scsicmd)
>>  struct scatterlist *sg;
>>  unsigned int i;
>>  char *this_page;
>> -char *this_page_orig;
>>  int bufind = 0;
>>  struct visordisk_info *vdisk;
>>  struct visorhba_devdata *devdata;
>> @@ -869,11 +868,14 @@ do_scsi_nolinuxstat(struct uiscmdrsp *cmdrsp,
>> struct scsi_cmnd *scsicmd)
>>
>>  sg = scsi_sglist(scsicmd);
>>  for (i = 0; i < scsi_sg_count(scsicmd); i++) {
>> -this_page_orig = kmap_atomic(sg_page(sg + i));
>> -this_page = (void *)((unsigned long)this_page_orig |
>> - sg[i].offset);
>> +this_page = sg_map(sg + i, SG_KMAP_ATOMIC);
>> +if (IS_ERR(this_page)) {
>> +scsicmd->result = DID_ERROR << 16;
>> +return;
>> +}
>> +
>>  memcpy(this_page, buf + bufind, sg[i].length);
>> -kunmap_atomic(this_page_orig);
>> +sg_unmap(sg + i, this_page, SG_KMAP_ATOMIC);
>>  }
>>  } else {
>>  devdata = (struct visorhba_devdata *)scsidev->host-
>>> hostdata;
>> --
>> 2.1.4
> 


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-15 Thread Logan Gunthorpe


On 15/04/17 09:01 PM, Benjamin Herrenschmidt wrote:
> Are ZONE_DEVICE pages identifiable based on the struct page alone ? (a
> flag ?)

Well you can't use ZONE_DEVICE as an indicator. They may be regular RAM,
(eg. pmem). It would need a separate flag indicating it is backed by iomem.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-15 Thread Logan Gunthorpe


On 15/04/17 04:17 PM, Benjamin Herrenschmidt wrote:
> You can't. If the iommu is on, everything is remapped. Or do you mean
> to have dma_map_* not do a remapping ?

Well, yes, you'd have to change the code so that iomem pages do not get
remapped and the raw BAR address is passed to the DMA engine. I said
specifically we haven't done this at this time but it really doesn't
seem like an unsolvable problem. It is something we will need to address
before a proper patch set is posted though.

> That's the problem again, same as before, for that to work, the
> dma_map_* ops would have to do something special that depends on *both*
> the source and target device.

No, I don't think you have to do things different based on the source.
Have the p2pmem device layer restrict allocating p2pmem based on the
devices in use (similar to how the RFC code works now) and when the dma
mapping code sees iomem pages it just needs to leave the address alone
so it's used directly by the dma in question.

It's much better to make the decision on which memory to use when you
allocate it. If you wait until you map it, it would be a pain to fall
back to system memory if it doesn't look like it will work. So, if when
you allocate it, you know everything will work you just need the dma
mapping layer to stay out of the way.

> The dma_ops today are architecture specific and have no way to
> differenciate between normal and those special P2P DMA pages.

Correct, unless Dan's idea works (which will need some investigation),
we'd need a flag in struct page or some other similar method to
determine that these are special iomem pages.

>> Though if it does, I'd expect
>> everything would still work you just wouldn't get the performance or
>> traffic flow you are looking for. We've been testing with the software
>> iommu which doesn't have this problem.
> 
> So first, no, it's more than "you wouldn't get the performance". On
> some systems it may also just not work. Also what do you mean by "the
> SW iommu doesn't have this problem" ? It catches the fact that
> addresses don't point to RAM and maps differently ?

I haven't tested it but I can't imagine why an iommu would not correctly
map the memory in the bar. But that's _way_ beside the point. We
_really_ want to avoid that situation anyway. If the iommu maps the
memory it defeats what we are trying to accomplish.

I believe the sotfware iommu only uses bounce buffers if the DMA engine
in use cannot address the memory. So in most cases, with modern
hardware, it just passes the BAR's address to the DMA engine and
everything works. The code posted in the RFC does in fact work without
needing to do any of this fussing.

>>> The problem is that the latter while seemingly easier, is also slower
>>> and not supported by all platforms and architectures (for example,
>>> POWER currently won't allow it, or rather only allows a store-only
>>> subset of it under special circumstances).
>>
>> Yes, I think situations where we have to cross host bridges will remain
>> unsupported by this work for a long time. There are two many cases where
>> it just doesn't work or it performs too poorly to be useful.
> 
> And the situation where you don't cross bridges is the one where you
> need to also take into account the offsets.

I think for the first incarnation we will just not support systems that
have offsets. This makes things much easier and still supports all the
use cases we are interested in.

> So you are designing something that is built from scratch to only work
> on a specific limited category of systems and is also incompatible with
> virtualization.

Yes, we are starting with support for specific use cases. Almost all
technology starts that way. Dax has been in the kernel for years and
only recently has someone submitted patches for it to support pmem on
powerpc. This is not unusual. If you had forced the pmem developers to
support all architectures in existence before allowing them upstream
they couldn't possibly be as far as they are today.

Virtualization specifically would be a _lot_ more difficult than simply
supporting offsets. The actual topology of the bus will probably be lost
on the guest OS and it would therefor have a difficult time figuring out
when it's acceptable to use p2pmem. I also have a difficult time seeing
a use case for it and thus I have a hard time with the argument that we
can't support use cases that do want it because use cases that don't
want it (perhaps yet) won't work.

> This is an interesting experiement to look at I suppose, but if you
> ever want this upstream I would like at least for you to develop a
> strategy to support the wider case, if not an actual implementation.

I think there are plenty of avenues forward to support offsets, etc.
It's just work. Nothing we'd be proposing would be incompatible with it.
We just don't want to have to do it all upfront especially when no one
really knows how well various architecture's hardware supports this or
if 

Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-16 Thread Logan Gunthorpe


On 16/04/17 09:53 AM, Dan Williams wrote:
> ZONE_DEVICE allows you to redirect via get_dev_pagemap() to retrieve
> context about the physical address in question. I'm thinking you can
> hang bus address translation data off of that structure. This seems
> vaguely similar to what HMM is doing.

Thanks! I didn't realize you had the infrastructure to look up a device
from a pfn/page. That would really come in handy for us.

Logan



Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-16 Thread Logan Gunthorpe


On 16/04/17 09:44 AM, Dan Williams wrote:
> I think we very much want the dma mapping layer to be in the way.
> It's the only sane semantic we have to communicate this translation.

Yes, I wasn't proposing bypassing that layer, per say. I just meant that
the layer would, in the end, have to return the address without any
translations.

> The difference is that there was nothing fundamental in the core
> design of pmem + DAX that prevented other archs from growing pmem
> support. THP and memory hotplug existed on other architectures and
> they just need to plug in their arch-specific enabling. p2p support
> needs the same starting point of something more than one architecture
> can plug into, and handling the bus address offset case needs to be
> incorporated into the design.

I don't think there's a difference there either. There'd have been
nothing fundamental in our core design that says offsets couldn't have
been added later.

> pmem + dax did not change the meaning of what a dma_addr_t is, p2p does.

I don't think p2p actually really changes the meaning of dma_addr_t
either. We are just putting addresses in there that weren't used
previously. Our RFC makes no changes to anything even remotely related
to dma_addr_t.

> I think you need to give other archs a chance to support this with a
> design that considers the offset case as a first class citizen rather
> than an afterthought.

I'll consider this. Given the fact I can use your existing
get_dev_pagemap infrastructure to look up the p2pmem device this
probably isn't as hard as I thought it would be anyway (we probably
don't even need a page flag). We'd just have lookup the dev_pagemap,
test if it's a p2pmem device, and if so, call a p2pmem_dma_map function
which could apply the offset or do any other arch specific logic (if
necessary).

Logan


Re: [PATCH 01/22] scatterlist: Introduce sg_map helper functions

2017-04-14 Thread Logan Gunthorpe


On 14/04/17 02:35 AM, Christoph Hellwig wrote:
>> +
>>  static inline int is_dma_buf_file(struct file *);
>>  
>>  struct dma_buf_list {
> 
> I think the right fix here is to rename the operation to unmap_atomic
> and send out a little patch for that ASAP.

Ok, I can do that next week.

> I'd rather have separate functions for kmap vs kmap_atomic instead of
> the flags parameter.  And while you're at it just always pass the 0
> offset parameter instead of adding a wrapper..
> 
> Otherwise this looks good to me.

I settled on the flags because I thought the interface could be expanded
to do more things like automatically copy iomem to a bounce buffer (with
a flag). It'd also be possible to add things like vmap and
physical_address to the interface which would cover even more sg_page
users. All the implementations would then share the common offset
calculations, and switching between them becomes a matter of changing a
couple flags.

If you're still not convinced by the above arguments  then I'll change
it but I did have reasons for choosing to do it this way.

I am fine with removing the offset versions. I will make that change.

Thanks,

Logan



Re: [PATCH 09/22] dm-crypt: Make use of the new sg_map helper in 4 call sites

2017-04-14 Thread Logan Gunthorpe


On 14/04/17 02:39 AM, Christoph Hellwig wrote:
> On Thu, Apr 13, 2017 at 04:05:22PM -0600, Logan Gunthorpe wrote:
>> Very straightforward conversion to the new function in all four spots.
> 
> I think the right fix here is to switch dm-crypt to the ahash API
> that takes a scatterlist.

Hmm, well I'm not sure I understand the code enough to make that
conversion. But I was looking at it. One tricky bit seems to be that
crypt_iv_lmk_one adds a seed, skips the first 16 bytes in the page and
then hashes another 16 bytes from other data. What would you do
construct a new sgl for it and pass it to the ahash api?

The other thing is crypt_iv_lmk_post also seems to modify the page after
the hash with a  crypto_xor so you'd still need at least one kmap in there.

Logan


Re: [PATCH 16/22] xen-blkfront: Make use of the new sg_map helper function

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 08:27 AM, Konrad Rzeszutek Wilk wrote:
> Interesting that you didn't CC any of the maintainers. Could you 
> do that in the future please?

Please read the cover letter. The distribution list for the patchset
would have been way too large to cc every maintainer (even as limited as
it was, I had mailing lists yelling at me). My plan was to get buy in
for the first patch, get it merged and resend the rest independently to
their respective maintainers. Of course, though, I'd be open to other
suggestions.

>>>
>>> Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
>>> ---
>>>  drivers/block/xen-blkfront.c | 33 +++--
>>>  1 file changed, 27 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
>>> index 5067a0a..7dcf41d 100644
>>> --- a/drivers/block/xen-blkfront.c
>>> +++ b/drivers/block/xen-blkfront.c
>>> @@ -807,8 +807,19 @@ static int blkif_queue_rw_req(struct request *req, 
>>> struct blkfront_ring_info *ri
>>> BUG_ON(sg->offset + sg->length > PAGE_SIZE);
>>>
>>> if (setup.need_copy) {
>>> -   setup.bvec_off = sg->offset;
>>> -   setup.bvec_data = kmap_atomic(sg_page(sg));
>>> +   setup.bvec_off = 0;
>>> +   setup.bvec_data = sg_map(sg, SG_KMAP_ATOMIC);
>>> +   if (IS_ERR(setup.bvec_data)) {
>>> +   /*
>>> +* This should really never happen unless
>>> +* the code is changed to use memory that is
>>> +* not mappable in the sg. Seeing there is a
>>> +* questionable error path out of here,
>>> +* we WARN.
>>> +*/
>>> +   WARN(1, "Non-mappable memory used in sg!");
>>> +   return 1;
>>> +   }
>> ...
>>
>> Perhaps add a flag to mark failure as 'unexpected' and trace (and panic?)
>> inside sg_map().

Thanks, that's a good suggestion. I'll make the change for v2.

Logan


Re: [PATCH 16/22] xen-blkfront: Make use of the new sg_map helper function

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 09:50 AM, Konrad Rzeszutek Wilk wrote:
> I am not sure if you know, but you can add on each patch the respective
> maintainer via 'CC'. That way you can have certain maintainers CCed only
> on the subsystems they cover. You put it after (or before) your SoB and
> git send-email happilly picks it up.

Yes, but I've seen some maintainers complain when they receive a patch
with no context (ie. cover letter and first patch). So I chose to do it
this way. I expect in this situation, no matter what you do, someone is
going to complain about the approach chosen.

Thanks anyway for the tip.

Logan


Re: [PATCH 05/22] drm/i915: Make use of the new sg_map helper function

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 12:44 AM, Daniel Vetter wrote:
> On Thu, Apr 13, 2017 at 04:05:18PM -0600, Logan Gunthorpe wrote:
>> This is a single straightforward conversion from kmap to sg_map.
>>
>> Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
> 
> Acked-by: Daniel Vetter <daniel.vet...@ffwll.ch>
> 
> Probably makes sense to merge through some other tree, but please be aware
> of the considerable churn rate in i915 (i.e. make sure your tree is in
> linux-next before you send a pull request for this). Plane B would be to
> get the prep patch in first and then merge the i915 conversion one kernel
> release later.

Yes, as per what I said in my cover letter, I was leaning towards a
"Plan B" style approach.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-17 Thread Logan Gunthorpe


On 17/04/17 01:20 AM, Benjamin Herrenschmidt wrote:
> But is it ? For example take a GPU, does it, in your scheme, need an
> additional "p2pmem" child ? Why can't the GPU driver just use some
> helper to instantiate the necessary struct pages ? What does having an
> actual "struct device" child buys you ?

Yes, in this scheme, it needs an additional p2pmem child. Why is that an
issue? It certainly makes it a lot easier for the user to understand the
p2pmem memory in the system (through the sysfs tree) and reason about
the topology and when to use it. This is important.

> 
>> 2) In order to create the struct pages we use the ZONE_DEVICE
>> infrastructure which requires a struct device. (See
>> devm_memremap_pages.)
> 
> Yup, but you already have one in the actual pci_dev ... What is the
> benefit of adding a second one ?

But that would tie all of this very tightly to be pci only and may get
hard to differentiate if more users of ZONE_DEVICE crop up who happen to
be using a pci device. Having a specific class for this makes it very
clear how this memory would be handled. For example, although I haven't
looked into it, this could very well be a point of conflict with HMM. If
they were to use the pci device to populate the dev_pagemap then we
couldn't also use the pci device. I feel it's much better for users of
dev_pagemap to have their struct devices they own to avoid such conflicts.

> 
>>  This amazingly gets us the get_dev_pagemap
>> architecture which also uses a struct device. So by using a p2pmem
>> device we can go from struct page to struct device to p2pmem device
>> quickly and effortlessly.
> 
> Which isn't terribly useful in itself right ? What you care about is
> the "enclosing" pci_dev no ? Or am I missing something ?

Sure it is. What if we want to someday support p2pmem that's on another bus?

>> 3) You wouldn't want to use the pci's struct device because it doesn't
>> really describe what's going on. For example, there may be multiple
>> devices on the pci device in question: eg. an NVME card and some p2pmem.
> 
> What is "some p2pmem" ?
>> Or it could be a NIC with some p2pmem.
> 
> Again what is "some p2pmem" ?

Some device local memory intended for use as a DMA target from a
neighbour device or itself. On a PCI device, this would be a BAR, or a
portion of a BAR with memory behind it.

Keep in mind device classes tend to carve out common use cases and don't
have a one to one mapping with a physical pci card.

> That a device might have some memory-like buffer space is all well and
> good but does it need to be specifically distinguished at the device
> level ? It could be inherent to what the device is... for example again
> take the GPU example, why would you call the FB memory "p2pmem" ? 

Well if you are using it for p2p transactions why wouldn't you call it
p2pmem? There's no technical downside here except some vague argument
over naming. Once registered as p2pmem, that device will handle all the
dma map stuff for you and have a central obvious place to put code which
helps decide whether to use it or not based on topology.

I can certainly see an issue you'd have with the current RFC in that the
p2pmem device currently also handles memory allocation which a GPU would
 want to do itself. There are plenty of solutions to this though: we
could provide hooks for the parent device to override allocation or
something like that. However, the use cases I'm concerned with don't do
their own allocation so that is an important feature for them.

> Again I'm not sure why it needs to "instanciate a p2pmem" device. Maybe
> it's the term "p2pmem" that offputs me. If p2pmem allowed to have a
> standard way to lookup the various offsets etc... I mentioned earlier,
> then yes, it would make sense to have it as a staging point. As-is, I
> don't know. 

Well of course, at some point it would have a standard way to lookup
offsets and figure out what's necessary for a mapping. We wouldn't make
that separate from this, that would make no sense.

I also forgot:

4) We need someway in the kernel to configure drivers that use p2pmem.
That means it needs a unique name that the user can understand, lookup
and pass to other drivers. Then a way for those drivers to find it in
the system. A specific device class gets that for us in a very simple
fashion. We also don't want to have drivers like nvmet having to walk
every pci device to figure out where the p2p memory is and whether it
can use it.

IMO there are many clear benefits here and you haven't really offered an
alternative that provides the same features and potential for future use
cases.

Logan



Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 10:45 AM, Jason Gunthorpe wrote:
> From Ben's comments, I would think that the 'first class' support that
> is needed here is simply a function to return the 'struct device'
> backing a CPU address range.

Yes, and Dan's get_dev_pagemap suggestion gets us 90% of the way there.
It's just a disagreement as to what struct device is inside the pagemap.
Care needs to be taken to ensure that struct device doesn't conflict
with hmm and doesn't limit other potential future users of ZONE_DEVICE.

> If there is going to be more core support for this stuff I think it
> will be under the topic of more robustly describing the fabric to the
> core and core helpers to extract data from the description: eg compute
> the path, check if the path crosses translation, etc

Agreed, those helpers would be useful to everyone.

> I think the key agreement to get out of Logan's series is that P2P DMA
> means:
>  - The BAR will be backed by struct pages
>  - Passing the CPU __iomem address of the BAR to the DMA API is
>valid and, long term, dma ops providers are expected to fail
>or return the right DMA address

Well, yes but we have a _lot_ of work to do to make it safe to pass
around struct pages backed with __iomem. That's where our next focus
will be. I've already taken very initial steps toward this with my
scatterlist map patchset.

>  - Mapping BAR memory into userspace and back to the kernel via
>get_user_pages works transparently, and with the DMA API above

Again, we've had a lot of push back for the memory to go to userspace at
all. It does work, but people expect userspace to screw it up in a lot
of ways. Among the people pushing back on that: Christoph Hellwig has
specifically said he wants to see this stay with in-kernel users only
until the apis can be worked out. This is one of the reasons we decided
to go with enabling nvme-fabrics as everything remains in the kernel.
And with that decision we needed a common in-kernel allocation
infrastructure: this is what p2pmem really is at this point.

>  - The dma ops provider must be able to tell if source memory is bar
>mapped and recover the pci device backing the mapping.

Do you mean to say that every dma-ops provider needs to be taught about
p2p backed pages? I was hoping we could have dma_map_* just use special
p2p dma-ops if it was passed p2p pages (though there are some
complications to this too).

> At least this is what we'd like in RDMA :)
> 
> FWIW, RDMA probably wouldn't want to use a p2mem device either, we
> already have APIs that map BAR memory to user space, and would like to
> keep using them. A 'enable P2P for bar' helper function sounds better
> to me.

Well, in the end that will likely come down to just devm_memremap_pages
with some (presently undecided) struct device that can be used to get
special p2p dma-ops for the bus.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-19 Thread Logan Gunthorpe


On 19/04/17 09:55 AM, Jason Gunthorpe wrote:
> I was thinking only this one would be supported with a core code
> helper..

Pivoting slightly: I was looking at how HMM uses ZONE_DEVICE. They add a
type flag to the dev_pagemap structure which would be very useful to us.
We could add another MEMORY_DEVICE_P2P type to distinguish p2p pages.
Then, potentially, we could add a dma_map callback to the structure
(possibly unioned with an hmm field). The dev_ops providers would then
just need to do something like this (enclosed in a helper):

if (is_zone_device_page(page)) {
pgmap = get_dev_pagemap(page_to_pfn(page));
if (!pgmap || pgmap->type !=  MEMORY_DEVICE_P2P ||
!pgmap->dma_map)
return 0;

dma_addr = pgmap->dma_map(dev, pgmap->dev, page);
put_dev_pagemap(pgmap);
if (!dma_addr)
return 0;
...
}

The pci_enable_p2p_bar function would then just need to call
devm_memremap_pages with the dma_map callback set to a function that
does the segment check and the offset calculation.

Thoughts?

@Jerome: my feedback to you would be that your patch assumes all users
of devm_memremap_pages are MEMORY_DEVICE_PERSISTENT. It would be more
useful if it was generic. My suggestion would be to have the caller
allocate the dev_pagemap structure, populate it and pass it into
devm_memremap_pages. Given that pretty much everything in that structure
are already arguments to that function, I feel like this makes sense.
This should also help to unify hmm_devmem_pages_create and
devm_memremap_pages which look very similar to each other.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-19 Thread Logan Gunthorpe


On 19/04/17 11:14 AM, Jason Gunthorpe wrote:
> I don't see a use for the dma_map function pointer at this point..

Yes, it is kind of like designing for the future. I just find it a
little odd calling the pci functions in the iommu.


> It doesn't make alot of sense for the completor of the DMA to provide
> a mapping op, the mapping process is *path* specific, not specific to
> a completer/initiator.

I'm just spit balling here but if HMM wanted to use unaddressable memory
as a DMA target, it could set that function to create a window ine gpu
memory, then call the pci_p2p_same_segment and return the result as the
dma address.


> dma_addr_t pci_p2p_same_segment(struct device *initator,
> struct device *completer,
>   struct page *page)

I'm not sure I like the name pci_p2p_same_segment. It reads as though
it's only checking if the devices are not the same segment. It also may
be that, in the future, it supports devices on different segments. I'd
call it more like pci_p2p_dma_map.


> for (each sgl) {

Thanks, this code fleshes things out nicely


> To me it looks like the code duplication across the iommu stuff comes
> from just duplicating the basic iommu algorithm in every driver.

Yes, this is true.

> To clean that up I think someone would need to hoist the overall sgl
> loop and use more ops callbacks eg allocate_iommu_range,
> assign_page_to_rage, dealloc_range, etc. This is a problem p2p makes
> worse, but isn't directly causing :\

Yup.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-19 Thread Logan Gunthorpe


On 19/04/17 02:48 PM, Jason Gunthorpe wrote:
> On Wed, Apr 19, 2017 at 01:41:49PM -0600, Logan Gunthorpe wrote:
> 
>>> But.. it could point to a GPU and the GPU struct device could have a
>>> proxy dma_ops like Dan pointed out.
>>
>> Seems a bit awkward to me that in order for the intended use case, you
>> have to proxy the dma_ops. I'd probably still suggest throwing a couple
>> ops for things like this in the dev_pagemap.
> 
> Another option is adding a new 'struct completer_dma_ops *' to struct
> device for this use case.
> 
> Seems like a waste to expand dev_pagemap when we only need a unique
> value per struct device?

I feel like expanding dev_pagemap has a much lower impact than expanding
struct device... dev_pagemap is only one instance per zone device region
so expanding it shouldn't be a huge issue. Expanding struct device means
every device struct in the system gets bigger.

Logan



[PATCH] libiscsi: Add an internal error code

2017-04-21 Thread Logan Gunthorpe
This is a prep patch to add a new error code to libiscsi. We want to
rework some kmap calls to be able to fail. When we do, we'd like to
use this error code.

This patch simply introduces ISCSI_TCP_INTERNAL_ERR and prints
"Internal Error." when it gets hit.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
---

Please see these links for additional context.

https://patchwork.kernel.org/patch/9680329/
https://patchwork.kernel.org/patch/9680091/

Thanks,

Logan

 drivers/scsi/cxgbi/libcxgbi.c | 5 +
 include/scsi/libiscsi_tcp.h   | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index bd7d39e..e38d0c1 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -1556,6 +1556,11 @@ static inline int read_pdu_skb(struct iscsi_conn *conn,
 */
iscsi_conn_printk(KERN_ERR, conn, "Invalid pdu or skb.");
return -EFAULT;
+   case ISCSI_TCP_INTERNAL_ERR:
+   pr_info("skb 0x%p, off %u, %d, TCP_INTERNAL_ERR.\n",
+   skb, offset, offloaded);
+   iscsi_conn_printk(KERN_ERR, conn, "Internal error.");
+   return -EFAULT;
case ISCSI_TCP_SEGMENT_DONE:
log_debug(1 << CXGBI_DBG_PDU_RX,
"skb 0x%p, off %u, %d, TCP_SEG_DONE, rc %d.\n",
diff --git a/include/scsi/libiscsi_tcp.h b/include/scsi/libiscsi_tcp.h
index 30520d5..90691ad 100644
--- a/include/scsi/libiscsi_tcp.h
+++ b/include/scsi/libiscsi_tcp.h
@@ -92,6 +92,7 @@ enum {
ISCSI_TCP_SKB_DONE, /* skb is out of data */
ISCSI_TCP_CONN_ERR, /* iscsi layer has fired a conn err */
ISCSI_TCP_SUSPENDED,/* conn is suspended */
+   ISCSI_TCP_INTERNAL_ERR, /* an internal error occurred */
 };

 extern void iscsi_tcp_hdr_recv_prep(struct iscsi_tcp_conn *tcp_conn);
--
2.1.4


Re: [PATCH v7 0/4] New Microsemi PCI Switch Management Driver

2017-03-09 Thread Logan Gunthorpe


On 09/03/17 12:55 PM, Bjorn Helgaas wrote:
> Applied to pci/switchtec for v4.12, thanks!

Great! Thanks so much!

Logan



Re: Question Regarding ERMS memcpy

2017-03-06 Thread Logan Gunthorpe


On 06/03/17 12:28 AM, H. Peter Anvin wrote:
> On 03/05/17 23:01, Logan Gunthorpe wrote:
>>
>> On 05/03/17 12:54 PM, Borislav Petkov wrote:
>>> Logan, wanna give that a try, see if it takes care of your issue?
>>
>> Well honestly my issue was solved by fixing my kernel config. I have no
>> idea why I had optimize for size in there in the first place.
>>
> 
> Yes, to gcc "optimize for size" means exactly that... intended for cases
> where saving storage (e.g. ROM) or code download time is paramount.

I agree and understand, however placing a poorly performing _inline_
memcpy instead of a single call instruction to a performant memcopy
probably took more code space in the end. So like Linus, I just have to
scratch my head at the -Os optimization option.

Logan


Re: Question Regarding ERMS memcpy

2017-03-04 Thread Logan Gunthorpe
Hey,

On 04/03/17 05:33 PM, Borislav Petkov wrote:
> On Sat, Mar 04, 2017 at 04:23:17PM -0800, h...@zytor.com wrote:
>> What are the compilation flags? It may be that gcc still does TRT
>> depending on this call site. I'd check what gcc6 or 7 generates,
>> though.
> Hmm, I wish we were able to say, "let gcc decide for small sizes and let
> us do the patching for larger ones."

So, I've found that my kernel config had the OPTIMIZE_FOR_SIZE selected
instead of OPTIMIZE_FOR_PERFORMANCE. I'm not sure why that is but
switching to the latter option fixes my problem. A memcpy call is used
instead of the poor inline solution. (I'm not really sure how the inline
solution even makes any sense as it almost certainly makes things larger
in the grand scheme of things.)

It still might make sense to apply your patch asking gcc to never use
the broken memcpy but I'll leave that in your capable hands to decide.

Anyway, thanks for the help with this.

Logan


Question Regarding ERMS memcpy

2017-03-04 Thread Logan Gunthorpe
Hi,

I'm trying to chase down a performance issue with a driver I'm working
on that does a repeated memcpy_fromio of about 1KB from a PCI device. I
made a small change from a fixed size copy to a variable size only to be
surprised with a performance decrease of about 1/3.

I've looked through the code and discovered this is due to switching out
__builtin_memcpy with __memcpy when the length is not constant. This
makes sense and I've now been looking into the inner workings of
memcpy_64.S.

The CPU I'm testing on is a Sandy Bridge and according to /proc/cpuinfo,
it does _not_ have the erms bit and does have the rep_good bit. So I
expect to be using the "rep movsq" version of memcpy. However, I've done
some testing with my driver by hacking in specific memcpy
implementations and I've found the following results:

__builtin_memcpy w/const length: 85KB/s
memcpy_fromio: 26kB/s
__builtin_memcpy: 26kB/s
memcpy_movsq: 126kB/s
memcpy_erms: 26kB/s

Thus, based on these performance numbers, it almost seems like my
platform is using the erms version when it probably shouldn't be.

So my question is: how do I find out what version of memcpy my actual
machine is using and fix it if it is wrong?

I'm running with a 4.10.0 kernel and I've attached my cpuinfo.

Thanks for any insights,

Logan

processor   : 0
vendor_id   : GenuineIntel
cpu family  : 6
model   : 45
model name  : Intel(R) Xeon(R) CPU E5-2609 0 @ 2.40GHz
stepping: 7
microcode   : 0x710
cpu MHz : 1200.000
cache size  : 10240 KB
physical id : 0
siblings: 4
core id : 0
cpu cores   : 4
apicid  : 0
initial apicid  : 0
fpu : yes
fpu_exception   : yes
cpuid level : 13
wp  : yes
flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb 
rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology 
nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 
ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer 
aes xsave avx lahf_lm epb tpr_shadow vnmi flexpriority ept vpid xsaveopt dtherm 
arat pln pts
bugs:
bogomips: 4799.90
clflush size: 64
cache_alignment : 64
address sizes   : 46 bits physical, 48 bits virtual
power management:

processor   : 1
vendor_id   : GenuineIntel
cpu family  : 6
model   : 45
model name  : Intel(R) Xeon(R) CPU E5-2609 0 @ 2.40GHz
stepping: 7
microcode   : 0x710
cpu MHz : 1200.000
cache size  : 10240 KB
physical id : 0
siblings: 4
core id : 1
cpu cores   : 4
apicid  : 2
initial apicid  : 2
fpu : yes
fpu_exception   : yes
cpuid level : 13
wp  : yes
flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb 
rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology 
nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 
ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer 
aes xsave avx lahf_lm epb tpr_shadow vnmi flexpriority ept vpid xsaveopt dtherm 
arat pln pts
bugs:
bogomips: 4801.06
clflush size: 64
cache_alignment : 64
address sizes   : 46 bits physical, 48 bits virtual
power management:

processor   : 2
vendor_id   : GenuineIntel
cpu family  : 6
model   : 45
model name  : Intel(R) Xeon(R) CPU E5-2609 0 @ 2.40GHz
stepping: 7
microcode   : 0x710
cpu MHz : 1200.000
cache size  : 10240 KB
physical id : 0
siblings: 4
core id : 2
cpu cores   : 4
apicid  : 4
initial apicid  : 4
fpu : yes
fpu_exception   : yes
cpuid level : 13
wp  : yes
flags   : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov 
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb 
rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology 
nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 
ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic popcnt tsc_deadline_timer 
aes xsave avx lahf_lm epb tpr_shadow vnmi flexpriority ept vpid xsaveopt dtherm 
arat pln pts
bugs:
bogomips: 4801.12
clflush size: 64
cache_alignment : 64
address sizes   : 46 bits physical, 48 bits virtual
power management:

processor   : 3
vendor_id   : GenuineIntel
cpu family  : 6
model   : 45
model name  : Intel(R) Xeon(R) CPU E5-2609 0 @ 2.40GHz
stepping: 7
microcode   : 0x710
cpu MHz : 1200.000
cache size  : 10240 KB
physical id : 0
siblings: 4
core id : 3
cpu cores   : 4
apicid  : 6
initial apicid  : 6
fpu : yes
fpu_exception   : yes
cpuid level : 

Re: Question Regarding ERMS memcpy

2017-03-05 Thread Logan Gunthorpe

On Sun, Mar 05, 2017 at 11:19:42AM -0800, Linus Torvalds wrote:
>> But it is *not* the right thing to use on IO memory, because the CPU
>> only does the magic cacheline access optimizations on cacheable
>> memory!

Yes, and actually this is where I started. I thought my memcpy was using
byte accesses on purpose and I needed to create a patch for a different
IO memcpy because obviously byte accesses over the PCI bus would be very
un-ideal. However, when I found my system wasn't intentionally using
that implementation that was no longer my focus.

So, I have no way to test this, but it sounds like any Ivy bridge system
using the ERMS version of memcpy could have the same slow PCI memcpy
performance I've been seeing (unless the microcode fixes it up?). So it
sounds like it would be a good idea to revert the change Linus is
talking about.

>> So I think we should re-introduce that old "__inline_memcpy()" as that
>> special "safe memcpy" thing. Not just for KMEMCHECK, and not just for
>> 64-bit.

On 05/03/17 12:54 PM, Borislav Petkov wrote:
> Logan, wanna give that a try, see if it takes care of your issue?

Well honestly my issue was solved by fixing my kernel config. I have no
idea why I had optimize for size in there in the first place.

Thanks,

Logan


Re: [PATCH v6 0/4] New Microsemi PCI Switch Management Driver

2017-03-02 Thread Logan Gunthorpe
Hey,

Ah, sorry, sorry. I noticed another minor mistake. I'll send a v7 in a
second.

Thanks,

Logan

On 02/03/17 04:07 PM, Logan Gunthorpe wrote:
> Hi,
> 
> Hopefully this is the last change necessary. The new code to handle
> unbind follows Jason's suggestions and is a lot easier to reason about.
> It's also been tested interrupting a number of operations.
> 
> Thanks,
> 
> Logan
> 
> --
> 
> Changes since v5:
> 
> * Reworked cleanup during unbind again. This time we follow the pattern
>   roughly suggested by Jason Gunthorpe: we use an alive flag
>   that's checked with a rw semaphore held by the cdev ops. Except
>   instead of using the rwsem we just use the already existing
>   mrpc_mutex. (Seeing the vast majority of locations that would use the
>   semaphore overlap with the existing mutex.)
> * Added a small performance enhancement so the code reads less io memory
>   if the user is quick in submitting their read request.
> 
> Changes since v4:
> 
> * Turns out pushing the pci release code into the device release
>   function didn't work as I would have liked. If you try to unbind the
>   device with an instance open, then you hit a kernel bug at
>   drivers/pci/msi.c:371. (This didn't occur in v3.)
> 
>   To solve this, we've moved the pci release code back into the
>   unregister function and reintroduced an alive flag. This time,
>   however, the alive flag is protected by mrpc_mutex and we're very
>   careful about what happens to devices still in use (they should
>   all be released through the timeout path and an ENODEV error
>   returned to userspace; while new commands are blocked with the
>   same error).
> 
> Changes since v3:
> 
> * Removed stdev_is_alive and moved pci deinit functions
>   into the device release function which only occurs after all
>   cdev instances are closed. (per Bjorn)
> * Reduced locking in read/write functions (per Bjorn)
> * Use pci_alloc_irq_vectors to vastly improve ISR initialization (Bjorn)
> * A few other minor nits and cleanup as noticed by Bjorn
> 
> Changes since v2:
> 
> * Collected reviewed and tested tags
> * Very minor fix to the error path in the create function
> 
> Changes since v1:
> 
> * Rebased onto 4.10-rc6 (cleanly)
> * Split the patch into a few more easily digestible patches (as
>   suggested by Greg Kroah-Hartman)
> * Folded switchtec.c into switchtec.h (per Greg)
> * Fixed a bunch of 32bit build warnings caught by the kbuild test robot
> * Fixed some issues in the documentation so it has a proper
>   reStructredText format (as noted by Jonathan Corbet)
> * Fixed padding and sizes in the IOCTL structures as noticed by Emil
>   Velikov and used pahole to verify their consistency across 32 and 64
>   bit builds
> * Reworked one of the IOCTL interfaces to be more future proof (per
>   Emil).
> 
> Changes since RFC:
> 
> * Fixed incorrect use of the drive model as pointed out by Greg
>   Kroah-Hartman
> * Used devm functions as suggested by Keith Busch
> * Added a handful of sysfs attributes to the switchtec class
> * Added a handful of IOCTLs to the switchtec device
> * A number of miscellaneous bug fixes
> 
> --
> 
> Hi,
> 
> This is a continuation of the RFC we posted lasted month [1] which
> proposes a management driver for Microsemi's Switchtec line of PCI
> switches. This hardware is still looking to be used in the Open
> Compute Platform
> 
> To make this entirely clear: the Switchtec products are compliant
> with the PCI specifications and are supported today with the standard
> in-kernel driver. However, these devices also expose a management endpoint
> on a separate PCI function address which can be used to perform some
> advanced operations. This is a driver for that function. See the patch
> for more information.
> 
> Since the RFC, we've made the changes requested by Greg Kroah-Hartman
> and Keith Busch, and we've also fleshed out a number of features. We've
> added a couple of IOCTLs and sysfs attributes which are documented in
> the patch. Significant work has also been done on the userspace tool
> which is available under a GPL license at [2]. We've also had testing
> done by some of the interested parties.
> 
> We hope to see this work included in either 4.11 or 4.12 assuming a
> smooth review process.
> 
> The patch is based off of the v4.10-rc6 release.
> 
> Thanks for your review,
> 
> Logan
> 
> [1] https://www.spinics.net/lists/linux-pci/msg56897.html
> [2] https://github.com/sbates130272/switchtec-user
> 
> Logan Gunthorpe (4):
>   MicroSemi Switchtec management interface driver
>   switchtec: Add user interface documentation
>   switchtec: Add sysfs attribu

[PATCH v6 0/4] New Microsemi PCI Switch Management Driver

2017-03-02 Thread Logan Gunthorpe
Hi,

Hopefully this is the last change necessary. The new code to handle
unbind follows Jason's suggestions and is a lot easier to reason about.
It's also been tested interrupting a number of operations.

Thanks,

Logan

--

Changes since v5:

* Reworked cleanup during unbind again. This time we follow the pattern
  roughly suggested by Jason Gunthorpe: we use an alive flag
  that's checked with a rw semaphore held by the cdev ops. Except
  instead of using the rwsem we just use the already existing
  mrpc_mutex. (Seeing the vast majority of locations that would use the
  semaphore overlap with the existing mutex.)
* Added a small performance enhancement so the code reads less io memory
  if the user is quick in submitting their read request.

Changes since v4:

* Turns out pushing the pci release code into the device release
  function didn't work as I would have liked. If you try to unbind the
  device with an instance open, then you hit a kernel bug at
  drivers/pci/msi.c:371. (This didn't occur in v3.)

  To solve this, we've moved the pci release code back into the
  unregister function and reintroduced an alive flag. This time,
  however, the alive flag is protected by mrpc_mutex and we're very
  careful about what happens to devices still in use (they should
  all be released through the timeout path and an ENODEV error
  returned to userspace; while new commands are blocked with the
  same error).

Changes since v3:

* Removed stdev_is_alive and moved pci deinit functions
  into the device release function which only occurs after all
  cdev instances are closed. (per Bjorn)
* Reduced locking in read/write functions (per Bjorn)
* Use pci_alloc_irq_vectors to vastly improve ISR initialization (Bjorn)
* A few other minor nits and cleanup as noticed by Bjorn

Changes since v2:

* Collected reviewed and tested tags
* Very minor fix to the error path in the create function

Changes since v1:

* Rebased onto 4.10-rc6 (cleanly)
* Split the patch into a few more easily digestible patches (as
  suggested by Greg Kroah-Hartman)
* Folded switchtec.c into switchtec.h (per Greg)
* Fixed a bunch of 32bit build warnings caught by the kbuild test robot
* Fixed some issues in the documentation so it has a proper
  reStructredText format (as noted by Jonathan Corbet)
* Fixed padding and sizes in the IOCTL structures as noticed by Emil
  Velikov and used pahole to verify their consistency across 32 and 64
  bit builds
* Reworked one of the IOCTL interfaces to be more future proof (per
  Emil).

Changes since RFC:

* Fixed incorrect use of the drive model as pointed out by Greg
  Kroah-Hartman
* Used devm functions as suggested by Keith Busch
* Added a handful of sysfs attributes to the switchtec class
* Added a handful of IOCTLs to the switchtec device
* A number of miscellaneous bug fixes

--

Hi,

This is a continuation of the RFC we posted lasted month [1] which
proposes a management driver for Microsemi's Switchtec line of PCI
switches. This hardware is still looking to be used in the Open
Compute Platform

To make this entirely clear: the Switchtec products are compliant
with the PCI specifications and are supported today with the standard
in-kernel driver. However, these devices also expose a management endpoint
on a separate PCI function address which can be used to perform some
advanced operations. This is a driver for that function. See the patch
for more information.

Since the RFC, we've made the changes requested by Greg Kroah-Hartman
and Keith Busch, and we've also fleshed out a number of features. We've
added a couple of IOCTLs and sysfs attributes which are documented in
the patch. Significant work has also been done on the userspace tool
which is available under a GPL license at [2]. We've also had testing
done by some of the interested parties.

We hope to see this work included in either 4.11 or 4.12 assuming a
smooth review process.

The patch is based off of the v4.10-rc6 release.

Thanks for your review,

Logan

[1] https://www.spinics.net/lists/linux-pci/msg56897.html
[2] https://github.com/sbates130272/switchtec-user

Logan Gunthorpe (4):
  MicroSemi Switchtec management interface driver
  switchtec: Add user interface documentation
  switchtec: Add sysfs attributes to the Switchtec driver
  switchtec: Add IOCTLs to the Switchtec driver

 Documentation/ABI/testing/sysfs-class-switchtec |   96 ++
 Documentation/ioctl/ioctl-number.txt|1 +
 Documentation/switchtec.txt |   80 ++
 MAINTAINERS |   11 +
 drivers/pci/Kconfig |1 +
 drivers/pci/Makefile|1 +
 drivers/pci/switch/Kconfig  |   13 +
 drivers/pci/switch/Makefile |1 +
 drivers/pci/switch/switchtec.c  | 1599 +++
 include/uapi/linux/switchtec_ioctl.h|  132 ++
 10 files changed, 1935 insertions(+)
 create mode 100644

[PATCH v7 4/4] switchtec: Add IOCTLs to the Switchtec driver

2017-03-02 Thread Logan Gunthorpe
This patch introduces a couple of special IOCTLs which are provided to:

* Inform userspace of firmware partition locations
* Pass event counts and allow userspace to wait on events
* Translate between PFF numbers used by the switch to
  port numbers.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Signed-off-by: Stephen Bates <stephen.ba...@microsemi.com>
Tested-by: Krishna Dhulipala <krish...@fb.com>
Reviewed-by: Wei Zhang <wzh...@fb.com>
Reviewed-by: Jens Axboe <ax...@fb.com>
---
 Documentation/ioctl/ioctl-number.txt |   1 +
 Documentation/switchtec.txt  |  27 ++
 MAINTAINERS  |   1 +
 drivers/pci/switch/switchtec.c   | 481 +++
 include/uapi/linux/switchtec_ioctl.h | 132 ++
 5 files changed, 642 insertions(+)
 create mode 100644 include/uapi/linux/switchtec_ioctl.h

diff --git a/Documentation/ioctl/ioctl-number.txt 
b/Documentation/ioctl/ioctl-number.txt
index 81c7f2b..032b33c 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -191,6 +191,7 @@ Code  Seq#(hex) Include FileComments
 'W'00-1F   linux/watchdog.hconflict!
 'W'00-1F   linux/wanrouter.h   conflict!   (pre 3.9)
 'W'00-3F   sound/asound.h  conflict!
+'W'40-5F   drivers/pci/switch/switchtec.c
 'X'all fs/xfs/xfs_fs.h conflict!
and fs/xfs/linux-2.6/xfs_ioctl32.h
and include/linux/falloc.h
diff --git a/Documentation/switchtec.txt b/Documentation/switchtec.txt
index 4bced4c..a0a9c7b 100644
--- a/Documentation/switchtec.txt
+++ b/Documentation/switchtec.txt
@@ -51,3 +51,30 @@ The char device has the following semantics:
 
 * The poll call will also be supported for userspace applications that
   need to do other things while waiting for the command to complete.
+
+The following IOCTLs are also supported by the device:
+
+* SWITCHTEC_IOCTL_FLASH_INFO - Retrieve firmware length and number
+  of partitions in the device.
+
+* SWITCHTEC_IOCTL_FLASH_PART_INFO - Retrieve address and lengeth for
+  any specified partition in flash.
+
+* SWITCHTEC_IOCTL_EVENT_SUMMARY - Read a structure of bitmaps
+  indicating all uncleared events.
+
+* SWITCHTEC_IOCTL_EVENT_CTL - Get the current count, clear and set flags
+  for any event. This ioctl takes in a switchtec_ioctl_event_ctl struct
+  with the event_id, index and flags set (index being the partition or PFF
+  number for non-global events). It returns whether the event has
+  occurred, the number of times and any event specific data. The flags
+  can be used to clear the count or enable and disable actions to
+  happen when the event occurs.
+  By using the SWITCHTEC_IOCTL_EVENT_FLAG_EN_POLL flag,
+  you can set an event to trigger a poll command to return with
+  POLLPRI. In this way, userspace can wait for events to occur.
+
+* SWITCHTEC_IOCTL_PFF_TO_PORT and SWITCHTEC_IOCTL_PORT_TO_PFF convert
+  between PCI Function Framework number (used by the event system)
+  and Switchtec Logic Port ID and Partition number (which is more
+  user friendly).
diff --git a/MAINTAINERS b/MAINTAINERS
index 6fed938..c1a9a30 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9515,6 +9515,7 @@ S:Maintained
 F: Documentation/switchtec.txt
 F: Documentation/ABI/testing/sysfs-class-switchtec
 F: drivers/pci/switch/switchtec*
+F: include/uapi/linux/switchtec_ioctl.h
 
 PCI DRIVER FOR NVIDIA TEGRA
 M: Thierry Reding <thierry.red...@gmail.com>
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 4888e23..1f045c9 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -13,6 +13,8 @@
  *
  */
 
+#include 
+
 #include 
 #include 
 #include 
@@ -778,6 +780,431 @@ static unsigned int switchtec_dev_poll(struct file *filp, 
poll_table *wait)
return ret;
 }
 
+static int ioctl_flash_info(struct switchtec_dev *stdev,
+   struct switchtec_ioctl_flash_info __user *uinfo)
+{
+   struct switchtec_ioctl_flash_info info = {0};
+   struct flash_info_regs __iomem *fi = stdev->mmio_flash_info;
+
+   info.flash_length = ioread32(>flash_length);
+   info.num_partitions = SWITCHTEC_IOCTL_NUM_PARTITIONS;
+
+   if (copy_to_user(uinfo, , sizeof(info)))
+   return -EFAULT;
+
+   return 0;
+}
+
+static void set_fw_info_part(struct switchtec_ioctl_flash_part_info *info,
+struct partition_info __iomem *pi)
+{
+   info->address = ioread32(>address);
+   info->length = ioread32(>length);
+}
+
+static int ioctl_flash_part_info(struct switchtec_dev *stdev,
+   struct switchtec_ioctl_flash_part_info __user *uinfo)
+{
+   struct switchtec_ioctl_flash_part_info info = {0};
+   struct flash_info_regs __iomem *fi = stdev->mmio_flash_info;
+   u32 active_addr = -1;
+
+   if (copy

[PATCH v7 3/4] switchtec: Add sysfs attributes to the Switchtec driver

2017-03-02 Thread Logan Gunthorpe
This patch adds a few read-only sysfs attributes which provide
some device information that is exposed from the devices. Primarily
component and device names and versions. These are documented in
Documentation/ABI/testing/sysfs-class-switchtec.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Signed-off-by: Stephen Bates <stephen.ba...@microsemi.com>
Tested-by: Krishna Dhulipala <krish...@fb.com>
Reviewed-by: Wei Zhang <wzh...@fb.com>
Reviewed-by: Jens Axboe <ax...@fb.com>
Reviewed-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-switchtec |  96 
 MAINTAINERS |   1 +
 drivers/pci/switch/switchtec.c  | 113 
 3 files changed, 210 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-class-switchtec

diff --git a/Documentation/ABI/testing/sysfs-class-switchtec 
b/Documentation/ABI/testing/sysfs-class-switchtec
new file mode 100644
index 000..48cb4c1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-switchtec
@@ -0,0 +1,96 @@
+switchtec - Microsemi Switchtec PCI Switch Management Endpoint
+
+For details on this subsystem look at Documentation/switchtec.txt.
+
+What:  /sys/class/switchtec
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   The switchtec class subsystem folder.
+   Each registered switchtec driver is represented by a switchtecX
+   subfolder (X being an integer >= 0).
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/component_id
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Component identifier as stored in the hardware (eg. PM8543)
+   (read only)
+Values:arbitrary string.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/component_revision
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Component revision stored in the hardware (read only)
+Values:integer.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/component_vendor
+Date:      05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Component vendor as stored in the hardware (eg. MICROSEM)
+   (read only)
+Values:arbitrary string.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/device_version
+Date:      05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Device version as stored in the hardware (read only)
+Values:integer.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/fw_version
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Currently running firmware version (read only)
+Values:integer (in hexadecimal).
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/partition
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Partition number for this device in the switch (read only)
+Values:integer.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/partition_count
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Total number of partitions in the switch (read only)
+Values:integer.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/product_id
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Product identifier as stored in the hardware (eg. PSX 48XG3)
+   (read only)
+Values:arbitrary string.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/product_revision
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Product revision stored in the hardware (eg. RevB)
+   (read only)
+Values:arbitrary string.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/product_vendor
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Product vendor as stored in the hardware (eg. MICROSEM)
+   (read only)
+Values:arbitrary string.
diff --git a/MAINTAINERS b/MAINTAINERS
index aa702b0..6fed938 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9513,6 +9513,7 @@ M:Logan Gunthorpe <log...@deltatee.com>
 L: linux-...@vger.kernel.org
 S: Maintained
 F: Documentation/switchtec.txt
+F: Documentation/ABI/testing/sysfs-class-switchtec
 F: drivers/pci/switch/switchtec*
 
 PCI DRIVER FOR NVIDIA TEGRA
diff --gi

[PATCH v6 4/4] switchtec: Add IOCTLs to the Switchtec driver

2017-03-02 Thread Logan Gunthorpe
This patch introduces a couple of special IOCTLs which are provided to:

* Inform userspace of firmware partition locations
* Pass event counts and allow userspace to wait on events
* Translate between PFF numbers used by the switch to
  port numbers.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Signed-off-by: Stephen Bates <stephen.ba...@microsemi.com>
Tested-by: Krishna Dhulipala <krish...@fb.com>
Reviewed-by: Wei Zhang <wzh...@fb.com>
Reviewed-by: Jens Axboe <ax...@fb.com>
---
 Documentation/ioctl/ioctl-number.txt |   1 +
 Documentation/switchtec.txt  |  27 ++
 MAINTAINERS  |   1 +
 drivers/pci/switch/switchtec.c   | 481 +++
 include/uapi/linux/switchtec_ioctl.h | 132 ++
 5 files changed, 642 insertions(+)
 create mode 100644 include/uapi/linux/switchtec_ioctl.h

diff --git a/Documentation/ioctl/ioctl-number.txt 
b/Documentation/ioctl/ioctl-number.txt
index 81c7f2b..032b33c 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -191,6 +191,7 @@ Code  Seq#(hex) Include FileComments
 'W'00-1F   linux/watchdog.hconflict!
 'W'00-1F   linux/wanrouter.h   conflict!   (pre 3.9)
 'W'00-3F   sound/asound.h  conflict!
+'W'40-5F   drivers/pci/switch/switchtec.c
 'X'all fs/xfs/xfs_fs.h conflict!
and fs/xfs/linux-2.6/xfs_ioctl32.h
and include/linux/falloc.h
diff --git a/Documentation/switchtec.txt b/Documentation/switchtec.txt
index 4bced4c..a0a9c7b 100644
--- a/Documentation/switchtec.txt
+++ b/Documentation/switchtec.txt
@@ -51,3 +51,30 @@ The char device has the following semantics:
 
 * The poll call will also be supported for userspace applications that
   need to do other things while waiting for the command to complete.
+
+The following IOCTLs are also supported by the device:
+
+* SWITCHTEC_IOCTL_FLASH_INFO - Retrieve firmware length and number
+  of partitions in the device.
+
+* SWITCHTEC_IOCTL_FLASH_PART_INFO - Retrieve address and lengeth for
+  any specified partition in flash.
+
+* SWITCHTEC_IOCTL_EVENT_SUMMARY - Read a structure of bitmaps
+  indicating all uncleared events.
+
+* SWITCHTEC_IOCTL_EVENT_CTL - Get the current count, clear and set flags
+  for any event. This ioctl takes in a switchtec_ioctl_event_ctl struct
+  with the event_id, index and flags set (index being the partition or PFF
+  number for non-global events). It returns whether the event has
+  occurred, the number of times and any event specific data. The flags
+  can be used to clear the count or enable and disable actions to
+  happen when the event occurs.
+  By using the SWITCHTEC_IOCTL_EVENT_FLAG_EN_POLL flag,
+  you can set an event to trigger a poll command to return with
+  POLLPRI. In this way, userspace can wait for events to occur.
+
+* SWITCHTEC_IOCTL_PFF_TO_PORT and SWITCHTEC_IOCTL_PORT_TO_PFF convert
+  between PCI Function Framework number (used by the event system)
+  and Switchtec Logic Port ID and Partition number (which is more
+  user friendly).
diff --git a/MAINTAINERS b/MAINTAINERS
index 6fed938..c1a9a30 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9515,6 +9515,7 @@ S:Maintained
 F: Documentation/switchtec.txt
 F: Documentation/ABI/testing/sysfs-class-switchtec
 F: drivers/pci/switch/switchtec*
+F: include/uapi/linux/switchtec_ioctl.h
 
 PCI DRIVER FOR NVIDIA TEGRA
 M: Thierry Reding <thierry.red...@gmail.com>
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index ab08aa4..27ced6f 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -13,6 +13,8 @@
  *
  */
 
+#include 
+
 #include 
 #include 
 #include 
@@ -778,6 +780,431 @@ static unsigned int switchtec_dev_poll(struct file *filp, 
poll_table *wait)
return ret;
 }
 
+static int ioctl_flash_info(struct switchtec_dev *stdev,
+   struct switchtec_ioctl_flash_info __user *uinfo)
+{
+   struct switchtec_ioctl_flash_info info = {0};
+   struct flash_info_regs __iomem *fi = stdev->mmio_flash_info;
+
+   info.flash_length = ioread32(>flash_length);
+   info.num_partitions = SWITCHTEC_IOCTL_NUM_PARTITIONS;
+
+   if (copy_to_user(uinfo, , sizeof(info)))
+   return -EFAULT;
+
+   return 0;
+}
+
+static void set_fw_info_part(struct switchtec_ioctl_flash_part_info *info,
+struct partition_info __iomem *pi)
+{
+   info->address = ioread32(>address);
+   info->length = ioread32(>length);
+}
+
+static int ioctl_flash_part_info(struct switchtec_dev *stdev,
+   struct switchtec_ioctl_flash_part_info __user *uinfo)
+{
+   struct switchtec_ioctl_flash_part_info info = {0};
+   struct flash_info_regs __iomem *fi = stdev->mmio_flash_info;
+   u32 active_addr = -1;
+
+   if (copy

[PATCH v6 3/4] switchtec: Add sysfs attributes to the Switchtec driver

2017-03-02 Thread Logan Gunthorpe
This patch adds a few read-only sysfs attributes which provide
some device information that is exposed from the devices. Primarily
component and device names and versions. These are documented in
Documentation/ABI/testing/sysfs-class-switchtec.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Signed-off-by: Stephen Bates <stephen.ba...@microsemi.com>
Tested-by: Krishna Dhulipala <krish...@fb.com>
Reviewed-by: Wei Zhang <wzh...@fb.com>
Reviewed-by: Jens Axboe <ax...@fb.com>
Reviewed-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-class-switchtec |  96 
 MAINTAINERS |   1 +
 drivers/pci/switch/switchtec.c  | 113 
 3 files changed, 210 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-class-switchtec

diff --git a/Documentation/ABI/testing/sysfs-class-switchtec 
b/Documentation/ABI/testing/sysfs-class-switchtec
new file mode 100644
index 000..48cb4c1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-switchtec
@@ -0,0 +1,96 @@
+switchtec - Microsemi Switchtec PCI Switch Management Endpoint
+
+For details on this subsystem look at Documentation/switchtec.txt.
+
+What:  /sys/class/switchtec
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   The switchtec class subsystem folder.
+   Each registered switchtec driver is represented by a switchtecX
+   subfolder (X being an integer >= 0).
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/component_id
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Component identifier as stored in the hardware (eg. PM8543)
+   (read only)
+Values:arbitrary string.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/component_revision
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Component revision stored in the hardware (read only)
+Values:integer.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/component_vendor
+Date:      05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Component vendor as stored in the hardware (eg. MICROSEM)
+   (read only)
+Values:arbitrary string.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/device_version
+Date:      05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Device version as stored in the hardware (read only)
+Values:integer.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/fw_version
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Currently running firmware version (read only)
+Values:integer (in hexadecimal).
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/partition
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Partition number for this device in the switch (read only)
+Values:integer.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/partition_count
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Total number of partitions in the switch (read only)
+Values:integer.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/product_id
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Product identifier as stored in the hardware (eg. PSX 48XG3)
+   (read only)
+Values:arbitrary string.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/product_revision
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Product revision stored in the hardware (eg. RevB)
+   (read only)
+Values:arbitrary string.
+
+
+What:  /sys/class/switchtec/switchtec[0-9]+/product_vendor
+Date:  05-Jan-2017
+KernelVersion: v4.11
+Contact:   Logan Gunthorpe <log...@deltatee.com>
+Description:   Product vendor as stored in the hardware (eg. MICROSEM)
+   (read only)
+Values:arbitrary string.
diff --git a/MAINTAINERS b/MAINTAINERS
index aa702b0..6fed938 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9513,6 +9513,7 @@ M:Logan Gunthorpe <log...@deltatee.com>
 L: linux-...@vger.kernel.org
 S: Maintained
 F: Documentation/switchtec.txt
+F: Documentation/ABI/testing/sysfs-class-switchtec
 F: drivers/pci/switch/switchtec*
 
 PCI DRIVER FOR NVIDIA TEGRA
diff --gi

[PATCH v7 1/4] MicroSemi Switchtec management interface driver

2017-03-02 Thread Logan Gunthorpe
Microsemi's "Switchtec" line of PCI switch devices is already well
supported by the kernel with standard PCI switch drivers. However, the
Switchtec device advertises a special management endpoint with a separate
PCI function address and class code. This endpoint enables some additional
functionality which includes:

 * Packet and Byte Counters
 * Switch Firmware Upgrades
 * Event and Error logs
 * Querying port link status
 * Custom user firmware commands

This patch introduces the switchtec kernel module which provides
PCI driver that exposes a char device. The char device provides
userspace access to this interface through read, write and (optionally)
poll calls.

A userspace tool and library which utilizes this interface is available
at [1]. This tool takes inspiration (and borrows some code) from
nvme-cli [2]. The tool is largely complete at this time but additional
features may be added in the future.

[1] https://github.com/sbates130272/switchtec-user
[2] https://github.com/linux-nvme/nvme-cli

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Signed-off-by: Stephen Bates <stephen.ba...@microsemi.com>
Tested-by: Krishna Dhulipala <krish...@fb.com>
Reviewed-by: Wei Zhang <wzh...@fb.com>
Reviewed-by: Jens Axboe <ax...@fb.com>
Reviewed-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
---
 MAINTAINERS|8 +
 drivers/pci/Kconfig|1 +
 drivers/pci/Makefile   |1 +
 drivers/pci/switch/Kconfig |   13 +
 drivers/pci/switch/Makefile|1 +
 drivers/pci/switch/switchtec.c | 1006 
 6 files changed, 1030 insertions(+)
 create mode 100644 drivers/pci/switch/Kconfig
 create mode 100644 drivers/pci/switch/Makefile
 create mode 100644 drivers/pci/switch/switchtec.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 527d137..a57686f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9506,6 +9506,14 @@ S:   Maintained
 F: Documentation/devicetree/bindings/pci/aardvark-pci.txt
 F: drivers/pci/host/pci-aardvark.c
 
+PCI DRIVER FOR MICROSEMI SWITCHTEC
+M: Kurt Schwemmer <kurt.schwem...@microsemi.com>
+M: Stephen Bates <stephen.ba...@microsemi.com>
+M: Logan Gunthorpe <log...@deltatee.com>
+L: linux-...@vger.kernel.org
+S: Maintained
+F: drivers/pci/switch/switchtec*
+
 PCI DRIVER FOR NVIDIA TEGRA
 M: Thierry Reding <thierry.red...@gmail.com>
 L: linux-te...@vger.kernel.org
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 6555eb7..f72e8c5 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -133,3 +133,4 @@ config PCI_HYPERV
 
 source "drivers/pci/hotplug/Kconfig"
 source "drivers/pci/host/Kconfig"
+source "drivers/pci/switch/Kconfig"
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 8db5079..15b46dd 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -68,3 +68,4 @@ ccflags-$(CONFIG_PCI_DEBUG) := -DDEBUG
 
 # PCI host controller drivers
 obj-y += host/
+obj-y += switch/
diff --git a/drivers/pci/switch/Kconfig b/drivers/pci/switch/Kconfig
new file mode 100644
index 000..4c49648
--- /dev/null
+++ b/drivers/pci/switch/Kconfig
@@ -0,0 +1,13 @@
+menu "PCI switch controller drivers"
+   depends on PCI
+
+config PCI_SW_SWITCHTEC
+   tristate "MicroSemi Switchtec PCIe Switch Management Driver"
+   help
+Enables support for the management interface for the MicroSemi
+Switchtec series of PCIe switches. Supports userspace access
+to submit MRPC commands to the switch via /dev/switchtecX
+devices. See  for more
+information.
+
+endmenu
diff --git a/drivers/pci/switch/Makefile b/drivers/pci/switch/Makefile
new file mode 100644
index 000..37d8cfb
--- /dev/null
+++ b/drivers/pci/switch/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_PCI_SW_SWITCHTEC) += switchtec.o
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
new file mode 100644
index 000..084e4ed
--- /dev/null
+++ b/drivers/pci/switch/switchtec.c
@@ -0,0 +1,1006 @@
+/*
+ * Microsemi Switchtec(tm) PCIe Management Driver
+ * Copyright (c) 2017, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+MODULE_DESCRIPTION("Microsemi Switchtec(tm) PCIe Management Driver");
+MODULE_VERSION("0.1");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Microsemi Cor

[PATCH v6 1/4] MicroSemi Switchtec management interface driver

2017-03-02 Thread Logan Gunthorpe
Microsemi's "Switchtec" line of PCI switch devices is already well
supported by the kernel with standard PCI switch drivers. However, the
Switchtec device advertises a special management endpoint with a separate
PCI function address and class code. This endpoint enables some additional
functionality which includes:

 * Packet and Byte Counters
 * Switch Firmware Upgrades
 * Event and Error logs
 * Querying port link status
 * Custom user firmware commands

This patch introduces the switchtec kernel module which provides
PCI driver that exposes a char device. The char device provides
userspace access to this interface through read, write and (optionally)
poll calls.

A userspace tool and library which utilizes this interface is available
at [1]. This tool takes inspiration (and borrows some code) from
nvme-cli [2]. The tool is largely complete at this time but additional
features may be added in the future.

[1] https://github.com/sbates130272/switchtec-user
[2] https://github.com/linux-nvme/nvme-cli

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Signed-off-by: Stephen Bates <stephen.ba...@microsemi.com>
Tested-by: Krishna Dhulipala <krish...@fb.com>
Reviewed-by: Wei Zhang <wzh...@fb.com>
Reviewed-by: Jens Axboe <ax...@fb.com>
Reviewed-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
---
 MAINTAINERS|8 +
 drivers/pci/Kconfig|1 +
 drivers/pci/Makefile   |1 +
 drivers/pci/switch/Kconfig |   13 +
 drivers/pci/switch/Makefile|1 +
 drivers/pci/switch/switchtec.c | 1005 
 6 files changed, 1029 insertions(+)
 create mode 100644 drivers/pci/switch/Kconfig
 create mode 100644 drivers/pci/switch/Makefile
 create mode 100644 drivers/pci/switch/switchtec.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 527d137..a57686f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9506,6 +9506,14 @@ S:   Maintained
 F: Documentation/devicetree/bindings/pci/aardvark-pci.txt
 F: drivers/pci/host/pci-aardvark.c
 
+PCI DRIVER FOR MICROSEMI SWITCHTEC
+M: Kurt Schwemmer <kurt.schwem...@microsemi.com>
+M: Stephen Bates <stephen.ba...@microsemi.com>
+M: Logan Gunthorpe <log...@deltatee.com>
+L: linux-...@vger.kernel.org
+S: Maintained
+F: drivers/pci/switch/switchtec*
+
 PCI DRIVER FOR NVIDIA TEGRA
 M: Thierry Reding <thierry.red...@gmail.com>
 L: linux-te...@vger.kernel.org
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 6555eb7..f72e8c5 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -133,3 +133,4 @@ config PCI_HYPERV
 
 source "drivers/pci/hotplug/Kconfig"
 source "drivers/pci/host/Kconfig"
+source "drivers/pci/switch/Kconfig"
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 8db5079..15b46dd 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -68,3 +68,4 @@ ccflags-$(CONFIG_PCI_DEBUG) := -DDEBUG
 
 # PCI host controller drivers
 obj-y += host/
+obj-y += switch/
diff --git a/drivers/pci/switch/Kconfig b/drivers/pci/switch/Kconfig
new file mode 100644
index 000..4c49648
--- /dev/null
+++ b/drivers/pci/switch/Kconfig
@@ -0,0 +1,13 @@
+menu "PCI switch controller drivers"
+   depends on PCI
+
+config PCI_SW_SWITCHTEC
+   tristate "MicroSemi Switchtec PCIe Switch Management Driver"
+   help
+Enables support for the management interface for the MicroSemi
+Switchtec series of PCIe switches. Supports userspace access
+to submit MRPC commands to the switch via /dev/switchtecX
+devices. See  for more
+information.
+
+endmenu
diff --git a/drivers/pci/switch/Makefile b/drivers/pci/switch/Makefile
new file mode 100644
index 000..37d8cfb
--- /dev/null
+++ b/drivers/pci/switch/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_PCI_SW_SWITCHTEC) += switchtec.o
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
new file mode 100644
index 000..2993c0c
--- /dev/null
+++ b/drivers/pci/switch/switchtec.c
@@ -0,0 +1,1005 @@
+/*
+ * Microsemi Switchtec(tm) PCIe Management Driver
+ * Copyright (c) 2017, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+MODULE_DESCRIPTION("Microsemi Switchtec(tm) PCIe Management Driver");
+MODULE_VERSION("0.1");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Microsemi Cor

[PATCH v7 0/4] New Microsemi PCI Switch Management Driver

2017-03-02 Thread Logan Gunthorpe
Because getting it right correctly the first time is appearantly hard
and I caught this mistake while reading the email I had just sent :(

I'm very sorry about the extra noise.

Logan

--

Changes since v6:

* I screwed up the device_unregister path and left a potential use
  after free bug in. I'm not sure why I didn't see a failure because of
  it but it all tested fine. This version is correct though.

Changes since v5:

* Reworked cleanup during unbind again. This time we follow the pattern
  roughly suggested by Jason Gunthorpe: we use an alive flag
  that's checked with a rw semaphore held by the cdev ops. Except
  instead of using the rwsem we just use the already existing
  mrpc_mutex. (Seeing the vast majority of locations that would use the
  semaphore overlap with the existing mutex.)
* Added a small performance enhancement so the code reads less io memory
  if the user is quick in submitting their read request.

Changes since v4:

* Turns out pushing the pci release code into the device release
  function didn't work as I would have liked. If you try to unbind the
  device with an instance open, then you hit a kernel bug at
  drivers/pci/msi.c:371. (This didn't occur in v3.)

  To solve this, we've moved the pci release code back into the
  unregister function and reintroduced an alive flag. This time,
  however, the alive flag is protected by mrpc_mutex and we're very
  careful about what happens to devices still in use (they should
  all be released through the timeout path and an ENODEV error
  returned to userspace; while new commands are blocked with the
  same error).

Changes since v3:

* Removed stdev_is_alive and moved pci deinit functions
  into the device release function which only occurs after all
  cdev instances are closed. (per Bjorn)
* Reduced locking in read/write functions (per Bjorn)
* Use pci_alloc_irq_vectors to vastly improve ISR initialization (Bjorn)
* A few other minor nits and cleanup as noticed by Bjorn

Changes since v2:

* Collected reviewed and tested tags
* Very minor fix to the error path in the create function

Changes since v1:

* Rebased onto 4.10-rc6 (cleanly)
* Split the patch into a few more easily digestible patches (as
  suggested by Greg Kroah-Hartman)
* Folded switchtec.c into switchtec.h (per Greg)
* Fixed a bunch of 32bit build warnings caught by the kbuild test robot
* Fixed some issues in the documentation so it has a proper
  reStructredText format (as noted by Jonathan Corbet)
* Fixed padding and sizes in the IOCTL structures as noticed by Emil
  Velikov and used pahole to verify their consistency across 32 and 64
  bit builds
* Reworked one of the IOCTL interfaces to be more future proof (per
  Emil).

Changes since RFC:

* Fixed incorrect use of the drive model as pointed out by Greg
  Kroah-Hartman
* Used devm functions as suggested by Keith Busch
* Added a handful of sysfs attributes to the switchtec class
* Added a handful of IOCTLs to the switchtec device
* A number of miscellaneous bug fixes

--

Hi,

This is a continuation of the RFC we posted lasted month [1] which
proposes a management driver for Microsemi's Switchtec line of PCI
switches. This hardware is still looking to be used in the Open
Compute Platform

To make this entirely clear: the Switchtec products are compliant
with the PCI specifications and are supported today with the standard
in-kernel driver. However, these devices also expose a management endpoint
on a separate PCI function address which can be used to perform some
advanced operations. This is a driver for that function. See the patch
for more information.

Since the RFC, we've made the changes requested by Greg Kroah-Hartman
and Keith Busch, and we've also fleshed out a number of features. We've
added a couple of IOCTLs and sysfs attributes which are documented in
the patch. Significant work has also been done on the userspace tool
which is available under a GPL license at [2]. We've also had testing
done by some of the interested parties.

We hope to see this work included in either 4.11 or 4.12 assuming a
smooth review process.

The patch is based off of the v4.10-rc6 release.

Thanks for your review,

Logan

[1] https://www.spinics.net/lists/linux-pci/msg56897.html
[2] https://github.com/sbates130272/switchtec-user

Logan Gunthorpe (4):
  MicroSemi Switchtec management interface driver
  switchtec: Add user interface documentation
  switchtec: Add sysfs attributes to the Switchtec driver
  switchtec: Add IOCTLs to the Switchtec driver

 Documentation/ABI/testing/sysfs-class-switchtec |   96 ++
 Documentation/ioctl/ioctl-number.txt|1 +
 Documentation/switchtec.txt |   80 ++
 MAINTAINERS |   11 +
 drivers/pci/Kconfig |1 +
 drivers/pci/Makefile|1 +
 drivers/pci/switch/Kconfig  |   13 +
 drivers/pci/switch/Makefile |1 +
 drivers/pci

[PATCH v7 2/4] switchtec: Add user interface documentation

2017-03-02 Thread Logan Gunthorpe
This adds standard documentation for the sysfs switchtec attributes and
a RST formatted text file which documents the char device interface.
Jonathan Corbet has indicated he will move this to a new user-space
developer documentation book once it's created.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Signed-off-by: Stephen Bates <stephen.ba...@microsemi.com>
Tested-by: Krishna Dhulipala <krish...@fb.com>
Reviewed-by: Wei Zhang <wzh...@fb.com>
Reviewed-by: Jens Axboe <ax...@fb.com>
---
 Documentation/switchtec.txt | 53 +
 MAINTAINERS |  1 +
 2 files changed, 54 insertions(+)
 create mode 100644 Documentation/switchtec.txt

diff --git a/Documentation/switchtec.txt b/Documentation/switchtec.txt
new file mode 100644
index 000..4bced4c
--- /dev/null
+++ b/Documentation/switchtec.txt
@@ -0,0 +1,53 @@
+
+Linux Switchtec Support
+
+
+Microsemi's "Switchtec" line of PCI switch devices is already
+supported by the kernel with standard PCI switch drivers. However, the
+Switchtec device advertises a special management endpoint which
+enables some additional functionality. This includes:
+
+* Packet and Byte Counters
+* Firmware Upgrades
+* Event and Error logs
+* Querying port link status
+* Custom user firmware commands
+
+The switchtec kernel module implements this functionality.
+
+
+Interface
+=
+
+The primary means of communicating with the Switchtec management firmware is
+through the Memory-mapped Remote Procedure Call (MRPC) interface.
+Commands are submitted to the interface with a 4-byte command
+identifier and up to 1KB of command specific data. The firmware will
+respond with a 4 bytes return code and up to 1KB of command specific
+data. The interface only processes a single command at a time.
+
+
+Userspace Interface
+===
+
+The MRPC interface will be exposed to userspace through a simple char
+device: /dev/switchtec#, one for each management endpoint in the system.
+
+The char device has the following semantics:
+
+* A write must consist of at least 4 bytes and no more than 1028 bytes.
+  The first four bytes will be interpreted as the command to run and
+  the remainder will be used as the input data. A write will send the
+  command to the firmware to begin processing.
+
+* Each write must be followed by exactly one read. Any double write will
+  produce an error and any read that doesn't follow a write will
+  produce an error.
+
+* A read will block until the firmware completes the command and return
+  the four bytes of status plus up to 1024 bytes of output data. (The
+  length will be specified by the size parameter of the read call --
+  reading less than 4 bytes will produce an error.
+
+* The poll call will also be supported for userspace applications that
+  need to do other things while waiting for the command to complete.
diff --git a/MAINTAINERS b/MAINTAINERS
index a57686f..aa702b0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9512,6 +9512,7 @@ M:Stephen Bates <stephen.ba...@microsemi.com>
 M: Logan Gunthorpe <log...@deltatee.com>
 L: linux-...@vger.kernel.org
 S: Maintained
+F: Documentation/switchtec.txt
 F: drivers/pci/switch/switchtec*
 
 PCI DRIVER FOR NVIDIA TEGRA
-- 
2.1.4



[PATCH v6 2/4] switchtec: Add user interface documentation

2017-03-02 Thread Logan Gunthorpe
This adds standard documentation for the sysfs switchtec attributes and
a RST formatted text file which documents the char device interface.
Jonathan Corbet has indicated he will move this to a new user-space
developer documentation book once it's created.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Signed-off-by: Stephen Bates <stephen.ba...@microsemi.com>
Tested-by: Krishna Dhulipala <krish...@fb.com>
Reviewed-by: Wei Zhang <wzh...@fb.com>
Reviewed-by: Jens Axboe <ax...@fb.com>
---
 Documentation/switchtec.txt | 53 +
 MAINTAINERS |  1 +
 2 files changed, 54 insertions(+)
 create mode 100644 Documentation/switchtec.txt

diff --git a/Documentation/switchtec.txt b/Documentation/switchtec.txt
new file mode 100644
index 000..4bced4c
--- /dev/null
+++ b/Documentation/switchtec.txt
@@ -0,0 +1,53 @@
+
+Linux Switchtec Support
+
+
+Microsemi's "Switchtec" line of PCI switch devices is already
+supported by the kernel with standard PCI switch drivers. However, the
+Switchtec device advertises a special management endpoint which
+enables some additional functionality. This includes:
+
+* Packet and Byte Counters
+* Firmware Upgrades
+* Event and Error logs
+* Querying port link status
+* Custom user firmware commands
+
+The switchtec kernel module implements this functionality.
+
+
+Interface
+=
+
+The primary means of communicating with the Switchtec management firmware is
+through the Memory-mapped Remote Procedure Call (MRPC) interface.
+Commands are submitted to the interface with a 4-byte command
+identifier and up to 1KB of command specific data. The firmware will
+respond with a 4 bytes return code and up to 1KB of command specific
+data. The interface only processes a single command at a time.
+
+
+Userspace Interface
+===
+
+The MRPC interface will be exposed to userspace through a simple char
+device: /dev/switchtec#, one for each management endpoint in the system.
+
+The char device has the following semantics:
+
+* A write must consist of at least 4 bytes and no more than 1028 bytes.
+  The first four bytes will be interpreted as the command to run and
+  the remainder will be used as the input data. A write will send the
+  command to the firmware to begin processing.
+
+* Each write must be followed by exactly one read. Any double write will
+  produce an error and any read that doesn't follow a write will
+  produce an error.
+
+* A read will block until the firmware completes the command and return
+  the four bytes of status plus up to 1024 bytes of output data. (The
+  length will be specified by the size parameter of the read call --
+  reading less than 4 bytes will produce an error.
+
+* The poll call will also be supported for userspace applications that
+  need to do other things while waiting for the command to complete.
diff --git a/MAINTAINERS b/MAINTAINERS
index a57686f..aa702b0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9512,6 +9512,7 @@ M:Stephen Bates <stephen.ba...@microsemi.com>
 M: Logan Gunthorpe <log...@deltatee.com>
 L: linux-...@vger.kernel.org
 S: Maintained
+F: Documentation/switchtec.txt
 F: drivers/pci/switch/switchtec*
 
 PCI DRIVER FOR NVIDIA TEGRA
-- 
2.1.4



Re: [RFC 5/8] scatterlist: Modify SG copy functions to support io memory.

2017-04-03 Thread Logan Gunthorpe


On 03/04/17 03:44 PM, Dan Williams wrote:
> On Mon, Apr 3, 2017 at 2:20 PM, Logan Gunthorpe <log...@deltatee.com> wrote:
>> Hi Christoph,
>>
>> What are your thoughts on an approach like the following untested
>> draft patch.
>>
>> The patch (if fleshed out) makes it so iomem can be used in an sgl
>> and WARN_ONs will occur in places where drivers attempt to access
>> iomem directly through the sgl.
>>
>> I'd also probably create a p2pmem_alloc_sgl helper function so driver
>> writers wouldn't have to mess with sg_set_iomem_page.
>>
>> With all that in place, it should be relatively safe for drivers to
>> implement p2pmem even though we'd still technically be violating the
>> __iomem boundary in some places.
> 
> Just reacting to this mail, I still haven't had a chance to take a
> look at the rest of the series.
> 
> The pfn_t type was invented to carry extra type and page lookup
> information about the memory behind a given pfn. At first glance that
> seems a more natural place to carry an indication that this is an
> "I/O" pfn.

I agree... But what are the plans for pfn_t? Is anyone working on using
it in the scatterlist code? Currently it's not there yet and given the
assertion that we will continue to be using struct page for DMA is that
a direction we'd want to go?

Logan


Re: [RFC 1/8] Introduce Peer-to-Peer memory (p2pmem) device

2017-04-02 Thread Logan Gunthorpe


On 02/04/17 03:03 PM, Sinan Kaya wrote:
> Push the decision all the way to the user. Let them decide whether they
> want this feature to work on a root port connected port or under the
> switch.

Yes, I prefer this too. If other folks agree with that I'd be very happy
to go back to user chooses. I think Sagi was the most vocal proponent
for kernel chooses at LSF so hopefully he will read this thread and
offer some opinion.

> I thought the issue was feature didn't work at all with some root ports
> or there was some kind of memory corruption issue that you were trying to
> avoid with the existing systems.

I *think* there are some much older root ports where P2P TLPs don't even
get through. But it doesn't really change the situation: in the nvmet
case, the user would enable p2pmem and then be unable to connect and
thus choose to disable it going forward. Not a big difference from the
user seeing bad performance and not choosing to enable it.

> I think you should get rid of all pci searching business in your code.

Yes, my original proposal was when you configure the nvme target you
chose the specific p2pmem device to use. That code had no tie ins to PCI
code and could, in theory, work generically with any device and bus.

Logan




Re: [RFC 5/8] scatterlist: Modify SG copy functions to support io memory.

2017-04-03 Thread Logan Gunthorpe


On 03/04/17 04:47 PM, Dan Williams wrote:
> I wouldn't necessarily conflate supporting pfn_t in the scatterlist
> with the stalled stuct-page-less DMA effor. A pfn_t_to_page()
> conversion will still work and be required. However you're right, the
> minute we use pfn_t for this we're into the realm of special case
> drivers that understand scatterlists with special "I/O-pfn_t" entries.

Well yes, it would certainly be possible to convert the scatterlist code
from page_link to pfn_t. (The only slightly tricky thing is that
scatterlist uses extra chaining bits and pfn_t uses extra flag bits so
they'd have to be harmonized somehow). But if we aren't moving toward
struct-page-less DMA, I fail to see the point of the conversion.

I'll definitely need IO scatterlists of some form or another and I like
pfn_t but right now it just seems like extra work with unclear benefit.
(Though, if someone told me that I can't use a third bit in the
page_link field then maybe that would be a good reason to move to pfn_t.)

> However, maybe that's what we want? I think peer-to-peer DMA is not a
> general purpose feature unless/until we get it standardized in PCI. So
> maybe drivers with special case scatterlist support is exactly what we
> want for now.

Well, I think this should be completely independent from PCI code. I see
no reason why we can't have infrastructure for DMA on iomem from any
bus. Largely all the work I've done in this area is completely agnostic
to the bus in use. (Except for any kind of white/black list when it is
used.)

The "special case scatterlist" is essentially what I'm proposing in the
patch I sent upthread, it just stores the flag in the page_link instead
of in a pfn_t.

Logan


Re: [RFC 5/8] scatterlist: Modify SG copy functions to support io memory.

2017-03-31 Thread Logan Gunthorpe


On 31/03/17 01:09 AM, Christoph Hellwig wrote:
> You're calling memcpy_{to,from}_iomem on non-__iomem pointers.  This
> is a fundamental no-go as we keep I/O memory separate from kernel
> pointers.

Yes, that's true, however I don't know how we could get around that when
the iomem is referenced by struct pages inside a scatter gather list. Do
we need to now have special __iomem sgls? And even still, I'm not sure
how that could work when the nvme target code is using the same sgls to
sometimes point to iomem and sometimes point to regular memory.

I'm certainly open to suggestions, though.

Logan



Re: [RFC 5/8] scatterlist: Modify SG copy functions to support io memory.

2017-04-07 Thread Logan Gunthorpe
Hi Dan,

On 03/04/17 06:07 PM, Dan Williams wrote:
> The completely agnostic part is where I get worried, but I shouldn't
> say anymore until I actually read the patch.The worry is cases where
> this agnostic enabling allows unsuspecting code paths to do the wrong
> thing. Like bypass iomem safety.

Yup, you're right the iomem safety issue is a really difficult problem.
I think replacing struct page with pfn_t in a bunch of places is
probably going to be a requirement for my work. However, this is going
to be a very large undertaking.

I've done an audit of sg_page users and there will indeed be some
difficult cases. However, I'm going to start doing some cleanup and
semantic changes to hopefully move in that direction. The first step
I've chosen to look at is to create an sg_kmap interface which replaces
about 77 (out of ~340) sg_page users. I'm hoping the new interface can
have the semantic that sg_kmap can fail (which would happen in the case
that no suitable page exists).

Eventually, I'd want to get to a place where sg_page either doesn't
exists or can fail and is always checked. At that point swapping out
pfn_t in the sgl would be manageable.

Thoughts?

Logan


Re: linux-next: manual merge of the scsi-mkp tree with the char-misc tree

2017-04-07 Thread Logan Gunthorpe
Hi Bart,

On 07/04/17 09:49 AM, Bart Van Assche wrote:
> Sorry that I had not yet noticed Logan's patch series. Should my two
> patches that conflict with Logan's patch series be dropped and reworked
> after Logan's patches are upstream?

Yeah, Greg took my patchset around a few maintainers relatively quickly.
This is the second conflict, so sorry about that. Looks like the easiest
thing would be to just base your change off of mine. It doesn't look too
difficult. If you can do it before my patch hits upstream, I'd
appreciate some testing and/or review as no one from the scsi side
responded and that particular patch was a bit more involved than I would
have liked.

Thanks,

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-17 Thread Logan Gunthorpe


On 17/04/17 11:04 AM, Dan Williams wrote:
>> Yes, in this scheme, it needs an additional p2pmem child. Why is that an
>> issue? It certainly makes it a lot easier for the user to understand the
>> p2pmem memory in the system (through the sysfs tree) and reason about
>> the topology and when to use it. This is important.
> 
> I think you want to go the other way in the hierarchy and find a
> shared *parent* to land the p2pmem capability. Because that same agent
> is going to be responsible handling address translation for the peers.
>
> Peer-dma is always going to be a property of the bus and not the end
> devices. Requiring each bus implementation to explicitly enable
> peer-to-peer support is a feature not a bug.
> 
> We shouldn't design for some future possible use case. Solve it for
> pci and when / if another bus comes along then look at a more generic
> abstraction.

Thanks Dan, these are some good points. Wedding it closer to the PCI
code makes more sense to me now. I'd still think you'd want some struct
device though to appear in the device hierarchy and allow reasoning
about topology.

Logan



Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-17 Thread Logan Gunthorpe


On 17/04/17 03:11 PM, Benjamin Herrenschmidt wrote:
> Is it ? Again, you create a "concept" the user may have no idea about,
> "p2pmem memory". So now any kind of memory buffer on a device can could
> be use for p2p but also potentially a bunch of other things becomes
> special and called "p2pmem" ...

The user is going to have to have an idea about it if they are designing
systems to make use of it. I've said it before many times: this is an
optimization with significant trade-offs so the user does have to make
decisions regarding when to enable it.


> But what do you have in p2pmem that somebody benefits from. Again I
> don't understand what that "p2pmem" device buys you in term of
> functionality vs. having the device just instanciate the pages.

Well thanks for just taking a big shit on all of our work without even
reading the patches. Bravo.

> Now having some kind of way to override the dma_ops, yes I do get that,
> and it could be that this "p2pmem" is typically the way to do it, but
> at the moment you don't even have that. So I'm a bit at a loss here.

Yes, we've already said many times that this is something we will need
to add.

> But it doesn't *have* to be. Again, take my GPU example. The fact that
> a NIC might be able to DMA into it doesn't make it specifically "p2p
> memory".

Just because you use it for other things doesn't mean it can't also
provide the service of a "p2pmem" device.

> So now your "p2pmem" device needs to also be laid out on top of those
> MMIO registers ? It's becoming weird.

Yes, Max Gurtovoy has also expressed an interest in expanding this work
to cover things other than memory. He's suggested simply calling it a
p2p device, but until we figure out what exactly that all means we can't
really finalize a name.

> See, basically, doing peer 2 peer between devices has 3 main challenges
> today: The DMA API needing struct pages, the MMIO translation issues
> and the IOMMU translation issues.
> 
> You seem to create that added device as some kind of "owner" for the
> struct pages, solving #1, but leave #2 and #3 alone.

Well there are other challenges too. Like figuring out when it's
appropriate to use, tying together the device that provides the memory
with the driver tring to use it in DMA transactions, etc, etc. Our patch
set tackles these latter issues.

> If we go down that path, though, rather than calling it p2pmem I would
> call it something like dma_target which I find much clearer especially
> since it doesn't have to be just memory.

I'm not set on the name. My arguments have been specifically for the
existence of an independent struct device. But I'm not really interested
in getting into bike shedding arguments over what to call it at this
time when we don't even really know what it's going to end up doing in
the end.

> The memory allocation should be a completely orthogonal and separate
> thing yes. You are conflating two completely different things now into
> a single concept.

Well we need a uniform way for a driver trying to coordinate a p2p dma
to find and obtain memory from devices that supply it. We are not
dealing with GPUs that already have complicated allocators. We are
dealing with people adding memory to their devices for the _sole_
purpose of enabling p2p transfers. So having a common allocation setup
is seen as a benefit to us.

Logan




Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-18 Thread Logan Gunthorpe


On 17/04/17 12:04 PM, Jerome Glisse wrote:
> I disagree here. I would rather see Peer-to-Peer mapping as a form
> of helper so that device driver can opt-in for multiple mecanisms
> concurrently. Like HMM and p2p.

I'm not against moving some of the common stuff into a library. It
sounds like the problems p2pmem solves don't overlap much with the
problems of the GPU and moving the stuff we have in common somewhere
else seems sensible.

> Also it seems you are having a static vision for p2p. For GPU and
> network the use case is you move some buffer into the device memory
> and then you create mapping for some network adapter while the buffer
> is in device memory. But this is only temporary and buffer might
> move to different device memory. So usecase is highly dynamic (well
> mapping lifetime is still probably few second/minutes).

I feel like you will need to pin the memory while it's the target of a
DMA transaction. If some network peer is sending you data and you just
invalidated the memory it is headed to then you are just going to break
applications. But really this isn't our concern: the memory we are using
with this work will be static and not prone to disappearing.

> I see no reason for exposing sysfs tree to userspace for all this.
> This isn't too dynamic, either 2 devices can access each others
> memory, either they can't. This can be hidden through the device
> kernel API. Again for GPU the idea is that it is always do-able
> in the sense that when it is not you fallback to using system
> memory.

The user has to make a decision to use it or not. This is an
optimization with significant trade-offs that may differ significantly
based on system design.

> Yes this could conflict and that's why i would rather see this as a set
> of helper like HMM is doing. So device driver can opt-in HMM and p2pmem
> at the same time.

I don't understand how that addresses the conflict. We need to each be
using unique and identifiable struct devices in the ZONE_DEVICE
dev_pagemap so we don't apply p2p dma mappings to hmm memory and vice-versa.

> This seems to duplicate things that already exist in each individual
> driver. If a device has memory than device driver already have some
> form of memory management and most likely expose some API to userspace
> to allow program to use that memory.

> Peer to peer DMA mapping is orthogonal to memory management, it is
> an optimization ie you have some buffer allocated through some device
> driver specific IOCTL and now you want some other device to directly
> access it. Having to first do another allocation in a different device
> driver for that to happen seems overkill.

The devices we are working with are adding memory specifically for
enabling p2p applications. The memory is new and there are no allocators
for any of it yet.

Also note: we've gotten _significant_ push back against exposing any of
this memory to userspace. Letting the user unknowingly have to deal with
the issues of iomem is not anything anyone wants to see. Thus we are
dealing with in-kernel users only and they need a common interface to
get the memory from.

> What you really want from device driver point of view is an helper to
> first tell you if 2 device can access each other and second an helper
> that allow the second device to import the other device memory to allow
> direct access.

Well, actually it's a bit more complicated than that but essentially
correct: There can be N devices in the mix and quite likely another
driver completely separate from all N devices. (eg. for our main use
case we have N nvme cards being talked to through an RDMA NIC with it
all being coordinated by the nvme-target driver).


> Discovering possible peer is a onetime only thing and designing around
> that is wrong in my view. There is already existing hierarchy in kernel
> for that in the form of the bus hierarchy (i am thinking pci bus here).
> So there is already existing way to discover this and you are just
> duplicating informations here.

I really don't see the solution you are proposing here. Have the user
specify a pci device name and just have them guess which ones have
suitable memory? Or do they have to walk the entire pci tree to find
ones that have such memory? There was no "duplicate" information created
by our patch set.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-14 Thread Logan Gunthorpe


On 14/04/17 05:37 AM, Benjamin Herrenschmidt wrote:
> I object to designing a subsystem that by design cannot work on whole
> categories of architectures out there.

Hardly. That's extreme. We'd design a subsystem that works for the easy
cases and needs more work to support the offset cases. It would not be
designed in such a way that it could _never_ support those
architectures. It would simply be such that it only permits use by the
cases that are known to work. Then those cases could be expanded as time
goes on and people work on adding more support.

There's tons of stuff that needs to be done to get this upstream. I'd
rather not require it to work for every possible architecture from the
start. The testing alone would be impossible. Many subsystems start by
working for x86 first and then adding support in other architectures
later. (Often with that work done by the people who care about those
systems and actually have the hardware to test with.)

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 04:28 PM, Dan Williams wrote:
> Unlike the pci bus address offset case which I think is fundamental to
> support since shipping archs do this today, I think it is ok to say
> p2p is restricted to a single sgl that gets to talk to host memory or
> a single device. That said, what's wrong with a p2p aware map_sg
> implementation calling up to the host memory map_sg implementation on
> a per sgl basis?

I think Ben said they need mixed sgls and that is where this gets messy.
I think I'd prefer this too given trying to enforce all sgs in a list to
be one type or another could be quite difficult given the state of the
scatterlist code.

>> Also, what happens if p2p pages end up getting passed to a device that
>> doesn't have the injected dma_ops?
> 
> This goes back to limiting p2p to a single pci host bridge. If the p2p
> capability is coordinated with the bridge rather than between the
> individual devices then we have a central point to catch this case.

Not really relevant. If these pages get to userspace (as people seem
keen on doing) or a less than careful kernel driver they could easily
get into the dma_map calls of devices that aren't even pci related (via
an O_DIRECT operation on an incorrect file or something). The common
code must reject these and can't rely on an injected dma op.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 04:50 PM, Dan Williams wrote:
> On Tue, Apr 18, 2017 at 3:48 PM, Logan Gunthorpe <log...@deltatee.com> wrote:
>>
>>
>> On 18/04/17 04:28 PM, Dan Williams wrote:
>>> Unlike the pci bus address offset case which I think is fundamental to
>>> support since shipping archs do this today, I think it is ok to say
>>> p2p is restricted to a single sgl that gets to talk to host memory or
>>> a single device. That said, what's wrong with a p2p aware map_sg
>>> implementation calling up to the host memory map_sg implementation on
>>> a per sgl basis?
>>
>> I think Ben said they need mixed sgls and that is where this gets messy.
>> I think I'd prefer this too given trying to enforce all sgs in a list to
>> be one type or another could be quite difficult given the state of the
>> scatterlist code.
>>
>>>> Also, what happens if p2p pages end up getting passed to a device that
>>>> doesn't have the injected dma_ops?
>>>
>>> This goes back to limiting p2p to a single pci host bridge. If the p2p
>>> capability is coordinated with the bridge rather than between the
>>> individual devices then we have a central point to catch this case.
>>
>> Not really relevant. If these pages get to userspace (as people seem
>> keen on doing) or a less than careful kernel driver they could easily
>> get into the dma_map calls of devices that aren't even pci related (via
>> an O_DIRECT operation on an incorrect file or something). The common
>> code must reject these and can't rely on an injected dma op.
> 
> No, we can't do that at get_user_pages() time, it will always need to
> be up to the device driver to fail dma that it can't perform.

I'm not sure I follow -- are you agreeing with me? The dma_map_* needs
to fail for any dma it cannot perform. Which means either all dma_ops
providers need to be p2p aware or this logic has to be in dma_map_*
itself. My point being: you can't rely on an injected dma_op for some
devices to handle the fail case globally.




Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 03:36 PM, Dan Williams wrote:
> On Tue, Apr 18, 2017 at 2:22 PM, Jason Gunthorpe
>  wrote:
>> On Tue, Apr 18, 2017 at 02:11:33PM -0700, Dan Williams wrote:
 I think this opens an even bigger can of worms..
>>>
>>> No, I don't think it does. You'd only shim when the target page is
>>> backed by a device, not host memory, and you can figure this out by a
>>> is_zone_device_page()-style lookup.
>>
>> The bigger can of worms is how do you meaningfully stack dma_ops.
> 
> This goes back to my original comment to make this capability a
> function of the pci bridge itself. The kernel has an implementation of
> a dynamically created bridge device that injects its own dma_ops for
> the devices behind the bridge. See vmd_setup_dma_ops() in
> drivers/pci/host/vmd.c.

Well the issue I think Jason is pointing out is that the ops don't
stack. The map_* function in the injected dma_ops needs to be able to
call the original map_* for any page that is not p2p memory. This is
especially annoying in the map_sg function which may need to call a
different op based on the contents of the sgl. (And please correct me if
I'm not seeing how this can be done in the vmd example.)

Also, what happens if p2p pages end up getting passed to a device that
doesn't have the injected dma_ops?

However, the concept of replacing the dma_ops for all devices behind a
supporting bridge is interesting and may be a good piece of the final
solution.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 01:48 PM, Jason Gunthorpe wrote:
> I think this is why progress on this keeps getting stuck - every
> solution is a lot of work.

Yup! There's also a ton of work just to get the iomem safety issues
addressed. Let alone the dma mapping issues.

> You could try to do a dummy mapping / create a MR early on to detect
> this.

Ok, that could be a workable solution.

> FWIW, I wonder if from a RDMA perspective we have another
> problem.. Should we allow P2P memory to be used with the local DMA
> lkey? There are potential designs around virtualization that would not
> allow that. Should we mandate that P2P memory be in its own MR?

I can't say I understand these issues...

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 02:31 PM, Dan Williams wrote:
> On Tue, Apr 18, 2017 at 1:29 PM, Jerome Glisse <jgli...@redhat.com> wrote:
>>> On Tue, Apr 18, 2017 at 12:35 PM, Logan Gunthorpe <log...@deltatee.com>
>>> wrote:
>>>>
>>>>
>>>> On 18/04/17 01:01 PM, Jason Gunthorpe wrote:
>>>>> Ultimately every dma_ops will need special code to support P2P with
>>>>> the special hardware that ops is controlling, so it makes some sense
>>>>> to start by pushing the check down there in the first place. This
>>>>> advice is partially motivated by how dma_map_sg is just a small
>>>>> wrapper around the function pointer call...
>>>>
>>>> Yes, I noticed this problem too and that makes sense. It just means
>>>> every dma_ops will probably need to be modified to either support p2p
>>>> pages or fail on them. Though, the only real difficulty there is that it
>>>> will be a lot of work.
>>>
>>> I don't think you need to go touch all dma_ops, I think you can just
>>> arrange for devices that are going to do dma to get redirected to a
>>> p2p aware provider of operations that overrides the system default
>>> dma_ops. I.e. just touch get_dma_ops().
>>
>> This would not work well for everyone, for instance on GPU we usualy
>> have buffer object with a mix of device memory and regular system
>> memory but call dma sg map once for the list.
>>
> 
> ...and that dma_map goes through get_dma_ops(), so I don't see the conflict?

The main conflict is in dma_map_sg which only does get_dma_ops once but
the sg may contain memory of different types.

Logan



Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 01:01 PM, Jason Gunthorpe wrote:
> Ultimately every dma_ops will need special code to support P2P with
> the special hardware that ops is controlling, so it makes some sense
> to start by pushing the check down there in the first place. This
> advice is partially motivated by how dma_map_sg is just a small
> wrapper around the function pointer call...

Yes, I noticed this problem too and that makes sense. It just means
every dma_ops will probably need to be modified to either support p2p
pages or fail on them. Though, the only real difficulty there is that it
will be a lot of work.

> Where p2p_same_segment_map_page checks if the two devices are on the
> 'same switch' and if so returns the address translated to match the
> bus address programmed into the BAR or fails. We knows this case is
> required to work by the PCI spec, so it makes sense to use it as the
> first canned helper.

I've also suggested that this check should probably be done (or perhaps
duplicated) before we even get to the map stage. In the case of
nvme-fabrics we'd probably want to let the user know when they try to
configure it or at least fall back to allocating regular memory instead.
It would be a difficult situation to have already copied a block of data
from a NIC to p2p memory only to have it be deemed unmappable on the
NVMe device it's destined for. (Or vice-versa.) This was another issue
p2pmem was attempting to solve.

Logan


[PATCH] dma-buf: Rename dma-ops to prevent conflict with kunmap_atomic macro

2017-04-18 Thread Logan Gunthorpe
Seeing the kunmap_atomic dma_buf_op shares the same name with a macro
in higmem.h, the former can be aliased if any dma-buf user includes
that header.

I'm personally trying to include highmem.h inside scatterlist.h and this
breaks the dma-buf code proper.

Christoph Hellwig suggested [1] renaming it and pushing this patch ASAP.

To maintain consistency I've renamed all four of kmap* and kunmap* to be
map* and unmap*. (Even though only kmap_atomic presently conflicts.)

[1] https://www.spinics.net/lists/target-devel/msg15070.html

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
---
 drivers/dma-buf/dma-buf.c  | 16 
 drivers/gpu/drm/armada/armada_gem.c|  8 
 drivers/gpu/drm/drm_prime.c|  8 
 drivers/gpu/drm/i915/i915_gem_dmabuf.c |  8 
 drivers/gpu/drm/omapdrm/omap_gem_dmabuf.c  |  8 
 drivers/gpu/drm/tegra/gem.c|  4 ++--
 drivers/gpu/drm/udl/udl_dmabuf.c   |  8 
 drivers/gpu/drm/vmwgfx/vmwgfx_prime.c  |  8 
 drivers/media/v4l2-core/videobuf2-dma-contig.c |  4 ++--
 drivers/media/v4l2-core/videobuf2-dma-sg.c |  4 ++--
 drivers/media/v4l2-core/videobuf2-vmalloc.c|  4 ++--
 drivers/staging/android/ion/ion.c  |  8 
 include/linux/dma-buf.h| 22 +++---
 13 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index 0007b79..7cc2bfe 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -405,8 +405,8 @@ struct dma_buf *dma_buf_export(const struct 
dma_buf_export_info *exp_info)
  || !exp_info->ops->map_dma_buf
  || !exp_info->ops->unmap_dma_buf
  || !exp_info->ops->release
- || !exp_info->ops->kmap_atomic
- || !exp_info->ops->kmap
+ || !exp_info->ops->map_atomic
+ || !exp_info->ops->map
  || !exp_info->ops->mmap)) {
return ERR_PTR(-EINVAL);
}
@@ -872,7 +872,7 @@ void *dma_buf_kmap_atomic(struct dma_buf *dmabuf, unsigned 
long page_num)
 {
WARN_ON(!dmabuf);
 
-   return dmabuf->ops->kmap_atomic(dmabuf, page_num);
+   return dmabuf->ops->map_atomic(dmabuf, page_num);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kmap_atomic);
 
@@ -889,8 +889,8 @@ void dma_buf_kunmap_atomic(struct dma_buf *dmabuf, unsigned 
long page_num,
 {
WARN_ON(!dmabuf);
 
-   if (dmabuf->ops->kunmap_atomic)
-   dmabuf->ops->kunmap_atomic(dmabuf, page_num, vaddr);
+   if (dmabuf->ops->unmap_atomic)
+   dmabuf->ops->unmap_atomic(dmabuf, page_num, vaddr);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kunmap_atomic);
 
@@ -907,7 +907,7 @@ void *dma_buf_kmap(struct dma_buf *dmabuf, unsigned long 
page_num)
 {
WARN_ON(!dmabuf);
 
-   return dmabuf->ops->kmap(dmabuf, page_num);
+   return dmabuf->ops->map(dmabuf, page_num);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kmap);
 
@@ -924,8 +924,8 @@ void dma_buf_kunmap(struct dma_buf *dmabuf, unsigned long 
page_num,
 {
WARN_ON(!dmabuf);
 
-   if (dmabuf->ops->kunmap)
-   dmabuf->ops->kunmap(dmabuf, page_num, vaddr);
+   if (dmabuf->ops->unmap)
+   dmabuf->ops->unmap(dmabuf, page_num, vaddr);
 }
 EXPORT_SYMBOL_GPL(dma_buf_kunmap);
 
diff --git a/drivers/gpu/drm/armada/armada_gem.c 
b/drivers/gpu/drm/armada/armada_gem.c
index 1597458..d6c2a5d 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -529,10 +529,10 @@ static const struct dma_buf_ops 
armada_gem_prime_dmabuf_ops = {
.map_dma_buf= armada_gem_prime_map_dma_buf,
.unmap_dma_buf  = armada_gem_prime_unmap_dma_buf,
.release= drm_gem_dmabuf_release,
-   .kmap_atomic= armada_gem_dmabuf_no_kmap,
-   .kunmap_atomic  = armada_gem_dmabuf_no_kunmap,
-   .kmap   = armada_gem_dmabuf_no_kmap,
-   .kunmap = armada_gem_dmabuf_no_kunmap,
+   .map_atomic = armada_gem_dmabuf_no_kmap,
+   .unmap_atomic   = armada_gem_dmabuf_no_kunmap,
+   .map= armada_gem_dmabuf_no_kmap,
+   .unmap  = armada_gem_dmabuf_no_kunmap,
.mmap   = armada_gem_dmabuf_mmap,
 };
 
diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c
index 25aa455..48ffd25 100644
--- a/drivers/gpu/drm/drm_prime.c
+++ b/drivers/gpu/drm/drm_prime.c
@@ -402,10 +402,10 @@ static const struct dma_buf_ops drm_gem_prime_dmabuf_ops 
=  {
.map_dma_buf = drm_gem_map_dma_buf,
.unmap_dma_buf = drm_gem_unmap_dma_buf,
.release = drm_gem_dmabuf_release,
-   .kmap = drm_gem_d

Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 03:03 PM, Jason Gunthorpe wrote:
> What about something more incremental like this instead:
> - dma_ops will set map_sg_p2p == map_sg when they are updated to
>   support p2p, otherwise DMA on P2P pages will fail for those ops.
> - When all ops support p2p we remove the if and ops->map_sg then
>   just call map_sg_p2p
> - For now the scatterlist maintains a bit when pages are added indicating if
>   p2p memory might be present in the list.
> - Unmap for p2p and non-p2p is the same, the underlying ops driver has
>   to make it work.
> 
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index 0977317c6835c2..505ed7d502053d 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -103,6 +103,9 @@ struct dma_map_ops {
>   int (*map_sg)(struct device *dev, struct scatterlist *sg,
> int nents, enum dma_data_direction dir,
> unsigned long attrs);
> + int (*map_sg_p2p)(struct device *dev, struct scatterlist *sg,
> +   int nents, enum dma_data_direction dir,
> +   unsigned long attrs);
>   void (*unmap_sg)(struct device *dev,
>struct scatterlist *sg, int nents,
>enum dma_data_direction dir,
> @@ -244,7 +247,15 @@ static inline int dma_map_sg_attrs(struct device *dev, 
> struct scatterlist *sg,
>   for_each_sg(sg, s, nents, i)
>   kmemcheck_mark_initialized(sg_virt(s), s->length);
>   BUG_ON(!valid_dma_direction(dir));
> - ents = ops->map_sg(dev, sg, nents, dir, attrs);
> +
> + if (sg_has_p2p(sg)) {
> + if (ops->map_sg_p2p)
> + ents = ops->map_sg_p2p(dev, sg, nents, dir, attrs);
> + else
> + return 0;
> + } else
> + ents = ops->map_sg(dev, sg, nents, dir, attrs);
> +
>   BUG_ON(ents < 0);
>   debug_dma_map_sg(dev, sg, nents, ents, dir);

I could get behind this. Though a couple of points:

1) It means that sg_has_p2p has to walk the entire sg and check every
page. Then map_sg_p2p/map_sg has to walk it again and repeat the check
then do some operation per page. If anyone is concerned about the
dma_map performance this could be an issue.

2) Without knowing exactly what the arch specific code may need to do
it's hard to say that this is exactly the right approach. If every
dma_ops provider has to do exactly this on every page it may lead to a
lot of duplicate code:

foreach_sg page:
   if (pci_page_is_p2p(page)) {
dma_addr = pci_p2p_map_page(page)
if (!dma_addr)
return 0;
continue
   }
   ...

The only thing I'm presently aware of is the segment check and applying
the offset to the physical address -- neither of which has much to do
with the specific dma_ops providers. It _may_ be that this needs to be
bus specific and not arch specific which I think is what Dan may be
getting at. So it may make sense to just have a pci_map_sg_p2p() which
takes a dma_ops struct it would use for any page that isn't a p2p page.

Logan


[PATCH] nvmet: convert from kmap to nvmet_copy_from_sgl

2017-04-18 Thread Logan Gunthorpe
This is safer as it doesn't rely on the data being stored in
a single page in an sgl.

It also aids our effort to start phasing out users of sg_page. See [1].

For this we kmalloc some memory, copy to it and free at the end. Note:
we can't allocate this memory on the stack as the kbuild test robot
reports some frame size overflows on i386.

[1] https://lwn.net/Articles/720053/

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Sagi Grimberg <s...@grimberg.me>
---
 drivers/nvme/target/fabrics-cmd.c | 32 +---
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/target/fabrics-cmd.c 
b/drivers/nvme/target/fabrics-cmd.c
index 8bd022af..2e0ab10 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -122,7 +122,15 @@ static void nvmet_execute_admin_connect(struct nvmet_req 
*req)
struct nvmet_ctrl *ctrl = NULL;
u16 status = 0;
 
-   d = kmap(sg_page(req->sg)) + req->sg->offset;
+   d = kmalloc(sizeof(*d), GFP_KERNEL);
+   if (!d) {
+   status = NVME_SC_INTERNAL;
+   goto complete;
+   }
+
+   status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+   if (status)
+   goto out;
 
/* zero out initial completion result, assign values as needed */
req->rsp->result.u32 = 0;
@@ -143,7 +151,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req 
*req)
}
 
status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
-   le32_to_cpu(c->kato), );
+ le32_to_cpu(c->kato), );
if (status)
goto out;
 
@@ -158,7 +166,8 @@ static void nvmet_execute_admin_connect(struct nvmet_req 
*req)
req->rsp->result.u16 = cpu_to_le16(ctrl->cntlid);
 
 out:
-   kunmap(sg_page(req->sg));
+   kfree(d);
+complete:
nvmet_req_complete(req, status);
 }
 
@@ -170,7 +179,15 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
u16 qid = le16_to_cpu(c->qid);
u16 status = 0;
 
-   d = kmap(sg_page(req->sg)) + req->sg->offset;
+   d = kmalloc(sizeof(*d), GFP_KERNEL);
+   if (!d) {
+   status = NVME_SC_INTERNAL;
+   goto complete;
+   }
+
+   status = nvmet_copy_from_sgl(req, 0, d, sizeof(*d));
+   if (status)
+   goto out;
 
/* zero out initial completion result, assign values as needed */
req->rsp->result.u32 = 0;
@@ -183,8 +200,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
}
 
status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn,
-   le16_to_cpu(d->cntlid),
-   req, );
+le16_to_cpu(d->cntlid),
+req, );
if (status)
goto out;
 
@@ -205,7 +222,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
 
 out:
-   kunmap(sg_page(req->sg));
+   kfree(d);
+complete:
nvmet_req_complete(req, status);
return;
 
-- 
2.1.4



Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-18 Thread Logan Gunthorpe


On 18/04/17 04:24 PM, Jason Gunthorpe wrote:
> Try and write a stacked map_sg function like you describe and you will
> see how horrible it quickly becomes.

Yes, unfortunately, I have to agree with this statement completely.

> Since dma mapping is a performance path we must be careful not to
> create intrinsic inefficiencies with otherwise nice layering :)

Yeah, I'm also personally thinking your proposal is the way to go as
well. Dan's injected ops suggestion is interesting but I can't see how
it solves the issue completely. Your proposal is the only one that seems
to be complete to me. It just has a few minor pain points which I've
already described but are likely manageable and less than the pain
stacked dma_ops creates.

Logan


Re: [RFC 0/8] Copy Offload with Peer-to-Peer PCI Memory

2017-04-24 Thread Logan Gunthorpe


On 24/04/17 01:36 AM, Knut Omang wrote:
> My first reflex when reading this thread was to think that this whole domain
> lends it self excellently to testing via Qemu. Could it be that doing this in 
> the opposite direction might be a safer approach in the long run even though 
> (significant) more work up-front?

That's an interesting idea. We did do some very limited testing on qemu
with one iteration of our work. However, it's difficult because there is
no support for any RDMA devices which are a part of our primary use
case. I also imagine it would be quite difficult to develop those models
given the array of hardware that needs to be supported and the deep
functional knowledge required to figure out appropriate restrictions.

Logan


Re: [PATCH v5 3/6] iomap: introduce io{read|write}64_{lo_hi|hi_lo}

2017-07-31 Thread Logan Gunthorpe


On 31/07/17 10:10 AM, Andy Shevchenko wrote:
> Some drivers (hardware) would like to have non-atomic MMIO accesses
> when readq() defined

Huh? But that's the whole point of the io64-nonatomic header. If a
driver wants a specific non-atomic access they should just code two 32
bit accesses.

> In case of readq() / writeq() it's defined by the order of inclusion:
> 
> 1)
> include <...non-atomic...>
> include 
> 
> Always non-atomic will be used.

I'm afraid you're wrong about this. The io-64-nonatomic-xx header
includes linux/io.h. Thus the order of the includes doesn't matter and
it will always auto switch. In any case, making an interface do
different things depending on the order of include files is *completely*
insane.

> P.S. I have done a table of comparison between IO accessors in Linux
> kernel and it looks hell out of being consistent.

There are a few corner oddities but it's really not that bad. Most
things are done for a reason if you dig into them.

Logan


Re: [PATCH v5 3/6] iomap: introduce io{read|write}64_{lo_hi|hi_lo}

2017-07-31 Thread Logan Gunthorpe


On 30/07/17 10:03 AM, Andy Shevchenko wrote:
> On Thu, Jul 27, 2017 at 2:19 AM, Logan Gunthorpe <log...@deltatee.com> wrote:
>> In order to provide non-atomic functions for io{read|write}64 that will
>> use readq and writeq when appropriate. We define a number of variants
>> of these functions in the generic iomap that will do non-atomic
>> operations on pio but atomic operations on mmio.
>>
>> These functions are only defined if readq and writeq are defined. If
>> they are not, then the wrappers that always use non-atomic operations
>> from include/linux/io-64-nonatomic*.h will be used.
> 
> Don't you see here a slight problem?
> 
> In some cases we want to substitute atomic in favour of non-atomic
> when both are defined.
> So, please don't do this "smartly".

I'm not sure what you mean here. The driver should use ioread64 and
include an io-64-nonatomic header. Then there are three cases:

1) The arch has no atomic 64 bit io operations defined. In this case it
uses the non-atomic inline function in the io-64-nonatomic header.

2) The arch uses CONFIG_GENERIC_IOMAP and has readq defined, but not
ioread64 defined (likely because pio can't do atomic 64 bit operations
but mmio can). In this case we need to use the ioread64_xx functions
defined in iomap.c which do atomic mmio and non-atomic pio.

3) The arch has ioread64 defined so the atomic operation is used.

>> +u64 ioread64_lo_hi(void __iomem *addr)
>> +{
>> +   IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr));
>> +   return 0xLL;
>> +}
> 
> U missed u.

I'll fix this in the next revision.

Thanks,

Logan



Re: [PATCH v3 01/16] switchtec: move structure definitions into a common header

2017-07-31 Thread Logan Gunthorpe
Hey,

I've never found any consistency in the capitalization and prefixes
patch titles. I used to capitalize them all but I've seen many
developers use lower case. In any case, I've changed them as you requested.

I've also fixed all my spelling mistakes you've pointed out in the
commit messages.

I'll submit a v4 set next week with these changes and any other feedback
I receive in the mean time.

Thanks,

Logan

On 31/07/17 02:15 PM, Bjorn Helgaas wrote:
> To follow the existing changelog subject pattern in drivers/ntb/hw, I
> would change yours like this:
> 
>   -switchtec: move structure definitions into a common header
>   +NTB: switchtec: Move structure definitions into a common header
> 
> I.e., add "NTB: " prefix and capitalize the first word of the
> description.  If I were merging this, I would silently change this
> myself, but obviously this already has my ack and will go through
> Jon's tree.
> 
> On Tue, Jul 25, 2017 at 02:57:38PM -0600, Logan Gunthorpe wrote:
>> Create the switchtec.h header in include/linux with hardware defines
>> and the switchtec_dev structure. Both moved directly from switchtec.c.
>> This is a prep patch for creating an NTB driver for Switchtec.
>>
>> Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
>> Reviewed-by: Stephen Bates <sba...@raithlin.com>
>> Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
>> Acked-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
>> Acked-by: Bjorn Helgaas <bhelg...@google.com>
>> ---
>>  MAINTAINERS|   1 +
>>  drivers/pci/switch/switchtec.c | 260 +-
>>  include/linux/switchtec.h  | 279 
>> +
>>  3 files changed, 281 insertions(+), 259 deletions(-)
>>  create mode 100644 include/linux/switchtec.h
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index 205d3977ac46..4ff2ad7c1c7b 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -10146,6 +10146,7 @@ F:   Documentation/switchtec.txt
>>  F:  Documentation/ABI/testing/sysfs-class-switchtec
>>  F:  drivers/pci/switch/switchtec*
>>  F:  include/uapi/linux/switchtec_ioctl.h
>> +F:  include/linux/switchtec.h
>>  
>>  PCI DRIVER FOR NVIDIA TEGRA
>>  M:  Thierry Reding <thierry.red...@gmail.com>
>> diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
>> index af81b2dec42e..5b75d3008ff8 100644
>> --- a/drivers/pci/switch/switchtec.c
>> +++ b/drivers/pci/switch/switchtec.c
>> @@ -13,6 +13,7 @@
>>   *
>>   */
>>  
>> +#include 
>>  #include 
>>  
>>  #include 
>> @@ -20,8 +21,6 @@
>>  #include 
>>  #include 
>>  #include 
>> -#include 
>> -#include 
>>  #include 
>>  
>>  MODULE_DESCRIPTION("Microsemi Switchtec(tm) PCIe Management Driver");
>> @@ -37,263 +36,6 @@ static dev_t switchtec_devt;
>>  static struct class *switchtec_class;
>>  static DEFINE_IDA(switchtec_minor_ida);
>>  
>> -#define MICROSEMI_VENDOR_ID 0x11f8
>> -#define MICROSEMI_NTB_CLASSCODE 0x068000
>> -#define MICROSEMI_MGMT_CLASSCODE0x058000
>> -
>> -#define SWITCHTEC_MRPC_PAYLOAD_SIZE 1024
>> -#define SWITCHTEC_MAX_PFF_CSR 48
>> -
>> -#define SWITCHTEC_EVENT_OCCURRED BIT(0)
>> -#define SWITCHTEC_EVENT_CLEARBIT(0)
>> -#define SWITCHTEC_EVENT_EN_LOG   BIT(1)
>> -#define SWITCHTEC_EVENT_EN_CLI   BIT(2)
>> -#define SWITCHTEC_EVENT_EN_IRQ   BIT(3)
>> -#define SWITCHTEC_EVENT_FATALBIT(4)
>> -
>> -enum {
>> -SWITCHTEC_GAS_MRPC_OFFSET   = 0x,
>> -SWITCHTEC_GAS_TOP_CFG_OFFSET= 0x1000,
>> -SWITCHTEC_GAS_SW_EVENT_OFFSET   = 0x1800,
>> -SWITCHTEC_GAS_SYS_INFO_OFFSET   = 0x2000,
>> -SWITCHTEC_GAS_FLASH_INFO_OFFSET = 0x2200,
>> -SWITCHTEC_GAS_PART_CFG_OFFSET   = 0x4000,
>> -SWITCHTEC_GAS_NTB_OFFSET= 0x1,
>> -SWITCHTEC_GAS_PFF_CSR_OFFSET= 0x134000,
>> -};
>> -
>> -struct mrpc_regs {
>> -u8 input_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
>> -u8 output_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
>> -u32 cmd;
>> -u32 status;
>> -u32 ret_value;
>> -} __packed;
>> -
>> -enum mrpc_status {
>> -SWITCHTEC_MRPC_STATUS_INPROGRESS = 1,
>> -SWITCHTEC_MRPC_STATUS_DONE = 2,
>> -SWITCHTEC_MRPC_STATUS_ERROR = 0xFF,
>> -SWITCHTEC_MRPC_STATUS_INTERRUPTED = 0x100,
>> -};
>> -
>> -struct sw_event_regs {
>> -u64 event_report_ctrl;
>> -u64 reserved1;
>

Re: [PATCH v5 3/6] iomap: introduce io{read|write}64_{lo_hi|hi_lo}

2017-07-31 Thread Logan Gunthorpe


On 31/07/17 11:58 AM, Andy Shevchenko wrote:
> On Mon, Jul 31, 2017 at 7:31 PM, Logan Gunthorpe <log...@deltatee.com> wrote:
>> On 31/07/17 10:10 AM, Andy Shevchenko wrote:
>>> Some drivers (hardware) would like to have non-atomic MMIO accesses
>>> when readq() defined
>>
>> Huh? But that's the whole point of the io64-nonatomic header. If a
>> driver wants a specific non-atomic access they should just code two 32
>> bit accesses.

> You mean to call them directly as lo_hi_XXX() or hi_lo_XXX() ?
> Yes it would work.

I suppose you could do that too but I really meant just using two io32
calls. That's the most explicit way to indicate you want a non-atomic
access.

Logan


Re: [PATCH v5 3/6] iomap: introduce io{read|write}64_{lo_hi|hi_lo}

2017-07-31 Thread Logan Gunthorpe


On 31/07/17 12:03 PM, Andy Shevchenko wrote:
> 
> Per commit 3a044178cccf they are exactly created for this kind of cases.
> 

Sure, ok, and my patchset provides the same set of functions to satisfy
such a use.

Logan


Re: BUG: NULL pointer dereference at ib_uverbs_comp_handler+0x20

2017-08-01 Thread Logan Gunthorpe

On 01/08/17 01:29 PM, Jason Gunthorpe wrote:
> On Tue, Aug 01, 2017 at 12:32:57PM -0600, Logan Gunthorpe wrote:
>>  Couldn't create rdma QP - Invalid argument
>> Unable to create QP.
>> Failed to create QP.
> 
> Failing to create a QP makes me wonder if you have have this patch?
> 
>  Subject: [PATCH v2 1/2] RDMA/uverbs: Fix the check for port number
> 
>  The port number is only valid if IB_QP_PORT is set in the mask.
>  So only check port number if it is valid to prevent modify_qp from
>  failing due to an invalid port number.
> 
>  Fixes: 5ecce4c9b17b("Check port number supplied by user verbs cmds")
>  Cc: <sta...@vger.kernel.org> # v2.6.14+
>  Reviewed-by: Steve Wise <sw...@opengridcomputing.com>
>  Signed-off-by: Mustafa Ismail <mustafa.ism...@intel.com>

Oh, oops, I forgot about that. I mentioned the fix for that in my
original email and it seems I wasn't testing apples to apples for my
testing today. During my testing today, the branch with the reverted
commits had the fix for that commit while the branch with Bharat's patch
didn't.

I just did a test with both Bharat's patch and 5a7a88f1b4, and
everything is working correctly again.

So that's great, we just need these patches to be picked up by the
stable kernels.

Thanks,

Logan





Re: [PATCH v3 14/16] switchtec_ntb: implement scratchpad registers

2017-08-01 Thread Logan Gunthorpe


On 01/08/17 01:10 PM, Jon Mason wrote:
> It would probaly be better if I remarked about the SPADs in the actual
> patch about the SPADS :)
> 
> The whole point of using the SPADs in the NTB driver was to workaround
> the problems establishing a connection between the two sides of the
> NTB and where everything lives.  So, using a MW to get around the
> SPADs is sort of backwards (and slightly funny).  I realize you are
> trying to use the existing transport with minimal changes to enable
> your hardware, and thus this makes logical sense to you.  However, if
> the SPADs are not really needed, then we should either remove them
> from the transport (or use them for something else).
>
> Per my comment in the other patch, I'm amenable to take this series
> as-is, assuming you are willing to address this design issue in the
> near future.  Thoughts?

Yes, I agree. I'd be willing to help but it seems the clients are
written the way they are for the other drivers, so it's their needs
(which I'm not fully aware of) that have to be considered.

I've also made all the other changes you sent as well as the file rename
Dave requested. Once I see the bug fix patch you were going to pull hit
ntb-next I'll rebase, test and resubmit.

Thanks,

Logan


[PATCH v4 15/15] NTB: switchtec_ntb: Update switchtec documentation with notes for NTB

2017-08-03 Thread Logan Gunthorpe
The switchtec_ntb driver has a couple requirements on the switchec's
hardware configuration so we add these notes to the documentation.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 Documentation/switchtec.txt | 12 
 1 file changed, 12 insertions(+)

diff --git a/Documentation/switchtec.txt b/Documentation/switchtec.txt
index a0a9c7b3d4d5..f788264921ff 100644
--- a/Documentation/switchtec.txt
+++ b/Documentation/switchtec.txt
@@ -78,3 +78,15 @@ The following IOCTLs are also supported by the device:
   between PCI Function Framework number (used by the event system)
   and Switchtec Logic Port ID and Partition number (which is more
   user friendly).
+
+
+Non-Transparent Bridge (NTB) Driver
+===
+
+An NTB driver is provided for the switchtec hardware in switchtec_ntb.
+Currently, it only supports switches configured with exactly 2
+partitions. It also requires the following configuration settings:
+
+* Both partitions must be able to access each other's GAS spaces.
+  Thus, the bits in the GAS Access Vector under Management Settings
+  must be set to support this.
-- 
2.11.0



[PATCH v4 01/15] NTB: switchtec: Move structure definitions into a common header

2017-08-03 Thread Logan Gunthorpe
Create the switchtec.h header in include/linux with hardware defines
and the switchtec_dev structure. Both moved directly from switchtec.c.
This is a prep patch for creating an NTB driver for Switchtec.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
Acked-by: Bjorn Helgaas <bhelg...@google.com>
---
 MAINTAINERS|   1 +
 drivers/pci/switch/switchtec.c | 260 +-
 include/linux/switchtec.h  | 279 +
 3 files changed, 281 insertions(+), 259 deletions(-)
 create mode 100644 include/linux/switchtec.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 205d3977ac46..4ff2ad7c1c7b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10146,6 +10146,7 @@ F:  Documentation/switchtec.txt
 F: Documentation/ABI/testing/sysfs-class-switchtec
 F: drivers/pci/switch/switchtec*
 F: include/uapi/linux/switchtec_ioctl.h
+F: include/linux/switchtec.h
 
 PCI DRIVER FOR NVIDIA TEGRA
 M: Thierry Reding <thierry.red...@gmail.com>
diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index af81b2dec42e..5b75d3008ff8 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -13,6 +13,7 @@
  *
  */
 
+#include 
 #include 
 
 #include 
@@ -20,8 +21,6 @@
 #include 
 #include 
 #include 
-#include 
-#include 
 #include 
 
 MODULE_DESCRIPTION("Microsemi Switchtec(tm) PCIe Management Driver");
@@ -37,263 +36,6 @@ static dev_t switchtec_devt;
 static struct class *switchtec_class;
 static DEFINE_IDA(switchtec_minor_ida);
 
-#define MICROSEMI_VENDOR_ID 0x11f8
-#define MICROSEMI_NTB_CLASSCODE 0x068000
-#define MICROSEMI_MGMT_CLASSCODE0x058000
-
-#define SWITCHTEC_MRPC_PAYLOAD_SIZE 1024
-#define SWITCHTEC_MAX_PFF_CSR 48
-
-#define SWITCHTEC_EVENT_OCCURRED BIT(0)
-#define SWITCHTEC_EVENT_CLEARBIT(0)
-#define SWITCHTEC_EVENT_EN_LOG   BIT(1)
-#define SWITCHTEC_EVENT_EN_CLI   BIT(2)
-#define SWITCHTEC_EVENT_EN_IRQ   BIT(3)
-#define SWITCHTEC_EVENT_FATALBIT(4)
-
-enum {
-   SWITCHTEC_GAS_MRPC_OFFSET   = 0x,
-   SWITCHTEC_GAS_TOP_CFG_OFFSET= 0x1000,
-   SWITCHTEC_GAS_SW_EVENT_OFFSET   = 0x1800,
-   SWITCHTEC_GAS_SYS_INFO_OFFSET   = 0x2000,
-   SWITCHTEC_GAS_FLASH_INFO_OFFSET = 0x2200,
-   SWITCHTEC_GAS_PART_CFG_OFFSET   = 0x4000,
-   SWITCHTEC_GAS_NTB_OFFSET= 0x1,
-   SWITCHTEC_GAS_PFF_CSR_OFFSET= 0x134000,
-};
-
-struct mrpc_regs {
-   u8 input_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
-   u8 output_data[SWITCHTEC_MRPC_PAYLOAD_SIZE];
-   u32 cmd;
-   u32 status;
-   u32 ret_value;
-} __packed;
-
-enum mrpc_status {
-   SWITCHTEC_MRPC_STATUS_INPROGRESS = 1,
-   SWITCHTEC_MRPC_STATUS_DONE = 2,
-   SWITCHTEC_MRPC_STATUS_ERROR = 0xFF,
-   SWITCHTEC_MRPC_STATUS_INTERRUPTED = 0x100,
-};
-
-struct sw_event_regs {
-   u64 event_report_ctrl;
-   u64 reserved1;
-   u64 part_event_bitmap;
-   u64 reserved2;
-   u32 global_summary;
-   u32 reserved3[3];
-   u32 stack_error_event_hdr;
-   u32 stack_error_event_data;
-   u32 reserved4[4];
-   u32 ppu_error_event_hdr;
-   u32 ppu_error_event_data;
-   u32 reserved5[4];
-   u32 isp_error_event_hdr;
-   u32 isp_error_event_data;
-   u32 reserved6[4];
-   u32 sys_reset_event_hdr;
-   u32 reserved7[5];
-   u32 fw_exception_hdr;
-   u32 reserved8[5];
-   u32 fw_nmi_hdr;
-   u32 reserved9[5];
-   u32 fw_non_fatal_hdr;
-   u32 reserved10[5];
-   u32 fw_fatal_hdr;
-   u32 reserved11[5];
-   u32 twi_mrpc_comp_hdr;
-   u32 twi_mrpc_comp_data;
-   u32 reserved12[4];
-   u32 twi_mrpc_comp_async_hdr;
-   u32 twi_mrpc_comp_async_data;
-   u32 reserved13[4];
-   u32 cli_mrpc_comp_hdr;
-   u32 cli_mrpc_comp_data;
-   u32 reserved14[4];
-   u32 cli_mrpc_comp_async_hdr;
-   u32 cli_mrpc_comp_async_data;
-   u32 reserved15[4];
-   u32 gpio_interrupt_hdr;
-   u32 gpio_interrupt_data;
-   u32 reserved16[4];
-} __packed;
-
-enum {
-   SWITCHTEC_CFG0_RUNNING = 0x04,
-   SWITCHTEC_CFG1_RUNNING = 0x05,
-   SWITCHTEC_IMG0_RUNNING = 0x03,
-   SWITCHTEC_IMG1_RUNNING = 0x07,
-};
-
-struct sys_info_regs {
-   u32 device_id;
-   u32 device_version;
-   u32 firmware_version;
-   u32 reserved1;
-   u32 vendor_table_revision;
-   u32 table_format_version;
-   u32 partition_id;
-   u32 cfg_file_fmt_version;
-   u16 cfg_running;
-   u16 img_running;
-   u32 reserved2[57];
-   char vendor_id[8];
-   char product_id[16];
-   char product_revision[4];
-   char component_vendor[8];
-   u16 component_id;
-   u8 component_re

[PATCH v4 04/15] NTB: switchtec: Add link event notifier callback

2017-08-03 Thread Logan Gunthorpe
In order for the Switchtec NTB code to handle link change events we
create a notifier callback in the switchtec code which gets called
whenever an appropriate event interrupt occurs.

In order to preserve userspace's ability to follow these events,
we compare the event count with a stored copy from last time we
checked.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Bjorn Helgaas <bhelg...@google.com>
---
 drivers/pci/switch/switchtec.c | 51 ++
 include/linux/switchtec.h  |  4 
 2 files changed, 55 insertions(+)

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 39c9218d733f..807569e62bee 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -978,6 +978,49 @@ static const struct file_operations switchtec_fops = {
.compat_ioctl = switchtec_dev_ioctl,
 };
 
+static void link_event_work(struct work_struct *work)
+{
+   struct switchtec_dev *stdev;
+
+   stdev = container_of(work, struct switchtec_dev, link_event_work);
+
+   if (stdev->link_notifier)
+   stdev->link_notifier(stdev);
+}
+
+static void check_link_state_events(struct switchtec_dev *stdev)
+{
+   int idx;
+   u32 reg;
+   int count;
+   int occurred = 0;
+
+   for (idx = 0; idx < stdev->pff_csr_count; idx++) {
+   reg = ioread32(>mmio_pff_csr[idx].link_state_hdr);
+   dev_dbg(>dev, "link_state: %d->%08x\n", idx, reg);
+   count = (reg >> 5) & 0xFF;
+
+   if (count != stdev->link_event_count[idx]) {
+   occurred = 1;
+   stdev->link_event_count[idx] = count;
+   }
+   }
+
+   if (occurred)
+   schedule_work(>link_event_work);
+}
+
+static void enable_link_state_events(struct switchtec_dev *stdev)
+{
+   int idx;
+
+   for (idx = 0; idx < stdev->pff_csr_count; idx++) {
+   iowrite32(SWITCHTEC_EVENT_CLEAR |
+ SWITCHTEC_EVENT_EN_IRQ,
+ >mmio_pff_csr[idx].link_state_hdr);
+   }
+}
+
 static void stdev_release(struct device *dev)
 {
struct switchtec_dev *stdev = to_stdev(dev);
@@ -1030,6 +1073,7 @@ static struct switchtec_dev *stdev_create(struct pci_dev 
*pdev)
stdev->mrpc_busy = 0;
INIT_WORK(>mrpc_work, mrpc_event_work);
INIT_DELAYED_WORK(>mrpc_timeout, mrpc_timeout_work);
+   INIT_WORK(>link_event_work, link_event_work);
init_waitqueue_head(>event_wq);
atomic_set(>event_cnt, 0);
 
@@ -1073,6 +1117,9 @@ static int mask_event(struct switchtec_dev *stdev, int 
eid, int idx)
if (!(hdr & SWITCHTEC_EVENT_OCCURRED && hdr & SWITCHTEC_EVENT_EN_IRQ))
return 0;
 
+   if (eid == SWITCHTEC_IOCTL_EVENT_LINK_STATE)
+   return 0;
+
dev_dbg(>dev, "%s: %d %d %x\n", __func__, eid, idx, hdr);
hdr &= ~(SWITCHTEC_EVENT_EN_IRQ | SWITCHTEC_EVENT_OCCURRED);
iowrite32(hdr, hdr_reg);
@@ -1092,6 +1139,7 @@ static int mask_all_events(struct switchtec_dev *stdev, 
int eid)
for (idx = 0; idx < stdev->pff_csr_count; idx++) {
if (!stdev->pff_local[idx])
continue;
+
count += mask_event(stdev, eid, idx);
}
} else {
@@ -1116,6 +1164,8 @@ static irqreturn_t switchtec_event_isr(int irq, void *dev)
iowrite32(reg, >mmio_part_cfg->mrpc_comp_hdr);
}
 
+   check_link_state_events(stdev);
+
for (eid = 0; eid < SWITCHTEC_IOCTL_MAX_EVENTS; eid++)
event_count += mask_all_events(stdev, eid);
 
@@ -1242,6 +1292,7 @@ static int switchtec_pci_probe(struct pci_dev *pdev,
iowrite32(SWITCHTEC_EVENT_CLEAR |
  SWITCHTEC_EVENT_EN_IRQ,
  >mmio_part_cfg->mrpc_comp_hdr);
+   enable_link_state_events(stdev);
 
rc = cdev_device_add(>cdev, >dev);
if (rc)
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 9c2be7631257..d8159944f013 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -353,6 +353,10 @@ struct switchtec_dev {
 
wait_queue_head_t event_wq;
atomic_t event_cnt;
+
+   struct work_struct link_event_work;
+   void (*link_notifier)(struct switchtec_dev *stdev);
+   u8 link_event_count[SWITCHTEC_MAX_PFF_CSR];
 };
 
 static inline struct switchtec_dev *to_stdev(struct device *dev)
-- 
2.11.0



[PATCH v4 06/15] NTB: Add check and comment for link up to mw_count() and mw_get_align()

2017-08-03 Thread Logan Gunthorpe
Adds a comment and a check to ntb_mw_get_align() so that it always fails
if the function is called before the link is up.

Also adds a comment to ntb_mw_count() to note that it may return 0 if
it is called before the link is up.

This is to prevent accidental mis-use in clients that are testing
on hardware that this doesn't matter for.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 include/linux/ntb.h | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/ntb.h b/include/linux/ntb.h
index 609e232c00da..47f2966cfd7f 100644
--- a/include/linux/ntb.h
+++ b/include/linux/ntb.h
@@ -730,7 +730,8 @@ static inline int ntb_link_disable(struct ntb_dev *ntb)
  * Hardware and topology may support a different number of memory windows.
  * Moreover different peer devices can support different number of memory
  * windows. Simply speaking this method returns the number of possible inbound
- * memory windows to share with specified peer device.
+ * memory windows to share with specified peer device. Note: this may return
+ * zero if the link is not up yet.
  *
  * Return: the number of memory windows.
  */
@@ -751,7 +752,7 @@ static inline int ntb_mw_count(struct ntb_dev *ntb, int 
pidx)
  * Get the alignments of an inbound memory window with specified index.
  * NULL may be given for any output parameter if the value is not needed.
  * The alignment and size parameters may be used for allocation of proper
- * shared memory.
+ * shared memory. Note: this must only be called when the link is up.
  *
  * Return: Zero on success, otherwise a negative error number.
  */
@@ -760,6 +761,9 @@ static inline int ntb_mw_get_align(struct ntb_dev *ntb, int 
pidx, int widx,
   resource_size_t *size_align,
   resource_size_t *size_max)
 {
+   if (!(ntb_link_is_up(ntb, NULL, NULL) & (1 << pidx)))
+   return -ENOTCONN;
+
return ntb->ops->mw_get_align(ntb, pidx, widx, addr_align, size_align,
  size_max);
 }
-- 
2.11.0



[PATCH v4 09/15] NTB: switchtec_ntb: Initialize hardware for doorbells and messages

2017-08-03 Thread Logan Gunthorpe
Set up some hardware registers and creates interrupt service routines
for the doorbells and messages.

There are 64 doorbells in the switch that are shared between all
partitions. The upper 4 doorbells are also shared with the messages
and are therefore not used. Thus, this provides 28 doorbells for each
partition.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 142 +
 1 file changed, 142 insertions(+)

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c 
b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index 831bfdf40068..175ac0baa7a0 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
 MODULE_VERSION("0.1");
@@ -75,6 +76,9 @@ struct switchtec_ntb {
int self_partition;
int peer_partition;
 
+   int doorbell_irq;
+   int message_irq;
+
struct ntb_info_regs __iomem *mmio_ntb;
struct ntb_ctrl_regs __iomem *mmio_ctrl;
struct ntb_dbmsg_regs __iomem *mmio_dbmsg;
@@ -86,6 +90,11 @@ struct switchtec_ntb {
struct shared_mw __iomem *peer_shared;
dma_addr_t self_shared_dma;
 
+   u64 db_mask;
+   u64 db_valid_mask;
+   int db_shift;
+   int db_peer_shift;
+
int nr_direct_mw;
int nr_lut_mw;
int direct_mw_to_bar[MAX_DIRECT_MW];
@@ -215,6 +224,49 @@ static void switchtec_ntb_init_mw(struct switchtec_ntb 
*sndev)
 
 }
 
+/*
+ * There are 64 doorbells in the switch hardware but this is
+ * shared among all partitions. So we must split them in half
+ * (32 for each partition). However, the message interrupts are
+ * also shared with the top 4 doorbells so we just limit this to
+ * 28 doorbells per partition
+ */
+static void switchtec_ntb_init_db(struct switchtec_ntb *sndev)
+{
+   sndev->db_valid_mask = 0x0FFF;
+
+   if (sndev->self_partition < sndev->peer_partition) {
+   sndev->db_shift = 0;
+   sndev->db_peer_shift = 32;
+   } else {
+   sndev->db_shift = 32;
+   sndev->db_peer_shift = 0;
+   }
+
+   sndev->db_mask = 0x0FFFULL;
+   iowrite64(~sndev->db_mask, >mmio_self_dbmsg->idb_mask);
+   iowrite64(sndev->db_valid_mask << sndev->db_peer_shift,
+ >mmio_self_dbmsg->odb_mask);
+}
+
+static void switchtec_ntb_init_msgs(struct switchtec_ntb *sndev)
+{
+   int i;
+   u32 msg_map = 0;
+
+   for (i = 0; i < ARRAY_SIZE(sndev->mmio_self_dbmsg->imsg); i++) {
+   int m = i | sndev->peer_partition << 2;
+
+   msg_map |= m << i * 8;
+   }
+
+   iowrite32(msg_map, >mmio_self_dbmsg->msg_map);
+
+   for (i = 0; i < ARRAY_SIZE(sndev->mmio_self_dbmsg->imsg); i++)
+   iowrite64(NTB_DBMSG_IMSG_STATUS | NTB_DBMSG_IMSG_MASK,
+ >mmio_self_dbmsg->imsg[i]);
+}
+
 static int switchtec_ntb_init_req_id_table(struct switchtec_ntb *sndev)
 {
int rc = 0;
@@ -364,6 +416,87 @@ static void switchtec_ntb_deinit_shared_mw(struct 
switchtec_ntb *sndev)
  sndev->self_shared_dma);
 }
 
+static irqreturn_t switchtec_ntb_doorbell_isr(int irq, void *dev)
+{
+   struct switchtec_ntb *sndev = dev;
+
+   dev_dbg(>stdev->dev, "doorbell\n");
+
+   return IRQ_HANDLED;
+}
+
+static irqreturn_t switchtec_ntb_message_isr(int irq, void *dev)
+{
+   int i;
+   struct switchtec_ntb *sndev = dev;
+
+   for (i = 0; i < ARRAY_SIZE(sndev->mmio_self_dbmsg->imsg); i++) {
+   u64 msg = ioread64(>mmio_self_dbmsg->imsg[i]);
+
+   if (msg & NTB_DBMSG_IMSG_STATUS) {
+   dev_dbg(>stdev->dev, "message: %d %08x\n", i,
+   (u32)msg);
+   iowrite8(1, >mmio_self_dbmsg->imsg[i].status);
+   }
+   }
+
+   return IRQ_HANDLED;
+}
+
+static int switchtec_ntb_init_db_msg_irq(struct switchtec_ntb *sndev)
+{
+   int i;
+   int rc;
+   int doorbell_irq = 0;
+   int message_irq = 0;
+   int event_irq;
+   int idb_vecs = sizeof(sndev->mmio_self_dbmsg->idb_vec_map);
+
+   event_irq = ioread32(>stdev->mmio_part_cfg->vep_vector_number);
+
+   while (doorbell_irq == event_irq)
+   doorbell_irq++;
+   while (message_irq == doorbell_irq ||
+  message_irq == event_irq)
+   message_irq++;
+
+   dev_dbg(>stdev->dev, "irqs - event: %d, db: %d, msgs: %d&q

[PATCH v6 1/7] drm/tilcdc: ensure nonatomic iowrite64 is not used

2017-08-03 Thread Logan Gunthorpe
Add a check to ensure iowrite64 is only used if it is atomic.

It was decided in [1] that the tilcdc driver should not be using an
atomic operation (so it was left out of this patchset). However, it turns
out that through the drm code, a nonatomic header is actually included:

include/linux/io-64-nonatomic-lo-hi.h
is included from include/drm/drm_os_linux.h:9:0,
from include/drm/drmP.h:74,
from include/drm/drm_modeset_helper.h:26,
from include/drm/drm_atomic_helper.h:33,
from drivers/gpu/drm/tilcdc/tilcdc_crtc.c:19:

And thus, without this change, this patchset would inadvertantly
change the behaviour of the tilcdc driver.

[1] 
lkml.kernel.org/r/cak8p3a2hho_zcnstzq7hmwsz5la5thu19fwzpun16imnyyn...@mail.gmail.com

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Cc: Jyri Sarha <jsa...@ti.com>
Cc: Arnd Bergmann <a...@arndb.de>
Cc: Tomi Valkeinen <tomi.valkei...@ti.com>
Cc: David Airlie <airl...@linux.ie>
---
 drivers/gpu/drm/tilcdc/tilcdc_regs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/tilcdc/tilcdc_regs.h 
b/drivers/gpu/drm/tilcdc/tilcdc_regs.h
index 9d528c0a67a4..5048ebb86835 100644
--- a/drivers/gpu/drm/tilcdc/tilcdc_regs.h
+++ b/drivers/gpu/drm/tilcdc/tilcdc_regs.h
@@ -133,7 +133,7 @@ static inline void tilcdc_write64(struct drm_device *dev, 
u32 reg, u64 data)
struct tilcdc_drm_private *priv = dev->dev_private;
volatile void __iomem *addr = priv->mmio + reg;
 
-#ifdef iowrite64
+#if defined(iowrite64) && !defined(iowrite64_is_nonatomic)
iowrite64(data, addr);
 #else
__iowmb();
-- 
2.11.0



[PATCH v6 6/7] ntb: ntb_hw_intel: use io-64-nonatomic instead of in-driver hacks

2017-08-03 Thread Logan Gunthorpe
Now that ioread64 and iowrite64 are available in io-64-nonatomic,
we can remove the hack at the top of ntb_hw_intel.c and replace it
with an include.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Acked-by: Dave Jiang <dave.ji...@intel.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
Acked-by: Jon Mason <jdma...@kudzu.us>
---
 drivers/ntb/hw/intel/ntb_hw_intel.c | 30 +-
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.c 
b/drivers/ntb/hw/intel/ntb_hw_intel.c
index 2557e2c05b90..606c90f59d4b 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.c
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.c
@@ -59,6 +59,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ntb_hw_intel.h"
 
@@ -155,35 +156,6 @@ MODULE_PARM_DESC(xeon_b2b_dsd_bar5_addr32,
 static inline enum ntb_topo xeon_ppd_topo(struct intel_ntb_dev *ndev, u8 ppd);
 static int xeon_init_isr(struct intel_ntb_dev *ndev);
 
-#ifndef ioread64
-#ifdef readq
-#define ioread64 readq
-#else
-#define ioread64 _ioread64
-static inline u64 _ioread64(void __iomem *mmio)
-{
-   u64 low, high;
-
-   low = ioread32(mmio);
-   high = ioread32(mmio + sizeof(u32));
-   return low | (high << 32);
-}
-#endif
-#endif
-
-#ifndef iowrite64
-#ifdef writeq
-#define iowrite64 writeq
-#else
-#define iowrite64 _iowrite64
-static inline void _iowrite64(u64 val, void __iomem *mmio)
-{
-   iowrite32(val, mmio);
-   iowrite32(val >> 32, mmio + sizeof(u32));
-}
-#endif
-#endif
-
 static inline int pdev_is_atom(struct pci_dev *pdev)
 {
switch (pdev->device) {
-- 
2.11.0



[PATCH v6 0/7] make io{read|write}64 globally usable

2017-08-03 Thread Logan Gunthorpe
Changes since v5:
- Added a fix to the tilcdc driver to ensure it doesn't use the
  non-atomic operation. (This includes adding io{read|write}64[be]_is_nonatomic
  defines).

Changes since v4:
- Add functions so the powerpc implementation of iomap.c compiles. (As
  noticed by Horia)

Changes since v3:

- I noticed powerpc didn't use the appropriate functions seeing
  readq/writeq were not defined when iomap.h was included. Thus I've
  included a patch to adjust this
- Fixed some mistakes with a couple of the defines in io-64-nonatomic*
  headers
- Fixed a typo noticed by Horia.

(earlier versions were drastically different)

Horia Geantă (1):
  crypto: caam: cleanup CONFIG_64BIT ifdefs when using io{read|write}64

Logan Gunthorpe (6):
  drm/tilcdc: ensure nonatomic iowrite64 is not used
  powerpc: io.h: move iomap.h include so that it can use readq/writeq
defs
  powerpc: iomap.c: introduce io{read|write}64_{lo_hi|hi_lo}
  iomap: introduce io{read|write}64_{lo_hi|hi_lo}
  io-64-nonatomic: add io{read|write}64[be]{_lo_hi|_hi_lo} macros
  ntb: ntb_hw_intel: use io-64-nonatomic instead of in-driver hacks

 arch/powerpc/include/asm/io.h |   6 +-
 arch/powerpc/kernel/iomap.c   |  40 +++
 drivers/crypto/caam/regs.h|  35 ++---
 drivers/gpu/drm/tilcdc/tilcdc_regs.h  |   2 +-
 drivers/ntb/hw/intel/ntb_hw_intel.c   |  30 +---
 include/asm-generic/iomap.h   |  26 +--
 include/linux/io-64-nonatomic-hi-lo.h |  64 +
 include/linux/io-64-nonatomic-lo-hi.h |  64 +
 lib/iomap.c   | 132 ++
 9 files changed, 331 insertions(+), 68 deletions(-)

--
2.11.0


[PATCH v4 05/15] NTB: Ensure ntb_mw_get_align() is only called when the link is up

2017-08-03 Thread Logan Gunthorpe
With Switchtec hardware it's impossible to get the alignment parameters
for a peer's memory window until the peer's driver has configured its
windows. Strictly speaking, the link doesn't have to be up for this,
but the link being up is the only way the client can tell that
the other side has been configured.

This patch converts ntb_transport and ntb_perf to use this function after
the link goes up. This simplifies these clients slightly because they
no longer have to store the alignment parameters. It also tweaks
ntb_tool so that peer_mw_trans will print zero if it is run before
the link goes up.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 drivers/ntb/ntb_transport.c | 20 ++--
 drivers/ntb/test/ntb_perf.c | 18 +-
 drivers/ntb/test/ntb_tool.c |  6 +++---
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index f58d8e305323..045e3dd4750e 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -191,8 +191,6 @@ struct ntb_transport_qp {
 struct ntb_transport_mw {
phys_addr_t phys_addr;
resource_size_t phys_size;
-   resource_size_t xlat_align;
-   resource_size_t xlat_align_size;
void __iomem *vbase;
size_t xlat_size;
size_t buff_size;
@@ -687,13 +685,20 @@ static int ntb_set_mw(struct ntb_transport_ctx *nt, int 
num_mw,
struct ntb_transport_mw *mw = >mw_vec[num_mw];
struct pci_dev *pdev = nt->ndev->pdev;
size_t xlat_size, buff_size;
+   resource_size_t xlat_align;
+   resource_size_t xlat_align_size;
int rc;
 
if (!size)
return -EINVAL;
 
-   xlat_size = round_up(size, mw->xlat_align_size);
-   buff_size = round_up(size, mw->xlat_align);
+   rc = ntb_mw_get_align(nt->ndev, PIDX, num_mw, _align,
+ _align_size, NULL);
+   if (rc)
+   return rc;
+
+   xlat_size = round_up(size, xlat_align_size);
+   buff_size = round_up(size, xlat_align);
 
/* No need to re-setup */
if (mw->xlat_size == xlat_size)
@@ -722,7 +727,7 @@ static int ntb_set_mw(struct ntb_transport_ctx *nt, int 
num_mw,
 * is a requirement of the hardware. It is recommended to setup CMA
 * for BAR sizes equal or greater than 4MB.
 */
-   if (!IS_ALIGNED(mw->dma_addr, mw->xlat_align)) {
+   if (!IS_ALIGNED(mw->dma_addr, xlat_align)) {
dev_err(>dev, "DMA memory %pad is not aligned\n",
>dma_addr);
ntb_free_mw(nt, num_mw);
@@ -1104,11 +1109,6 @@ static int ntb_transport_probe(struct ntb_client *self, 
struct ntb_dev *ndev)
for (i = 0; i < mw_count; i++) {
mw = >mw_vec[i];
 
-   rc = ntb_mw_get_align(ndev, PIDX, i, >xlat_align,
- >xlat_align_size, NULL);
-   if (rc)
-   goto err1;
-
rc = ntb_peer_mw_get_addr(ndev, i, >phys_addr,
  >phys_size);
if (rc)
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c
index 759f772fa00c..427112cf101a 100644
--- a/drivers/ntb/test/ntb_perf.c
+++ b/drivers/ntb/test/ntb_perf.c
@@ -108,8 +108,6 @@ MODULE_PARM_DESC(on_node, "Run threads only on NTB device 
node (default: true)")
 struct perf_mw {
phys_addr_t phys_addr;
resource_size_t phys_size;
-   resource_size_t xlat_align;
-   resource_size_t xlat_align_size;
void __iomem*vbase;
size_t  xlat_size;
size_t  buf_size;
@@ -472,13 +470,20 @@ static int perf_set_mw(struct perf_ctx *perf, 
resource_size_t size)
 {
struct perf_mw *mw = >mw;
size_t xlat_size, buf_size;
+   resource_size_t xlat_align;
+   resource_size_t xlat_align_size;
int rc;
 
if (!size)
return -EINVAL;
 
-   xlat_size = round_up(size, mw->xlat_align_size);
-   buf_size = round_up(size, mw->xlat_align);
+   rc = ntb_mw_get_align(perf->ntb, PIDX, 0, _align,
+ _align_size, NULL);
+   if (rc)
+   return rc;
+
+   xlat_size = round_up(size, xlat_align_size);
+   buf_size = round_up(size, xlat_align);
 
if (mw->xlat_size == xlat_size)
return 0;
@@ -567,11 +572,6 @@ static int perf_setup_mw(struct ntb_dev *ntb, struct 
perf_ctx *perf)
 
mw = >mw;
 
-   rc = ntb_mw_get_align(ntb, PIDX, 0, >xlat_align,
- >xlat_align_size, NULL);
-   if (rc)
-   return rc;
-
rc = ntb_peer_mw_get_addr(ntb, 0, >phys_addr, >phys_size);
if (rc)
return rc;
diff --git a/driv

[PATCH v4 03/15] NTB: switchtec: Add NTB hardware register definitions

2017-08-03 Thread Logan Gunthorpe
There are two additional regions: ctrl and dbmsg. The first is
for generic NTB control and memory windows. The second is for doorbells
and message registers. This patch also adds a number of related
constants for using these registers.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
---
 include/linux/switchtec.h | 84 +++
 1 file changed, 84 insertions(+)

diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 18e388101855..9c2be7631257 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -156,6 +156,12 @@ struct flash_info_regs {
struct partition_info vendor[8];
 };
 
+enum {
+   SWITCHTEC_NTB_REG_INFO_OFFSET   = 0x,
+   SWITCHTEC_NTB_REG_CTRL_OFFSET   = 0x4000,
+   SWITCHTEC_NTB_REG_DBMSG_OFFSET  = 0x64000,
+};
+
 struct ntb_info_regs {
u8  partition_count;
u8  partition_id;
@@ -191,6 +197,84 @@ struct part_cfg_regs {
 } __packed;
 
 enum {
+   NTB_CTRL_PART_OP_LOCK = 0x1,
+   NTB_CTRL_PART_OP_CFG = 0x2,
+   NTB_CTRL_PART_OP_RESET = 0x3,
+
+   NTB_CTRL_PART_STATUS_NORMAL = 0x1,
+   NTB_CTRL_PART_STATUS_LOCKED = 0x2,
+   NTB_CTRL_PART_STATUS_LOCKING = 0x3,
+   NTB_CTRL_PART_STATUS_CONFIGURING = 0x4,
+   NTB_CTRL_PART_STATUS_RESETTING = 0x5,
+
+   NTB_CTRL_BAR_VALID = 1 << 0,
+   NTB_CTRL_BAR_DIR_WIN_EN = 1 << 4,
+   NTB_CTRL_BAR_LUT_WIN_EN = 1 << 5,
+
+   NTB_CTRL_REQ_ID_EN = 1 << 0,
+
+   NTB_CTRL_LUT_EN = 1 << 0,
+
+   NTB_PART_CTRL_ID_PROT_DIS = 1 << 0,
+};
+
+struct ntb_ctrl_regs {
+   u32 partition_status;
+   u32 partition_op;
+   u32 partition_ctrl;
+   u32 bar_setup;
+   u32 bar_error;
+   u16 lut_table_entries;
+   u16 lut_table_offset;
+   u32 lut_error;
+   u16 req_id_table_size;
+   u16 req_id_table_offset;
+   u32 req_id_error;
+   u32 reserved1[7];
+   struct {
+   u32 ctl;
+   u32 win_size;
+   u64 xlate_addr;
+   } bar_entry[6];
+   u32 reserved2[216];
+   u32 req_id_table[256];
+   u32 reserved3[512];
+   u64 lut_entry[512];
+} __packed;
+
+#define NTB_DBMSG_IMSG_STATUS BIT_ULL(32)
+#define NTB_DBMSG_IMSG_MASK   BIT_ULL(40)
+
+struct ntb_dbmsg_regs {
+   u32 reserved1[1024];
+   u64 odb;
+   u64 odb_mask;
+   u64 idb;
+   u64 idb_mask;
+   u8  idb_vec_map[64];
+   u32 msg_map;
+   u32 reserved2;
+   struct {
+   u32 msg;
+   u32 status;
+   } omsg[4];
+
+   struct {
+   u32 msg;
+   u8  status;
+   u8  mask;
+   u8  src;
+   u8  reserved;
+   } imsg[4];
+
+   u8 reserved3[3928];
+   u8 msix_table[1024];
+   u8 reserved4[3072];
+   u8 pba[24];
+   u8 reserved5[4072];
+} __packed;
+
+enum {
SWITCHTEC_PART_CFG_EVENT_RESET = 1 << 0,
SWITCHTEC_PART_CFG_EVENT_MRPC_CMP = 1 << 1,
SWITCHTEC_PART_CFG_EVENT_MRPC_ASYNC_CMP = 1 << 2,
-- 
2.11.0



[PATCH v4 08/15] NTB: switchtec_ntb: Initialize hardware for memory windows

2017-08-03 Thread Logan Gunthorpe
Add the code to initialize the memory windows in the hardware.
This includes setting up the requester ID table, and figuring out
which BAR corresponds to which memory window. (Seeing the switch
can be configured with any number of BARs.)

Also, seeing the device doesn't have hardware for scratchpads or
determining the link status, we create a shared memory window that has
these features. A magic number with a version component will be used
to determine if the other side's driver is actually up.

The shared memory window also informs the other side of the
size and count of the local memory windows.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 360 +
 1 file changed, 360 insertions(+)

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c 
b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index 253efba72275..831bfdf40068 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -15,37 +15,396 @@
 
 #include 
 #include 
+#include 
+#include 
 
 MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
 MODULE_VERSION("0.1");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Microsemi Corporation");
 
+static bool use_lut_mws;
+module_param(use_lut_mws, bool, 0644);
+MODULE_PARM_DESC(use_lut_mws,
+"Enable the use of the LUT based memory windows");
+
+#ifndef ioread64
+#ifdef readq
+#define ioread64 readq
+#else
+#define ioread64 _ioread64
+static inline u64 _ioread64(void __iomem *mmio)
+{
+   u64 low, high;
+
+   low = ioread32(mmio);
+   high = ioread32(mmio + sizeof(u32));
+   return low | (high << 32);
+}
+#endif
+#endif
+
+#ifndef iowrite64
+#ifdef writeq
+#define iowrite64 writeq
+#else
+#define iowrite64 _iowrite64
+static inline void _iowrite64(u64 val, void __iomem *mmio)
+{
+   iowrite32(val, mmio);
+   iowrite32(val >> 32, mmio + sizeof(u32));
+}
+#endif
+#endif
+
+#define SWITCHTEC_NTB_MAGIC 0x45CC0001
+#define MAX_MWS 128
+
+struct shared_mw {
+   u32 magic;
+   u32 partition_id;
+   u64 mw_sizes[MAX_MWS];
+};
+
+#define MAX_DIRECT_MW ARRAY_SIZE(((struct ntb_ctrl_regs *)(0))->bar_entry)
+#define LUT_SIZE SZ_64K
+
 struct switchtec_ntb {
struct switchtec_dev *stdev;
+
+   int self_partition;
+   int peer_partition;
+
+   struct ntb_info_regs __iomem *mmio_ntb;
+   struct ntb_ctrl_regs __iomem *mmio_ctrl;
+   struct ntb_dbmsg_regs __iomem *mmio_dbmsg;
+   struct ntb_ctrl_regs __iomem *mmio_self_ctrl;
+   struct ntb_ctrl_regs __iomem *mmio_peer_ctrl;
+   struct ntb_dbmsg_regs __iomem *mmio_self_dbmsg;
+
+   struct shared_mw *self_shared;
+   struct shared_mw __iomem *peer_shared;
+   dma_addr_t self_shared_dma;
+
+   int nr_direct_mw;
+   int nr_lut_mw;
+   int direct_mw_to_bar[MAX_DIRECT_MW];
+
+   int peer_nr_direct_mw;
+   int peer_nr_lut_mw;
+   int peer_direct_mw_to_bar[MAX_DIRECT_MW];
 };
 
+static int switchtec_ntb_part_op(struct switchtec_ntb *sndev,
+struct ntb_ctrl_regs __iomem *ctl,
+u32 op, int wait_status)
+{
+   static const char * const op_text[] = {
+   [NTB_CTRL_PART_OP_LOCK] = "lock",
+   [NTB_CTRL_PART_OP_CFG] = "configure",
+   [NTB_CTRL_PART_OP_RESET] = "reset",
+   };
+
+   int i;
+   u32 ps;
+   int status;
+
+   switch (op) {
+   case NTB_CTRL_PART_OP_LOCK:
+   status = NTB_CTRL_PART_STATUS_LOCKING;
+   break;
+   case NTB_CTRL_PART_OP_CFG:
+   status = NTB_CTRL_PART_STATUS_CONFIGURING;
+   break;
+   case NTB_CTRL_PART_OP_RESET:
+   status = NTB_CTRL_PART_STATUS_RESETTING;
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   iowrite32(op, >partition_op);
+
+   for (i = 0; i < 1000; i++) {
+   if (msleep_interruptible(50) != 0) {
+   iowrite32(NTB_CTRL_PART_OP_RESET, >partition_op);
+   return -EINTR;
+   }
+
+   ps = ioread32(>partition_status) & 0x;
+
+   if (ps != status)
+   break;
+   }
+
+   if (ps == wait_status)
+   return 0;
+
+   if (ps == status) {
+   dev_err(>stdev->dev,
+   "Timed out while peforming %s (%d). (%08x)",
+   op_text[op], op,
+   ioread32(>partition_status));
+
+   return -ETIMEDOUT;
+   }
+
+   return -EIO;
+}
+
+static void switchtec_ntb_init_sndev

[PATCH v4 10/15] NTB: switchtec_ntb: Add skeleton NTB driver

2017-08-03 Thread Logan Gunthorpe
Add a skeleton NTB driver which will be filled out in subsequent patches.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 148 -
 include/linux/ntb.h|   3 +
 2 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c 
b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index 175ac0baa7a0..158ed310cbaf 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
 MODULE_VERSION("0.1");
@@ -71,6 +72,7 @@ struct shared_mw {
 #define LUT_SIZE SZ_64K
 
 struct switchtec_ntb {
+   struct ntb_dev ntb;
struct switchtec_dev *stdev;
 
int self_partition;
@@ -161,10 +163,148 @@ static int switchtec_ntb_part_op(struct switchtec_ntb 
*sndev,
return -EIO;
 }
 
+static int switchtec_ntb_mw_count(struct ntb_dev *ntb, int pidx)
+{
+   return 0;
+}
+
+static int switchtec_ntb_mw_get_align(struct ntb_dev *ntb, int pidx,
+ int widx, resource_size_t *addr_align,
+ resource_size_t *size_align,
+ resource_size_t *size_max)
+{
+   return 0;
+}
+
+static int switchtec_ntb_mw_set_trans(struct ntb_dev *ntb, int pidx, int widx,
+ dma_addr_t addr, resource_size_t size)
+{
+   return 0;
+}
+
+static int switchtec_ntb_peer_mw_count(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static int switchtec_ntb_peer_mw_get_addr(struct ntb_dev *ntb, int idx,
+ phys_addr_t *base,
+ resource_size_t *size)
+{
+   return 0;
+}
+
+static u64 switchtec_ntb_link_is_up(struct ntb_dev *ntb,
+   enum ntb_speed *speed,
+   enum ntb_width *width)
+{
+   return 0;
+}
+
+static int switchtec_ntb_link_enable(struct ntb_dev *ntb,
+enum ntb_speed max_speed,
+enum ntb_width max_width)
+{
+   return 0;
+}
+
+static int switchtec_ntb_link_disable(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static u64 switchtec_ntb_db_valid_mask(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static int switchtec_ntb_db_vector_count(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static u64 switchtec_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector)
+{
+   return 0;
+}
+
+static u64 switchtec_ntb_db_read(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static int switchtec_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+   return 0;
+}
+
+static int switchtec_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+   return 0;
+}
+
+static int switchtec_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+   return 0;
+}
+
+static int switchtec_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
+{
+   return 0;
+}
+
+static int switchtec_ntb_spad_count(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static u32 switchtec_ntb_spad_read(struct ntb_dev *ntb, int idx)
+{
+   return 0;
+}
+
+static int switchtec_ntb_spad_write(struct ntb_dev *ntb, int idx, u32 val)
+{
+   return 0;
+}
+
+static int switchtec_ntb_peer_spad_write(struct ntb_dev *ntb, int pidx,
+int sidx, u32 val)
+{
+   return 0;
+}
+
+static const struct ntb_dev_ops switchtec_ntb_ops = {
+   .mw_count   = switchtec_ntb_mw_count,
+   .mw_get_align   = switchtec_ntb_mw_get_align,
+   .mw_set_trans   = switchtec_ntb_mw_set_trans,
+   .peer_mw_count  = switchtec_ntb_peer_mw_count,
+   .peer_mw_get_addr   = switchtec_ntb_peer_mw_get_addr,
+   .link_is_up = switchtec_ntb_link_is_up,
+   .link_enable= switchtec_ntb_link_enable,
+   .link_disable   = switchtec_ntb_link_disable,
+   .db_valid_mask  = switchtec_ntb_db_valid_mask,
+   .db_vector_count= switchtec_ntb_db_vector_count,
+   .db_vector_mask = switchtec_ntb_db_vector_mask,
+   .db_read= switchtec_ntb_db_read,
+   .db_clear   = switchtec_ntb_db_clear,
+   .db_set_mask= switchtec_ntb_db_set_mask,
+   .db_clear_mask  = switchtec_ntb_db_clear_mask,
+   .peer_db_set= switchtec_ntb_peer_db_set,
+   .spad_count = switchtec_ntb_spad_count,
+   .spad_read  = switchtec_ntb_spad_read,
+   .spad_write = switchtec_ntb_spad_write,
+   .peer_spad_

[PATCH v4 02/15] NTB: switchtec: Export class symbol for use in upper layer driver

2017-08-03 Thread Logan Gunthorpe
We export the class pointer symbol and add an extern define in the
Switchtec header file.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Bjorn Helgaas <bhelg...@google.com>
---
 drivers/pci/switch/switchtec.c | 4 +++-
 include/linux/switchtec.h  | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 5b75d3008ff8..39c9218d733f 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -33,9 +33,11 @@ module_param(max_devices, int, 0644);
 MODULE_PARM_DESC(max_devices, "max number of switchtec device instances");
 
 static dev_t switchtec_devt;
-static struct class *switchtec_class;
 static DEFINE_IDA(switchtec_minor_ida);
 
+struct class *switchtec_class;
+EXPORT_SYMBOL_GPL(switchtec_class);
+
 enum mrpc_state {
MRPC_IDLE = 0,
MRPC_QUEUED,
diff --git a/include/linux/switchtec.h b/include/linux/switchtec.h
index 1cbd0e63b0ab..18e388101855 100644
--- a/include/linux/switchtec.h
+++ b/include/linux/switchtec.h
@@ -276,4 +276,6 @@ static inline struct switchtec_dev *to_stdev(struct device 
*dev)
return container_of(dev, struct switchtec_dev, dev);
 }
 
+extern struct class *switchtec_class;
+
 #endif
-- 
2.11.0



[PATCH v4 00/15] Switchtec NTB Support

2017-08-03 Thread Logan Gunthorpe
Changes since v3:

- Rebased onto the Jon's ntb branch that contains one of the patches
  that was in v3
- Various spelling and commit log mistakes noticed by Bjorn
- Renamed switchtec_ntb to ntb_hw_switchtec per Dave's suggestion
- Fixed a number of code nits pointed out by Jon

--

Changes since v2:

- Reordered the ntb_test link patch per Allen
- Removed an extra call to switchtec_ntb_init_mw
- Fixed a typo in the switchtec.txt documentation.

--

Changes since v1:

- Rebased onto latest ntb-next branch (with v4.13-rc1)
- Reworked ntb_mw_count() function so that it can be called all the
  time (per discussion with Allen)
- Various spelling and formatting cleanups from Bjorn
- Added request_module() call such that the NTB module is automatically
  loaded when appropriate hardware exists.

--

Changes since the rfc:

- Rebased on ntb-next
- Switched ntb_part_op to use sleep instead of delay
- Dropped a number of useless dbg __func__ prints
- Went back to the dynamic instead of the static class
- Swapped the notifier block for a simple callback
- Modified the new ntb api so that a couple functions with pidx
  now must be called after link up. Per our discussion on the list.

--

This patchset implements Non-Transparent Bridge (NTB) support for the
Microsemi Switchtec series of switches. We're looking for some
review from the community at this point but hope to get it upstreamed
for v4.14.

Switchtec NTB support is configured over the same function and bar
as the management endpoint. Thus, the new driver hooks into the
management driver which we had merged in v4.12. We use the class
interface API to register an NTB device for every switchtec device
which supports NTB (not all do).

The Switchtec hardware supports doorbells, memory windows and messages.
Seeing there is no native scratchpad support, 128 spads are emulated
through the use of a pre-setup memory window. The switch has 64
doorbells which are shared between the two partitions and a
configurable set of memory windows. While the hardware supports more
than 2 partitions, this driver only supports the first two seeing
the current NTB API only supports two hosts.

The driver has been tested with ntb_netdev and fully passes the
ntb_test script.

This patchset is based off of ntb-next and can be found in this
git repo:

https://github.com/sbates130272/linux-p2pmem.git switchtec_ntb_v4

Logan Gunthorpe (15):
  NTB: switchtec: Move structure definitions into a common header
  NTB: switchtec: Export class symbol for use in upper layer driver
  NTB: switchtec: Add NTB hardware register definitions
  NTB: switchtec: Add link event notifier callback
  NTB: Ensure ntb_mw_get_align() is only called when the link is up
  NTB: Add check and comment for link up to mw_count() and
mw_get_align()
  NTB: switchtec_ntb: Introduce initial NTB driver
  NTB: switchtec_ntb: Initialize hardware for memory windows
  NTB: switchtec_ntb: Initialize hardware for doorbells and messages
  NTB: switchtec_ntb: Add skeleton NTB driver
  NTB: switchtec_ntb: Add link management
  NTB: switchtec_ntb: Implement doorbell registers
  NTB: switchtec_ntb: Implement scratchpad registers
  NTB: switchtec_ntb: Add memory window support
  NTB: switchtec_ntb: Update switchtec documentation with notes for NTB

 Documentation/switchtec.txt|   12 +
 MAINTAINERS|2 +
 drivers/ntb/hw/Kconfig |1 +
 drivers/ntb/hw/Makefile|1 +
 drivers/ntb/hw/mscc/Kconfig|9 +
 drivers/ntb/hw/mscc/Makefile   |1 +
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 1216 
 drivers/ntb/ntb_transport.c|   20 +-
 drivers/ntb/test/ntb_perf.c|   18 +-
 drivers/ntb/test/ntb_tool.c|6 +-
 drivers/pci/switch/switchtec.c |  316 ++---
 include/linux/ntb.h|   11 +-
 include/linux/switchtec.h  |  373 ++
 13 files changed, 1703 insertions(+), 283 deletions(-)
 create mode 100644 drivers/ntb/hw/mscc/Kconfig
 create mode 100644 drivers/ntb/hw/mscc/Makefile
 create mode 100644 drivers/ntb/hw/mscc/ntb_hw_switchtec.c
 create mode 100644 include/linux/switchtec.h

--
2.11.0


[PATCH v6 7/7] crypto: caam: cleanup CONFIG_64BIT ifdefs when using io{read|write}64

2017-08-03 Thread Logan Gunthorpe
From: Horia Geantă <horia.gea...@nxp.com>

We can now make use of the io-64-nonatomic-lo-hi header to always
provide 64 bit IO operations. So this patch cleans up the extra
CONFIG_64BIT ifdefs.

To be consistent with CAAM engine HW spec: in case of 64-bit registers,
irrespective of device endianness, the lower address should be read from
/ written to first, followed by the upper address. Indeed the I/O
accessors in CAAM driver currently don't follow the spec, however this
is a good opportunity to fix the code.

Signed-off-by: Horia Geantă <horia.gea...@nxp.com>
Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Cc: Horia Geantă <horia.gea...@nxp.com>
Cc: Dan Douglass <dan.dougl...@nxp.com>
Cc: Herbert Xu <herb...@gondor.apana.org.au>
Cc: "David S. Miller" <da...@davemloft.net>
---
 drivers/crypto/caam/regs.h | 35 +--
 1 file changed, 5 insertions(+), 30 deletions(-)

diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h
index 84d2f838a063..0c45505458e7 100644
--- a/drivers/crypto/caam/regs.h
+++ b/drivers/crypto/caam/regs.h
@@ -9,7 +9,7 @@
 
 #include 
 #include 
-#include 
+#include 
 
 /*
  * Architecture-specific register access methods
@@ -134,50 +134,25 @@ static inline void clrsetbits_32(void __iomem *reg, u32 
clear, u32 set)
  *base + 0x : least-significant 32 bits
  *base + 0x0004 : most-significant 32 bits
  */
-#ifdef CONFIG_64BIT
 static inline void wr_reg64(void __iomem *reg, u64 data)
 {
+#ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX
if (caam_little_end)
iowrite64(data, reg);
else
-   iowrite64be(data, reg);
-}
-
-static inline u64 rd_reg64(void __iomem *reg)
-{
-   if (caam_little_end)
-   return ioread64(reg);
-   else
-   return ioread64be(reg);
-}
-
-#else /* CONFIG_64BIT */
-static inline void wr_reg64(void __iomem *reg, u64 data)
-{
-#ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX
-   if (caam_little_end) {
-   wr_reg32((u32 __iomem *)(reg) + 1, data >> 32);
-   wr_reg32((u32 __iomem *)(reg), data);
-   } else
 #endif
-   {
-   wr_reg32((u32 __iomem *)(reg), data >> 32);
-   wr_reg32((u32 __iomem *)(reg) + 1, data);
-   }
+   iowrite64be(data, reg);
 }
 
 static inline u64 rd_reg64(void __iomem *reg)
 {
 #ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX
if (caam_little_end)
-   return ((u64)rd_reg32((u32 __iomem *)(reg) + 1) << 32 |
-   (u64)rd_reg32((u32 __iomem *)(reg)));
+   return ioread64(reg);
else
 #endif
-   return ((u64)rd_reg32((u32 __iomem *)(reg)) << 32 |
-   (u64)rd_reg32((u32 __iomem *)(reg) + 1));
+   return ioread64be(reg);
 }
-#endif /* CONFIG_64BIT  */
 
 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 #ifdef CONFIG_SOC_IMX7D
-- 
2.11.0



[PATCH v4 11/15] NTB: switchtec_ntb: Add link management

2017-08-03 Thread Logan Gunthorpe
switchtec_ntb checks for a link by looking at the shared memory
window. If the magic number is correct and the other side indicates
their link is enabled then we take the link to be up.

Whenever we change our local link status we send a msg to the
other side to check whether it's up and change their status.

The current status is maintained in a flag so ntb_is_link_up
can return quickly.

We utilize Switchtec's link status notifier to also check link changes
when the switch notices a port changes state.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 130 -
 1 file changed, 129 insertions(+), 1 deletion(-)

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c 
b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index 158ed310cbaf..b477a8915245 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -64,6 +64,7 @@ static inline void _iowrite64(u64 val, void __iomem *mmio)
 
 struct shared_mw {
u32 magic;
+   u32 link_sta;
u32 partition_id;
u64 mw_sizes[MAX_MWS];
 };
@@ -104,8 +105,17 @@ struct switchtec_ntb {
int peer_nr_direct_mw;
int peer_nr_lut_mw;
int peer_direct_mw_to_bar[MAX_DIRECT_MW];
+
+   bool link_is_up;
+   enum ntb_speed link_speed;
+   enum ntb_width link_width;
 };
 
+static struct switchtec_ntb *ntb_sndev(struct ntb_dev *ntb)
+{
+   return container_of(ntb, struct switchtec_ntb, ntb);
+}
+
 static int switchtec_ntb_part_op(struct switchtec_ntb *sndev,
 struct ntb_ctrl_regs __iomem *ctl,
 u32 op, int wait_status)
@@ -163,6 +173,17 @@ static int switchtec_ntb_part_op(struct switchtec_ntb 
*sndev,
return -EIO;
 }
 
+static int switchtec_ntb_send_msg(struct switchtec_ntb *sndev, int idx,
+ u32 val)
+{
+   if (idx < 0 || idx >= ARRAY_SIZE(sndev->mmio_self_dbmsg->omsg))
+   return -EINVAL;
+
+   iowrite32(val, >mmio_self_dbmsg->omsg[idx].msg);
+
+   return 0;
+}
+
 static int switchtec_ntb_mw_count(struct ntb_dev *ntb, int pidx)
 {
return 0;
@@ -194,22 +215,124 @@ static int switchtec_ntb_peer_mw_get_addr(struct ntb_dev 
*ntb, int idx,
return 0;
 }
 
+static void switchtec_ntb_part_link_speed(struct switchtec_ntb *sndev,
+ int partition,
+ enum ntb_speed *speed,
+ enum ntb_width *width)
+{
+   struct switchtec_dev *stdev = sndev->stdev;
+
+   u32 pff = ioread32(>mmio_part_cfg[partition].vep_pff_inst_id);
+   u32 linksta = ioread32(>mmio_pff_csr[pff].pci_cap_region[13]);
+
+   if (speed)
+   *speed = (linksta >> 16) & 0xF;
+
+   if (width)
+   *width = (linksta >> 20) & 0x3F;
+}
+
+static void switchtec_ntb_set_link_speed(struct switchtec_ntb *sndev)
+{
+   enum ntb_speed self_speed, peer_speed;
+   enum ntb_width self_width, peer_width;
+
+   if (!sndev->link_is_up) {
+   sndev->link_speed = NTB_SPEED_NONE;
+   sndev->link_width = NTB_WIDTH_NONE;
+   return;
+   }
+
+   switchtec_ntb_part_link_speed(sndev, sndev->self_partition,
+ _speed, _width);
+   switchtec_ntb_part_link_speed(sndev, sndev->peer_partition,
+ _speed, _width);
+
+   sndev->link_speed = min(self_speed, peer_speed);
+   sndev->link_width = min(self_width, peer_width);
+}
+
+enum {
+   LINK_MESSAGE = 0,
+   MSG_LINK_UP = 1,
+   MSG_LINK_DOWN = 2,
+   MSG_CHECK_LINK = 3,
+};
+
+static void switchtec_ntb_check_link(struct switchtec_ntb *sndev)
+{
+   int link_sta;
+   int old = sndev->link_is_up;
+
+   link_sta = sndev->self_shared->link_sta;
+   if (link_sta) {
+   u64 peer = ioread64(>peer_shared->magic);
+
+   if ((peer & 0x) == SWITCHTEC_NTB_MAGIC)
+   link_sta = peer >> 32;
+   else
+   link_sta = 0;
+   }
+
+   sndev->link_is_up = link_sta;
+   switchtec_ntb_set_link_speed(sndev);
+
+   if (link_sta != old) {
+   switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_CHECK_LINK);
+   ntb_link_event(>ntb);
+   dev_info(>stdev->dev, "ntb link %s",
+link_sta ? "up" : "down");
+   }
+}
+
+static void switchtec_ntb_link_notification(struct switchtec_dev *stdev)
+{
+   struct switchtec_ntb *sndev = stdev->sndev;
+

[PATCH v4 13/15] NTB: switchtec_ntb: Implement scratchpad registers

2017-08-03 Thread Logan Gunthorpe
Seeing there is no dedicated hardware for this, we simply add
these as entries in the shared memory window. Thus, we could support
any number of them but 128 seems like enough, for now.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 75 +-
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c 
b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index 4a6125d2f305..205bd9481122 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -67,6 +67,7 @@ struct shared_mw {
u32 link_sta;
u32 partition_id;
u64 mw_sizes[MAX_MWS];
+   u32 spad[128];
 };
 
 #define MAX_DIRECT_MW ARRAY_SIZE(((struct ntb_ctrl_regs *)(0))->bar_entry)
@@ -455,22 +456,90 @@ static int switchtec_ntb_peer_db_set(struct ntb_dev *ntb, 
u64 db_bits)
 
 static int switchtec_ntb_spad_count(struct ntb_dev *ntb)
 {
-   return 0;
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   return ARRAY_SIZE(sndev->self_shared->spad);
 }
 
 static u32 switchtec_ntb_spad_read(struct ntb_dev *ntb, int idx)
 {
-   return 0;
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   if (idx < 0 || idx >= ARRAY_SIZE(sndev->self_shared->spad))
+   return 0;
+
+   if (!sndev->self_shared)
+   return 0;
+
+   return sndev->self_shared->spad[idx];
 }
 
 static int switchtec_ntb_spad_write(struct ntb_dev *ntb, int idx, u32 val)
 {
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   if (idx < 0 || idx >= ARRAY_SIZE(sndev->self_shared->spad))
+   return -EINVAL;
+
+   if (!sndev->self_shared)
+   return -EIO;
+
+   sndev->self_shared->spad[idx] = val;
+
return 0;
 }
 
+static u32 switchtec_ntb_peer_spad_read(struct ntb_dev *ntb, int pidx,
+   int sidx)
+{
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   if (pidx != NTB_DEF_PEER_IDX)
+   return -EINVAL;
+
+   if (sidx < 0 || sidx >= ARRAY_SIZE(sndev->peer_shared->spad))
+   return 0;
+
+   if (!sndev->peer_shared)
+   return 0;
+
+   return ioread32(>peer_shared->spad[sidx]);
+}
+
 static int switchtec_ntb_peer_spad_write(struct ntb_dev *ntb, int pidx,
 int sidx, u32 val)
 {
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   if (pidx != NTB_DEF_PEER_IDX)
+   return -EINVAL;
+
+   if (sidx < 0 || sidx >= ARRAY_SIZE(sndev->peer_shared->spad))
+   return -EINVAL;
+
+   if (!sndev->peer_shared)
+   return -EIO;
+
+   iowrite32(val, >peer_shared->spad[sidx]);
+
+   return 0;
+}
+
+static int switchtec_ntb_peer_spad_addr(struct ntb_dev *ntb, int pidx,
+   int sidx, phys_addr_t *spad_addr)
+{
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+   unsigned long offset;
+
+   if (pidx != NTB_DEF_PEER_IDX)
+   return -EINVAL;
+
+   offset = (unsigned long)>peer_shared->spad[sidx] -
+   (unsigned long)sndev->stdev->mmio;
+
+   if (spad_addr)
+   *spad_addr = pci_resource_start(ntb->pdev, 0) + offset;
+
return 0;
 }
 
@@ -496,7 +565,9 @@ static const struct ntb_dev_ops switchtec_ntb_ops = {
.spad_count = switchtec_ntb_spad_count,
.spad_read  = switchtec_ntb_spad_read,
.spad_write = switchtec_ntb_spad_write,
+   .peer_spad_read = switchtec_ntb_peer_spad_read,
.peer_spad_write= switchtec_ntb_peer_spad_write,
+   .peer_spad_addr = switchtec_ntb_peer_spad_addr,
 };
 
 static void switchtec_ntb_init_sndev(struct switchtec_ntb *sndev)
-- 
2.11.0



[PATCH v4 12/15] NTB: switchtec_ntb: Implement doorbell registers

2017-08-03 Thread Logan Gunthorpe
Pretty straightforward implementation of doorbell registers.
The shift and mask were setup in an earlier patch and this just hooks
up the appropriate portion of the IDB register as the local doorbells
and the opposite portion of ODB as the peer doorbells. The DB mask is
protected by a spinlock to avoid concurrent read-modify-write accesses.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 89 --
 1 file changed, 85 insertions(+), 4 deletions(-)

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c 
b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index b477a8915245..4a6125d2f305 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -98,6 +98,9 @@ struct switchtec_ntb {
int db_shift;
int db_peer_shift;
 
+   /* synchronize rmw access of db_mask and hw reg */
+   spinlock_t db_mask_lock;
+
int nr_direct_mw;
int nr_lut_mw;
int direct_mw_to_bar[MAX_DIRECT_MW];
@@ -338,41 +341,115 @@ static int switchtec_ntb_link_disable(struct ntb_dev 
*ntb)
 
 static u64 switchtec_ntb_db_valid_mask(struct ntb_dev *ntb)
 {
-   return 0;
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   return sndev->db_valid_mask;
 }
 
 static int switchtec_ntb_db_vector_count(struct ntb_dev *ntb)
 {
-   return 0;
+   return 1;
 }
 
 static u64 switchtec_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector)
 {
-   return 0;
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   if (db_vector < 0 || db_vector > 1)
+   return 0;
+
+   return sndev->db_valid_mask;
 }
 
 static u64 switchtec_ntb_db_read(struct ntb_dev *ntb)
 {
-   return 0;
+   u64 ret;
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   ret = ioread64(>mmio_self_dbmsg->idb) >> sndev->db_shift;
+
+   return ret & sndev->db_valid_mask;
 }
 
 static int switchtec_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
 {
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   iowrite64(db_bits << sndev->db_shift, >mmio_self_dbmsg->idb);
+
return 0;
 }
 
 static int switchtec_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
 {
+   unsigned long irqflags;
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   if (db_bits & ~sndev->db_valid_mask)
+   return -EINVAL;
+
+   spin_lock_irqsave(>db_mask_lock, irqflags);
+
+   sndev->db_mask |= db_bits << sndev->db_shift;
+   iowrite64(~sndev->db_mask, >mmio_self_dbmsg->idb_mask);
+
+   spin_unlock_irqrestore(>db_mask_lock, irqflags);
+
return 0;
 }
 
 static int switchtec_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
 {
+   unsigned long irqflags;
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   if (db_bits & ~sndev->db_valid_mask)
+   return -EINVAL;
+
+   spin_lock_irqsave(>db_mask_lock, irqflags);
+
+   sndev->db_mask &= ~(db_bits << sndev->db_shift);
+   iowrite64(~sndev->db_mask, >mmio_self_dbmsg->idb_mask);
+
+   spin_unlock_irqrestore(>db_mask_lock, irqflags);
+
+   return 0;
+}
+
+static u64 switchtec_ntb_db_read_mask(struct ntb_dev *ntb)
+{
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   return (sndev->db_mask >> sndev->db_shift) & sndev->db_valid_mask;
+}
+
+static int switchtec_ntb_peer_db_addr(struct ntb_dev *ntb,
+ phys_addr_t *db_addr,
+ resource_size_t *db_size)
+{
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+   unsigned long offset;
+
+   offset = (unsigned long)sndev->mmio_self_dbmsg->odb -
+   (unsigned long)sndev->stdev->mmio;
+
+   offset += sndev->db_shift / 8;
+
+   if (db_addr)
+   *db_addr = pci_resource_start(ntb->pdev, 0) + offset;
+   if (db_size)
+   *db_size = sizeof(u32);
+
return 0;
 }
 
 static int switchtec_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
 {
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+
+   iowrite64(db_bits << sndev->db_peer_shift,
+ >mmio_self_dbmsg->odb);
+
return 0;
 }
 
@@ -413,6 +490,8 @@ static const struct ntb_dev_ops switchtec_ntb_ops = {
.db_clear   = switchtec_ntb_db_clear,
.db_set_mask= switchtec_ntb_db_set_mask,
.db_clear_mask  = switchtec_ntb_db_clear_mask,
+   .db_read_mask   = switchtec_ntb_db_read_mask,
+   .peer_db_addr   = switchtec_ntb_peer_db_addr,
.peer_db_set= switchtec_ntb_

[PATCH v4 07/15] NTB: switchtec_ntb: Introduce initial NTB driver

2017-08-03 Thread Logan Gunthorpe
Seeing the Switchtec NTB hardware shares the same endpoint as the
management endpoint we utilize the class_interface API to register
an NTB driver for every Switchtec device in the system that has the
NTB class code.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 MAINTAINERS|  1 +
 drivers/ntb/hw/Kconfig |  1 +
 drivers/ntb/hw/Makefile|  1 +
 drivers/ntb/hw/mscc/Kconfig|  9 
 drivers/ntb/hw/mscc/Makefile   |  1 +
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 81 ++
 drivers/pci/switch/switchtec.c |  3 ++
 include/linux/switchtec.h  |  4 ++
 8 files changed, 101 insertions(+)
 create mode 100644 drivers/ntb/hw/mscc/Kconfig
 create mode 100644 drivers/ntb/hw/mscc/Makefile
 create mode 100644 drivers/ntb/hw/mscc/ntb_hw_switchtec.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 4ff2ad7c1c7b..6e491ce5e876 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10147,6 +10147,7 @@ F:  Documentation/ABI/testing/sysfs-class-switchtec
 F: drivers/pci/switch/switchtec*
 F: include/uapi/linux/switchtec_ioctl.h
 F: include/linux/switchtec.h
+F: drivers/ntb/hw/mscc/
 
 PCI DRIVER FOR NVIDIA TEGRA
 M: Thierry Reding <thierry.red...@gmail.com>
diff --git a/drivers/ntb/hw/Kconfig b/drivers/ntb/hw/Kconfig
index a89243c9fdd3..e51b581fd102 100644
--- a/drivers/ntb/hw/Kconfig
+++ b/drivers/ntb/hw/Kconfig
@@ -1,3 +1,4 @@
 source "drivers/ntb/hw/amd/Kconfig"
 source "drivers/ntb/hw/idt/Kconfig"
 source "drivers/ntb/hw/intel/Kconfig"
+source "drivers/ntb/hw/mscc/Kconfig"
diff --git a/drivers/ntb/hw/Makefile b/drivers/ntb/hw/Makefile
index 87332c3905f0..923c442db750 100644
--- a/drivers/ntb/hw/Makefile
+++ b/drivers/ntb/hw/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_NTB_AMD)  += amd/
 obj-$(CONFIG_NTB_IDT)  += idt/
 obj-$(CONFIG_NTB_INTEL)+= intel/
+obj-$(CONFIG_NTB_SWITCHTEC) += mscc/
diff --git a/drivers/ntb/hw/mscc/Kconfig b/drivers/ntb/hw/mscc/Kconfig
new file mode 100644
index ..013ed6716438
--- /dev/null
+++ b/drivers/ntb/hw/mscc/Kconfig
@@ -0,0 +1,9 @@
+config NTB_SWITCHTEC
+   tristate "MicroSemi Switchtec Non-Transparent Bridge Support"
+   select PCI_SW_SWITCHTEC
+   help
+Enables NTB support for Switchtec PCI switches. This also
+selects the Switchtec management driver as they share the same
+hardware interface.
+
+If unsure, say N.
diff --git a/drivers/ntb/hw/mscc/Makefile b/drivers/ntb/hw/mscc/Makefile
new file mode 100644
index ..064686ead1ba
--- /dev/null
+++ b/drivers/ntb/hw/mscc/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_NTB_SWITCHTEC) += ntb_hw_switchtec.o
diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c 
b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
new file mode 100644
index ..253efba72275
--- /dev/null
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -0,0 +1,81 @@
+/*
+ * Microsemi Switchtec(tm) PCIe Management Driver
+ * Copyright (c) 2017, Microsemi Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include 
+#include 
+
+MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
+MODULE_VERSION("0.1");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Microsemi Corporation");
+
+struct switchtec_ntb {
+   struct switchtec_dev *stdev;
+};
+
+static int switchtec_ntb_add(struct device *dev,
+struct class_interface *class_intf)
+{
+   struct switchtec_dev *stdev = to_stdev(dev);
+   struct switchtec_ntb *sndev;
+
+   stdev->sndev = NULL;
+
+   if (stdev->pdev->class != MICROSEMI_NTB_CLASSCODE)
+   return -ENODEV;
+
+   sndev = kzalloc_node(sizeof(*sndev), GFP_KERNEL, dev_to_node(dev));
+   if (!sndev)
+   return -ENOMEM;
+
+   sndev->stdev = stdev;
+
+   stdev->sndev = sndev;
+   dev_info(dev, "NTB device registered");
+
+   return 0;
+}
+
+void switchtec_ntb_remove(struct device *dev,
+ struct class_interface *class_intf)
+{
+   struct switchtec_dev *stdev = to_stdev(dev);
+   struct switchtec_ntb *sndev = stdev->sndev;
+
+   if (!sndev)
+   return;
+
+   stdev->sndev = NULL;
+   kfree(sndev);
+   dev_info(

[PATCH v4 14/15] NTB: switchtec_ntb: Add memory window support

2017-08-03 Thread Logan Gunthorpe
The Switchtec hardware has two types of memory windows: LUTs and Direct.
The first area in each BAR is for LUT windows and the remaining area is
for the direct region. The total number of LUT entries is set by a
configuration setting in hardware and they all must be the same
size. (This is fixed by switchtec_ntb to be 64K.)

switchtec_ntb enables the LUTs only for the first BAR and enables the
highest power of two possible. Seeing the LUTs are at the beginning of
the BAR, the direct memory window's alignment is affected. Therefore,
the maximum direct memory window size can not be greater than the number
of LUTs times 64K. The direct window in other BARs will not have this
restriction as the LUTs will not be enabled there. LUTs will only be
exposed through the NTB API if the use_lut_mw parameter is set.

Seeing the Switchtec hardware, by default, configures BARs to be 4G a
module parameter is given to limit the size of the advertised memory
windows. Higher layers tend to allocate the maximum BAR size and this
has a tendency to fail when they try to allocate 4GB of contiguous
memory.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
Acked-by: Allen Hubbe <allen.hu...@dell.com>
---
 drivers/ntb/hw/mscc/ntb_hw_switchtec.c | 213 -
 1 file changed, 210 insertions(+), 3 deletions(-)

diff --git a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c 
b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
index 205bd9481122..afe8ed6f3b23 100644
--- a/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
+++ b/drivers/ntb/hw/mscc/ntb_hw_switchtec.c
@@ -25,6 +25,11 @@ MODULE_VERSION("0.1");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Microsemi Corporation");
 
+static ulong max_mw_size = SZ_2M;
+module_param(max_mw_size, ulong, 0644);
+MODULE_PARM_DESC(max_mw_size,
+   "Max memory window size reported to the upper layer");
+
 static bool use_lut_mws;
 module_param(use_lut_mws, bool, 0644);
 MODULE_PARM_DESC(use_lut_mws,
@@ -190,7 +195,27 @@ static int switchtec_ntb_send_msg(struct switchtec_ntb 
*sndev, int idx,
 
 static int switchtec_ntb_mw_count(struct ntb_dev *ntb, int pidx)
 {
-   return 0;
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+   int nr_direct_mw = sndev->peer_nr_direct_mw;
+   int nr_lut_mw = sndev->peer_nr_lut_mw - 1;
+
+   if (pidx != NTB_DEF_PEER_IDX)
+   return -EINVAL;
+
+   if (!use_lut_mws)
+   nr_lut_mw = 0;
+
+   return nr_direct_mw + nr_lut_mw;
+}
+
+static int lut_index(struct switchtec_ntb *sndev, int mw_idx)
+{
+   return mw_idx - sndev->nr_direct_mw + 1;
+}
+
+static int peer_lut_index(struct switchtec_ntb *sndev, int mw_idx)
+{
+   return mw_idx - sndev->peer_nr_direct_mw + 1;
 }
 
 static int switchtec_ntb_mw_get_align(struct ntb_dev *ntb, int pidx,
@@ -198,17 +223,192 @@ static int switchtec_ntb_mw_get_align(struct ntb_dev 
*ntb, int pidx,
  resource_size_t *size_align,
  resource_size_t *size_max)
 {
+   struct switchtec_ntb *sndev = ntb_sndev(ntb);
+   int lut;
+   resource_size_t size;
+
+   if (pidx != NTB_DEF_PEER_IDX)
+   return -EINVAL;
+
+   lut = widx >= sndev->peer_nr_direct_mw;
+   size = ioread64(>peer_shared->mw_sizes[widx]);
+
+   if (size == 0)
+   return -EINVAL;
+
+   if (addr_align)
+   *addr_align = lut ? size : SZ_4K;
+
+   if (size_align)
+   *size_align = lut ? size : SZ_4K;
+
+   if (size_max)
+   *size_max = size;
+
return 0;
 }
 
+static void switchtec_ntb_mw_clr_direct(struct switchtec_ntb *sndev, int idx)
+{
+   struct ntb_ctrl_regs __iomem *ctl = sndev->mmio_peer_ctrl;
+   int bar = sndev->peer_direct_mw_to_bar[idx];
+   u32 ctl_val;
+
+   ctl_val = ioread32(>bar_entry[bar].ctl);
+   ctl_val &= ~NTB_CTRL_BAR_DIR_WIN_EN;
+   iowrite32(ctl_val, >bar_entry[bar].ctl);
+   iowrite32(0, >bar_entry[bar].win_size);
+   iowrite64(sndev->self_partition, >bar_entry[bar].xlate_addr);
+}
+
+static void switchtec_ntb_mw_clr_lut(struct switchtec_ntb *sndev, int idx)
+{
+   struct ntb_ctrl_regs __iomem *ctl = sndev->mmio_peer_ctrl;
+
+   iowrite64(0, >lut_entry[peer_lut_index(sndev, idx)]);
+}
+
+static void switchtec_ntb_mw_set_direct(struct switchtec_ntb *sndev, int idx,
+   dma_addr_t addr, resource_size_t size)
+{
+   int xlate_pos = ilog2(size);
+   int bar = sndev->peer_direct_mw_to_bar[idx];
+   struct ntb_ctrl_regs __iomem *ctl = sndev->mmio_peer_ctrl;
+   u32 ctl_val;
+
+   ctl_val = ioread32(>bar_entry[bar].ctl);
+   ctl_val |= NTB_CTRL_BAR_DIR_WIN_EN;
+
+   iowrite32(ctl_val, >

[PATCH v6 2/7] powerpc: io.h: move iomap.h include so that it can use readq/writeq defs

2017-08-03 Thread Logan Gunthorpe
Subsequent patches in this series makes use of the readq and writeq
defines in iomap.h. However, as is, they get missed on the powerpc
platform seeing the include comes before the define. This patch
moves the include down to fix this.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Acked-By: Michael Ellerman <m...@ellerman.id.au>
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Michael Ellerman <m...@ellerman.id.au>
Cc: Nicholas Piggin <npig...@gmail.com>
Cc: Suresh Warrier <warr...@linux.vnet.ibm.com>
Cc: "Oliver O'Halloran" <ooh...@gmail.com>
---
 arch/powerpc/include/asm/io.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 422f99cf9924..af074923d598 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -33,8 +33,6 @@ extern struct pci_dev *isa_bridge_pcidev;
 #include 
 #include 
 
-#include 
-
 #ifdef CONFIG_PPC64
 #include 
 #endif
@@ -663,6 +661,8 @@ static inline void name at  
\
 #define writel_relaxed(v, addr)writel(v, addr)
 #define writeq_relaxed(v, addr)writeq(v, addr)
 
+#include 
+
 #ifdef CONFIG_PPC32
 #define mmiowb()
 #else
-- 
2.11.0



[PATCH v6 5/7] io-64-nonatomic: add io{read|write}64[be]{_lo_hi|_hi_lo} macros

2017-08-03 Thread Logan Gunthorpe
This patch adds generic io{read|write}64[be]{_lo_hi|_hi_lo} macros if
they are not already defined by the architecture. (As they are provided
by the generic iomap library).

The patch also points io{read|write}64[be] to the variant specified by the
header name.

This is because new drivers are encouraged to use ioreadXX, et al instead
of readX[1], et al -- and mixing ioreadXX with readq is pretty ugly.

[1] ldd3: section 9.4.2

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
cc: Christoph Hellwig <h...@lst.de>
cc: Arnd Bergmann <a...@arndb.de>
cc: Alan Cox <gno...@lxorguk.ukuu.org.uk>
cc: Greg Kroah-Hartman <gre...@linuxfoundation.org>
---
 include/linux/io-64-nonatomic-hi-lo.h | 64 +++
 include/linux/io-64-nonatomic-lo-hi.h | 64 +++
 2 files changed, 128 insertions(+)

diff --git a/include/linux/io-64-nonatomic-hi-lo.h 
b/include/linux/io-64-nonatomic-hi-lo.h
index defcc4644ce3..410c8b177080 100644
--- a/include/linux/io-64-nonatomic-hi-lo.h
+++ b/include/linux/io-64-nonatomic-hi-lo.h
@@ -54,4 +54,68 @@ static inline void hi_lo_writeq_relaxed(__u64 val, volatile 
void __iomem *addr)
 #define writeq_relaxed hi_lo_writeq_relaxed
 #endif
 
+#ifndef ioread64_hi_lo
+#define ioread64_hi_lo ioread64_hi_lo
+static inline u64 ioread64_hi_lo(void __iomem *addr)
+{
+   u32 low, high;
+
+   high = ioread32(addr + sizeof(u32));
+   low = ioread32(addr);
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64_hi_lo
+#define iowrite64_hi_lo iowrite64_hi_lo
+static inline void iowrite64_hi_lo(u64 val, void __iomem *addr)
+{
+   iowrite32(val >> 32, addr + sizeof(u32));
+   iowrite32(val, addr);
+}
+#endif
+
+#ifndef ioread64be_hi_lo
+#define ioread64be_hi_lo ioread64be_hi_lo
+static inline u64 ioread64be_hi_lo(void __iomem *addr)
+{
+   u32 low, high;
+
+   high = ioread32be(addr);
+   low = ioread32be(addr + sizeof(u32));
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64be_hi_lo
+#define iowrite64be_hi_lo iowrite64be_hi_lo
+static inline void iowrite64be_hi_lo(u64 val, void __iomem *addr)
+{
+   iowrite32be(val >> 32, addr);
+   iowrite32be(val, addr + sizeof(u32));
+}
+#endif
+
+#ifndef ioread64
+#define ioread64_is_nonatomic
+#define ioread64 ioread64_hi_lo
+#endif
+
+#ifndef iowrite64
+#define iowrite64_is_nonatomic
+#define iowrite64 iowrite64_hi_lo
+#endif
+
+#ifndef ioread64be
+#define ioread64be_is_nonatomic
+#define ioread64be ioread64be_hi_lo
+#endif
+
+#ifndef iowrite64be
+#define iowrite64be_is_nonatomic
+#define iowrite64be iowrite64be_hi_lo
+#endif
+
 #endif /* _LINUX_IO_64_NONATOMIC_HI_LO_H_ */
diff --git a/include/linux/io-64-nonatomic-lo-hi.h 
b/include/linux/io-64-nonatomic-lo-hi.h
index 084461a4e5ab..acba36812be8 100644
--- a/include/linux/io-64-nonatomic-lo-hi.h
+++ b/include/linux/io-64-nonatomic-lo-hi.h
@@ -54,4 +54,68 @@ static inline void lo_hi_writeq_relaxed(__u64 val, volatile 
void __iomem *addr)
 #define writeq_relaxed lo_hi_writeq_relaxed
 #endif
 
+#ifndef ioread64_lo_hi
+#define ioread64_lo_hi ioread64_lo_hi
+static inline u64 ioread64_lo_hi(void __iomem *addr)
+{
+   u32 low, high;
+
+   low = ioread32(addr);
+   high = ioread32(addr + sizeof(u32));
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64_lo_hi
+#define iowrite64_lo_hi iowrite64_lo_hi
+static inline void iowrite64_lo_hi(u64 val, void __iomem *addr)
+{
+   iowrite32(val, addr);
+   iowrite32(val >> 32, addr + sizeof(u32));
+}
+#endif
+
+#ifndef ioread64be_lo_hi
+#define ioread64be_lo_hi ioread64be_lo_hi
+static inline u64 ioread64be_lo_hi(void __iomem *addr)
+{
+   u32 low, high;
+
+   low = ioread32be(addr + sizeof(u32));
+   high = ioread32be(addr);
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64be_lo_hi
+#define iowrite64be_lo_hi iowrite64be_lo_hi
+static inline void iowrite64be_lo_hi(u64 val, void __iomem *addr)
+{
+   iowrite32be(val, addr + sizeof(u32));
+   iowrite32be(val >> 32, addr);
+}
+#endif
+
+#ifndef ioread64
+#define ioread64_is_nonatomic
+#define ioread64 ioread64_lo_hi
+#endif
+
+#ifndef iowrite64
+#define iowrite64_is_nonatomic
+#define iowrite64 iowrite64_lo_hi
+#endif
+
+#ifndef ioread64be
+#define ioread64be_is_nonatomic
+#define ioread64be ioread64be_lo_hi
+#endif
+
+#ifndef iowrite64be
+#define iowrite64be_is_nonatomic
+#define iowrite64be iowrite64be_lo_hi
+#endif
+
 #endif /* _LINUX_IO_64_NONATOMIC_LO_HI_H_ */
-- 
2.11.0



[PATCH v6 4/7] iomap: introduce io{read|write}64_{lo_hi|hi_lo}

2017-08-03 Thread Logan Gunthorpe
In order to provide non-atomic functions for io{read|write}64 that will
use readq and writeq when appropriate. We define a number of variants
of these functions in the generic iomap that will do non-atomic
operations on pio but atomic operations on mmio.

These functions are only defined if readq and writeq are defined. If
they are not, then the wrappers that always use non-atomic operations
from include/linux/io-64-nonatomic*.h will be used.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Michael Ellerman <m...@ellerman.id.au>
Cc: Arnd Bergmann <a...@arndb.de>
Cc: Suresh Warrier <warr...@linux.vnet.ibm.com>
Cc: Nicholas Piggin <npig...@gmail.com>
---
 arch/powerpc/include/asm/io.h |   2 +
 include/asm-generic/iomap.h   |  26 +++--
 lib/iomap.c   | 132 ++
 3 files changed, 154 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index af074923d598..4cc420cfaa78 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -788,8 +788,10 @@ extern void __iounmap_at(void *ea, unsigned long size);
 
 #define mmio_read16be(addr)readw_be(addr)
 #define mmio_read32be(addr)readl_be(addr)
+#define mmio_read64be(addr)readq_be(addr)
 #define mmio_write16be(val, addr)  writew_be(val, addr)
 #define mmio_write32be(val, addr)  writel_be(val, addr)
+#define mmio_write64be(val, addr)  writeq_be(val, addr)
 #define mmio_insb(addr, dst, count)readsb(addr, dst, count)
 #define mmio_insw(addr, dst, count)readsw(addr, dst, count)
 #define mmio_insl(addr, dst, count)readsl(addr, dst, count)
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 650fede33c25..30edebf627fe 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -30,9 +30,16 @@ extern unsigned int ioread16(void __iomem *);
 extern unsigned int ioread16be(void __iomem *);
 extern unsigned int ioread32(void __iomem *);
 extern unsigned int ioread32be(void __iomem *);
-#ifdef CONFIG_64BIT
-extern u64 ioread64(void __iomem *);
-extern u64 ioread64be(void __iomem *);
+
+#ifdef readq
+#define ioread64_lo_hi ioread64_lo_hi
+#define ioread64_hi_lo ioread64_hi_lo
+#define ioread64be_lo_hi ioread64be_lo_hi
+#define ioread64be_hi_lo ioread64be_hi_lo
+extern u64 ioread64_lo_hi(void __iomem *addr);
+extern u64 ioread64_hi_lo(void __iomem *addr);
+extern u64 ioread64be_lo_hi(void __iomem *addr);
+extern u64 ioread64be_hi_lo(void __iomem *addr);
 #endif
 
 extern void iowrite8(u8, void __iomem *);
@@ -40,9 +47,16 @@ extern void iowrite16(u16, void __iomem *);
 extern void iowrite16be(u16, void __iomem *);
 extern void iowrite32(u32, void __iomem *);
 extern void iowrite32be(u32, void __iomem *);
-#ifdef CONFIG_64BIT
-extern void iowrite64(u64, void __iomem *);
-extern void iowrite64be(u64, void __iomem *);
+
+#ifdef writeq
+#define iowrite64_lo_hi iowrite64_lo_hi
+#define iowrite64_hi_lo iowrite64_hi_lo
+#define iowrite64be_lo_hi iowrite64be_lo_hi
+#define iowrite64be_hi_lo iowrite64be_hi_lo
+extern void iowrite64_lo_hi(u64 val, void __iomem *addr);
+extern void iowrite64_hi_lo(u64 val, void __iomem *addr);
+extern void iowrite64be_lo_hi(u64 val, void __iomem *addr);
+extern void iowrite64be_hi_lo(u64 val, void __iomem *addr);
 #endif
 
 /*
diff --git a/lib/iomap.c b/lib/iomap.c
index fc3dcb4b238e..845b9c41082c 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -66,6 +66,7 @@ static void bad_io_access(unsigned long port, const char 
*access)
 #ifndef mmio_read16be
 #define mmio_read16be(addr) be16_to_cpu(__raw_readw(addr))
 #define mmio_read32be(addr) be32_to_cpu(__raw_readl(addr))
+#define mmio_read64be(addr) be64_to_cpu(__raw_readq(addr))
 #endif
 
 unsigned int ioread8(void __iomem *addr)
@@ -99,6 +100,80 @@ EXPORT_SYMBOL(ioread16be);
 EXPORT_SYMBOL(ioread32);
 EXPORT_SYMBOL(ioread32be);
 
+#ifdef readq
+static u64 pio_read64_lo_hi(unsigned long port)
+{
+   u64 lo, hi;
+
+   lo = inl(port);
+   hi = inl(port + sizeof(u32));
+
+   return lo | (hi << 32);
+}
+
+static u64 pio_read64_hi_lo(unsigned long port)
+{
+   u64 lo, hi;
+
+   hi = inl(port + sizeof(u32));
+   lo = inl(port);
+
+   return lo | (hi << 32);
+}
+
+static u64 pio_read64be_lo_hi(unsigned long port)
+{
+   u64 lo, hi;
+
+   lo = pio_read32be(port + sizeof(u32));
+   hi = pio_read32be(port);
+
+   return lo | (hi << 32);
+}
+
+static u64 pio_read64be_hi_lo(unsigned long port)
+{
+   u64 lo, hi;
+
+   hi = pio_read32be(port);
+   lo = pio_read32be(port + sizeof(u32));
+
+   return lo | (hi << 32);
+}
+
+u64 ioread64_lo_hi(void __iomem *addr)
+{
+   IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr));
+   return 0xUL

[PATCH v6 3/7] powerpc: iomap.c: introduce io{read|write}64_{lo_hi|hi_lo}

2017-08-03 Thread Logan Gunthorpe
These functions will be introduced into the generic iomap.c so
they can deal with PIO accesses in hi-lo/lo-hi variants. Thus,
the powerpc version of iomap.c will need to provide the same
functions even though, in this arch, they are identical to the
regular io{read|write}64 functions.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Tested-by: Horia Geantă <horia.gea...@nxp.com>
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Michael Ellerman <m...@ellerman.id.au>
---
 arch/powerpc/kernel/iomap.c | 40 
 1 file changed, 40 insertions(+)

diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
index a1854d1ded8b..b43dbadfd24f 100644
--- a/arch/powerpc/kernel/iomap.c
+++ b/arch/powerpc/kernel/iomap.c
@@ -44,12 +44,32 @@ u64 ioread64(void __iomem *addr)
 {
return readq(addr);
 }
+u64 ioread64_lo_hi(void __iomem *addr)
+{
+   return readq(addr);
+}
+u64 ioread64_hi_lo(void __iomem *addr)
+{
+   return readq(addr);
+}
 u64 ioread64be(void __iomem *addr)
 {
return readq_be(addr);
 }
+u64 ioread64be_lo_hi(void __iomem *addr)
+{
+   return readq_be(addr);
+}
+u64 ioread64be_hi_lo(void __iomem *addr)
+{
+   return readq_be(addr);
+}
 EXPORT_SYMBOL(ioread64);
+EXPORT_SYMBOL(ioread64_lo_hi);
+EXPORT_SYMBOL(ioread64_hi_lo);
 EXPORT_SYMBOL(ioread64be);
+EXPORT_SYMBOL(ioread64be_lo_hi);
+EXPORT_SYMBOL(ioread64be_hi_lo);
 #endif /* __powerpc64__ */
 
 void iowrite8(u8 val, void __iomem *addr)
@@ -82,12 +102,32 @@ void iowrite64(u64 val, void __iomem *addr)
 {
writeq(val, addr);
 }
+void iowrite64_lo_hi(u64 val, void __iomem *addr)
+{
+   writeq(val, addr);
+}
+void iowrite64_hi_lo(u64 val, void __iomem *addr)
+{
+   writeq(val, addr);
+}
 void iowrite64be(u64 val, void __iomem *addr)
 {
writeq_be(val, addr);
 }
+void iowrite64be_lo_hi(u64 val, void __iomem *addr)
+{
+   writeq_be(val, addr);
+}
+void iowrite64be_hi_lo(u64 val, void __iomem *addr)
+{
+   writeq_be(val, addr);
+}
 EXPORT_SYMBOL(iowrite64);
+EXPORT_SYMBOL(iowrite64_lo_hi);
+EXPORT_SYMBOL(iowrite64_hi_lo);
 EXPORT_SYMBOL(iowrite64be);
+EXPORT_SYMBOL(iowrite64be_lo_hi);
+EXPORT_SYMBOL(iowrite64be_hi_lo);
 #endif /* __powerpc64__ */
 
 /*
-- 
2.11.0



Re: [PATCH] ntb: use correct mw_count function in ntb_tool and ntb_transport

2017-07-17 Thread Logan Gunthorpe
Hi John,


On 17/07/17 09:16 AM, Jon Mason wrote:
> Per your request, this has been applied to my ntb-next branch.

This isn't a huge issue but I meant for this patch to fix the mistake in
the new ntb_api before it hit 4.13.

On the other hand, seeing there's no way to actually trigger the issue
until the switchtec driver is merged I guess it's okay to leave it the
way you did it.

Thanks,

Logan


Re: [PATCH] ntb: use correct mw_count function in ntb_tool and ntb_transport

2017-07-17 Thread Logan Gunthorpe


On 17/07/17 10:46 AM, Jon Mason wrote:
> s/John/Jon

Apologies. Monday morning -- not fully awake yet.

> NP.  I'll add a "Fixes: ..." to it.  So, it'll get pulled into the
> -stable trees.

Ok, thanks, but it's not really the stable trees that need to see it
seeing 4.13 isn't released yet, it would just need to get into a
subsequent rc

Logan


[PATCH v4 1/5] powerpc: io.h: move iomap.h include so that it can use readq/writeq defs

2017-07-18 Thread Logan Gunthorpe
Subsequent patches in this series makes use of the readq and writeq
defines in iomap.h. However, as is, they get missed on the powerpc
platform seeing the include comes before the define. This patch
moves the include down to fix this.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Michael Ellerman <m...@ellerman.id.au>
Cc: Nicholas Piggin <npig...@gmail.com>
Cc: Suresh Warrier <warr...@linux.vnet.ibm.com>
Cc: "Oliver O'Halloran" <ooh...@gmail.com>
---
 arch/powerpc/include/asm/io.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 422f99cf9924..af074923d598 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -33,8 +33,6 @@ extern struct pci_dev *isa_bridge_pcidev;
 #include 
 #include 
 
-#include 
-
 #ifdef CONFIG_PPC64
 #include 
 #endif
@@ -663,6 +661,8 @@ static inline void name at  
\
 #define writel_relaxed(v, addr)writel(v, addr)
 #define writeq_relaxed(v, addr)writeq(v, addr)
 
+#include 
+
 #ifdef CONFIG_PPC32
 #define mmiowb()
 #else
-- 
2.11.0



[PATCH v4 2/5] iomap: introduce io{read|write}64_{lo_hi|hi_lo}

2017-07-18 Thread Logan Gunthorpe
In order to provide non-atomic functions for io{read|write}64 that will
use readq and writeq when appropriate. We define a number of variants
of these functions in the generic iomap that will do non-atomic
operations on pio but atomic operations on mmio.

These functions are only defined if readq and writeq are defined. If
they are not, then the wrappers that always use non-atomic operations
from include/linux/io-64-nonatomic*.h will be used.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Michael Ellerman <m...@ellerman.id.au>
Cc: Arnd Bergmann <a...@arndb.de>
Cc: Suresh Warrier <warr...@linux.vnet.ibm.com>
Cc: Nicholas Piggin <npig...@gmail.com>
---
 arch/powerpc/include/asm/io.h |   2 +
 include/asm-generic/iomap.h   |  26 +++--
 lib/iomap.c   | 132 ++
 3 files changed, 154 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index af074923d598..4cc420cfaa78 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -788,8 +788,10 @@ extern void __iounmap_at(void *ea, unsigned long size);
 
 #define mmio_read16be(addr)readw_be(addr)
 #define mmio_read32be(addr)readl_be(addr)
+#define mmio_read64be(addr)readq_be(addr)
 #define mmio_write16be(val, addr)  writew_be(val, addr)
 #define mmio_write32be(val, addr)  writel_be(val, addr)
+#define mmio_write64be(val, addr)  writeq_be(val, addr)
 #define mmio_insb(addr, dst, count)readsb(addr, dst, count)
 #define mmio_insw(addr, dst, count)readsw(addr, dst, count)
 #define mmio_insl(addr, dst, count)readsl(addr, dst, count)
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 650fede33c25..e4601455ac4a 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -30,9 +30,16 @@ extern unsigned int ioread16(void __iomem *);
 extern unsigned int ioread16be(void __iomem *);
 extern unsigned int ioread32(void __iomem *);
 extern unsigned int ioread32be(void __iomem *);
-#ifdef CONFIG_64BIT
-extern u64 ioread64(void __iomem *);
-extern u64 ioread64be(void __iomem *);
+
+#ifdef readq
+#define ioread64_lo_hi ioread64_lo_hi
+#define ioread64_hi_lo ioread64_hi_lo
+#define ioread64be_lo_hi ioread64be_lo_hi
+#define ioread64be_hi_lo ioread64be_hi_lo
+extern u64 ioread64_lo_hi(void __iomem *addr);
+extern u64 ioread64_hi_lo(void __iomem *addr);
+extern u64 ioread64be_lo_hi(void __iomem *addr);
+extern u64 ioread64be_hi_lo(void __iomem *addr);
 #endif
 
 extern void iowrite8(u8, void __iomem *);
@@ -40,9 +47,16 @@ extern void iowrite16(u16, void __iomem *);
 extern void iowrite16be(u16, void __iomem *);
 extern void iowrite32(u32, void __iomem *);
 extern void iowrite32be(u32, void __iomem *);
-#ifdef CONFIG_64BIT
-extern void iowrite64(u64, void __iomem *);
-extern void iowrite64be(u64, void __iomem *);
+
+#ifdef writeq
+#define iowrite64_lo_hi iowrite64_lo_hi
+#define iowrite64_hi_lo iowrite64_hi_lo
+#define iowrite64be_lo_hi iowrite64be_lo_hi
+#define iowrite64be_hi_lo iowrite64be_hi_lo
+void iowrite64_lo_hi(u64 val, void __iomem *addr);
+void iowrite64_hi_lo(u64 val, void __iomem *addr);
+void iowrite64be_lo_hi(u64 val, void __iomem *addr);
+void iowrite64be_hi_lo(u64 val, void __iomem *addr);
 #endif
 
 /*
diff --git a/lib/iomap.c b/lib/iomap.c
index fc3dcb4b238e..b993400d60bd 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -66,6 +66,7 @@ static void bad_io_access(unsigned long port, const char 
*access)
 #ifndef mmio_read16be
 #define mmio_read16be(addr) be16_to_cpu(__raw_readw(addr))
 #define mmio_read32be(addr) be32_to_cpu(__raw_readl(addr))
+#define mmio_read64be(addr) be64_to_cpu(__raw_readq(addr))
 #endif
 
 unsigned int ioread8(void __iomem *addr)
@@ -99,6 +100,80 @@ EXPORT_SYMBOL(ioread16be);
 EXPORT_SYMBOL(ioread32);
 EXPORT_SYMBOL(ioread32be);
 
+#ifdef readq
+static u64 pio_read64_lo_hi(unsigned long port)
+{
+   u64 lo, hi;
+
+   lo = inl(port);
+   hi = inl(port + sizeof(u32));
+
+   return lo | (hi << 32);
+}
+
+static u64 pio_read64_hi_lo(unsigned long port)
+{
+   u64 lo, hi;
+
+   hi = inl(port + sizeof(u32));
+   lo = inl(port);
+
+   return lo | (hi << 32);
+}
+
+static u64 pio_read64be_lo_hi(unsigned long port)
+{
+   u64 lo, hi;
+
+   lo = pio_read32be(port + sizeof(u32));
+   hi = pio_read32be(port);
+
+   return lo | (hi << 32);
+}
+
+static u64 pio_read64be_hi_lo(unsigned long port)
+{
+   u64 lo, hi;
+
+   hi = pio_read32be(port);
+   lo = pio_read32be(port + sizeof(u32));
+
+   return lo | (hi << 32);
+}
+
+u64 ioread64_lo_hi(void __iomem *addr)
+{
+   IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr));
+   return 0xLL;
+}
+
+u64 ioread64_hi_lo(vo

[PATCH v4 3/5] io-64-nonatomic: add io{read|write}64[be]{_lo_hi|_hi_lo} macros

2017-07-18 Thread Logan Gunthorpe
This patch adds generic io{read|write}64[be]{_lo_hi|_hi_lo} macros if
they are not already defined by the architecture. (As they are provided
by the generic iomap library).

The patch also points io{read|write}64[be] to the variant specified by the
header name.

This is because new drivers are encouraged to use ioreadXX, et al instead
of readX[1], et al -- and mixing ioreadXX with readq is pretty ugly.

[1] ldd3: section 9.4.2

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
cc: Christoph Hellwig <h...@lst.de>
cc: Arnd Bergmann <a...@arndb.de>
cc: Alan Cox <gno...@lxorguk.ukuu.org.uk>
cc: Greg Kroah-Hartman <gre...@linuxfoundation.org>
---
 include/linux/io-64-nonatomic-hi-lo.h | 60 +++
 include/linux/io-64-nonatomic-lo-hi.h | 60 +++
 2 files changed, 120 insertions(+)

diff --git a/include/linux/io-64-nonatomic-hi-lo.h 
b/include/linux/io-64-nonatomic-hi-lo.h
index defcc4644ce3..31d28e981299 100644
--- a/include/linux/io-64-nonatomic-hi-lo.h
+++ b/include/linux/io-64-nonatomic-hi-lo.h
@@ -54,4 +54,64 @@ static inline void hi_lo_writeq_relaxed(__u64 val, volatile 
void __iomem *addr)
 #define writeq_relaxed hi_lo_writeq_relaxed
 #endif
 
+#ifndef ioread64_hi_lo
+#define ioread64_hi_lo ioread64_hi_lo
+static inline u64 ioread64_hi_lo(void __iomem *addr)
+{
+   u32 low, high;
+
+   high = ioread32(addr + sizeof(u32));
+   low = ioread32(addr);
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64_hi_lo
+#define iowrite64_hi_lo iowrite64_hi_lo
+static inline void iowrite64_hi_lo(u64 val, void __iomem *addr)
+{
+   iowrite32(val >> 32, addr + sizeof(u32));
+   iowrite32(val, addr);
+}
+#endif
+
+#ifndef ioread64be_hi_lo
+#define ioread64be_hi_lo ioread64be_hi_lo
+static inline u64 ioread64be_hi_lo(void __iomem *addr)
+{
+   u32 low, high;
+
+   high = ioread32be(addr);
+   low = ioread32be(addr + sizeof(u32));
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64be_hi_lo
+#define iowrite64be_hi_lo iowrite64be_hi_lo
+static inline void iowrite64be_hi_lo(u64 val, void __iomem *addr)
+{
+   iowrite32be(val >> 32, addr);
+   iowrite32be(val, addr + sizeof(u32));
+}
+#endif
+
+#ifndef ioread64
+#define ioread64 ioread64_hi_lo
+#endif
+
+#ifndef iowrite64
+#define iowrite64 iowrite64_hi_lo
+#endif
+
+#ifndef ioread64be
+#define ioread64be ioread64be_hi_lo
+#endif
+
+#ifndef iowrite64be
+#define iowrite64be iowrite64be_hi_lo
+#endif
+
 #endif /* _LINUX_IO_64_NONATOMIC_HI_LO_H_ */
diff --git a/include/linux/io-64-nonatomic-lo-hi.h 
b/include/linux/io-64-nonatomic-lo-hi.h
index 084461a4e5ab..437a34f20f5a 100644
--- a/include/linux/io-64-nonatomic-lo-hi.h
+++ b/include/linux/io-64-nonatomic-lo-hi.h
@@ -54,4 +54,64 @@ static inline void lo_hi_writeq_relaxed(__u64 val, volatile 
void __iomem *addr)
 #define writeq_relaxed lo_hi_writeq_relaxed
 #endif
 
+#ifndef ioread64_lo_hi
+#define ioread64_lo_hi ioread64_lo_hi
+static inline u64 ioread64_lo_hi(void __iomem *addr)
+{
+   u32 low, high;
+
+   low = ioread32(addr);
+   high = ioread32(addr + sizeof(u32));
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64_lo_hi
+#define iowrite64_lo_hi iowrite64_lo_hi
+static inline void iowrite64_lo_hi(u64 val, void __iomem *addr)
+{
+   iowrite32(val, addr);
+   iowrite32(val >> 32, addr + sizeof(u32));
+}
+#endif
+
+#ifndef ioread64be_lo_hi
+#define ioread64be_lo_hi ioread64be_lo_hi
+static inline u64 ioread64be_lo_hi(void __iomem *addr)
+{
+   u32 low, high;
+
+   low = ioread32be(addr + sizeof(u32));
+   high = ioread32be(addr);
+
+   return low + ((u64)high << 32);
+}
+#endif
+
+#ifndef iowrite64be_lo_hi
+#define iowrite64be_lo_hi iowrite64be_lo_hi
+static inline void iowrite64be_lo_hi(u64 val, void __iomem *addr)
+{
+   iowrite32be(val, addr + sizeof(u32));
+   iowrite32be(val >> 32, addr);
+}
+#endif
+
+#ifndef ioread64
+#define ioread64 ioread64_lo_hi
+#endif
+
+#ifndef iowrite64
+#define iowrite64 iowrite64_lo_hi
+#endif
+
+#ifndef ioread64be
+#define ioread64be ioread64be_lo_hi
+#endif
+
+#ifndef iowrite64be
+#define iowrite64be iowrite64be_lo_hi
+#endif
+
 #endif /* _LINUX_IO_64_NONATOMIC_LO_HI_H_ */
-- 
2.11.0



[PATCH v4 4/5] ntb: ntb_hw_intel: use io-64-nonatomic instead of in-driver hacks

2017-07-18 Thread Logan Gunthorpe
Now that ioread64 and iowrite64 are available in io-64-nonatomic,
we can remove the hack at the top of ntb_hw_intel.c and replace it
with an include.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Cc: Jon Mason <jdma...@kudzu.us>
Cc: Allen Hubbe <allen.hu...@emc.com>
Acked-by: Dave Jiang <dave.ji...@intel.com>
---
 drivers/ntb/hw/intel/ntb_hw_intel.c | 30 +-
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.c 
b/drivers/ntb/hw/intel/ntb_hw_intel.c
index 2557e2c05b90..606c90f59d4b 100644
--- a/drivers/ntb/hw/intel/ntb_hw_intel.c
+++ b/drivers/ntb/hw/intel/ntb_hw_intel.c
@@ -59,6 +59,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ntb_hw_intel.h"
 
@@ -155,35 +156,6 @@ MODULE_PARM_DESC(xeon_b2b_dsd_bar5_addr32,
 static inline enum ntb_topo xeon_ppd_topo(struct intel_ntb_dev *ndev, u8 ppd);
 static int xeon_init_isr(struct intel_ntb_dev *ndev);
 
-#ifndef ioread64
-#ifdef readq
-#define ioread64 readq
-#else
-#define ioread64 _ioread64
-static inline u64 _ioread64(void __iomem *mmio)
-{
-   u64 low, high;
-
-   low = ioread32(mmio);
-   high = ioread32(mmio + sizeof(u32));
-   return low | (high << 32);
-}
-#endif
-#endif
-
-#ifndef iowrite64
-#ifdef writeq
-#define iowrite64 writeq
-#else
-#define iowrite64 _iowrite64
-static inline void _iowrite64(u64 val, void __iomem *mmio)
-{
-   iowrite32(val, mmio);
-   iowrite32(val >> 32, mmio + sizeof(u32));
-}
-#endif
-#endif
-
 static inline int pdev_is_atom(struct pci_dev *pdev)
 {
switch (pdev->device) {
-- 
2.11.0



[PATCH v4 5/5] crypto: caam: cleanup CONFIG_64BIT ifdefs when using io{read|write}64

2017-07-18 Thread Logan Gunthorpe
From: Horia Geantă <horia.gea...@nxp.com>

We can now make use of the io-64-nonatomic-lo-hi header to always
provide 64 bit IO operations. So this patch cleans up the extra
CONFIG_64BIT ifdefs.

To be consistent with CAAM engine HW spec: in case of 64-bit registers,
irrespective of device endianness, the lower address should be read from
/ written to first, followed by the upper address. Indeed the I/O
accessors in CAAM driver currently don't follow the spec, however this
is a good opportunity to fix the code.

Signed-off-by: Horia Geantă <horia.gea...@nxp.com>
Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Cc: Horia Geantă <horia.gea...@nxp.com>
Cc: Dan Douglass <dan.dougl...@nxp.com>
Cc: Herbert Xu <herb...@gondor.apana.org.au>
Cc: "David S. Miller" <da...@davemloft.net>
---
 drivers/crypto/caam/regs.h | 35 +--
 1 file changed, 5 insertions(+), 30 deletions(-)

diff --git a/drivers/crypto/caam/regs.h b/drivers/crypto/caam/regs.h
index 84d2f838a063..0c45505458e7 100644
--- a/drivers/crypto/caam/regs.h
+++ b/drivers/crypto/caam/regs.h
@@ -9,7 +9,7 @@
 
 #include 
 #include 
-#include 
+#include 
 
 /*
  * Architecture-specific register access methods
@@ -134,50 +134,25 @@ static inline void clrsetbits_32(void __iomem *reg, u32 
clear, u32 set)
  *base + 0x : least-significant 32 bits
  *base + 0x0004 : most-significant 32 bits
  */
-#ifdef CONFIG_64BIT
 static inline void wr_reg64(void __iomem *reg, u64 data)
 {
+#ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX
if (caam_little_end)
iowrite64(data, reg);
else
-   iowrite64be(data, reg);
-}
-
-static inline u64 rd_reg64(void __iomem *reg)
-{
-   if (caam_little_end)
-   return ioread64(reg);
-   else
-   return ioread64be(reg);
-}
-
-#else /* CONFIG_64BIT */
-static inline void wr_reg64(void __iomem *reg, u64 data)
-{
-#ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX
-   if (caam_little_end) {
-   wr_reg32((u32 __iomem *)(reg) + 1, data >> 32);
-   wr_reg32((u32 __iomem *)(reg), data);
-   } else
 #endif
-   {
-   wr_reg32((u32 __iomem *)(reg), data >> 32);
-   wr_reg32((u32 __iomem *)(reg) + 1, data);
-   }
+   iowrite64be(data, reg);
 }
 
 static inline u64 rd_reg64(void __iomem *reg)
 {
 #ifndef CONFIG_CRYPTO_DEV_FSL_CAAM_IMX
if (caam_little_end)
-   return ((u64)rd_reg32((u32 __iomem *)(reg) + 1) << 32 |
-   (u64)rd_reg32((u32 __iomem *)(reg)));
+   return ioread64(reg);
else
 #endif
-   return ((u64)rd_reg32((u32 __iomem *)(reg)) << 32 |
-   (u64)rd_reg32((u32 __iomem *)(reg) + 1));
+   return ioread64be(reg);
 }
-#endif /* CONFIG_64BIT  */
 
 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 #ifdef CONFIG_SOC_IMX7D
-- 
2.11.0



[PATCH v4 0/5] make io{read|write}64 globally usable

2017-07-18 Thread Logan Gunthorpe
This is version four of my patchset to enable drivers to use
io{read|write}64 on all arches.

Changes since v3:

- I noticed powerpc didn't use the appropriate functions seeing
readq/writeq were not defined when iomap.h was included. Thus I've
included a patch to adjust this
- Fixed some mistakes with a couple of the defines in io-64-nonatomic*
headers
- Fixed a typo noticed by Horia.


Horia Geantă (1):
  crypto: caam: cleanup CONFIG_64BIT ifdefs when using io{read|write}64

Logan Gunthorpe (4):
  powerpc: io.h: move iomap.h include so that it can use readq/writeq
defs
  iomap: introduce io{read|write}64_{lo_hi|hi_lo}
  io-64-nonatomic: add io{read|write}64[be]{_lo_hi|_hi_lo} macros
  ntb: ntb_hw_intel: use io-64-nonatomic instead of in-driver hacks

 arch/powerpc/include/asm/io.h |   6 +-
 drivers/crypto/caam/regs.h|  35 ++---
 drivers/ntb/hw/intel/ntb_hw_intel.c   |  30 +---
 include/asm-generic/iomap.h   |  26 +--
 include/linux/io-64-nonatomic-hi-lo.h |  60 
 include/linux/io-64-nonatomic-lo-hi.h |  60 
 lib/iomap.c   | 132 ++
 7 files changed, 282 insertions(+), 67 deletions(-)

--
2.11.0


[PATCH v2 12/16] switchtec_ntb: add link management

2017-07-18 Thread Logan Gunthorpe
switchtec_ntb checks for a link by looking at the shared memory
window. If the magic number is correct and the otherside indicates
their link is enabled then we take the link to be up.

Whenever we change our local link status we send a msg to the
otherside to check whether it's up and change their status.

The current status is maintained in a flag so ntb_is_link_up
can return quickly.

We utilize Switchtec's link status notifier to also check link changes
when the switch notices a port changes state.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
---
 drivers/ntb/hw/mscc/switchtec_ntb.c | 130 +++-
 1 file changed, 129 insertions(+), 1 deletion(-)

diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c 
b/drivers/ntb/hw/mscc/switchtec_ntb.c
index 20f2fdda4644..dba14188ba95 100644
--- a/drivers/ntb/hw/mscc/switchtec_ntb.c
+++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
@@ -64,6 +64,7 @@ static inline void _iowrite64(u64 val, void __iomem *mmio)
 
 struct shared_mw {
u32 magic;
+   u32 link_sta;
u32 partition_id;
u64 mw_sizes[MAX_MWS];
 };
@@ -104,8 +105,17 @@ struct switchtec_ntb {
int peer_nr_direct_mw;
int peer_nr_lut_mw;
int peer_direct_mw_to_bar[MAX_DIRECT_MW];
+
+   bool link_is_up;
+   enum ntb_speed link_speed;
+   enum ntb_width link_width;
 };
 
+static struct switchtec_ntb *ntb_sndev(struct ntb_dev *ntb)
+{
+   return container_of(ntb, struct switchtec_ntb, ntb);
+}
+
 static int switchtec_ntb_part_op(struct switchtec_ntb *sndev,
 struct ntb_ctrl_regs __iomem *ctl,
 u32 op, int wait_status)
@@ -163,6 +173,17 @@ static int switchtec_ntb_part_op(struct switchtec_ntb 
*sndev,
return -EIO;
 }
 
+static int switchtec_ntb_send_msg(struct switchtec_ntb *sndev, int idx,
+ u32 val)
+{
+   if (idx < 0 || idx >= ARRAY_SIZE(sndev->mmio_self_dbmsg->omsg))
+   return -EINVAL;
+
+   iowrite32(val, >mmio_self_dbmsg->omsg[idx].msg);
+
+   return 0;
+}
+
 static int switchtec_ntb_mw_count(struct ntb_dev *ntb, int pidx)
 {
return 0;
@@ -194,22 +215,124 @@ static int switchtec_ntb_peer_mw_get_addr(struct ntb_dev 
*ntb, int idx,
return 0;
 }
 
+static void switchtec_ntb_part_link_speed(struct switchtec_ntb *sndev,
+ int partition,
+ enum ntb_speed *speed,
+ enum ntb_width *width)
+{
+   struct switchtec_dev *stdev = sndev->stdev;
+
+   u32 pff = ioread32(>mmio_part_cfg[partition].vep_pff_inst_id);
+   u32 linksta = ioread32(>mmio_pff_csr[pff].pci_cap_region[13]);
+
+   if (speed)
+   *speed = (linksta >> 16) & 0xF;
+
+   if (width)
+   *width = (linksta >> 20) & 0x3F;
+}
+
+static void switchtec_ntb_set_link_speed(struct switchtec_ntb *sndev)
+{
+   enum ntb_speed self_speed, peer_speed;
+   enum ntb_width self_width, peer_width;
+
+   if (!sndev->link_is_up) {
+   sndev->link_speed = NTB_SPEED_NONE;
+   sndev->link_width = NTB_WIDTH_NONE;
+   return;
+   }
+
+   switchtec_ntb_part_link_speed(sndev, sndev->self_partition,
+ _speed, _width);
+   switchtec_ntb_part_link_speed(sndev, sndev->peer_partition,
+ _speed, _width);
+
+   sndev->link_speed = min(self_speed, peer_speed);
+   sndev->link_width = min(self_width, peer_width);
+}
+
+enum {
+   LINK_MESSAGE = 0,
+   MSG_LINK_UP = 1,
+   MSG_LINK_DOWN = 2,
+   MSG_CHECK_LINK = 3,
+};
+
+static void switchtec_ntb_check_link(struct switchtec_ntb *sndev)
+{
+   int link_sta;
+   int old = sndev->link_is_up;
+
+   link_sta = sndev->self_shared->link_sta;
+   if (link_sta) {
+   u64 peer = ioread64(>peer_shared->magic);
+
+   if ((peer & 0x) == SWITCHTEC_NTB_MAGIC)
+   link_sta = peer >> 32;
+   else
+   link_sta = 0;
+   }
+
+   sndev->link_is_up = link_sta;
+   switchtec_ntb_set_link_speed(sndev);
+
+   if (link_sta != old) {
+   switchtec_ntb_send_msg(sndev, LINK_MESSAGE, MSG_CHECK_LINK);
+   ntb_link_event(>ntb);
+   dev_info(>stdev->dev, "ntb link %s",
+link_sta ? "up" : "down");
+   }
+}
+
+static void switchtec_ntb_link_notification(struct switchtec_dev *stdev)
+{
+   struct switchtec_ntb *sndev = stdev->sndev;
+
+   switchtec_ntb_check_link(sndev);
+}
+
 static u64 switch

[PATCH v2 11/16] switchtec_ntb: add skeleton NTB driver

2017-07-18 Thread Logan Gunthorpe
Add a skeleton NTB driver which will be filled out in subsequent patches.

Signed-off-by: Logan Gunthorpe <log...@deltatee.com>
Reviewed-by: Stephen Bates <sba...@raithlin.com>
Reviewed-by: Kurt Schwemmer <kurt.schwem...@microsemi.com>
---
 drivers/ntb/hw/mscc/switchtec_ntb.c | 148 +++-
 include/linux/ntb.h |   3 +
 2 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/drivers/ntb/hw/mscc/switchtec_ntb.c 
b/drivers/ntb/hw/mscc/switchtec_ntb.c
index 852dc9869a3e..20f2fdda4644 100644
--- a/drivers/ntb/hw/mscc/switchtec_ntb.c
+++ b/drivers/ntb/hw/mscc/switchtec_ntb.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 MODULE_DESCRIPTION("Microsemi Switchtec(tm) NTB Driver");
 MODULE_VERSION("0.1");
@@ -71,6 +72,7 @@ struct shared_mw {
 #define LUT_SIZE SZ_64K
 
 struct switchtec_ntb {
+   struct ntb_dev ntb;
struct switchtec_dev *stdev;
 
int self_partition;
@@ -161,10 +163,148 @@ static int switchtec_ntb_part_op(struct switchtec_ntb 
*sndev,
return -EIO;
 }
 
+static int switchtec_ntb_mw_count(struct ntb_dev *ntb, int pidx)
+{
+   return 0;
+}
+
+static int switchtec_ntb_mw_get_align(struct ntb_dev *ntb, int pidx,
+ int widx, resource_size_t *addr_align,
+ resource_size_t *size_align,
+ resource_size_t *size_max)
+{
+   return 0;
+}
+
+static int switchtec_ntb_mw_set_trans(struct ntb_dev *ntb, int pidx, int widx,
+ dma_addr_t addr, resource_size_t size)
+{
+   return 0;
+}
+
+static int switchtec_ntb_peer_mw_count(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static int switchtec_ntb_peer_mw_get_addr(struct ntb_dev *ntb, int idx,
+ phys_addr_t *base,
+ resource_size_t *size)
+{
+   return 0;
+}
+
+static u64 switchtec_ntb_link_is_up(struct ntb_dev *ntb,
+   enum ntb_speed *speed,
+   enum ntb_width *width)
+{
+   return 0;
+}
+
+static int switchtec_ntb_link_enable(struct ntb_dev *ntb,
+enum ntb_speed max_speed,
+enum ntb_width max_width)
+{
+   return 0;
+}
+
+static int switchtec_ntb_link_disable(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static u64 switchtec_ntb_db_valid_mask(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static int switchtec_ntb_db_vector_count(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static u64 switchtec_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector)
+{
+   return 0;
+}
+
+static u64 switchtec_ntb_db_read(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static int switchtec_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits)
+{
+   return 0;
+}
+
+static int switchtec_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+   return 0;
+}
+
+static int switchtec_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits)
+{
+   return 0;
+}
+
+static int switchtec_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits)
+{
+   return 0;
+}
+
+static int switchtec_ntb_spad_count(struct ntb_dev *ntb)
+{
+   return 0;
+}
+
+static u32 switchtec_ntb_spad_read(struct ntb_dev *ntb, int idx)
+{
+   return 0;
+}
+
+static int switchtec_ntb_spad_write(struct ntb_dev *ntb, int idx, u32 val)
+{
+   return 0;
+}
+
+static int switchtec_ntb_peer_spad_write(struct ntb_dev *ntb, int pidx,
+int sidx, u32 val)
+{
+   return 0;
+}
+
+static const struct ntb_dev_ops switchtec_ntb_ops = {
+   .mw_count   = switchtec_ntb_mw_count,
+   .mw_get_align   = switchtec_ntb_mw_get_align,
+   .mw_set_trans   = switchtec_ntb_mw_set_trans,
+   .peer_mw_count  = switchtec_ntb_peer_mw_count,
+   .peer_mw_get_addr   = switchtec_ntb_peer_mw_get_addr,
+   .link_is_up = switchtec_ntb_link_is_up,
+   .link_enable= switchtec_ntb_link_enable,
+   .link_disable   = switchtec_ntb_link_disable,
+   .db_valid_mask  = switchtec_ntb_db_valid_mask,
+   .db_vector_count= switchtec_ntb_db_vector_count,
+   .db_vector_mask = switchtec_ntb_db_vector_mask,
+   .db_read= switchtec_ntb_db_read,
+   .db_clear   = switchtec_ntb_db_clear,
+   .db_set_mask= switchtec_ntb_db_set_mask,
+   .db_clear_mask  = switchtec_ntb_db_clear_mask,
+   .peer_db_set= switchtec_ntb_peer_db_set,
+   .spad_count = switchtec_ntb_spad_count,
+   .spad_read  = switchtec_ntb_spad_read,
+   .spad_write = switchtec_ntb_spad_write,
+   .peer_spad_write= switchtec_ntb_peer_spad_write,
+};
+
 stati

<    1   2   3   4   5   6   7   8   9   10   >