Thanks for implementing this.  You saved me a lot of work and Libor
will be very happy when he gets back next week.

Some comments from my first read through:

 > This patch implements FMR support. I also rolled into it two fixes
 > for regular mrs that I posed previously, let me know if its a problem.

No problem although I'll apply them separately.

 > This seems to be working fine for me, although I only did relatively basic
 > tests. Both Tavor and Arbel Native modes are supported. I made some tradeoffs
 > for simplicity, let me know what do you think:
 > - for tavor, I keep for each fmr two pages mapped: for mpt and one for
 >   mtt access. This spends more kernel virtual memory than could be used,
 >   since many mpts could share one page. Alternatives are:
 >   map/unmap io memory on each fmr map/unmap request, or
 >   keep and intermediate table tomap each page only once.

I don't think this is acceptable.  Each ioremap has to map at least
one page plus a guard page.  With two ioremaps per FMR, every FMR is
using 16K (or more) of vmalloc space.  On 64 bit archs, this doesn't
matter, but on a large memory i386 machine, there's less than 128 MB
of vmalloc space available (possibly a lot less if someone is using a
video card with a big frame buffer or something).  That means we're
limited to a few thousand FMRs, which isn't enough.

What if we just reserve something like 64K MPTs and MTTs for FMRs and
ioremap everything at driver startup?  That would only use a few MB of
vmalloc space and probably simplify the code too.

 > - icm that has the mpts/mtts is linearly scanned and this is repeated
 >   for each mtt on each fmr map. This may be improved somewhat
 >   with some kind of an iterator, but to really speed things up
 >   the icm data structure (list of arrays) would have to
 >   be replaced by some kind of tree.

I don't understand this.  I'm probably missing something but the
addresses don't change after we allocate the FMR, right?  It seems we
could just store the MPT/MTT address in the FMR structure the same way
we do for Tavor mode.

Some more nitpicky comments below...

 > 
 > Signed-off-by: Michael S. Tsirkin <[EMAIL PROTECTED]>
 > 
 > Index: hw/mthca/mthca_dev.h
 > ===================================================================
 > --- hw/mthca/mthca_dev.h     (revision 2012)
 > +++ hw/mthca/mthca_dev.h     (working copy)
 > @@ -163,6 +163,7 @@ struct mthca_mr_table {
 >      int                     max_mtt_order;
 >      unsigned long         **mtt_buddy;
 >      u64                     mtt_base;
 > +    u64                     mpt_base; /* Tavor only */
 >      struct mthca_icm_table *mtt_table;
 >      struct mthca_icm_table *mpt_table;
 >  };
 > @@ -363,7 +364,17 @@ int mthca_mr_alloc_phys(struct mthca_dev
 >                      u64 *buffer_list, int buffer_size_shift,
 >                      int list_len, u64 iova, u64 total_size,
 >                      u32 access, struct mthca_mr *mr);
 > -void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr);
 > +void mthca_free_mr(struct mthca_dev *dev,  struct mthca_mr *mr);
 > +
 > +int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
 > +                    u32 access, struct mthca_fmr *fmr);
 > +
 > +int mthca_fmr_map(struct mthca_dev *dev, struct mthca_fmr *fmr,
 > +              u64 *page_list, int list_len, u64 iova);
 > +
 > +void mthca_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
 > +
 > +void mthca_free_fmr(struct mthca_dev *dev,  struct mthca_fmr *fmr);
 >  
 >  int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
 >  void mthca_unmap_eq_icm(struct mthca_dev *dev);
 > Index: hw/mthca/mthca_memfree.h
 > ===================================================================
 > --- hw/mthca/mthca_memfree.h (revision 2012)
 > +++ hw/mthca/mthca_memfree.h (working copy)
 > @@ -90,6 +90,11 @@ int mthca_table_get_range(struct mthca_d
 >  void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table 
 > *table,
 >                         int start, int end);
 >  
 > +/* Nonblocking. Callers must make sure the object exists by serializing 
 > against
 > + * callers of get/put. */
 > +void *mthca_table_find(struct mthca_dev *dev, struct mthca_icm_table *table,
 > +                   int obj);

Can we just make this use the table mutex and only call it when
allocating an FMR?

 > +
 >  static inline void mthca_icm_first(struct mthca_icm *icm,
 >                                 struct mthca_icm_iter *iter)
 >  {
 > Index: hw/mthca/mthca_provider.c
 > ===================================================================
 > --- hw/mthca/mthca_provider.c        (revision 2012)
 > +++ hw/mthca/mthca_provider.c        (working copy)
 > @@ -568,8 +568,101 @@ static struct ib_mr *mthca_reg_phys_mr(s
 >  
 >  static int mthca_dereg_mr(struct ib_mr *mr)
 >  {
 > -    mthca_free_mr(to_mdev(mr->device), to_mmr(mr));
 > -    kfree(mr);
 > +    struct mthca_mr *mmr = to_mmr(mr);
 > +    mthca_free_mr(to_mdev(mr->device), mmr);
 > +    kfree(mmr);
 > +    return 0;
 > +}
 > +
 > +static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
 > +                       struct ib_fmr_attr *fmr_attr)
 > +{
 > +    struct mthca_fmr *fmr;
 > +    int err;
 > +    fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
 > +    if (!fmr)
 > +            return ERR_PTR(-ENOMEM);
 > +
 > +    memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr);
 > +    err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num,
 > +                          convert_access(mr_access_flags), fmr);
 > +
 > +    if (err) {
 > +            kfree(fmr);
 > +            return ERR_PTR(err);
 > +    }
 > +    return &fmr->ibmr;
 > +}
 > +
 > +static int mthca_dealloc_fmr(struct ib_fmr *fmr)
 > +{
 > +    struct mthca_fmr *mfmr = to_mfmr(fmr);
 > +    mthca_free_fmr(to_mdev(fmr->device), mfmr);
 > +    kfree(mfmr);
 > +    return 0;
 > +}
 > +
 > +static int mthca_map_phys_fmr(struct ib_fmr *fmr, u64 *page_list, int 
 > list_len,
 > +                          u64 iova)
 > +{
 > +    int i, page_mask;
 > +    struct mthca_fmr *mfmr;
 > +
 > +    mfmr = to_mfmr(fmr);
 > +
 > +    if (list_len > mfmr->attr.max_pages) {
 > +            mthca_warn(to_mdev(fmr->device), "Attempt to map list length "
 > +                       "%d to fmr with max %d pages\n",
 > +                       list_len, mfmr->attr.max_pages);
 > +            return -EINVAL;
 > +    }
 > +
 > +    page_mask = (1 << mfmr->attr.page_size) - 1;
 > +
 > +    /* We are getting page lists, so va must be page aligned. */
 > +    if (iova & page_mask) {
 > +            mthca_warn(to_mdev(fmr->device), "Attempt to map fmr with page "
 > +                       "shift %d at misaligned virtual address %016llx\n",
 > +                       mfmr->attr.page_size, iova);
 > +            return -EINVAL;
 > +    }
 > +
 > +    /* Trust the user not to pass misaligned data in page_list */
 > +    if (0)
 > +            for (i = 0; i < list_len; ++i) {
 > +                    if (page_list[i] & page_mask) {
 > +                            mthca_warn(to_mdev(fmr->device), "Attempt to "
 > +                                       "map fmr with page shift %d at "
 > +                                       "address %016llx\n",
 > +                                       mfmr->attr.page_size, page_list[i]);
 > +                            return -EINVAL;
 > +                    }
 > +            }
 > +
 > +    return mthca_fmr_map(to_mdev(fmr->device), to_mfmr(fmr), page_list,
 > +                         list_len, iova);
 > +}
 > +
 > +static int mthca_unmap_fmr(struct list_head *fmr_list)
 > +{
 > +    struct mthca_dev* mdev = NULL;
 > +    struct ib_fmr *fmr;
 > +    int err;
 > +    u8 status;
 > +
 > +    list_for_each_entry(fmr, fmr_list, list) {
 > +            mdev = to_mdev(fmr->device);
 > +            mthca_fmr_unmap(mdev, to_mfmr(fmr));
 > +    }
 > +
 > +    if (!mdev)
 > +            return 0;
 > +
 > +    err = mthca_SYNC_TPT(mdev, &status);
 > +    if (err)
 > +            return err;
 > +    if (status)
 > +            return -EINVAL;
 >      return 0;
 >  }
 >  
 > @@ -636,6 +729,15 @@ int mthca_register_device(struct mthca_d
 >      dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
 >      dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
 >      dev->ib_dev.dereg_mr             = mthca_dereg_mr;
 > +
 > +    if (dev->hca_type == ARBEL_NATIVE ||
 > +        !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
 > +            dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
 > +            dev->ib_dev.map_phys_fmr         = mthca_map_phys_fmr;
 > +            dev->ib_dev.unmap_fmr            = mthca_unmap_fmr;
 > +            dev->ib_dev.dealloc_fmr          = mthca_dealloc_fmr;
 > +    }
 > +
 >      dev->ib_dev.attach_mcast         = mthca_multicast_attach;
 >      dev->ib_dev.detach_mcast         = mthca_multicast_detach;
 >      dev->ib_dev.process_mad          = mthca_process_mad;
 > Index: hw/mthca/mthca_provider.h
 > ===================================================================
 > --- hw/mthca/mthca_provider.h        (revision 2012)
 > +++ hw/mthca/mthca_provider.h        (working copy)
 > @@ -60,6 +60,16 @@ struct mthca_mr {
 >      u32 first_seg;
 >  };
 >  
 > +struct mthca_fmr {
 > +    struct ib_fmr ibmr;
 > +    struct ib_fmr_attr attr;
 > +    int order;
 > +    u32 first_seg;
 > +    int maps;
 > +    struct mthca_mpt_entry __iomem *mpt;  /* Tavor only */
 > +    u64 __iomem *mtts; /* Tavor only */
 > +};
 > +
 >  struct mthca_pd {
 >      struct ib_pd    ibpd;
 >      u32             pd_num;
 > @@ -218,6 +228,11 @@ struct mthca_sqp {
 >      dma_addr_t      header_dma;
 >  };
 >  
 > +static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr)
 > +{
 > +    return container_of(ibmr, struct mthca_fmr, ibmr);
 > +}
 > +
 >  static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
 >  {
 >      return container_of(ibmr, struct mthca_mr, ibmr);
 > Index: hw/mthca/mthca_profile.c
 > ===================================================================
 > --- hw/mthca/mthca_profile.c (revision 2012)
 > +++ hw/mthca/mthca_profile.c (working copy)
 > @@ -223,9 +223,10 @@ u64 mthca_make_profile(struct mthca_dev 
 >                      init_hca->mc_hash_sz      = 1 << (profile[i].log_num - 
 > 1);
 >                      break;
 >              case MTHCA_RES_MPT:
 > -                    dev->limits.num_mpts = profile[i].num;
 > -                    init_hca->mpt_base   = profile[i].start;
 > -                    init_hca->log_mpt_sz = profile[i].log_num;
 > +                    dev->limits.num_mpts   = profile[i].num;
 > +                    dev->mr_table.mpt_base = profile[i].start;
 > +                    init_hca->mpt_base     = profile[i].start;
 > +                    init_hca->log_mpt_sz   = profile[i].log_num;
 >                      break;
 >              case MTHCA_RES_MTT:
 >                      dev->limits.num_mtt_segs = profile[i].num;
 > Index: hw/mthca/mthca_cmd.c
 > ===================================================================
 > --- hw/mthca/mthca_cmd.c     (revision 2012)
 > +++ hw/mthca/mthca_cmd.c     (working copy)
 > @@ -1406,6 +1406,11 @@ int mthca_WRITE_MTT(struct mthca_dev *de
 >      return err;
 >  }
 >  
 > +int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status)
 > +{
 > +    return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B, status);
 > +}
 > +
 >  int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
 >               int eq_num, u8 *status)
 >  {
 > Index: hw/mthca/mthca_cmd.h
 > ===================================================================
 > --- hw/mthca/mthca_cmd.h     (revision 2012)
 > +++ hw/mthca/mthca_cmd.h     (working copy)
 > @@ -277,6 +277,7 @@ int mthca_HW2SW_MPT(struct mthca_dev *de
 >                  int mpt_index, u8 *status);
 >  int mthca_WRITE_MTT(struct mthca_dev *dev, u64 *mtt_entry,
 >                  int num_mtt, u8 *status);
 > +int mthca_SYNC_TPT(struct mthca_dev *dev, u8 *status);
 >  int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
 >               int eq_num, u8 *status);
 >  int mthca_SW2HW_EQ(struct mthca_dev *dev, void *eq_context,
 > Index: hw/mthca/mthca_mr.c
 > ===================================================================
 > --- hw/mthca/mthca_mr.c      (revision 2012)
 > +++ hw/mthca/mthca_mr.c      (working copy)
 > @@ -368,11 +368,13 @@ err_out_table:
 >              mthca_table_put(dev, dev->mr_table.mpt_table, key);
 >  
 >  err_out_mpt_free:
 > -    mthca_free(&dev->mr_table.mpt_alloc, mr->ibmr.lkey);
 > +    mthca_free(&dev->mr_table.mpt_alloc, key);
 >      return err;
 >  }
 >  
 > -void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
 > +/* Free mr or fmr */
 > +static void mthca_free_region(struct mthca_dev *dev, u32 lkey, int order,
 > +                          u32 first_seg)
 >  {
 >      int err;
 >      u8 status;
 > @@ -380,7 +382,7 @@ void mthca_free_mr(struct mthca_dev *dev
 >      might_sleep();
 >  
 >      err = mthca_HW2SW_MPT(dev, NULL,
 > -                          key_to_hw_index(dev, mr->ibmr.lkey) &
 > +                          key_to_hw_index(dev, lkey) &
 >                            (dev->limits.num_mpts - 1),
 >                            &status);
 >      if (err)
 > @@ -389,15 +391,276 @@ void mthca_free_mr(struct mthca_dev *dev
 >              mthca_warn(dev, "HW2SW_MPT returned status 0x%02x\n",
 >                         status);
 >  
 > -    if (mr->order >= 0)
 > -            mthca_free_mtt(dev, mr->first_seg, mr->order);
 > +    if (order >= 0)
 > +            mthca_free_mtt(dev, first_seg, order);
 >  
 >      if (dev->hca_type == ARBEL_NATIVE)
 >              mthca_table_put(dev, dev->mr_table.mpt_table,
 > -                            key_to_hw_index(dev, mr->ibmr.lkey));
 > -    mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, 
 > mr->ibmr.lkey));
 > +                            key_to_hw_index(dev, lkey));
 > +    mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey));
 > +}
 > +
 > +void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
 > +{
 > +    mthca_free_region(dev, mr->ibmr.lkey, mr->order, mr->first_seg);
 > +}
 > +
 > +int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
 > +                    u32 access, struct mthca_fmr *mr)
 > +{
 > +    void *mailbox;
 > +    u64 mtt_seg;
 > +    struct mthca_mpt_entry *mpt_entry;
 > +    u32 key, idx;
 > +    int err = -ENOMEM;
 > +    u8 status;
 > +    int i;
 > +    int list_len = mr->attr.max_pages;
 > +
 > +    might_sleep();
 > +
 > +    BUG_ON(mr->attr.page_size < 12);
 > +    WARN_ON(mr->attr.page_size >= 32);

Why is one a BUG and one a WARN?  Why not just return an error in both
cases?  Something like:

        if (mr->attr.page_size < 12 || mr->attr.page_size >= 32)
                return -EINVAL;

 > +
 > +    mr->maps = 0;
 > +
 > +    key = mthca_alloc(&dev->mr_table.mpt_alloc);
 > +    if (key == -1)
 > +            return -ENOMEM;
 > +    idx = key & (dev->limits.num_mpts - 1);
 > +    mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
 > +
 > +    if (dev->hca_type == ARBEL_NATIVE) {
 > +            err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
 > +            if (err)
 > +                    goto err_out_mpt_free;
 > +    }
 > +    for (i = dev->limits.mtt_seg_size / 8, mr->order = 0;
 > +         i < list_len;
 > +         i <<= 1, ++mr->order)
 > +            ; /* nothing */
 > +
 > +    mr->first_seg = mthca_alloc_mtt(dev, mr->order);
 > +    if (mr->first_seg == -1)
 > +            goto err_out_table;
 > +
 > +    mtt_seg = dev->mr_table.mtt_base +
 > +            mr->first_seg * dev->limits.mtt_seg_size;
 > +
 > +    mr->mpt = NULL;
 > +    mr->mtts = NULL;
 > +
 > +    if (dev->hca_type != ARBEL_NATIVE) {
 > +            mr->mpt = ioremap(dev->mr_table.mpt_base +
 > +                              sizeof *(mr->mpt) * idx, sizeof *(mr->mpt));
 > +            if (!mr->mpt) {
 > +                    mthca_dbg(dev, "Couldn't map MPT entry for fmr %08x.\n",
 > +                              mr->ibmr.lkey);
 > +                    goto err_out_free_mtt;
 > +            }
 > +            mr->mtts = ioremap(mtt_seg, list_len * sizeof *(mr->mtts));
 > +            if (!mr->mtts) {
 > +                    mthca_dbg(dev, "Couldn't map MTT entry %016llx "
 > +                              "(size %x) for fmr %08x.\n", mtt_seg,
 > +                              list_len * sizeof u64, mr->ibmr.lkey);
 > +                    goto err_out_free_mtt;
 > +            }
 > +    }
 > +
 > +    mailbox = kmalloc(sizeof *mpt_entry + MTHCA_CMD_MAILBOX_EXTRA,
 > +                      GFP_KERNEL);
 > +    if (!mailbox)
 > +            goto err_out_free_mtt;
 > +
 > +    mpt_entry = MAILBOX_ALIGN(mailbox);
 > +
 > +    mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
 > +                                   MTHCA_MPT_FLAG_MIO         |
 > +                                   MTHCA_MPT_FLAG_REGION      |
 > +                                   access);
 > +
 > +    mpt_entry->page_size = cpu_to_be32(mr->attr.page_size - 12);
 > +    mpt_entry->key       = cpu_to_be32(key);
 > +    mpt_entry->pd        = cpu_to_be32(pd);
 > +    memset(&mpt_entry->start, 0,
 > +           sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start));
 > +    mpt_entry->mtt_seg   = cpu_to_be64(mtt_seg);
 > +
 > +    if (0) {
 > +            mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
 > +            for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
 > +                    if (i % 4 == 0)
 > +                            printk("[%02x] ", i * 4);
 > +                    printk(" %08x", be32_to_cpu(((u32 *) mpt_entry)[i]));
 > +                    if ((i + 1) % 4 == 0)
 > +                            printk("\n");
 > +            }
 > +    }
 > +
 > +    err = mthca_SW2HW_MPT(dev, mpt_entry,
 > +                          key & (dev->limits.num_mpts - 1),
 > +                          &status);
 > +    if (err)
 > +            mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
 > +    else if (status) {
 > +            mthca_warn(dev, "SW2HW_MPT returned status 0x%02x\n",
 > +                       status);
 > +            err = -EINVAL;
 > +    }
 > +
 > +    kfree(mailbox);
 > +    return err;
 > +
 > +err_out_free_mtt:
 > +    if (mr->mtts)
 > +            iounmap(mr->mtts);
 > +    if (mr->mpt)
 > +            iounmap(mr->mpt);
 > +    mthca_free_mtt(dev, mr->first_seg, mr->order);
 > +
 > +err_out_table:
 > +    if (dev->hca_type == ARBEL_NATIVE)
 > +            mthca_table_put(dev, dev->mr_table.mpt_table, key);
 > +
 > +err_out_mpt_free:
 > +    mthca_free(&dev->mr_table.mpt_alloc, key);
 > +    return err;
 > +}
 > +
 > +void mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr)
 > +{
 > +    mthca_free_region(dev, fmr->ibmr.lkey, fmr->order, fmr->first_seg);
 > +}
 > +
 > +#define MTHCA_MPT_STATUS_SW 0xF
 > +#define MTHCA_MPT_STATUS_HW 0x0
 > +
 > +static void mthca_tavor_fmr_map(struct mthca_dev *dev, struct mthca_fmr 
 > *fmr,
 > +              u64 *page_list, int list_len, u64 iova, u32 key)
 > +{
 > +    struct mthca_mpt_entry mpt_entry;
 > +    int i;
 > +
 > +    writeb(MTHCA_MPT_STATUS_SW,fmr->mpt);

It's a minor nitpick but put a space after the "," please.  (This
appears a few more times below too)

 > +
 > +    wmb();

Are the wmb()s required in this function?  writeX() is already ordered.

 > +
 > +    for (i = 0; i < list_len; ++i) {
 > +            u64 mtt_entry = cpu_to_be64(page_list[i] |
 > +                                        MTHCA_MTT_FLAG_PRESENT);
 > +            writeq(mtt_entry, fmr->mtts + i);

Don't use writeq() unconditionally.  32 bit archs don't have it.

 > +    }
 > +    mpt_entry.lkey = cpu_to_be32(key);
 > +    mpt_entry.length = cpu_to_be64(((u64)list_len) *
 > +                                   (1 << fmr->attr.page_size));
 > +    mpt_entry.start = cpu_to_be64(iova);
 > +
 > +    writel(mpt_entry.lkey, &fmr->mpt->key);
 > +    memcpy_toio(&fmr->mpt->start, &mpt_entry.start, 
 > +                offsetof(struct mthca_mpt_entry, window_count) -
 > +                offsetof(struct mthca_mpt_entry, start));
 > +
 > +    wmb();
 > +
 > +    writeb(MTHCA_MPT_STATUS_SW,fmr->mpt);

Should this be STATUS_HW here?  It seems we exit the function with the
MPT marked invalid.

 > +
 > +    wmb();
 > +}
 > +
 > +static int mthca_arbel_fmr_map(struct mthca_dev *dev, struct mthca_fmr *fmr,
 > +              u64 *page_list, int list_len, u64 iova, u32 key)
 > +{
 > +    void* mpt;

Write this as "void *mpt" instead please.

 > +    struct mthca_mpt_entry *mpt_entry;
 > +    u8 *mpt_status;
 > +    int i;
 > +
 > +    mpt = mthca_table_find(dev, dev->mr_table.mpt_table, key);
 > +    if (!mpt)
 > +            return -EINVAL;
 > +
 > +    mpt_status = mpt;
 > +    *mpt_status = MTHCA_MPT_STATUS_SW;
 > +
 > +    wmb();
 > +
 > +    /* This is really dumb. We are rescanning the ICM on
 > +     * each mpt entry. We want some kind of iterator here.
 > +     * May be fine meanwhile, while things are small. */
 > +    for (i = 0; i < list_len; ++i) {
 > +            u64 *mtt_entry = mthca_table_find(dev, dev->mr_table.mtt_table,
 > +                                              fmr->first_seg + i);
 > +            if (!mtt_entry)
 > +                    return -EINVAL;
 > +
 > +            *mtt_entry = cpu_to_be64(page_list[i] | MTHCA_MTT_FLAG_PRESENT);
 > +    }
 > +
 > +
 > +    mpt_entry = mpt;
 > +    mpt_entry->lkey = mpt_entry->key = cpu_to_be32(key);
 > +    mpt_entry->length = cpu_to_be64(((u64)list_len) *
 > +                                    (1 << fmr->attr.page_size));
 > +    mpt_entry->start = cpu_to_be64(iova);
 > +
 > +    wmb();
 > +
 > +    *mpt_status = MTHCA_MPT_STATUS_HW;
 > +
 > +    wmb();
 > +    return 0;
 > +}
 > +
 > +
 > +int mthca_fmr_map(struct mthca_dev *dev, struct mthca_fmr *fmr,
 > +              u64 *page_list, int list_len, u64 iova)
 > +{
 > +    u32 key;
 > +
 > +    if (fmr->maps >= fmr->attr.max_maps) {
 > +            mthca_warn(dev, "Attempt to map fmr %d times, max_maps is %d\n",
 > +                       fmr->maps, fmr->attr.max_maps);
 > +            return -EINVAL;
 > +    }
 > +
 > +    key = key_to_hw_index(dev, fmr->ibmr.lkey) + dev->limits.num_mpts;
 > +    fmr->ibmr.lkey = fmr->ibmr.rkey = hw_index_to_key(dev, key);
 > +    fmr->maps++;

Maybe put this common part in a static function and have
mthca_{arbel,tavor}_fmr_map() call it.  Then we could just set the
map_phys_fmr method in the device struct to the right function at
initialization time.

 > +
 > +    if (dev->hca_type == ARBEL_NATIVE) {
 > +            return mthca_arbel_fmr_map(dev, fmr, page_list, list_len,
 > +                                       iova, key);
 > +    } else {
 > +            mthca_tavor_fmr_map(dev, fmr, page_list, list_len,
 > +                                       iova, key);
 > +            return 0;
 > +    }
 > +}
 > +
 > +void mthca_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
 > +{
 > +    if (!fmr->maps) {
 > +            return;
 > +    }

Nitpick -- no { } needed here.

 > +
 > +    fmr->maps = 0;
 > +
 > +    if (dev->hca_type == ARBEL_NATIVE) {
 > +            u32 key = key_to_hw_index(dev, fmr->ibmr.lkey);
 > +            u8 *mpt_status = mthca_table_find(dev, dev->mr_table.mpt_table,
 > +                                              key);
 > +            if (!mpt_status)
 > +                    return;
 > +
 > +            *mpt_status = MTHCA_MPT_STATUS_SW;
 > +            wmb();
 > +    } else {
 > +            writeb(MTHCA_MPT_STATUS_SW,fmr->mpt);
 > +            wmb();
 > +    }
 >  }
 >  
 > +
 >  int __devinit mthca_init_mr_table(struct mthca_dev *dev)
 >  {
 >      int err;
 > Index: hw/mthca/mthca_memfree.c
 > ===================================================================
 > --- hw/mthca/mthca_memfree.c (revision 2012)
 > +++ hw/mthca/mthca_memfree.c (working copy)
 > @@ -192,6 +192,47 @@ void mthca_table_put(struct mthca_dev *d
 >      up(&table->mutex);
 >  }
 >  
 > +/* Nonblocking. Callers must make sure the object exists by serializing 
 > against
 > + * callers of get/put. */
 > +void *mthca_table_find(struct mthca_dev *dev, struct mthca_icm_table *table,
 > +                   int obj)
 > +{
 > +    int idx, offset, i;
 > +    struct mthca_icm_chunk *chunk;
 > +    struct mthca_icm *icm;
 > +    struct page *page = NULL;
 > +
 > +    /* Supported only for low mem tables for now. */
 > +    if (!table->lowmem)
 > +            return NULL;
 > +
 > +            idx = (obj & (table->num_obj - 1)) * table->obj_size;
 > +            icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE];
 > +    offset = idx % MTHCA_TABLE_CHUNK_SIZE;

What happened to the indendation here?

 > +
 > +    if(!icm)
 > +            return NULL;
 > +
 > +    /* Linear scan of ICM on each access. Since this is called on fmr
 > +     * registration which is on data path, eventually we may want to
 > +     * rearrange things to use some kind of tree. */
 > +
 > +    list_for_each_entry(chunk, &icm->chunk_list, list) {
 > +            for (i = 0; i < chunk->npages; ++i) {
 > +                    if (chunk->mem[i].length >= offset) {
 > +                            page = chunk->mem[i].page;
 > +                            break;
 > +                    }
 > +                    offset -= chunk->mem[i].length;
 > +            }
 > +    }
 > +
 > +    if (!page)
 > +            return NULL;
 > +
 > +    return lowmem_page_address(page) + offset;
 > +}
 > +
 >  int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table 
 > *table,
 >                        int start, int end)
 >  {
_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to