Re: [Intel-gfx] [PATCH 12/26] drm/i915/guc: Implement multi-lrc submission

2021-10-13 Thread Matthew Brost
On Fri, Oct 08, 2021 at 10:20:24AM -0700, John Harrison wrote:
> On 10/4/2021 15:06, Matthew Brost wrote:
> > Implement multi-lrc submission via a single workqueue entry and single
> > H2G. The workqueue entry contains an updated tail value for each
> > request, of all the contexts in the multi-lrc submission, and updates
> > these values simultaneously. As such, the tasklet and bypass path have
> > been updated to coalesce requests into a single submission.
> > 
> > v2:
> >   (John Harrison)
> >- s/wqe/wqi
> >- Use FIELD_PREP macros
> >- Add GEM_BUG_ONs ensures length fits within field
> >- Add comment / white space to intel_guc_write_barrier
> >   (Kernel test robot)
> >- Make need_tasklet a static function
> > 
> > Signed-off-by: Matthew Brost 
> > ---
> >   drivers/gpu/drm/i915/gt/uc/intel_guc.c|  26 ++
> >   drivers/gpu/drm/i915/gt/uc/intel_guc.h|   8 +
> >   drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  24 +-
> >   drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 319 --
> >   drivers/gpu/drm/i915/i915_request.h   |   8 +
> >   6 files changed, 335 insertions(+), 73 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c 
> > b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> > index 8f8182bf7c11..7191e8439290 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
> > @@ -756,3 +756,29 @@ void intel_guc_load_status(struct intel_guc *guc, 
> > struct drm_printer *p)
> > }
> > }
> >   }
> > +
> > +void intel_guc_write_barrier(struct intel_guc *guc)
> > +{
> > +   struct intel_gt *gt = guc_to_gt(guc);
> > +
> > +   if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> > +   /*
> > +* Ensure intel_uncore_write_fw can be used rather than
> > +* intel_uncore_write.
> > +*/
> > +   GEM_BUG_ON(guc->send_regs.fw_domains);
> > +
> > +   /*
> > +* This register is used by the i915 and GuC for MMIO based
> > +* communication. Once we are in this code CTBs are the only
> > +* method the i915 uses to communicate with the GuC so it is
> > +* safe to write to this register (a value of 0 is NOP for MMIO
> > +* communication). If we ever start mixing CTBs and MMIOs a new
> > +* register will have to be chosen.
> > +*/
> Hmm, missed it before but this comment is very CTB centric and the barrier
> function is now being used for parallel submission work queues. Seems like
> an extra comment should be added to cover that case. Just something simple
> about WQ usage is also guaranteed to be post CTB switch over.
> 

Sure.

> > +   intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
> > +   } else {
> > +   /* wmb() sufficient for a barrier if in smem */
> > +   wmb();
> > +   }
> > +}
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
> > b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > index a9f4ec972bfb..147f39cc0f2f 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > @@ -46,6 +46,12 @@ struct intel_guc {
> >  * submitted until the stalled request is processed.
> >  */
> > struct i915_request *stalled_request;
> > +   enum {
> > +   STALL_NONE,
> > +   STALL_REGISTER_CONTEXT,
> > +   STALL_MOVE_LRC_TAIL,
> > +   STALL_ADD_REQUEST,
> > +   } submission_stall_reason;
> > /* intel_guc_recv interrupt related state */
> > /** @irq_lock: protects GuC irq state */
> > @@ -361,4 +367,6 @@ void intel_guc_submission_cancel_requests(struct 
> > intel_guc *guc);
> >   void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p);
> > +void intel_guc_write_barrier(struct intel_guc *guc);
> > +
> >   #endif
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
> > b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > index 20c710a74498..10d1878d2826 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
> > @@ -377,28 +377,6 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
> > return ++ct->requests.last_fence;
> >   }
> > -static void write_barrier(struct intel_guc_ct *ct)
> > -{
> > -   struct intel_guc *guc = ct_to_guc(ct);
> > -   struct intel_gt *gt = guc_to_gt(guc);
> > -
> > -   if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
> > -   GEM_BUG_ON(guc->send_regs.fw_domains);
> > -   /*
> > -* This register is used by the i915 and GuC for MMIO based
> > -* communication. Once we are in this code CTBs are the only
> > -* method the i915 uses to communicate with the GuC so it is
> > -* safe to write to this register (a value of 0 is NOP for MMIO
> > -* communication). If we ever start mixing CTBs and MMIOs 

Re: [Intel-gfx] [PATCH 12/26] drm/i915/guc: Implement multi-lrc submission

2021-10-08 Thread John Harrison

On 10/4/2021 15:06, Matthew Brost wrote:

Implement multi-lrc submission via a single workqueue entry and single
H2G. The workqueue entry contains an updated tail value for each
request, of all the contexts in the multi-lrc submission, and updates
these values simultaneously. As such, the tasklet and bypass path have
been updated to coalesce requests into a single submission.

v2:
  (John Harrison)
   - s/wqe/wqi
   - Use FIELD_PREP macros
   - Add GEM_BUG_ONs ensures length fits within field
   - Add comment / white space to intel_guc_write_barrier
  (Kernel test robot)
   - Make need_tasklet a static function

Signed-off-by: Matthew Brost 
---
  drivers/gpu/drm/i915/gt/uc/intel_guc.c|  26 ++
  drivers/gpu/drm/i915/gt/uc/intel_guc.h|   8 +
  drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  24 +-
  drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
  .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 319 --
  drivers/gpu/drm/i915/i915_request.h   |   8 +
  6 files changed, 335 insertions(+), 73 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
index 8f8182bf7c11..7191e8439290 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -756,3 +756,29 @@ void intel_guc_load_status(struct intel_guc *guc, struct 
drm_printer *p)
}
}
  }
+
+void intel_guc_write_barrier(struct intel_guc *guc)
+{
+   struct intel_gt *gt = guc_to_gt(guc);
+
+   if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
+   /*
+* Ensure intel_uncore_write_fw can be used rather than
+* intel_uncore_write.
+*/
+   GEM_BUG_ON(guc->send_regs.fw_domains);
+
+   /*
+* This register is used by the i915 and GuC for MMIO based
+* communication. Once we are in this code CTBs are the only
+* method the i915 uses to communicate with the GuC so it is
+* safe to write to this register (a value of 0 is NOP for MMIO
+* communication). If we ever start mixing CTBs and MMIOs a new
+* register will have to be chosen.
+*/
Hmm, missed it before but this comment is very CTB centric and the 
barrier function is now being used for parallel submission work queues. 
Seems like an extra comment should be added to cover that case. Just 
something simple about WQ usage is also guaranteed to be post CTB switch 
over.



+   intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
+   } else {
+   /* wmb() sufficient for a barrier if in smem */
+   wmb();
+   }
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index a9f4ec972bfb..147f39cc0f2f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -46,6 +46,12 @@ struct intel_guc {
 * submitted until the stalled request is processed.
 */
struct i915_request *stalled_request;
+   enum {
+   STALL_NONE,
+   STALL_REGISTER_CONTEXT,
+   STALL_MOVE_LRC_TAIL,
+   STALL_ADD_REQUEST,
+   } submission_stall_reason;
  
  	/* intel_guc_recv interrupt related state */

/** @irq_lock: protects GuC irq state */
@@ -361,4 +367,6 @@ void intel_guc_submission_cancel_requests(struct intel_guc 
*guc);
  
  void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p);
  
+void intel_guc_write_barrier(struct intel_guc *guc);

+
  #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index 20c710a74498..10d1878d2826 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -377,28 +377,6 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
return ++ct->requests.last_fence;
  }
  
-static void write_barrier(struct intel_guc_ct *ct)

-{
-   struct intel_guc *guc = ct_to_guc(ct);
-   struct intel_gt *gt = guc_to_gt(guc);
-
-   if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
-   GEM_BUG_ON(guc->send_regs.fw_domains);
-   /*
-* This register is used by the i915 and GuC for MMIO based
-* communication. Once we are in this code CTBs are the only
-* method the i915 uses to communicate with the GuC so it is
-* safe to write to this register (a value of 0 is NOP for MMIO
-* communication). If we ever start mixing CTBs and MMIOs a new
-* register will have to be chosen.
-*/
-   intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
-   } else {
-   /* wmb() sufficient for a barrier if in smem */
-   wmb();
-   }
-}
-
  static int ct_write(struct 

Re: [Intel-gfx] [PATCH 12/26] drm/i915/guc: Implement multi-lrc submission

2021-10-05 Thread kernel test robot
Hi Matthew,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on drm-tip/drm-tip]
[cannot apply to drm-intel/for-linux-next drm-exynos/exynos-drm-next 
tegra-drm/drm/tegra/for-next linus/master airlied/drm-next v5.15-rc3 
next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20211005-061424
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: x86_64-randconfig-a003-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 
c0039de2953d15815448b4b3c3bafb45607781e0)
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/3bea3cc438df1105d0d8c1bcc01b96559d4bb78c
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20211005-061424
git checkout 3bea3cc438df1105d0d8c1bcc01b96559d4bb78c
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 
ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

>> drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1486:7: error: variable 
>> 'ret' is used uninitialized whenever 'if' condition is false 
>> [-Werror,-Wsometimes-uninitialized]
   if (multi_lrc_submit(rq)) {
   ^~~~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1499:9: note: 
uninitialized use occurs here
   return ret;
  ^~~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1486:3: note: remove the 
'if' if its condition is always true
   if (multi_lrc_submit(rq)) {
   ^~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1479:9: note: initialize 
the variable 'ret' to silence this warning
   int ret;
  ^
   = 0
   1 error generated.


vim +1486 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

  1475  
  1476  static int guc_bypass_tasklet_submit(struct intel_guc *guc,
  1477   struct i915_request *rq)
  1478  {
  1479  int ret;
  1480  
  1481  __i915_request_submit(rq);
  1482  
  1483  trace_i915_request_in(rq, 0);
  1484  
  1485  if (is_multi_lrc_rq(rq)) {
> 1486  if (multi_lrc_submit(rq)) {
  1487  ret = guc_wq_item_append(guc, rq);
  1488  if (!ret)
  1489  ret = guc_add_request(guc, rq);
  1490  }
  1491  } else {
  1492  guc_set_lrc_tail(rq);
  1493  ret = guc_add_request(guc, rq);
  1494  }
  1495  
  1496  if (unlikely(ret == -EPIPE))
  1497  disable_submission(guc);
  1498  
  1499  return ret;
  1500  }
  1501  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


Re: [Intel-gfx] [PATCH 12/26] drm/i915/guc: Implement multi-lrc submission

2021-10-05 Thread kernel test robot
Hi Matthew,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-tip/drm-tip]
[cannot apply to drm-intel/for-linux-next drm-exynos/exynos-drm-next 
tegra-drm/drm/tegra/for-next linus/master airlied/drm-next v5.15-rc3 
next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20211005-061424
base:   git://anongit.freedesktop.org/drm/drm-tip drm-tip
config: i386-randconfig-a004-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 
c0039de2953d15815448b4b3c3bafb45607781e0)
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/3bea3cc438df1105d0d8c1bcc01b96559d4bb78c
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Matthew-Brost/Parallel-submission-aka-multi-bb-execbuf/20211005-061424
git checkout 3bea3cc438df1105d0d8c1bcc01b96559d4bb78c
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 
ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1486:7: warning: variable 
>> 'ret' is used uninitialized whenever 'if' condition is false 
>> [-Wsometimes-uninitialized]
   if (multi_lrc_submit(rq)) {
   ^~~~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1499:9: note: 
uninitialized use occurs here
   return ret;
  ^~~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1486:3: note: remove the 
'if' if its condition is always true
   if (multi_lrc_submit(rq)) {
   ^~
   drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c:1479:9: note: initialize 
the variable 'ret' to silence this warning
   int ret;
  ^
   = 0
   1 warning generated.


vim +1486 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c

  1475  
  1476  static int guc_bypass_tasklet_submit(struct intel_guc *guc,
  1477   struct i915_request *rq)
  1478  {
  1479  int ret;
  1480  
  1481  __i915_request_submit(rq);
  1482  
  1483  trace_i915_request_in(rq, 0);
  1484  
  1485  if (is_multi_lrc_rq(rq)) {
> 1486  if (multi_lrc_submit(rq)) {
  1487  ret = guc_wq_item_append(guc, rq);
  1488  if (!ret)
  1489  ret = guc_add_request(guc, rq);
  1490  }
  1491  } else {
  1492  guc_set_lrc_tail(rq);
  1493  ret = guc_add_request(guc, rq);
  1494  }
  1495  
  1496  if (unlikely(ret == -EPIPE))
  1497  disable_submission(guc);
  1498  
  1499  return ret;
  1500  }
  1501  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


[Intel-gfx] [PATCH 12/26] drm/i915/guc: Implement multi-lrc submission

2021-10-04 Thread Matthew Brost
Implement multi-lrc submission via a single workqueue entry and single
H2G. The workqueue entry contains an updated tail value for each
request, of all the contexts in the multi-lrc submission, and updates
these values simultaneously. As such, the tasklet and bypass path have
been updated to coalesce requests into a single submission.

v2:
 (John Harrison)
  - s/wqe/wqi
  - Use FIELD_PREP macros
  - Add GEM_BUG_ONs ensures length fits within field
  - Add comment / white space to intel_guc_write_barrier
 (Kernel test robot)
  - Make need_tasklet a static function

Signed-off-by: Matthew Brost 
---
 drivers/gpu/drm/i915/gt/uc/intel_guc.c|  26 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc.h|   8 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c |  24 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |  23 +-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 319 --
 drivers/gpu/drm/i915/i915_request.h   |   8 +
 6 files changed, 335 insertions(+), 73 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
index 8f8182bf7c11..7191e8439290 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c
@@ -756,3 +756,29 @@ void intel_guc_load_status(struct intel_guc *guc, struct 
drm_printer *p)
}
}
 }
+
+void intel_guc_write_barrier(struct intel_guc *guc)
+{
+   struct intel_gt *gt = guc_to_gt(guc);
+
+   if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
+   /*
+* Ensure intel_uncore_write_fw can be used rather than
+* intel_uncore_write.
+*/
+   GEM_BUG_ON(guc->send_regs.fw_domains);
+
+   /*
+* This register is used by the i915 and GuC for MMIO based
+* communication. Once we are in this code CTBs are the only
+* method the i915 uses to communicate with the GuC so it is
+* safe to write to this register (a value of 0 is NOP for MMIO
+* communication). If we ever start mixing CTBs and MMIOs a new
+* register will have to be chosen.
+*/
+   intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
+   } else {
+   /* wmb() sufficient for a barrier if in smem */
+   wmb();
+   }
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index a9f4ec972bfb..147f39cc0f2f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -46,6 +46,12 @@ struct intel_guc {
 * submitted until the stalled request is processed.
 */
struct i915_request *stalled_request;
+   enum {
+   STALL_NONE,
+   STALL_REGISTER_CONTEXT,
+   STALL_MOVE_LRC_TAIL,
+   STALL_ADD_REQUEST,
+   } submission_stall_reason;
 
/* intel_guc_recv interrupt related state */
/** @irq_lock: protects GuC irq state */
@@ -361,4 +367,6 @@ void intel_guc_submission_cancel_requests(struct intel_guc 
*guc);
 
 void intel_guc_load_status(struct intel_guc *guc, struct drm_printer *p);
 
+void intel_guc_write_barrier(struct intel_guc *guc);
+
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
index 20c710a74498..10d1878d2826 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ct.c
@@ -377,28 +377,6 @@ static u32 ct_get_next_fence(struct intel_guc_ct *ct)
return ++ct->requests.last_fence;
 }
 
-static void write_barrier(struct intel_guc_ct *ct)
-{
-   struct intel_guc *guc = ct_to_guc(ct);
-   struct intel_gt *gt = guc_to_gt(guc);
-
-   if (i915_gem_object_is_lmem(guc->ct.vma->obj)) {
-   GEM_BUG_ON(guc->send_regs.fw_domains);
-   /*
-* This register is used by the i915 and GuC for MMIO based
-* communication. Once we are in this code CTBs are the only
-* method the i915 uses to communicate with the GuC so it is
-* safe to write to this register (a value of 0 is NOP for MMIO
-* communication). If we ever start mixing CTBs and MMIOs a new
-* register will have to be chosen.
-*/
-   intel_uncore_write_fw(gt->uncore, GEN11_SOFT_SCRATCH(0), 0);
-   } else {
-   /* wmb() sufficient for a barrier if in smem */
-   wmb();
-   }
-}
-
 static int ct_write(struct intel_guc_ct *ct,
const u32 *action,
u32 len /* in dwords */,
@@ -468,7 +446,7 @@ static int ct_write(struct intel_guc_ct *ct,
 * make sure H2G buffer update and LRC tail update (if this triggering a
 * submission) are visible before updating the descriptor tail
 */
-   write_barrier(ct);
+