[PATCH v2 net-next 05/22] cxgb4: Add T5 write combining support

2013-03-14 Thread Vipul Pandya
From: Santosh Rastapur sant...@chelsio.com

This patch implements a low latency Write Combining (aka Write Coalescing) work
request path. PCIE maps User Space Doorbell BAR2 region writes to the new
interface to SGE. SGE pulls a new message from PCIE new interface and if its a
coalesced write work request then pushes it for processing. This patch copies
coalesced work request to memory mapped BAR2 space.

Signed-off-by: Santosh Rastapur sant...@chelsio.com
Signed-off-by: Vipul Pandya vi...@chelsio.com
---
v2: Replaced #ifdef with portable interface wmb in ring_tx_db

 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h  |2 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   53 ++-
 drivers/net/ethernet/chelsio/cxgb4/sge.c|   52 +-
 3 files changed, 102 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index a91dea6..f8ff30e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -439,6 +439,7 @@ struct sge_txq {
spinlock_t db_lock;
int db_disabled;
unsigned short db_pidx;
+   u64 udb;
 };
 
 struct sge_eth_txq {/* state for an SGE Ethernet Tx queue */
@@ -543,6 +544,7 @@ enum chip_type {
 
 struct adapter {
void __iomem *regs;
+   void __iomem *bar2;
struct pci_dev *pdev;
struct device *pdev_dev;
unsigned int mbox;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 3d6d23a..ce1451c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -1327,6 +1327,8 @@ static char stats_strings[][ETH_GSTRING_LEN] = {
VLANinsertions ,
GROpackets ,
GROmerged  ,
+   WriteCoalSuccess   ,
+   WriteCoalFail  ,
 };
 
 static int get_sset_count(struct net_device *dev, int sset)
@@ -1422,11 +1424,25 @@ static void get_stats(struct net_device *dev, struct 
ethtool_stats *stats,
 {
struct port_info *pi = netdev_priv(dev);
struct adapter *adapter = pi-adapter;
+   u32 val1, val2;
 
t4_get_port_stats(adapter, pi-tx_chan, (struct port_stats *)data);
 
data += sizeof(struct port_stats) / sizeof(u64);
collect_sge_port_stats(adapter, pi, (struct queue_port_stats *)data);
+   data += sizeof(struct queue_port_stats) / sizeof(u64);
+   if (!is_t4(adapter-chip)) {
+   t4_write_reg(adapter, SGE_STAT_CFG, STATSOURCE_T5(7));
+   val1 = t4_read_reg(adapter, SGE_STAT_TOTAL);
+   val2 = t4_read_reg(adapter, SGE_STAT_MATCH);
+   *data = val1 - val2;
+   data++;
+   *data = val2;
+   data++;
+   } else {
+   memset(data, 0, 2 * sizeof(u64));
+   *data += 2;
+   }
 }
 
 /*
@@ -5337,10 +5353,11 @@ static void free_some_resources(struct adapter *adapter)
 #define TSO_FLAGS (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_TSO_ECN)
 #define VLAN_FEAT (NETIF_F_SG | NETIF_F_IP_CSUM | TSO_FLAGS | \
   NETIF_F_IPV6_CSUM | NETIF_F_HIGHDMA)
+#define SEGMENT_SIZE 128
 
 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
-   int func, i, err;
+   int func, i, err, s_qpp, qpp, num_seg;
struct port_info *pi;
bool highdma = false;
struct adapter *adapter = NULL;
@@ -5420,7 +5437,34 @@ static int init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
 
err = t4_prep_adapter(adapter);
if (err)
-   goto out_unmap_bar;
+   goto out_unmap_bar0;
+
+   if (!is_t4(adapter-chip)) {
+   s_qpp = QUEUESPERPAGEPF1 * adapter-fn;
+   qpp = 1  QUEUESPERPAGEPF0_GET(t4_read_reg(adapter,
+ SGE_EGRESS_QUEUES_PER_PAGE_PF)  s_qpp);
+   num_seg = PAGE_SIZE / SEGMENT_SIZE;
+
+   /* Each segment size is 128B. Write coalescing is enabled only
+* when SGE_EGRESS_QUEUES_PER_PAGE_PF reg value for the
+* queue is less no of segments that can be accommodated in
+* a page size.
+*/
+   if (qpp  num_seg) {
+   dev_err(pdev-dev,
+   Incorrect number of egress queues per page\n);
+   err = -EINVAL;
+   goto out_unmap_bar0;
+   }
+   adapter-bar2 = ioremap_wc(pci_resource_start(pdev, 2),
+   pci_resource_len(pdev, 2));
+   if (!adapter-bar2) {
+   dev_err(pdev-dev, cannot map device bar2 region\n);
+   err = -ENOMEM;
+   goto out_unmap_bar0;
+   }
+   }
+
setup_memwin(adapter);
err = adap_init0(adapter);

RE: [PATCH v2 net-next 05/22] cxgb4: Add T5 write combining support

2013-03-14 Thread David Laight
 This patch implements a low latency Write Combining (aka Write Coalescing) 
 work
 request path. PCIE maps User Space Doorbell BAR2 region writes to the new
 interface to SGE. SGE pulls a new message from PCIE new interface and if its a
 coalesced write work request then pushes it for processing. This patch copies
 coalesced work request to memory mapped BAR2 space.
...
 + } else {
 + memset(data, 0, 2 * sizeof(u64));
 + *data += 2;
 + }

Using memset is overkill (or rather a big overhead if it isn't
detected by the compiler). Nothing wrong with:
(*data)[0] = 0;
(*data)[1] = 0;
*data += 2;
Actually, typing that, I realise that you should probably have
read *data into a local variable, and then updated it when finished.
Otherwise some of the accesses might be ones which force the
compiler to reload the value.

  static inline void ring_tx_db(struct adapter *adap, struct sge_txq *q, int n)
  {
 + unsigned int *wr, index;
 +
   wmb();/* write descriptors before telling HW */
   spin_lock(q-db_lock);
   if (!q-db_disabled) {
 - t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
 -  QID(q-cntxt_id) | PIDX(n));
 + if (is_t4(adap-chip)) {
 + t4_write_reg(adap, MYPF_REG(SGE_PF_KDOORBELL),
 +  QID(q-cntxt_id) | PIDX(n));
 + } else {
 + if (n == 1) {
 + index = q-pidx ? (q-pidx - 1) : (q-size - 1);
 + wr = (unsigned int *)q-desc[index];
 + cxgb_pio_copy((u64 __iomem *)
 +   (adap-bar2 + q-udb + 64),
 +   (u64 *)wr);

Why all the casts on 'wr' ?

 + } else
 + writel(n,  adap-bar2 + q-udb + 8);
 + wmb();

Since you actually need memory barriers here on x86 you definitely
need a comment saying so, and it would (IMHO) better to use a
different define in the source (even if it is currently converted
to wmb() in a local header file).

Thinking further, for portability you might want to find some way
of abstracting the multi-word writes somehow.
For example, some of the ppc have a dma engine associated with the
PCIe master interface that can be used to generate large TLP.
The code would still want to spin waiting for the dma to complete
(since the transfer would be far faster than any interrupt path).

David



--
To unsubscribe from this list: send the line unsubscribe linux-scsi in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html