[PATCH] iscsi: respond to netlink with unicast when appropriate

2018-04-09 Thread Chris Leech
Instead of always multicasting responses, send a unicast netlink message
directed at the correct pid.  This will be needed if we ever want to
support multiple userspace processes interacting with the kernel over
iSCSI netlink simultaneously.  Limitations can currently be seen if you
attempt to run multiple iscsistart commands in parallel.

We've fixed up the userspace issues in iscsistart that prevented
multiple instances from running, so now attempts to speed up booting by
bringing up multiple iscsi sessions at once in the initramfs are just
running into misrouted responses that this fixes.

Signed-off-by: Chris Leech <cle...@redhat.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 29 ++---
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index f4b52b44b966..65f6c94f2e9b 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2322,6 +2322,12 @@ iscsi_multicast_skb(struct sk_buff *skb, uint32_t group, 
gfp_t gfp)
return nlmsg_multicast(nls, skb, 0, group, gfp);
 }
 
+static int
+iscsi_unicast_skb(struct sk_buff *skb, u32 portid)
+{
+   return nlmsg_unicast(nls, skb, portid);
+}
+
 int iscsi_recv_pdu(struct iscsi_cls_conn *conn, struct iscsi_hdr *hdr,
   char *data, uint32_t data_size)
 {
@@ -2524,14 +2530,11 @@ void iscsi_ping_comp_event(uint32_t host_no, struct 
iscsi_transport *transport,
 EXPORT_SYMBOL_GPL(iscsi_ping_comp_event);
 
 static int
-iscsi_if_send_reply(uint32_t group, int seq, int type, int done, int multi,
-   void *payload, int size)
+iscsi_if_send_reply(u32 portid, int type, void *payload, int size)
 {
struct sk_buff  *skb;
struct nlmsghdr *nlh;
int len = nlmsg_total_size(size);
-   int flags = multi ? NLM_F_MULTI : 0;
-   int t = done ? NLMSG_DONE : type;
 
skb = alloc_skb(len, GFP_ATOMIC);
if (!skb) {
@@ -2539,10 +2542,9 @@ iscsi_if_send_reply(uint32_t group, int seq, int type, 
int done, int multi,
return -ENOMEM;
}
 
-   nlh = __nlmsg_put(skb, 0, 0, t, (len - sizeof(*nlh)), 0);
-   nlh->nlmsg_flags = flags;
+   nlh = __nlmsg_put(skb, 0, 0, type, (len - sizeof(*nlh)), 0);
memcpy(nlmsg_data(nlh), payload, size);
-   return iscsi_multicast_skb(skb, group, GFP_ATOMIC);
+   return iscsi_unicast_skb(skb, portid);
 }
 
 static int
@@ -3470,6 +3472,7 @@ static int
 iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group)
 {
int err = 0;
+   u32 portid;
struct iscsi_uevent *ev = nlmsg_data(nlh);
struct iscsi_transport *transport = NULL;
struct iscsi_internal *priv;
@@ -3490,10 +3493,12 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr 
*nlh, uint32_t *group)
if (!try_module_get(transport->owner))
return -EINVAL;
 
+   portid = NETLINK_CB(skb).portid;
+
switch (nlh->nlmsg_type) {
case ISCSI_UEVENT_CREATE_SESSION:
err = iscsi_if_create_session(priv, ep, ev,
- NETLINK_CB(skb).portid,
+ portid,
  ev->u.c_session.initial_cmdsn,
  ev->u.c_session.cmds_max,
  ev->u.c_session.queue_depth);
@@ -3506,7 +3511,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr 
*nlh, uint32_t *group)
}
 
err = iscsi_if_create_session(priv, ep, ev,
-   NETLINK_CB(skb).portid,
+   portid,
ev->u.c_bound_session.initial_cmdsn,
ev->u.c_bound_session.cmds_max,
ev->u.c_bound_session.queue_depth);
@@ -3664,6 +3669,8 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr 
*nlh, uint32_t *group)
 static void
 iscsi_if_rx(struct sk_buff *skb)
 {
+   u32 portid = NETLINK_CB(skb).portid;
+
mutex_lock(_queue_mutex);
while (skb->len >= NLMSG_HDRLEN) {
int err;
@@ -3699,8 +3706,8 @@ iscsi_if_rx(struct sk_buff *skb)
break;
if (ev->type == ISCSI_UEVENT_GET_CHAP && !err)
break;
-   err = iscsi_if_send_reply(group, nlh->nlmsg_seq,
-   nlh->nlmsg_type, 0, 0, ev, sizeof(*ev));
+   err = iscsi_if_send_reply(portid, nlh->nlmsg_type,
+ ev, sizeof(*ev));
} while (err < 0 && err != -ECONNREFUSED && err != -ESRCH);
skb_pull(skb, rlen);
}
-- 
2.14.3



Re: [PATCH 0/9] use network namespace for iSCSI control interfaces

2017-11-21 Thread Chris Leech
On Tue, Nov 21, 2017 at 11:26:09AM +, David Laight wrote:
> From: Chris Leech
> > Sent: 15 November 2017 00:25
> > To: David Laight
> > Cc: netdev@vger.kernel.org; contain...@lists.linux-foundation.org
> > Subject: Re: [PATCH 0/9] use network namespace for iSCSI control interfaces
> > 
> > On Wed, Nov 08, 2017 at 10:31:04AM +, David Laight wrote:
> > > From: Chris Leech
> > > > Sent: 07 November 2017 22:45
> > > >
> > > > I've posted these changes to allow iSCSI management within a container
> > > > using a network namespace to the SCSI and Open-iSCSI lists, but seeing
> > > > as it's not really SCSI/block related I'm casting a wider net looking
> > > > for reviews.
> > >
> > > I didn't spot you acquiring and releasing references to the namespace.
> > > (I might have missed it, the relevant patch is difficult to read).
> > >
> > > If the sockets are created in the context of the process whose namespace
> > > you are using you don't need it, but given the hooks and callbacks
> > > I'm not at all sure that is obviously true.
> > 
> > Thanks David,
> > 
> > Looking at it again, you're right and I think I need to hold a reference
> > for the iSCSI host and handle namespace deletion.  Even for iscsi_tcp
> > the socket gets handed off from the creating process to the transport
> > and can outlive iscsid.
> 
> It isn't that simple
> IIRC:
> 
> The namespace delete callback isn't made until the reference count is zero.
> Sockets created with sock_create_kern() don't hold a reference to the
> namespace
> 
> This is all fine for sockets used for admin purposes, but rather hopeless
> if you really need the namespace to continue to exist while the connections
> are open - if only for long enough to close the connection.

Yeah, I'm catching up on a lot of the details as I attempt to sort out
what a sane behavior for iscsi_tcp should be here.

With these patches as is, iscsi_tcp will hold a reference to a TCP
socket created by iscsid and keep the net namespace from exiting.
That's good for keeping iSCSI sessions alive.  Bad in that all processes
attached to the namespace can terminate, and if filesystem references
(like bind mounts from ip-netns) are unlinked then I don't see any way
to get back into the namespace to shut down iSCSI.

I've been trying to sort out a way to shut down and clean up in that
case, but the other approach might be to look at having a kernel thread
to reference the namespace so that the ns inode could be recovered from
/proc?
 
> To make matters even more annoying the functions for holding and
> releasing a namespace are GPL_ONLY :-(

I have no problem with that.

Thanks,
Chris Leech



Re: [PATCH 0/9] use network namespace for iSCSI control interfaces

2017-11-14 Thread Chris Leech
On Wed, Nov 08, 2017 at 10:31:04AM +, David Laight wrote:
> From: Chris Leech
> > Sent: 07 November 2017 22:45
> > 
> > I've posted these changes to allow iSCSI management within a container
> > using a network namespace to the SCSI and Open-iSCSI lists, but seeing
> > as it's not really SCSI/block related I'm casting a wider net looking
> > for reviews.
> 
> I didn't spot you acquiring and releasing references to the namespace.
> (I might have missed it, the relevant patch is difficult to read).
> 
> If the sockets are created in the context of the process whose namespace
> you are using you don't need it, but given the hooks and callbacks
> I'm not at all sure that is obviously true.

Thanks David,

Looking at it again, you're right and I think I need to hold a reference
for the iSCSI host and handle namespace deletion.  Even for iscsi_tcp
the socket gets handed off from the creating process to the transport
and can outlive iscsid.

I'm looking at migration or destruction now rather than later.

Chris



[PATCH 4/9] iscsi: make all iSCSI netlink multicast namespace aware

2017-11-07 Thread Chris Leech
Make use of the per-net netlink sockets. Responses are sent back on the
same socket/namespace the request was received on.  Async events are
reported on the socket/namespace stored in the iscsi_cls_host associated
with the event.

Signed-off-by: Chris Leech <cle...@redhat.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 92 -
 1 file changed, 61 insertions(+), 31 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index d29c095ccc7d..1fc5878b1a8c 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -2418,8 +2418,8 @@ iscsi_if_transport_lookup(struct iscsi_transport *tt)
 }
 
 static int
-iscsi_multicast_netns(struct net *net, struct sk_buff *skb,
- uint32_t group, gfp_t gfp)
+iscsi_multicast_skb(struct net *net, struct sk_buff *skb,
+   uint32_t group, gfp_t gfp)
 {
struct sock *nls;
struct iscsi_net *isn;
@@ -2429,12 +2429,6 @@ iscsi_multicast_netns(struct net *net, struct sk_buff 
*skb,
return nlmsg_multicast(nls, skb, 0, group, gfp);
 }
 
-static int
-iscsi_multicast_skb(struct sk_buff *skb, uint32_t group, gfp_t gfp)
-{
-   return iscsi_multicast_netns(_net, skb, group, gfp);
-}
-
 int iscsi_recv_pdu(struct iscsi_cls_conn *conn, struct iscsi_hdr *hdr,
   char *data, uint32_t data_size)
 {
@@ -2443,6 +2437,7 @@ int iscsi_recv_pdu(struct iscsi_cls_conn *conn, struct 
iscsi_hdr *hdr,
struct iscsi_uevent *ev;
char *pdu;
struct iscsi_internal *priv;
+   struct net *net;
int len = nlmsg_total_size(sizeof(*ev) + sizeof(struct iscsi_hdr) +
   data_size);
 
@@ -2469,7 +2464,8 @@ int iscsi_recv_pdu(struct iscsi_cls_conn *conn, struct 
iscsi_hdr *hdr,
memcpy(pdu, hdr, sizeof(struct iscsi_hdr));
memcpy(pdu + sizeof(struct iscsi_hdr), data, data_size);
 
-   return iscsi_multicast_skb(skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC);
+   net = iscsi_conn_net(conn);
+   return iscsi_multicast_skb(net, skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC);
 }
 EXPORT_SYMBOL_GPL(iscsi_recv_pdu);
 
@@ -2480,6 +2476,7 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
struct nlmsghdr *nlh;
struct sk_buff *skb;
struct iscsi_uevent *ev;
+   struct net *net;
int len = nlmsg_total_size(sizeof(*ev) + data_size);
 
skb = alloc_skb(len, GFP_ATOMIC);
@@ -2504,7 +2501,8 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
 
memcpy((char *)ev + sizeof(*ev), data, data_size);
 
-   return iscsi_multicast_skb(skb, ISCSI_NL_GRP_UIP, GFP_ATOMIC);
+   net = iscsi_host_net(shost->shost_data);
+   return iscsi_multicast_skb(net, skb, ISCSI_NL_GRP_UIP, GFP_ATOMIC);
 }
 EXPORT_SYMBOL_GPL(iscsi_offload_mesg);
 
@@ -2514,6 +2512,7 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, 
enum iscsi_err error)
struct sk_buff  *skb;
struct iscsi_uevent *ev;
struct iscsi_internal *priv;
+   struct net *net;
int len = nlmsg_total_size(sizeof(*ev));
 
priv = iscsi_if_transport_lookup(conn->transport);
@@ -2535,7 +2534,8 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, 
enum iscsi_err error)
ev->r.connerror.cid = conn->cid;
ev->r.connerror.sid = iscsi_conn_get_sid(conn);
 
-   iscsi_multicast_skb(skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC);
+   net = iscsi_conn_net(conn);
+   iscsi_multicast_skb(net, skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC);
 
iscsi_cls_conn_printk(KERN_INFO, conn, "detected conn error (%d)\n",
  error);
@@ -2549,6 +2549,7 @@ void iscsi_conn_login_event(struct iscsi_cls_conn *conn,
struct sk_buff  *skb;
struct iscsi_uevent *ev;
struct iscsi_internal *priv;
+   struct net *net;
int len = nlmsg_total_size(sizeof(*ev));
 
priv = iscsi_if_transport_lookup(conn->transport);
@@ -2569,7 +2570,9 @@ void iscsi_conn_login_event(struct iscsi_cls_conn *conn,
ev->r.conn_login.state = state;
ev->r.conn_login.cid = conn->cid;
ev->r.conn_login.sid = iscsi_conn_get_sid(conn);
-   iscsi_multicast_skb(skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC);
+
+   net = iscsi_conn_net(conn);
+   iscsi_multicast_skb(net, skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC);
 
iscsi_cls_conn_printk(KERN_INFO, conn, "detected conn login (%d)\n",
  state);
@@ -2580,11 +2583,17 @@ void iscsi_post_host_event(uint32_t host_no, struct 
iscsi_transport *transport,
   enum iscsi_host_event_code code, uint32_t data_size,
   uint8_t *data)
 {
+   struct Scsi_Host *shost;
+   struct net *net;
struct nlmsghdr *nlh;
struct sk_buff *skb;
struct iscsi_uevent *ev;
int len = nlmsg_total_size(sizeof(*ev) 

[PATCH 0/9] use network namespace for iSCSI control interfaces

2017-11-07 Thread Chris Leech
Hello,

I've posted these changes to allow iSCSI management within a container
using a network namespace to the SCSI and Open-iSCSI lists, but seeing
as it's not really SCSI/block related I'm casting a wider net looking
for reviews.

These patches apply network namespace to the iSCSI netlink family and
sysfs objects from the iSCSI transport class.

Thank you,

Chris Leech

---

This series of changes makes the iSCSI netlink and sysfs control
interfaces filtered by network namespace.  This is required to run
iscsid in any network namespace other than the initial default one.

Currently the netlink communication will fail if iscsid is started in a
non-default network namespace, as there is no kernel side socket.  After
fixing that, the rest of these changes are to filter visibility of the
iSCSI transport objects by netns.  This allows for multiple iscsid
instances to be run, one per netns, each controlling it's own set of
iSCSI sessions.

The iSCSI transport objects are filtered, but not the SCSI or block
layer devices.  So while iSCSI hosts and sessions become limited to a
network namespace, any attached devices remain visible system wide.

This currently only supports iscsi_tcp running in a new namespace, as it
creates a virtual host per session.  Support could be added later to
allow assignment of iSCSI HBAs to network namespace, much as is done for
network interfaces.

Chris Leech (9):
  iscsi: create per-net iscsi netlink kernel sockets
  iscsi: associate endpoints with a host
  iscsi: sysfs filtering by network namespace
  iscsi: make all iSCSI netlink multicast namespace aware
  iscsi: set netns for iscsi_tcp hosts
  iscsi: check net namespace for all iscsi lookups
  iscsi: convert flashnode devices from bus to class
  iscsi: rename iscsi_bus_flash_* to iscsi_flash_*
  iscsi: filter flashnode sysfs by net namespace

 drivers/infiniband/ulp/iser/iscsi_iser.c |   7 +-
 drivers/scsi/be2iscsi/be_iscsi.c |   6 +-
 drivers/scsi/bnx2i/bnx2i_iscsi.c |   6 +-
 drivers/scsi/cxgbi/libcxgbi.c|   6 +-
 drivers/scsi/iscsi_tcp.c |   7 +
 drivers/scsi/qedi/qedi_iscsi.c   |   6 +-
 drivers/scsi/qla4xxx/ql4_os.c|  62 +--
 drivers/scsi/scsi_transport_iscsi.c  | 625 ++-
 include/scsi/scsi_transport_iscsi.h  |  63 ++--
 9 files changed, 538 insertions(+), 250 deletions(-)

-- 
2.9.5



[PATCH 5/9] iscsi: set netns for iscsi_tcp hosts

2017-11-07 Thread Chris Leech
This lets iscsi_tcp operate in multiple namespaces.  It uses current
during session creation to find the net namespace, but it might be
better to manage to pass it along from the iscsi netlink socket.

Signed-off-by: Chris Leech <cle...@redhat.com>
---
 drivers/scsi/iscsi_tcp.c| 7 +++
 drivers/scsi/scsi_transport_iscsi.c | 7 ++-
 include/scsi/scsi_transport_iscsi.h | 1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 4d934d6c3e13..b368c94c884b 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -957,6 +957,11 @@ static int iscsi_sw_tcp_slave_configure(struct scsi_device 
*sdev)
return 0;
 }
 
+static struct net *iscsi_sw_tcp_netns(struct Scsi_Host *shost)
+{
+   return current->nsproxy->net_ns;
+}
+
 static struct scsi_host_template iscsi_sw_tcp_sht = {
.module = THIS_MODULE,
.name   = "iSCSI Initiator over TCP/IP",
@@ -1013,6 +1018,8 @@ static struct iscsi_transport iscsi_sw_tcp_transport = {
.alloc_pdu  = iscsi_sw_tcp_pdu_alloc,
/* recovery */
.session_recovery_timedout = iscsi_session_recovery_timedout,
+   /* net namespace */
+   .get_netns  = iscsi_sw_tcp_netns,
 };
 
 static int __init iscsi_sw_tcp_init(void)
diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index 1fc5878b1a8c..2ec10f6ac3a2 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1600,11 +1600,16 @@ static int iscsi_setup_host(struct transport_container 
*tc, struct device *dev,
 {
struct Scsi_Host *shost = dev_to_shost(dev);
struct iscsi_cls_host *ihost = shost->shost_data;
+   struct iscsi_internal *priv = to_iscsi_internal(shost->transportt);
+   struct iscsi_transport *transport = priv->iscsi_transport;
 
memset(ihost, 0, sizeof(*ihost));
atomic_set(>nr_scans, 0);
mutex_init(>mutex);
-   ihost->netns = _net;
+   if (transport->get_netns)
+   ihost->netns = transport->get_netns(shost);
+   else
+   ihost->netns = _net;
 
iscsi_bsg_host_add(shost, ihost);
/* ignore any bsg add error - we just can't do sgio */
diff --git a/include/scsi/scsi_transport_iscsi.h 
b/include/scsi/scsi_transport_iscsi.h
index 8c8191dfdc21..3c4cd4779f72 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -168,6 +168,7 @@ struct iscsi_transport {
int (*logout_flashnode_sid) (struct iscsi_cls_session *cls_sess);
int (*get_host_stats) (struct Scsi_Host *shost, char *buf, int len);
u8 (*check_protection)(struct iscsi_task *task, sector_t *sector);
+   struct net *(*get_netns)(struct Scsi_Host *shost);
 };
 
 /*
-- 
2.9.5



[PATCH 3/9] iscsi: sysfs filtering by network namespace

2017-11-07 Thread Chris Leech
This makes the iscsi_host, iscsi_session, iscsi_connection, iscsi_iface,
and iscsi_endpoint transport class devices only visible in sysfs under a
matching network namespace.  The network namespace for all of these
objects is tracked in the iscsi_cls_host structure.

Signed-off-by: Chris Leech <cle...@redhat.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 128 +++-
 include/scsi/scsi_transport_iscsi.h |   1 +
 2 files changed, 112 insertions(+), 17 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index 6ab7ca82b121..d29c095ccc7d 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -161,9 +161,31 @@ static void iscsi_endpoint_release(struct device *dev)
kfree(ep);
 }
 
+static struct net *iscsi_host_net(struct iscsi_cls_host *ihost)
+{
+   return ihost->netns;
+}
+
+static struct net *iscsi_endpoint_net(struct iscsi_endpoint *ep)
+{
+   struct Scsi_Host *shost = iscsi_endpoint_to_shost(ep);
+   struct iscsi_cls_host *ihost = shost->shost_data;
+
+   return iscsi_host_net(ihost);
+}
+
+static const void *iscsi_endpoint_namespace(struct device *dev)
+{
+   struct iscsi_endpoint *ep = iscsi_dev_to_endpoint(dev);
+
+   return iscsi_endpoint_net(ep);
+}
+
 static struct class iscsi_endpoint_class = {
.name = "iscsi_endpoint",
.dev_release = iscsi_endpoint_release,
+   .ns_type = _ns_type_operations,
+   .namespace = iscsi_endpoint_namespace,
 };
 
 static ssize_t
@@ -285,10 +307,26 @@ static void iscsi_iface_release(struct device *dev)
put_device(parent);
 }
 
+static struct net *iscsi_iface_net(struct iscsi_iface *iface)
+{
+   struct Scsi_Host *shost = iscsi_iface_to_shost(iface);
+   struct iscsi_cls_host *ihost = shost->shost_data;
+
+   return iscsi_host_net(ihost);
+}
+
+static const void *iscsi_iface_namespace(struct device *dev)
+{
+   struct iscsi_iface *iface = iscsi_dev_to_iface(dev);
+
+   return iscsi_iface_net(iface);
+}
 
 static struct class iscsi_iface_class = {
.name = "iscsi_iface",
.dev_release = iscsi_iface_release,
+   .ns_type = _ns_type_operations,
+   .namespace = iscsi_iface_namespace,
 };
 
 #define ISCSI_IFACE_ATTR(_prefix, _name, _mode, _show, _store) \
@@ -1566,6 +1604,7 @@ static int iscsi_setup_host(struct transport_container 
*tc, struct device *dev,
memset(ihost, 0, sizeof(*ihost));
atomic_set(>nr_scans, 0);
mutex_init(>mutex);
+   ihost->netns = _net;
 
iscsi_bsg_host_add(shost, ihost);
/* ignore any bsg add error - we just can't do sgio */
@@ -1586,23 +1625,78 @@ static int iscsi_remove_host(struct transport_container 
*tc,
return 0;
 }
 
-static DECLARE_TRANSPORT_CLASS(iscsi_host_class,
-  "iscsi_host",
-  iscsi_setup_host,
-  iscsi_remove_host,
-  NULL);
-
-static DECLARE_TRANSPORT_CLASS(iscsi_session_class,
-  "iscsi_session",
-  NULL,
-  NULL,
-  NULL);
-
-static DECLARE_TRANSPORT_CLASS(iscsi_connection_class,
-  "iscsi_connection",
-  NULL,
-  NULL,
-  NULL);
+#define DECLARE_TRANSPORT_CLASS_NS(cls, nm, su, rm, cfg, ns, nslookup) \
+struct transport_class cls = { \
+   .class = {  \
+   .name = nm, \
+   .ns_type = ns,  \
+   .namespace = nslookup,  \
+   },  \
+   .setup = su,\
+   .remove = rm,   \
+   .configure = cfg,   \
+}
+
+static const void *iscsi_host_namespace(struct device *dev)
+{
+   struct Scsi_Host *shost = transport_class_to_shost(dev);
+   struct iscsi_cls_host *ihost = shost->shost_data;
+
+   return iscsi_host_net(ihost);
+}
+
+static DECLARE_TRANSPORT_CLASS_NS(iscsi_host_class,
+ "iscsi_host",
+ iscsi_setup_host,
+ iscsi_remove_host,
+ NULL,
+ _ns_type_operations,
+ iscsi_host_namespace);
+
+static struct net *iscsi_sess_net(struct iscsi_cls_session *cls_session)
+{
+   struct Scsi_Host *sh

[PATCH 8/9] iscsi: rename iscsi_bus_flash_* to iscsi_flash_*

2017-11-07 Thread Chris Leech
cleanups after the bus to class conversion

Signed-off-by: Chris Leech <cle...@redhat.com>
---
 drivers/scsi/qla4xxx/ql4_os.c   |  52 +-
 drivers/scsi/scsi_transport_iscsi.c | 102 ++--
 include/scsi/scsi_transport_iscsi.h |  48 +
 3 files changed, 102 insertions(+), 100 deletions(-)

diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 55a729568873..9c80688d0681 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -169,20 +169,20 @@ static int qla4xxx_host_reset(struct Scsi_Host *shost, 
int reset_type);
  * iSCSI Flash DDB sysfs entry points
  */
 static int
-qla4xxx_sysfs_ddb_set_param(struct iscsi_bus_flash_session *fnode_sess,
-   struct iscsi_bus_flash_conn *fnode_conn,
+qla4xxx_sysfs_ddb_set_param(struct iscsi_flash_session *fnode_sess,
+   struct iscsi_flash_conn *fnode_conn,
void *data, int len);
 static int
-qla4xxx_sysfs_ddb_get_param(struct iscsi_bus_flash_session *fnode_sess,
+qla4xxx_sysfs_ddb_get_param(struct iscsi_flash_session *fnode_sess,
int param, char *buf);
 static int qla4xxx_sysfs_ddb_add(struct Scsi_Host *shost, const char *buf,
 int len);
 static int
-qla4xxx_sysfs_ddb_delete(struct iscsi_bus_flash_session *fnode_sess);
-static int qla4xxx_sysfs_ddb_login(struct iscsi_bus_flash_session *fnode_sess,
-  struct iscsi_bus_flash_conn *fnode_conn);
-static int qla4xxx_sysfs_ddb_logout(struct iscsi_bus_flash_session *fnode_sess,
-   struct iscsi_bus_flash_conn *fnode_conn);
+qla4xxx_sysfs_ddb_delete(struct iscsi_flash_session *fnode_sess);
+static int qla4xxx_sysfs_ddb_login(struct iscsi_flash_session *fnode_sess,
+  struct iscsi_flash_conn *fnode_conn);
+static int qla4xxx_sysfs_ddb_logout(struct iscsi_flash_session *fnode_sess,
+   struct iscsi_flash_conn *fnode_conn);
 static int qla4xxx_sysfs_ddb_logout_sid(struct iscsi_cls_session *cls_sess);
 
 static struct qla4_8xxx_legacy_intr_set legacy_intr[] =
@@ -3454,8 +3454,8 @@ static int qla4xxx_task_xmit(struct iscsi_task *task)
return -ENOSYS;
 }
 
-static int qla4xxx_copy_from_fwddb_param(struct iscsi_bus_flash_session *sess,
-struct iscsi_bus_flash_conn *conn,
+static int qla4xxx_copy_from_fwddb_param(struct iscsi_flash_session *sess,
+struct iscsi_flash_conn *conn,
 struct dev_db_entry *fw_ddb_entry)
 {
unsigned long options = 0;
@@ -3596,8 +3596,8 @@ static int qla4xxx_copy_from_fwddb_param(struct 
iscsi_bus_flash_session *sess,
return rc;
 }
 
-static int qla4xxx_copy_to_fwddb_param(struct iscsi_bus_flash_session *sess,
-  struct iscsi_bus_flash_conn *conn,
+static int qla4xxx_copy_to_fwddb_param(struct iscsi_flash_session *sess,
+  struct iscsi_flash_conn *conn,
   struct dev_db_entry *fw_ddb_entry)
 {
uint16_t options;
@@ -7162,7 +7162,7 @@ static void qla4xxx_build_new_nt_list(struct 
scsi_qla_host *ha,
  **/
 static int qla4xxx_sysfs_ddb_is_non_persistent(struct device *dev, void *data)
 {
-   struct iscsi_bus_flash_session *fnode_sess;
+   struct iscsi_flash_session *fnode_sess;
 
if (!iscsi_is_flashnode_session_dev(dev))
return 0;
@@ -7192,8 +7192,8 @@ static int qla4xxx_sysfs_ddb_tgt_create(struct 
scsi_qla_host *ha,
struct dev_db_entry *fw_ddb_entry,
uint16_t *idx, int user)
 {
-   struct iscsi_bus_flash_session *fnode_sess = NULL;
-   struct iscsi_bus_flash_conn *fnode_conn = NULL;
+   struct iscsi_flash_session *fnode_sess = NULL;
+   struct iscsi_flash_conn *fnode_conn = NULL;
int rc = QLA_ERROR;
 
fnode_sess = iscsi_create_flashnode_sess(ha->host, *idx,
@@ -7330,8 +7330,8 @@ static int qla4xxx_sysfs_ddb_add(struct Scsi_Host *shost, 
const char *buf,
  * This writes the contents of target ddb buffer to Flash with a valid cookie
  * value in order to make the ddb entry persistent.
  **/
-static int  qla4xxx_sysfs_ddb_apply(struct iscsi_bus_flash_session *fnode_sess,
-   struct iscsi_bus_flash_conn *fnode_conn)
+static int  qla4xxx_sysfs_ddb_apply(struct iscsi_flash_session *fnode_sess,
+   struct iscsi_flash_conn *fnode_conn)
 {
struct Scsi_Host *shost = iscsi_flash_session_to_shost(fnode_sess);
struct scsi_qla_host *ha = to_qla_host(shost);
@@ -7520,8 +7520,8 @@ static int qla4xxx_ddb_login_nt(struct scsi_qla_host *ha,
  *
  * This logs in to the specifi

[PATCH 6/9] iscsi: check net namespace for all iscsi lookups

2017-11-07 Thread Chris Leech
All internal lookups of iSCSI transport objects need to be filtered by
net namespace.

Signed-off-by: Chris Leech <cle...@redhat.com>
---
 drivers/infiniband/ulp/iser/iscsi_iser.c |   5 +-
 drivers/scsi/be2iscsi/be_iscsi.c |   4 +-
 drivers/scsi/bnx2i/bnx2i_iscsi.c |   4 +-
 drivers/scsi/cxgbi/libcxgbi.c|   4 +-
 drivers/scsi/qedi/qedi_iscsi.c   |   4 +-
 drivers/scsi/qla4xxx/ql4_os.c|   6 +-
 drivers/scsi/scsi_transport_iscsi.c  | 201 +++
 include/scsi/scsi_transport_iscsi.h  |   5 +-
 8 files changed, 150 insertions(+), 83 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c 
b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 0a4214be4877..6d088634a806 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -464,15 +464,18 @@ iscsi_iser_conn_bind(struct iscsi_cls_session 
*cls_session,
struct iscsi_conn *conn = cls_conn->dd_data;
struct iser_conn *iser_conn;
struct iscsi_endpoint *ep;
+   struct net *net;
int error;
 
error = iscsi_conn_bind(cls_session, cls_conn, is_leading);
if (error)
return error;
 
+
/* the transport ep handle comes from user space so it must be
 * verified against the global ib connections list */
-   ep = iscsi_lookup_endpoint(transport_eph);
+   net = iscsi_sess_net(cls_session);
+   ep = iscsi_lookup_endpoint(net, transport_eph);
if (!ep) {
iser_err("can't bind eph %llx\n",
 (unsigned long long)transport_eph);
diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c
index 33f79f385660..1f4b1b98b4e6 100644
--- a/drivers/scsi/be2iscsi/be_iscsi.c
+++ b/drivers/scsi/be2iscsi/be_iscsi.c
@@ -181,8 +181,10 @@ int beiscsi_conn_bind(struct iscsi_cls_session 
*cls_session,
struct beiscsi_endpoint *beiscsi_ep;
struct iscsi_endpoint *ep;
uint16_t cri_index;
+   struct net *net;
 
-   ep = iscsi_lookup_endpoint(transport_fd);
+   net = iscsi_sess_net(cls_session);
+   ep = iscsi_lookup_endpoint(net, transport_fd);
if (!ep)
return -EINVAL;
 
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index 19fadb5d3b3c..58dca20f0ba0 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -1414,9 +1414,11 @@ static int bnx2i_conn_bind(struct iscsi_cls_session 
*cls_session,
struct bnx2i_hba *hba = iscsi_host_priv(shost);
struct bnx2i_endpoint *bnx2i_ep;
struct iscsi_endpoint *ep;
+   struct net *net;
int ret_code;
 
-   ep = iscsi_lookup_endpoint(transport_fd);
+   net = iscsi_sess_net(cls_session);
+   ep = iscsi_lookup_endpoint(net, transport_fd);
if (!ep)
return -EINVAL;
/*
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 558484f72738..e768fe285e85 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -2373,9 +2373,11 @@ int cxgbi_bind_conn(struct iscsi_cls_session 
*cls_session,
struct iscsi_endpoint *ep;
struct cxgbi_endpoint *cep;
struct cxgbi_sock *csk;
+   struct net *net;
int err;
 
-   ep = iscsi_lookup_endpoint(transport_eph);
+   net = iscsi_sess_net(cls_session);
+   ep = iscsi_lookup_endpoint(net, transport_eph);
if (!ep)
return -EINVAL;
 
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index 5ae589ea1dd2..5cd267a457f4 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -381,8 +381,10 @@ static int qedi_conn_bind(struct iscsi_cls_session 
*cls_session,
struct qedi_ctx *qedi = iscsi_host_priv(shost);
struct qedi_endpoint *qedi_ep;
struct iscsi_endpoint *ep;
+   struct net *net;
 
-   ep = iscsi_lookup_endpoint(transport_fd);
+   net = iscsi_sess_net(cls_session);
+   ep = iscsi_lookup_endpoint(net, transport_fd);
if (!ep)
return -EINVAL;
 
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 5785bf6c3ec0..770313d0b986 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -3178,6 +3178,7 @@ static int qla4xxx_conn_bind(struct iscsi_cls_session 
*cls_session,
struct ddb_entry *ddb_entry;
struct scsi_qla_host *ha;
struct iscsi_session *sess;
+   struct net *net;
 
sess = cls_session->dd_data;
ddb_entry = sess->dd_data;
@@ -3186,9 +3187,12 @@ static int qla4xxx_conn_bind(struct iscsi_cls_session 
*cls_session,
DEBUG2(ql4_printk(KERN_INFO, ha, "%s: sid = %d, cid = %d\n", __func__,
  cls_session->sid, cls_conn->cid));
 
+   net = iscsi_sess_net(cls_session

[PATCH 7/9] iscsi: convert flashnode devices from bus to class

2017-11-07 Thread Chris Leech
The flashnode session and connection devices should be filtered by net
namespace along with the iscsi_host, but we can't do that with a bus
device.  As these don't use any of the bus matching functionality, they
make more sense as a class device anyway.

Signed-off-by: Chris Leech <cle...@redhat.com>
---
 drivers/scsi/qla4xxx/ql4_os.c   |  2 +-
 drivers/scsi/scsi_transport_iscsi.c | 36 +++-
 include/scsi/scsi_transport_iscsi.h |  2 ++
 3 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 770313d0b986..55a729568873 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -7164,7 +7164,7 @@ static int qla4xxx_sysfs_ddb_is_non_persistent(struct 
device *dev, void *data)
 {
struct iscsi_bus_flash_session *fnode_sess;
 
-   if (!iscsi_flashnode_bus_match(dev, NULL))
+   if (!iscsi_is_flashnode_session_dev(dev))
return 0;
 
fnode_sess = iscsi_dev_to_flash_session(dev);
diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index fbec3a019f00..b053d57a482d 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1060,6 +1060,12 @@ static const struct device_type 
iscsi_flashnode_sess_dev_type = {
.release = iscsi_flashnode_sess_release,
 };
 
+bool iscsi_is_flashnode_session_dev(struct device *dev)
+{
+   return dev->type == _flashnode_sess_dev_type;
+}
+EXPORT_SYMBOL_GPL(iscsi_is_flashnode_session_dev);
+
 /* flash node connection attrs show */
 #define iscsi_flashnode_conn_attr_show(type, name, param)  \
 static ssize_t \
@@ -1246,20 +1252,8 @@ static const struct device_type 
iscsi_flashnode_conn_dev_type = {
.release = iscsi_flashnode_conn_release,
 };
 
-static struct bus_type iscsi_flashnode_bus;
-
-int iscsi_flashnode_bus_match(struct device *dev,
-struct device_driver *drv)
-{
-   if (dev->bus == _flashnode_bus)
-   return 1;
-   return 0;
-}
-EXPORT_SYMBOL_GPL(iscsi_flashnode_bus_match);
-
-static struct bus_type iscsi_flashnode_bus = {
+static struct class iscsi_flashnode_bus = {
.name = "iscsi_flashnode",
-   .match = _flashnode_bus_match,
 };
 
 /**
@@ -1290,7 +1284,7 @@ iscsi_create_flashnode_sess(struct Scsi_Host *shost, int 
index,
fnode_sess->transport = transport;
fnode_sess->target_id = index;
fnode_sess->dev.type = _flashnode_sess_dev_type;
-   fnode_sess->dev.bus = _flashnode_bus;
+   fnode_sess->dev.class = _flashnode_bus;
fnode_sess->dev.parent = >shost_gendev;
dev_set_name(_sess->dev, "flashnode_sess-%u:%u",
 shost->host_no, index);
@@ -1338,7 +1332,7 @@ iscsi_create_flashnode_conn(struct Scsi_Host *shost,
 
fnode_conn->transport = transport;
fnode_conn->dev.type = _flashnode_conn_dev_type;
-   fnode_conn->dev.bus = _flashnode_bus;
+   fnode_conn->dev.class = _flashnode_bus;
fnode_conn->dev.parent = _sess->dev;
dev_set_name(_conn->dev, "flashnode_conn-%u:%u:0",
 shost->host_no, fnode_sess->target_id);
@@ -1371,7 +1365,7 @@ EXPORT_SYMBOL_GPL(iscsi_create_flashnode_conn);
  */
 static int iscsi_is_flashnode_conn_dev(struct device *dev, void *data)
 {
-   return dev->bus == _flashnode_bus;
+   return dev->type == _flashnode_conn_dev_type;
 }
 
 static int iscsi_destroy_flashnode_conn(struct iscsi_bus_flash_conn 
*fnode_conn)
@@ -1385,7 +1379,7 @@ static int flashnode_match_index(struct device *dev, void 
*data)
struct iscsi_bus_flash_session *fnode_sess = NULL;
int ret = 0;
 
-   if (!iscsi_flashnode_bus_match(dev, NULL))
+   if (dev->type != _flashnode_sess_dev_type)
goto exit_match_index;
 
fnode_sess = iscsi_dev_to_flash_session(dev);
@@ -1491,7 +1485,7 @@ EXPORT_SYMBOL_GPL(iscsi_destroy_flashnode_sess);
 
 static int iscsi_iter_destroy_flashnode_fn(struct device *dev, void *data)
 {
-   if (!iscsi_flashnode_bus_match(dev, NULL))
+   if (dev->type != _flashnode_sess_dev_type)
return 0;
 
iscsi_destroy_flashnode_sess(iscsi_dev_to_flash_session(dev));
@@ -4752,7 +4746,7 @@ static __init int iscsi_transport_init(void)
if (err)
goto unregister_conn_class;
 
-   err = bus_register(_flashnode_bus);
+   err = class_register(_flashnode_bus);
if (err)
goto unregister_session_class;
 
@@ -4773,7 +4767,7 @@ static __init int iscsi_transport_init(void)
 unregister_pernet_subsys:
unregister_pernet_subsys(_net_ops);
 unregister_flashnode_bus:
-   bus_unregister(_flashnode_bus);
+   class_unregister(_flashnode

[PATCH 1/9] iscsi: create per-net iscsi netlink kernel sockets

2017-11-07 Thread Chris Leech
Prepare iSCSI netlink to operate in multiple namespaces.

Signed-off-by: Chris Leech <cle...@redhat.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 67 +++--
 1 file changed, 57 insertions(+), 10 deletions(-)

diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index 7404d26895f5..0b23ba346cbe 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -26,6 +26,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -1601,7 +1603,11 @@ static DECLARE_TRANSPORT_CLASS(iscsi_connection_class,
   NULL,
   NULL);
 
-static struct sock *nls;
+struct iscsi_net {
+   struct sock *nls;
+};
+
+static int iscsi_net_id __read_mostly;
 static DEFINE_MUTEX(rx_queue_mutex);
 
 static LIST_HEAD(sesslist);
@@ -2317,11 +2323,23 @@ iscsi_if_transport_lookup(struct iscsi_transport *tt)
 }
 
 static int
-iscsi_multicast_skb(struct sk_buff *skb, uint32_t group, gfp_t gfp)
+iscsi_multicast_netns(struct net *net, struct sk_buff *skb,
+ uint32_t group, gfp_t gfp)
 {
+   struct sock *nls;
+   struct iscsi_net *isn;
+
+   isn = net_generic(net, iscsi_net_id);
+   nls = isn->nls;
return nlmsg_multicast(nls, skb, 0, group, gfp);
 }
 
+static int
+iscsi_multicast_skb(struct sk_buff *skb, uint32_t group, gfp_t gfp)
+{
+   return iscsi_multicast_netns(_net, skb, group, gfp);
+}
+
 int iscsi_recv_pdu(struct iscsi_cls_conn *conn, struct iscsi_hdr *hdr,
   char *data, uint32_t data_size)
 {
@@ -4490,13 +4508,42 @@ int iscsi_unregister_transport(struct iscsi_transport 
*tt)
 }
 EXPORT_SYMBOL_GPL(iscsi_unregister_transport);
 
-static __init int iscsi_transport_init(void)
+static int __net_init iscsi_net_init(struct net *net)
 {
-   int err;
+   struct sock *nls;
+   struct iscsi_net *isn;
struct netlink_kernel_cfg cfg = {
.groups = 1,
.input  = iscsi_if_rx,
};
+
+   nls = netlink_kernel_create(net, NETLINK_ISCSI, );
+   if (!nls)
+   return -ENOMEM;
+   isn = net_generic(net, iscsi_net_id);
+   isn->nls = nls;
+   return 0;
+}
+
+static void __net_exit iscsi_net_exit(struct net *net)
+{
+   struct iscsi_net *isn;
+
+   isn = net_generic(net, iscsi_net_id);
+   netlink_kernel_release(isn->nls);
+   isn->nls = NULL;
+}
+
+static struct pernet_operations iscsi_net_ops = {
+   .init = iscsi_net_init,
+   .exit = iscsi_net_exit,
+   .id   = _net_id,
+   .size = sizeof(struct iscsi_net),
+};
+
+static __init int iscsi_transport_init(void)
+{
+   int err;
printk(KERN_INFO "Loading iSCSI transport class v%s.\n",
ISCSI_TRANSPORT_VERSION);
 
@@ -4530,8 +4577,8 @@ static __init int iscsi_transport_init(void)
if (err)
goto unregister_session_class;
 
-   nls = netlink_kernel_create(_net, NETLINK_ISCSI, );
-   if (!nls) {
+   err = register_pernet_subsys(_net_ops);
+   if (err) {
err = -ENOBUFS;
goto unregister_flashnode_bus;
}
@@ -4539,13 +4586,13 @@ static __init int iscsi_transport_init(void)
iscsi_eh_timer_workq = create_singlethread_workqueue("iscsi_eh");
if (!iscsi_eh_timer_workq) {
err = -ENOMEM;
-   goto release_nls;
+   goto unregister_pernet_subsys;
}
 
return 0;
 
-release_nls:
-   netlink_kernel_release(nls);
+unregister_pernet_subsys:
+   unregister_pernet_subsys(_net_ops);
 unregister_flashnode_bus:
bus_unregister(_flashnode_bus);
 unregister_session_class:
@@ -4566,7 +4613,7 @@ static __init int iscsi_transport_init(void)
 static void __exit iscsi_transport_exit(void)
 {
destroy_workqueue(iscsi_eh_timer_workq);
-   netlink_kernel_release(nls);
+   unregister_pernet_subsys(_net_ops);
bus_unregister(_flashnode_bus);
transport_class_unregister(_connection_class);
transport_class_unregister(_session_class);
-- 
2.9.5



[PATCH 9/9] iscsi: filter flashnode sysfs by net namespace

2017-11-07 Thread Chris Leech
Finished the net namespace support for flashnode sysfs devices

Signed-off-by: Chris Leech <cle...@redhat.com>
---
 drivers/scsi/scsi_transport_iscsi.c | 33 +
 1 file changed, 33 insertions(+)

diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index 5ffda170ac9d..783971d72c4c 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1268,8 +1268,41 @@ static int iscsi_is_flashnode_conn_dev(struct device 
*dev, void *data)
return dev->type == _flashnode_conn_dev_type;
 }
 
+static struct net *iscsi_flashnode_sess_net(struct iscsi_flash_session *f_sess)
+{
+   struct Scsi_Host *shost = iscsi_flash_session_to_shost(f_sess);
+   struct iscsi_cls_host *ihost = shost->shost_data;
+
+   return iscsi_host_net(ihost);
+}
+
+static struct net *iscsi_flashnode_conn_net(struct iscsi_flash_conn *f_conn)
+{
+   struct iscsi_flash_session *f_sess =
+   iscsi_flash_conn_to_flash_session(f_conn);
+
+   return iscsi_flashnode_sess_net(f_sess);
+}
+
+static const void *iscsi_flashnode_namespace(struct device *dev)
+{
+   struct iscsi_flash_conn *f_conn;
+   struct iscsi_flash_session *f_sess;
+
+   if (iscsi_is_flashnode_conn_dev(dev, NULL)) {
+   f_conn = iscsi_dev_to_flash_conn(dev);
+   return iscsi_flashnode_conn_net(f_conn);
+   } else if (iscsi_is_flashnode_session_dev(dev)) {
+   f_sess = iscsi_dev_to_flash_session(dev);
+   return iscsi_flashnode_sess_net(f_sess);
+   }
+   return NULL;
+}
+
 static struct class iscsi_flashnode = {
.name = "iscsi_flashnode",
+   .ns_type = _ns_type_operations,
+   .namespace = iscsi_flashnode_namespace,
 };
 
 /**
-- 
2.9.5



[PATCH 2/9] iscsi: associate endpoints with a host

2017-11-07 Thread Chris Leech
Right now the iscsi_endpoint is only linked to a connection once that
connection has been established.  For net namespace filtering of the
sysfs objects, associate an endpoint with the host that it was
allocated for when it is created.

Signed-off-by: Chris Leech <cle...@redhat.com>
---
 drivers/infiniband/ulp/iser/iscsi_iser.c | 2 +-
 drivers/scsi/be2iscsi/be_iscsi.c | 2 +-
 drivers/scsi/bnx2i/bnx2i_iscsi.c | 2 +-
 drivers/scsi/cxgbi/libcxgbi.c| 2 +-
 drivers/scsi/qedi/qedi_iscsi.c   | 2 +-
 drivers/scsi/qla4xxx/ql4_os.c| 2 +-
 drivers/scsi/scsi_transport_iscsi.c  | 3 ++-
 include/scsi/scsi_transport_iscsi.h  | 6 +-
 8 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c 
b/drivers/infiniband/ulp/iser/iscsi_iser.c
index 19624e023ebd..0a4214be4877 100644
--- a/drivers/infiniband/ulp/iser/iscsi_iser.c
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -817,7 +817,7 @@ iscsi_iser_ep_connect(struct Scsi_Host *shost, struct 
sockaddr *dst_addr,
struct iser_conn *iser_conn;
struct iscsi_endpoint *ep;
 
-   ep = iscsi_create_endpoint(0);
+   ep = iscsi_create_endpoint(shost, 0);
if (!ep)
return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c
index a398c54139aa..33f79f385660 100644
--- a/drivers/scsi/be2iscsi/be_iscsi.c
+++ b/drivers/scsi/be2iscsi/be_iscsi.c
@@ -1157,7 +1157,7 @@ beiscsi_ep_connect(struct Scsi_Host *shost, struct 
sockaddr *dst_addr,
return ERR_PTR(ret);
}
 
-   ep = iscsi_create_endpoint(sizeof(struct beiscsi_endpoint));
+   ep = iscsi_create_endpoint(shost, sizeof(struct beiscsi_endpoint));
if (!ep) {
ret = -ENOMEM;
return ERR_PTR(ret);
diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c
index 03c104b47f31..19fadb5d3b3c 100644
--- a/drivers/scsi/bnx2i/bnx2i_iscsi.c
+++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c
@@ -384,7 +384,7 @@ static struct iscsi_endpoint *bnx2i_alloc_ep(struct 
bnx2i_hba *hba)
struct bnx2i_endpoint *bnx2i_ep;
u32 ec_div;
 
-   ep = iscsi_create_endpoint(sizeof(*bnx2i_ep));
+   ep = iscsi_create_endpoint(hba->shost, sizeof(*bnx2i_ep));
if (!ep) {
printk(KERN_ERR "bnx2i: Could not allocate ep\n");
return NULL;
diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 858e32e8ad2d..558484f72738 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -2616,7 +2616,7 @@ struct iscsi_endpoint *cxgbi_ep_connect(struct Scsi_Host 
*shost,
goto release_conn;
}
 
-   ep = iscsi_create_endpoint(sizeof(*cep));
+   ep = iscsi_create_endpoint(shost, sizeof(*cep));
if (!ep) {
err = -ENOMEM;
pr_info("iscsi alloc ep, OOM.\n");
diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
index a02b34ea5cab..5ae589ea1dd2 100644
--- a/drivers/scsi/qedi/qedi_iscsi.c
+++ b/drivers/scsi/qedi/qedi_iscsi.c
@@ -847,7 +847,7 @@ qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr 
*dst_addr,
return ERR_PTR(ret);
}
 
-   ep = iscsi_create_endpoint(sizeof(struct qedi_endpoint));
+   ep = iscsi_create_endpoint(shost, sizeof(struct qedi_endpoint));
if (!ep) {
QEDI_ERR(>dbg_ctx, "endpoint create fail\n");
ret = -ENOMEM;
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 64c6fa563fdb..5785bf6c3ec0 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -1673,7 +1673,7 @@ qla4xxx_ep_connect(struct Scsi_Host *shost, struct 
sockaddr *dst_addr,
}
 
ha = iscsi_host_priv(shost);
-   ep = iscsi_create_endpoint(sizeof(struct qla_endpoint));
+   ep = iscsi_create_endpoint(shost, sizeof(struct qla_endpoint));
if (!ep) {
ret = -ENOMEM;
return ERR_PTR(ret);
diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index 0b23ba346cbe..6ab7ca82b121 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -194,7 +194,7 @@ static int iscsi_match_epid(struct device *dev, const void 
*data)
 }
 
 struct iscsi_endpoint *
-iscsi_create_endpoint(int dd_size)
+iscsi_create_endpoint(struct Scsi_Host *shost, int dd_size)
 {
struct device *dev;
struct iscsi_endpoint *ep;
@@ -221,6 +221,7 @@ iscsi_create_endpoint(int dd_size)
 
ep->id = id;
ep->dev.class = _endpoint_class;
+   ep->dev.parent = >shost_gendev;
dev_set_name(>dev, "ep-%llu", (unsigned long long) id);
err = device_register(>dev);
 if (err)
diff --git a/include/scsi/scsi_trans

Re: [PATCH 22/29] drivers, scsi: convert iscsi_task.refcount from atomic_t to refcount_t

2017-03-08 Thread Chris Leech
On Mon, Mar 06, 2017 at 04:21:09PM +0200, Elena Reshetova wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
> 
> Signed-off-by: Elena Reshetova <elena.reshet...@intel.com>
> Signed-off-by: Hans Liljestrand <ishkam...@gmail.com>
> Signed-off-by: Kees Cook <keesc...@chromium.org>
> Signed-off-by: David Windsor <dwind...@gmail.com>

This looks OK to me.

Acked-by: Chris Leech <cle...@redhat.com>

> ---
>  drivers/scsi/libiscsi.c| 8 
>  drivers/scsi/qedi/qedi_iscsi.c | 2 +-
>  include/scsi/libiscsi.h| 3 ++-
>  3 files changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
> index 834d121..7eb1d2c 100644
> --- a/drivers/scsi/libiscsi.c
> +++ b/drivers/scsi/libiscsi.c
> @@ -516,13 +516,13 @@ static void iscsi_free_task(struct iscsi_task *task)
>  
>  void __iscsi_get_task(struct iscsi_task *task)
>  {
> - atomic_inc(>refcount);
> + refcount_inc(>refcount);
>  }
>  EXPORT_SYMBOL_GPL(__iscsi_get_task);
>  
>  void __iscsi_put_task(struct iscsi_task *task)
>  {
> - if (atomic_dec_and_test(>refcount))
> + if (refcount_dec_and_test(>refcount))
>   iscsi_free_task(task);
>  }
>  EXPORT_SYMBOL_GPL(__iscsi_put_task);
> @@ -744,7 +744,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct 
> iscsi_hdr *hdr,
>* released by the lld when it has transmitted the task for
>* pdus we do not expect a response for.
>*/
> - atomic_set(>refcount, 1);
> + refcount_set(>refcount, 1);
>   task->conn = conn;
>   task->sc = NULL;
>   INIT_LIST_HEAD(>running);
> @@ -1616,7 +1616,7 @@ static inline struct iscsi_task 
> *iscsi_alloc_task(struct iscsi_conn *conn,
>   sc->SCp.phase = conn->session->age;
>   sc->SCp.ptr = (char *) task;
>  
> - atomic_set(>refcount, 1);
> + refcount_set(>refcount, 1);
>   task->state = ISCSI_TASK_PENDING;
>   task->conn = conn;
>   task->sc = sc;
> diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c
> index b9f79d3..3895bd5 100644
> --- a/drivers/scsi/qedi/qedi_iscsi.c
> +++ b/drivers/scsi/qedi/qedi_iscsi.c
> @@ -1372,7 +1372,7 @@ static void qedi_cleanup_task(struct iscsi_task *task)
>  {
>   if (!task->sc || task->state == ISCSI_TASK_PENDING) {
>   QEDI_INFO(NULL, QEDI_LOG_IO, "Returning ref_cnt=%d\n",
> -   atomic_read(>refcount));
> +   refcount_read(>refcount));
>   return;
>   }
>  
> diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h
> index b0e275d..24d74b5 100644
> --- a/include/scsi/libiscsi.h
> +++ b/include/scsi/libiscsi.h
> @@ -29,6 +29,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -139,7 +140,7 @@ struct iscsi_task {
>  
>   /* state set/tested under session->lock */
>   int state;
> - atomic_trefcount;
> + refcount_t  refcount;
>   struct list_headrunning;/* running cmd list */
>   void*dd_data;   /* driver/transport data */
>  };
> -- 
> 2.7.4
> 


Re: [RFC PATCH 0/4] Make iSCSI network namespace aware

2015-05-21 Thread Chris Leech
On Wed, May 20, 2015 at 11:45:43AM -0700, Andy Grover wrote:
 On 05/13/2015 03:12 PM, Chris Leech wrote:
 This is only about the structures and functionality involved in maintaining 
 the
 iSCSI session, the SCSI host along with it's discovered targets and devices 
 has
 no association with network namespaces.
 
 These patches are functional, but not complete.  There's no isolation 
 enforced
 in the kernel just yet, so it relies on well behaved userspace.  I plan on
 fixing that, but wanted some feedback on the idea and approach so far.
 
 Seems like a good direction, to me.
 
 What would be the extent of the userspace (open-iscsi) changes needed to go
 along with this?

There's no core changes needed in the open-iscsi tools, it's more a
matter of how iscsid is packaged and executed.

The control socket between iscsid and iscsiadm binds to an abstract unix
domain path, so that works fine as long as you run iscsiadm from within
the same net ns as the iscsid instance you want to talk to.

The pid file checks clash if /var/run is common between instances.
Putting iscsid in a container could provide separate config files and
configuration databases, but there may be something that could improve
handling there.

I've been testing using 'ip netns exec' to run iscsid in a new network
namespace (it actually crates a new mount namespace as well, to remount
/sys with the new namespace filtered view).

My test setup so far has been the following:

  A VM with two virtio network interfaces on different virtual networks.
  I have an iSCSI target configured with two portals, one on each
  virtual network.

  I create two new network namespaces with 'ip netns add' and then move
  the nics into them with 'ip link dev netns ns' and bring them
  online.

  Using 'ip netns exec' I start up an iscsid instance in each namespace,
  using the --foreground option to avoid the PID file clash.

  Form within each namespace I can run iscsiadm to manage sessions
  through one of the iscsid instances.  With this setup they share the
  persistent configuration database, so I specifically select which
  records to start/stop.

- Chris

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 0/4] Make iSCSI network namespace aware

2015-05-13 Thread Chris Leech
I've had a few reports of people trying to run iscsid in a container, which
doesn't work at all when using network namespaces.  This is the start of me
looking at what it would take to make that work, and if it makes sense at all.

The first issue is that the kernel side of the iSCSI netlink control protocol
only operates in the initial network namespace.  But beyond that, if we allow
iSCSI to be managed within a namespace we need to decide what that means.  I
think it makes the most sense to isolate the iSCSI host, along with it's
associated endpoints, connections, and sessions, to a network namespace and
allow multiple instances of the userspace tools to exist in separate namespaces
managing separate hosts.

It works well for iscsi_tcp, which creates a host per session.  There's no
attempt to manage sessions on offloading hosts independently, although future
work could include the ability to move an entire host to a new namespace like
is supported for network devices.

This is only about the structures and functionality involved in maintaining the
iSCSI session, the SCSI host along with it's discovered targets and devices has
no association with network namespaces.

These patches are functional, but not complete.  There's no isolation enforced
in the kernel just yet, so it relies on well behaved userspace.  I plan on
fixing that, but wanted some feedback on the idea and approach so far.

Thanks,
Chris

Chris Leech (4):
  iscsi: create per-net iscsi nl kernel sockets
  iscsi: sysfs filtering by network namespace
  iscsi: make all netlink multicast namespace aware
  iscsi: set netns for iscsi_tcp hosts

 drivers/scsi/iscsi_tcp.c|   7 +
 drivers/scsi/scsi_transport_iscsi.c | 264 +---
 include/scsi/scsi_transport_iscsi.h |   2 +
 3 files changed, 222 insertions(+), 51 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 4/4] iscsi: set netns for iscsi_tcp hosts

2015-05-13 Thread Chris Leech
This lets iscsi_tcp operate in multiple namespaces.  It uses current
during session creation to find the net namespace, but it might be
better to manage to pass it along from the iscsi netlink socket.
---
 drivers/scsi/iscsi_tcp.c| 7 +++
 drivers/scsi/scsi_transport_iscsi.c | 7 ++-
 include/scsi/scsi_transport_iscsi.h | 1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 0b8af18..ebe99da 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -948,6 +948,11 @@ static int iscsi_sw_tcp_slave_configure(struct scsi_device 
*sdev)
return 0;
 }
 
+static struct net *iscsi_sw_tcp_netns(struct Scsi_Host *shost)
+{
+   return current-nsproxy-net_ns;
+}
+
 static struct scsi_host_template iscsi_sw_tcp_sht = {
.module = THIS_MODULE,
.name   = iSCSI Initiator over TCP/IP,
@@ -1003,6 +1008,8 @@ static struct iscsi_transport iscsi_sw_tcp_transport = {
.alloc_pdu  = iscsi_sw_tcp_pdu_alloc,
/* recovery */
.session_recovery_timedout = iscsi_session_recovery_timedout,
+   /* net namespace */
+   .get_netns  = iscsi_sw_tcp_netns,
 };
 
 static int __init iscsi_sw_tcp_init(void)
diff --git a/drivers/scsi/scsi_transport_iscsi.c 
b/drivers/scsi/scsi_transport_iscsi.c
index 4fdd4bf..791aacd 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1590,11 +1590,16 @@ static int iscsi_setup_host(struct transport_container 
*tc, struct device *dev,
 {
struct Scsi_Host *shost = dev_to_shost(dev);
struct iscsi_cls_host *ihost = shost-shost_data;
+   struct iscsi_internal *priv = to_iscsi_internal(shost-transportt);
+   struct iscsi_transport *transport = priv-iscsi_transport;
 
memset(ihost, 0, sizeof(*ihost));
atomic_set(ihost-nr_scans, 0);
mutex_init(ihost-mutex);
-   ihost-netns = init_net;
+   if (transport-get_netns)
+   ihost-netns = transport-get_netns(shost);
+   else
+   ihost-netns = init_net;
 
iscsi_bsg_host_add(shost, ihost);
/* ignore any bsg add error - we just can't do sgio */
diff --git a/include/scsi/scsi_transport_iscsi.h 
b/include/scsi/scsi_transport_iscsi.h
index 860ac0c..878bcf2 100644
--- a/include/scsi/scsi_transport_iscsi.h
+++ b/include/scsi/scsi_transport_iscsi.h
@@ -168,6 +168,7 @@ struct iscsi_transport {
int (*logout_flashnode_sid) (struct iscsi_cls_session *cls_sess);
int (*get_host_stats) (struct Scsi_Host *shost, char *buf, int len);
u8 (*check_protection)(struct iscsi_task *task, sector_t *sector);
+   struct net *(*get_netns)(struct Scsi_Host *shost);
 };
 
 /*
-- 
2.1.0

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[VLAN] set_rx_mode support for unicast address list

2008-01-24 Thread Chris Leech
Reuse the existing logic for multicast list synchronization for the unicast
address list. The core of dev_mc_sync/unsync are split out as
__dev_addr_sync/unsync and moved from dev_mcast.c to dev.c.  These are then
used to implement dev_unicast_sync/unsync as well.

I'm working on cleaning up Intel's FCoE stack, which generates new MAC
addresses from the fibre channel device id assigned by the fabric as per the
current draft specification in T11.  When using such a protocol in a VLAN
environment it would be nice to not always be forced into promiscuous mode,
assuming the underlying Ethernet driver supports multiple unicast addresses as
well.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/linux/netdevice.h |4 ++
 net/8021q/vlan_dev.c  |7 ++-
 net/core/dev.c|   96 +
 net/core/dev_mcast.c  |   39 ++
 4 files changed, 110 insertions(+), 36 deletions(-)


diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b0813c3..047d432 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1414,12 +1414,16 @@ extern void dev_set_rx_mode(struct 
net_device *dev);
 extern void__dev_set_rx_mode(struct net_device *dev);
 extern int dev_unicast_delete(struct net_device *dev, void *addr, 
int alen);
 extern int dev_unicast_add(struct net_device *dev, void *addr, int 
alen);
+extern int dev_unicast_sync(struct net_device *to, struct 
net_device *from);
+extern voiddev_unicast_unsync(struct net_device *to, struct 
net_device *from);
 extern int dev_mc_delete(struct net_device *dev, void *addr, int 
alen, int all);
 extern int dev_mc_add(struct net_device *dev, void *addr, int 
alen, int newonly);
 extern int dev_mc_sync(struct net_device *to, struct net_device 
*from);
 extern voiddev_mc_unsync(struct net_device *to, struct net_device 
*from);
 extern int __dev_addr_delete(struct dev_addr_list **list, int 
*count, void *addr, int alen, int all);
 extern int __dev_addr_add(struct dev_addr_list **list, int *count, 
void *addr, int alen, int newonly);
+extern int __dev_addr_sync(struct dev_addr_list **to, int 
*to_count, struct dev_addr_list **from, int *from_count);
+extern void__dev_addr_unsync(struct dev_addr_list **to, int 
*to_count, struct dev_addr_list **from, int *from_count);
 extern voiddev_set_promiscuity(struct net_device *dev, int inc);
 extern voiddev_set_allmulti(struct net_device *dev, int inc);
 extern voidnetdev_state_change(struct net_device *dev);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 8059fa4..77f04e4 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -563,6 +563,7 @@ static int vlan_dev_stop(struct net_device *dev)
struct net_device *real_dev = vlan_dev_info(dev)-real_dev;
 
dev_mc_unsync(real_dev, dev);
+   dev_unicast_unsync(real_dev, dev);
if (dev-flags  IFF_ALLMULTI)
dev_set_allmulti(real_dev, -1);
if (dev-flags  IFF_PROMISC)
@@ -634,9 +635,10 @@ static void vlan_dev_change_rx_flags(struct net_device 
*dev, int change)
dev_set_promiscuity(real_dev, dev-flags  IFF_PROMISC ? 1 : 
-1);
 }
 
-static void vlan_dev_set_multicast_list(struct net_device *vlan_dev)
+static void vlan_dev_set_rx_mode(struct net_device *vlan_dev)
 {
dev_mc_sync(vlan_dev_info(vlan_dev)-real_dev, vlan_dev);
+   dev_unicast_sync(vlan_dev_info(vlan_dev)-real_dev, vlan_dev);
 }
 
 /*
@@ -702,7 +704,8 @@ void vlan_setup(struct net_device *dev)
dev-open   = vlan_dev_open;
dev-stop   = vlan_dev_stop;
dev-set_mac_address= vlan_dev_set_mac_address;
-   dev-set_multicast_list = vlan_dev_set_multicast_list;
+   dev-set_rx_mode= vlan_dev_set_rx_mode;
+   dev-set_multicast_list = vlan_dev_set_rx_mode;
dev-change_rx_flags= vlan_dev_change_rx_flags;
dev-do_ioctl   = vlan_dev_ioctl;
dev-destructor = free_netdev;
diff --git a/net/core/dev.c b/net/core/dev.c
index c9c593e..edaff27 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2962,6 +2962,102 @@ int dev_unicast_add(struct net_device *dev, void *addr, 
int alen)
 }
 EXPORT_SYMBOL(dev_unicast_add);
 
+int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
+   struct dev_addr_list **from, int *from_count)
+{
+   struct dev_addr_list *da, *next;
+   int err = 0;
+
+   da = *from;
+   while (da != NULL) {
+   next = da-next;
+   if (!da-da_synced) {
+   err = __dev_addr_add(to, to_count,
+da-da_addr, da-da_addrlen, 0);
+   if (err  0)
+   break

Re: [ANNOUNCE] Open-FCoE - Fibre Channel over Ethernet Project

2007-11-28 Thread Chris Leech

Christoph Hellwig wrote:

I just did a very quick glance over the tree.  Some extremly highlevel
comments to start with before actually starting the source review:


Thanks for taking a look Christoph


 - why do you need your own libcrc?  lib/crc32.c has a crc32_le


We shouldn't, but we may want to add a CRC and copy routine.


 - libsa should go.  Much of it is just wrappers of kernel functions
   that should be used directly.  Other like that hash, even or state
   helpers might either be opencoded in the caller or made completely
   generic in lib/.  Probably the former but we'll have to see.


Yes, and along with it the last use of the BSD TAILQ macros.  Just 
before Rob set up the open repos I finished converting most of those to 
list_head, the only one left is in the sa_event mechanism.  Rather than 
convert it I'd like to replace the use of sa_event with notifier call 
chains.  I just need to finish auditing the use to make sure the 
differences won't cause unexpected problems.


After than and unwrapping kernel functions, I think the only thing left 
before completly removing libsa is to open code the state machines.


Similarly I think net_types.h need to go.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: change the way e1000 is handling short VLAN frames

2007-09-21 Thread Chris Leech
On 9/21/07, jamal [EMAIL PROTECTED] wrote:
 On Fri, 2007-21-09 at 08:43 -0700, Ben Greear wrote:

  I just re-read the spec, and a bridge *may* pad up to 68, but it is not
  required.
  On page 166, it says equipment must be able to handle 64 byte minimums.
 
  See page 22 (section 7.2) of this document:
 
  http://standards.ieee.org/getieee802/download/802.1Q-1998.pdf
 
  Also, page 63, 165, 166

 Thanks for the enlightnment.
 Do we need an ethtool interface to turn off hardware accelerated vlans?
 Jesse is indicating that the intel hardware can only handle the MUST but
 not the SHOULD of the spec.
 Actually a more basic question: Can you select one or the other mode in
 the software based vlans?

Inserting the VLAN tag in software will not change the behavior in the
way you want anyway, short frames will still be padded to 64 bytes.
You'd have to do short packet padding in software to 68 bytes.  Or do
software padding to 64 bytes and let the hardware insert the VLAN tag
after.

Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: change the way e1000 is handling short VLAN frames

2007-09-21 Thread Chris Leech
On 9/21/07, jamal [EMAIL PROTECTED] wrote:
 On Fri, 2007-21-09 at 14:34 -0700, Kok, Auke wrote:

  I never saw any bugreports about e1000 not being able to accept vlan packets
  because of this, so I'm quite certain it works OK, feel free to find me a 
  case
  where this isn't so :)

 If you tell me it can be done on the rx, i will take your word for it;-
 Emil can certainly verify it.
 The tx you certainly have issues - Look at one of the suggestions from
 Chris, i think it is resolvable.

I'd say that devices that can't receive 64 bytes VLAN tagged frames
have an issue, but for the sake of interoperability and solving Emil's
problem I'm willing to discuss how a change to e1000 would work  ;-)

The simplest option is to add software small frame padding all the
time.  It won't catch software tagged frames if they were generated
somehow, but should fix the hardware tagged ones to be 68 bytes on the
wire.  If you were worried about software tagged frames then replacing
ETH_ZLEN with VLAN_ETH_ZLEN would pad all frames, VLAN or not, to 68
bytes.

Emil, this patch will probably do what you want.

diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 4a22595..34e3d18 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -3284,6 +3284,9 @@ e1000_xmit_frame(struct sk_buff *skb, struct
net_device *netdev)
return NETDEV_TX_OK;
}

+   if (skb_padto(skb, ETH_ZLEN))
+   return NETDEV_TX_OK;
+
/* 82571 and newer doesn't need the workaround that limited descriptor
 * length to 4kB */
if (adapter-hw.mac_type = e1000_82571)
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [NET 00/02]: MACVLAN driver

2007-06-19 Thread Chris Leech

On 6/19/07, Stephen Hemminger [EMAIL PROTECTED] wrote:

Looks good. I have some changes to allow devices with multiple MAC addresses
(never finished).  This device could use that.


Stephen,

Is this patch available somewhere?  I'd be interested in taking a look at it.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GIT PULL] I/OAT updates for 2.6.22

2007-05-02 Thread Chris Leech
On Wed, 2007-05-02 at 15:44 -0700, David Miller wrote:
 
 Chrstopher, I really really would like you to post these patches early
 and often to netdev@vger.kernel.org especially because you are
 touching the TCP code.

You're right, I should have sent this to netdev as well.  I'm Sorry.

As for early and often, I have posted all of these patches to netdev,
and made suggested changes, and re-posted.

And when I have other networking changes, you can bet they'll get sent
to netdev for review first before I think about asking that they be
included.

 You aren't doing this, for several rounds, and just submitting your
 stuff directly to Linus, Andrew, and lkml, and it's starting to annoy
 me greatly.

For several rounds, I've been posting patches that go nowhere.  I
honestly don't care if they go straight to Linus, through you, or
through Andrew.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: AF_PACKET how to get the original netdev from a packet received from a bonded master

2007-04-18 Thread Chris Leech

On 4/18/07, David Miller [EMAIL PROTECTED] wrote:

Ok, it will give you one level of decapsulation.

What do we tell people who want 2 devices previous? :-)


I can tell you that the intent of PJs patch was to provide the ifindex
of the physical interface that a packet entered the system on,
regardless of how many layers of encapsulation are involved.

Of course it may not actually do that ...
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: AF_PACKET how to get the original netdev from a packet received from a bonded master

2007-04-18 Thread Chris Leech

On 4/18/07, David Miller [EMAIL PROTECTED] wrote:


Ok, I'll try to remember to high-priority reviewing PJ's patch
on my next rebase of the net-2.6.22 tree which should be
tonight or tomorrow sometime.


Thanks Dave, PJ is offline this week so I'm trying to keep an eye out
for discussions related to his various patches :-)

Just to give you an idea of our motivation around this, we're looking
at layer 2 configuration protocols implemented from user space.  As an
example Link Layer Discovery Protocol could be used to detect trunking
misconfiguration, but only if you can track that information for the
underlying interfaces of a bond.  Things like 802.1x authenticated
links in a bond would have a similar issue of needing to configure
each underlying interface before bringing up the bond, but with LLDP
there's the added fun of being able to receive updated notifications
of configuration changes from the link partner at any time.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/9] ioatdma: Remove the use of writeq from the ioatdma driver

2007-03-02 Thread Chris Leech
There's only one now anyway, and it's not in a performance path,
so make it behave the same on 32-bit and 64-bit CPUs.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/ioatdma.c |   10 --
 1 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index ec11131..cbf93ca 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -608,13 +608,11 @@ static void ioat_start_null_desc(struct ioat_dma_chan 
*ioat_chan)
list_add_tail(desc-node, ioat_chan-used_desc);
spin_unlock_bh(ioat_chan-desc_lock);
 
-#if (BITS_PER_LONG == 64)
-   writeq(desc-phys, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET);
-#else
-   writel((u32) desc-phys,
+   writel(((u64) desc-phys)  0x,
   ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_LOW);
-   writel(0, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH);
-#endif
+   writel(((u64) desc-phys)  32,
+  ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH);
+
writeb(IOAT_CHANCMD_START, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
 }
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/9] I/OAT fixes

2007-03-02 Thread Chris Leech
Andrew Morton (1):
 I/OAT: warning fix

Chris Leech (6):
 ioatdma: Push pending transactions to hardware more frequently
 ioatdma: Remove the wrappers around read(bwl)/write(bwl) in ioatdma
 ioatdma: Remove the use of writeq from the ioatdma driver
 I/OAT: Add documentation for the tcp_dma_copybreak sysctl
 I/OAT: Add entries to MAINTAINERS for the DMA memcpy subsystem and ioatdma
 I/OAT: Only offload copies for TCP when there will be a context switch

Dan Aloni (1):
 I/OAT: fix I/OAT for kexec

Jeff Garzik (1):
 drivers/dma: handle sysfs errors

 Documentation/networking/ip-sysctl.txt |6 +
 MAINTAINERS|   12 +++
 drivers/dma/dmaengine.c|   22 +-
 drivers/dma/ioatdma.c  |   81 --
 drivers/dma/ioatdma_io.h   |  118 -
 net/ipv4/tcp.c |   26 +--
 6 files changed, 100 insertions(+), 165 deletions(-)

--
Chris Leech [EMAIL PROTECTED]
I/O Acceleration Technology Software Development
LAN Access Division / Digital Enterprise Group 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/9] drivers/dma: handle sysfs errors

2007-03-02 Thread Chris Leech
From: Jeff Garzik [EMAIL PROTECTED]

Signed-off-by: Jeff Garzik [EMAIL PROTECTED]
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/dmaengine.c |   22 --
 1 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 1527804..dc65773 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -312,7 +312,7 @@ void dma_async_client_chan_request(struct dma_client 
*client,
 int dma_async_device_register(struct dma_device *device)
 {
static int id;
-   int chancnt = 0;
+   int chancnt = 0, rc;
struct dma_chan* chan;
 
if (!device)
@@ -334,8 +334,15 @@ int dma_async_device_register(struct dma_device *device)
snprintf(chan-class_dev.class_id, BUS_ID_SIZE, dma%dchan%d,
 device-dev_id, chan-chan_id);
 
+   rc = class_device_register(chan-class_dev);
+   if (rc) {
+   chancnt--;
+   free_percpu(chan-local);
+   chan-local = NULL;
+   goto err_out;
+   }
+
kref_get(device-refcount);
-   class_device_register(chan-class_dev);
}
 
mutex_lock(dma_list_mutex);
@@ -345,6 +352,17 @@ int dma_async_device_register(struct dma_device *device)
dma_chans_rebalance();
 
return 0;
+
+err_out:
+   list_for_each_entry(chan, device-channels, device_node) {
+   if (chan-local == NULL)
+   continue;
+   kref_put(device-refcount, dma_async_device_cleanup);
+   class_device_unregister(chan-class_dev);
+   chancnt--;
+   free_percpu(chan-local);
+   }
+   return rc;
 }
 
 /**

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 9/9] I/OAT: fix I/OAT for kexec

2007-03-02 Thread Chris Leech
Under kexec, I/OAT initialization breaks over busy resources because the
previous kernel did not release them.

I'm not sure this fix can be considered a complete one but it works for me.
 I guess something similar to the *_remove method should occur there..

Signed-off-by: Dan Aloni [EMAIL PROTECTED]
Signed-off-by: Chris Leech [EMAIL PROTECTED]
Signed-off-by: Andrew Morton [EMAIL PROTECTED]
---

 drivers/dma/ioatdma.c |   13 +
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index cbf93ca..1d259e5 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -41,6 +41,7 @@
 
 /* internal functions */
 static int __devinit ioat_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent);
+static void ioat_shutdown(struct pci_dev *pdev);
 static void __devexit ioat_remove(struct pci_dev *pdev);
 
 static int enumerate_dma_channels(struct ioat_device *device)
@@ -557,6 +558,7 @@ static struct pci_driver ioat_pci_drv = {
.name   = ioatdma,
.id_table = ioat_pci_tbl,
.probe  = ioat_probe,
+   .shutdown = ioat_shutdown,
.remove = __devexit_p(ioat_remove),
 };
 
@@ -781,9 +783,20 @@ err_request_regions:
 err_set_dma_mask:
pci_disable_device(pdev);
 err_enable_device:
+
+   printk(KERN_ERR Intel(R) I/OAT DMA Engine initialization failed\n);
+
return err;
 }
 
+static void ioat_shutdown(struct pci_dev *pdev)
+{
+   struct ioat_device *device;
+   device = pci_get_drvdata(pdev);
+
+   dma_async_device_unregister(device-common);
+}
+
 static void __devexit ioat_remove(struct pci_dev *pdev)
 {
struct ioat_device *device;

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/9] ioatdma: Remove the wrappers around read(bwl)/write(bwl) in ioatdma

2007-03-02 Thread Chris Leech
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/ioatdma.c|   60 +++
 drivers/dma/ioatdma_io.h |  118 --
 2 files changed, 28 insertions(+), 150 deletions(-)

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 0f77a9d..ec11131 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -32,7 +32,6 @@
 #include linux/delay.h
 #include linux/dma-mapping.h
 #include ioatdma.h
-#include ioatdma_io.h
 #include ioatdma_registers.h
 #include ioatdma_hw.h
 
@@ -51,8 +50,8 @@ static int enumerate_dma_channels(struct ioat_device *device)
int i;
struct ioat_dma_chan *ioat_chan;
 
-   device-common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET);
-   xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET);
+   device-common.chancnt = readb(device-reg_base + IOAT_CHANCNT_OFFSET);
+   xfercap_scale = readb(device-reg_base + IOAT_XFERCAP_OFFSET);
xfercap = (xfercap_scale == 0 ? -1 : (1UL  xfercap_scale));
 
for (i = 0; i  device-common.chancnt; i++) {
@@ -123,7 +122,7 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan 
*chan)
 * In-use bit automatically set by reading chanctrl
 * If 0, we got it, if 1, someone else did
 */
-   chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET);
+   chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
if (chanctrl  IOAT_CHANCTRL_CHANNEL_IN_USE)
return -EBUSY;
 
@@ -132,12 +131,12 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan 
*chan)
IOAT_CHANCTRL_ERR_INT_EN |
IOAT_CHANCTRL_ANY_ERR_ABORT_EN |
IOAT_CHANCTRL_ERR_COMPLETION_EN;
-ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl);
+writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
 
-   chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET);
+   chanerr = readl(ioat_chan-reg_base + IOAT_CHANERR_OFFSET);
if (chanerr) {
printk(IOAT: CHANERR = %x, clearing\n, chanerr);
-   ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr);
+   writel(chanerr, ioat_chan-reg_base + IOAT_CHANERR_OFFSET);
}
 
/* Allocate descriptors */
@@ -161,10 +160,10 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan 
*chan)
   ioat_chan-completion_addr);
memset(ioat_chan-completion_virt, 0,
   sizeof(*ioat_chan-completion_virt));
-   ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW,
-  ((u64) ioat_chan-completion_addr)  0x);
-   ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH,
-  ((u64) ioat_chan-completion_addr)  32);
+   writel(((u64) ioat_chan-completion_addr)  0x,
+  ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_LOW);
+   writel(((u64) ioat_chan-completion_addr)  32,
+  ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_HIGH);
 
ioat_start_null_desc(ioat_chan);
return i;
@@ -182,7 +181,7 @@ static void ioat_dma_free_chan_resources(struct dma_chan 
*chan)
 
ioat_dma_memcpy_cleanup(ioat_chan);
 
-   ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET);
+   writeb(IOAT_CHANCMD_RESET, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
 
spin_lock_bh(ioat_chan-desc_lock);
list_for_each_entry_safe(desc, _desc, ioat_chan-used_desc, node) {
@@ -210,9 +209,9 @@ static void ioat_dma_free_chan_resources(struct dma_chan 
*chan)
ioat_chan-last_completion = ioat_chan-completion_addr = 0;
 
/* Tell hw the chan is free */
-   chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET);
+   chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
chanctrl = ~IOAT_CHANCTRL_CHANNEL_IN_USE;
-   ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl);
+   writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
 }
 
 /**
@@ -318,9 +317,8 @@ static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan 
*ioat_chan,
spin_unlock_bh(ioat_chan-desc_lock);
 
if (append)
-   ioatdma_chan_write8(ioat_chan,
-   IOAT_CHANCMD_OFFSET,
-   IOAT_CHANCMD_APPEND);
+   writeb(IOAT_CHANCMD_APPEND,
+  ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
return cookie;
 }
 
@@ -417,9 +415,8 @@ static void ioat_dma_memcpy_issue_pending(struct dma_chan 
*chan)
 
if (ioat_chan-pending != 0) {
ioat_chan-pending = 0;
-   ioatdma_chan_write8(ioat_chan,
-   IOAT_CHANCMD_OFFSET,
-   IOAT_CHANCMD_APPEND);
+   writeb(IOAT_CHANCMD_APPEND,
+  ioat_chan-reg_base

[PATCH 1/9] ioatdma: Push pending transactions to hardware more frequently

2007-03-02 Thread Chris Leech
Every 20 descriptors turns out to be to few append commands with
newer/faster CPUs.  Pushing every 4 still cuts down on MMIO writes to an
acceptable level without letting the DMA engine run out of work.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/ioatdma.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 8e87261..0f77a9d 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -310,7 +310,7 @@ static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan 
*ioat_chan,
list_splice_init(new_chain, ioat_chan-used_desc.prev);
 
ioat_chan-pending += desc_count;
-   if (ioat_chan-pending = 20) {
+   if (ioat_chan-pending = 4) {
append = 1;
ioat_chan-pending = 0;
}
@@ -818,7 +818,7 @@ static void __devexit ioat_remove(struct pci_dev *pdev)
 }
 
 /* MODULE API */
-MODULE_VERSION(1.7);
+MODULE_VERSION(1.9);
 MODULE_LICENSE(GPL);
 MODULE_AUTHOR(Intel Corporation);
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/9] I/OAT: Only offload copies for TCP when there will be a context switch

2007-03-02 Thread Chris Leech
The performance wins come with having the DMA copy engine doing the copies
in parallel with the context switch.  If there is enough data ready on the
socket at recv time just use a regular copy.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 net/ipv4/tcp.c |   10 +++---
 1 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 74c4d10..5ccd5e1 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1110,6 +1110,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, 
struct msghdr *msg,
long timeo;
struct task_struct *user_recv = NULL;
int copied_early = 0;
+   int available = 0;
+   struct sk_buff *skb;
 
lock_sock(sk);
 
@@ -1136,7 +1138,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, 
struct msghdr *msg,
 #ifdef CONFIG_NET_DMA
tp-ucopy.dma_chan = NULL;
preempt_disable();
-   if ((len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK) 
+   skb = skb_peek_tail(sk-sk_receive_queue);
+   if (skb)
+   available = TCP_SKB_CB(skb)-seq + skb-len - (*seq);
+   if ((available  target) 
+   (len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK) 
!sysctl_tcp_low_latency  __get_cpu_var(softnet_data).net_dma) {
preempt_enable_no_resched();
tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len);
@@ -1145,7 +1151,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, 
struct msghdr *msg,
 #endif
 
do {
-   struct sk_buff *skb;
u32 offset;
 
/* Are we at urgent data? Stop if we have read anything or have 
SIGURG pending. */
@@ -1433,7 +1438,6 @@ skip_copy:
 
 #ifdef CONFIG_NET_DMA
if (tp-ucopy.dma_chan) {
-   struct sk_buff *skb;
dma_cookie_t done, used;
 
dma_async_memcpy_issue_pending(tp-ucopy.dma_chan);

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/9] I/OAT: Add documentation for the tcp_dma_copybreak sysctl

2007-03-02 Thread Chris Leech
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 Documentation/networking/ip-sysctl.txt |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index d3aae1f..9541691 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -413,6 +413,12 @@ tcp_workaround_signed_windows - BOOLEAN
not receive a window scaling option from them.
Default: 0
 
+tcp_dma_copybreak - INTEGER
+   Lower limit, in bytes, of the size of socket reads that will be
+   offloaded to a DMA copy engine, if one is present in the system
+   and CONFIG_NET_DMA is enabled.
+   Default: 4096
+
 CIPSOv4 Variables:
 
 cipso_cache_enable - BOOLEAN

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/9] I/OAT: Add entries to MAINTAINERS for the DMA memcpy subsystem and ioatdma

2007-03-02 Thread Chris Leech
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 MAINTAINERS |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1dfba85..2dd5d23 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1156,6 +1156,12 @@ M:   [EMAIL PROTECTED]
 L: netdev@vger.kernel.org
 S: Maintained
 
+DMA GENERIC MEMCPY SUBSYSTEM
+P: Chris Leech
+M: [EMAIL PROTECTED]
+L: linux-kernel@vger.kernel.org
+S: Maintained
+
 DOCBOOK FOR DOCUMENTATION
 P: Randy Dunlap
 M: [EMAIL PROTECTED]
@@ -1777,6 +1783,12 @@ P:   Tigran Aivazian
 M: [EMAIL PROTECTED]
 S: Maintained
 
+INTEL I/OAT DMA DRIVER
+P: Chris Leech
+M: [EMAIL PROTECTED]
+L: linux-kernel@vger.kernel.org
+S: Supported
+
 INTEL IXP4XX RANDOM NUMBER GENERATOR SUPPORT
 P: Deepak Saxena
 M: [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/9] I/OAT: warning fix

2007-03-02 Thread Chris Leech
net/ipv4/tcp.c: In function 'tcp_recvmsg':
net/ipv4/tcp.c:: warning: unused variable 'available'

Signed-off-by: Andrew Morton [EMAIL PROTECTED]
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 net/ipv4/tcp.c |   26 --
 1 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5ccd5e1..69c525d 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1110,7 +1110,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, 
struct msghdr *msg,
long timeo;
struct task_struct *user_recv = NULL;
int copied_early = 0;
-   int available = 0;
struct sk_buff *skb;
 
lock_sock(sk);
@@ -1139,15 +1138,22 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, 
struct msghdr *msg,
tp-ucopy.dma_chan = NULL;
preempt_disable();
skb = skb_peek_tail(sk-sk_receive_queue);
-   if (skb)
-   available = TCP_SKB_CB(skb)-seq + skb-len - (*seq);
-   if ((available  target) 
-   (len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK) 
-   !sysctl_tcp_low_latency  __get_cpu_var(softnet_data).net_dma) {
-   preempt_enable_no_resched();
-   tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len);
-   } else
-   preempt_enable_no_resched();
+   {
+   int available = 0;
+
+   if (skb)
+   available = TCP_SKB_CB(skb)-seq + skb-len - (*seq);
+   if ((available  target) 
+   (len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK) 
+   !sysctl_tcp_low_latency 
+   __get_cpu_var(softnet_data).net_dma) {
+   preempt_enable_no_resched();
+   tp-ucopy.pinned_list =
+   dma_pin_iovec_pages(msg-msg_iov, len);
+   } else {
+   preempt_enable_no_resched();
+   }
+   }
 #endif
 
do {

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/9] ioatdma: Push pending transactions to hardware more frequently

2007-03-02 Thread Chris Leech

 This sounds like something that will always be wrong -- or in other
 words, always be right for only the latest CPUs.  Can this be made
 dynamic, based on some timing factor?

In fact I think this has been tweaked twice in the vanilla tree
already.


This is actually just the same tweak you remember me posting before
and I never pushed to get it in mainline, but Jeff's right.  The
problem isn't so much in the driver itself, as in how it's used by
I/OAT in the TCP receive code, there are inherent assumptions about
how long a context switch takes compared to how long an offloaded
memcpy takes.

I'm working on using completion interrupts for the device so as not to
end up polling when the CPUs are faster than the code was tuned for,
and doing it in a way that doesn't introduce extra context switches.
I'm hoping to have something ready for 2.6.22, or at least ready for
MM in that time frame.

As for this change in the short term, we did go back and make sure
that it didn't performance worse with the older CPUs supported on
these platforms.  We should have tested more intermediate values
instead of just jumping from 1 t o 20 for that threshold.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Question on IOAT

2007-02-05 Thread Chris Leech

On 2/5/07, Olaf Kirch [EMAIL PROTECTED] wrote:


Nowhere in the dma_async_*complete functions can I see any code
that would sleep if the DMA is not yet complete. Am I missing something,
or are we really busy-waiting on the DMA engine? Wouldn't this kind of
defeat the purpose of freeing up the CPU from the chores of memcpying?


It is busy waiting, but only because the TCP socket use initiates the
DMA copies from the softirq and they have time to complete during the
switch back to application context.  Going back to sleep and creating
more context switching made things worse.  I'm working on seeing if
completion interrupts could be used with a better thought out
implementation, the performance implications aren't fully clear to me
yet.

For other uses, interrupts are probably desired.


I also checked the code in ioatdma.c - I would have expected there to
be some kind of interrupt handler that kicks the upper layers when a
DMA operation completes. But the interrupt handler seems to be for
error reporting exclusively...


It's just not there now, but it can be added easily, it's one bit in
the descriptor and a register read in the interrupt handler to see
which channel(s) need attention.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/7] I/OAT: Push pending transactions to hardware more frequently

2006-10-18 Thread Chris Leech
Every 20 descriptors turns out to be to few append commands with
newer/faster CPUs.  Pushing every 4 still cuts down on MMIO writes to an
acceptable level without letting the DMA engine run out of work.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/ioatdma.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 0358419..f3b34b5 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -310,7 +310,7 @@ static dma_cookie_t do_ioat_dma_memcpy(s
list_splice_init(new_chain, ioat_chan-used_desc.prev);
 
ioat_chan-pending += desc_count;
-   if (ioat_chan-pending = 20) {
+   if (ioat_chan-pending = 4) {
append = 1;
ioat_chan-pending = 0;
}
@@ -818,7 +818,7 @@ static void __devexit ioat_remove(struct
 }
 
 /* MODULE API */
-MODULE_VERSION(1.7);
+MODULE_VERSION(1.9);
 MODULE_LICENSE(GPL);
 MODULE_AUTHOR(Intel Corporation);
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/7] I/OAT: Add entries to MAINTAINERS for the DMA memcpy subsystem and ioatdma

2006-10-18 Thread Chris Leech
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 MAINTAINERS |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 5305dd6..533adbe 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -915,6 +915,12 @@ M: [EMAIL PROTECTED]
 L: linux-kernel@vger.kernel.org
 S: Maintained
 
+DMA GENERIC MEMCPY SUBSYSTEM
+P: Chris Leech
+M: [EMAIL PROTECTED]
+L: linux-kernel@vger.kernel.org
+S: Maintained
+
 DOCBOOK FOR DOCUMENTATION
 P: Martin Waitz
 M: [EMAIL PROTECTED]
@@ -1516,6 +1522,12 @@ P:   Tigran Aivazian
 M: [EMAIL PROTECTED]
 S: Maintained
 
+INTEL I/OAT DMA DRIVER
+P: Chris Leech
+M: [EMAIL PROTECTED]
+L: linux-kernel@vger.kernel.org
+S: Supported
+
 INTEL IXP4XX RANDOM NUMBER GENERATOR SUPPORT
 P: Deepak Saxena
 M: [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/7] I/OAT: Add documentation for the tcp_dma_copybreak sysctl

2006-10-18 Thread Chris Leech
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 Documentation/networking/ip-sysctl.txt |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index fd3c0c0..e9ee102 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -375,6 +375,12 @@ tcp_slow_start_after_idle - BOOLEAN
be timed out after an idle period.
Default: 1
 
+tcp_dma_copybreak - INTEGER
+   Lower limit, in bytes, of the size of socket reads that will be
+   offloaded to a DMA copy engine, if one is present in the system
+   and CONFIG_NET_DMA is enabled.
+   Default: 4096
+
 CIPSOv4 Variables:
 
 cipso_cache_enable - BOOLEAN

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/7] drivers/dma I/OAT fixes

2006-10-18 Thread Chris Leech
Various fixes for the hardware memcpy engine code and ioatdma

Most of these I've posted before, except for the patch to handle sysfs
errors from Jeff Garzik.  I've dropped the controversial change to not
offload loopback traffic.

These changes can be pulled from
git://lost.foo-projects.org/~cleech/linux-2.6 master

--
Chris Leech [EMAIL PROTECTED]
I/O Acceleration Technology Software Development
LAN Access Division / Digital Enterprise Group 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/7] I/OAT: Only offload copies for TCP when there will be a context switch

2006-10-18 Thread Chris Leech
The performance wins come with having the DMA copy engine doing the copies
in parallel with the context switch.  If there is enough data ready on the
socket at recv time just use a regular copy.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 net/ipv4/tcp.c |   10 +++---
 1 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 66e9a72..ef0a6cd 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1108,6 +1108,8 @@ int tcp_recvmsg(struct kiocb *iocb, stru
long timeo;
struct task_struct *user_recv = NULL;
int copied_early = 0;
+   int available = 0;
+   struct sk_buff *skb;
 
lock_sock(sk);
 
@@ -1134,7 +1136,11 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 #ifdef CONFIG_NET_DMA
tp-ucopy.dma_chan = NULL;
preempt_disable();
-   if ((len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK) 
+   skb = skb_peek_tail(sk-sk_receive_queue);
+   if (skb)
+   available = TCP_SKB_CB(skb)-seq + skb-len - (*seq);
+   if ((available  target) 
+   (len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK) 
!sysctl_tcp_low_latency  __get_cpu_var(softnet_data).net_dma) {
preempt_enable_no_resched();
tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len);
@@ -1143,7 +1149,6 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 #endif
 
do {
-   struct sk_buff *skb;
u32 offset;
 
/* Are we at urgent data? Stop if we have read anything or have 
SIGURG pending. */
@@ -1431,7 +1436,6 @@ skip_copy:
 
 #ifdef CONFIG_NET_DMA
if (tp-ucopy.dma_chan) {
-   struct sk_buff *skb;
dma_cookie_t done, used;
 
dma_async_memcpy_issue_pending(tp-ucopy.dma_chan);

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/7] I/OAT: Remove the wrappers around read(bwl)/write(bwl) in ioatdma

2006-10-18 Thread Chris Leech
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/ioatdma.c|   60 +++
 drivers/dma/ioatdma_io.h |  118 --
 2 files changed, 28 insertions(+), 150 deletions(-)

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index f3b34b5..ceb03ee 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -32,7 +32,6 @@
 #include linux/delay.h
 #include linux/dma-mapping.h
 #include ioatdma.h
-#include ioatdma_io.h
 #include ioatdma_registers.h
 #include ioatdma_hw.h
 
@@ -51,8 +50,8 @@ static int enumerate_dma_channels(struct
int i;
struct ioat_dma_chan *ioat_chan;
 
-   device-common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET);
-   xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET);
+   device-common.chancnt = readb(device-reg_base + IOAT_CHANCNT_OFFSET);
+   xfercap_scale = readb(device-reg_base + IOAT_XFERCAP_OFFSET);
xfercap = (xfercap_scale == 0 ? -1 : (1UL  xfercap_scale));
 
for (i = 0; i  device-common.chancnt; i++) {
@@ -123,7 +122,7 @@ static int ioat_dma_alloc_chan_resources
 * In-use bit automatically set by reading chanctrl
 * If 0, we got it, if 1, someone else did
 */
-   chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET);
+   chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
if (chanctrl  IOAT_CHANCTRL_CHANNEL_IN_USE)
return -EBUSY;
 
@@ -132,12 +131,12 @@ static int ioat_dma_alloc_chan_resources
IOAT_CHANCTRL_ERR_INT_EN |
IOAT_CHANCTRL_ANY_ERR_ABORT_EN |
IOAT_CHANCTRL_ERR_COMPLETION_EN;
-ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl);
+writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
 
-   chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET);
+   chanerr = readl(ioat_chan-reg_base + IOAT_CHANERR_OFFSET);
if (chanerr) {
printk(IOAT: CHANERR = %x, clearing\n, chanerr);
-   ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr);
+   writel(chanerr, ioat_chan-reg_base + IOAT_CHANERR_OFFSET);
}
 
/* Allocate descriptors */
@@ -161,10 +160,10 @@ static int ioat_dma_alloc_chan_resources
   ioat_chan-completion_addr);
memset(ioat_chan-completion_virt, 0,
   sizeof(*ioat_chan-completion_virt));
-   ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW,
-  ((u64) ioat_chan-completion_addr)  0x);
-   ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH,
-  ((u64) ioat_chan-completion_addr)  32);
+   writel(((u64) ioat_chan-completion_addr)  0x,
+  ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_LOW);
+   writel(((u64) ioat_chan-completion_addr)  32,
+  ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_HIGH);
 
ioat_start_null_desc(ioat_chan);
return i;
@@ -182,7 +181,7 @@ static void ioat_dma_free_chan_resources
 
ioat_dma_memcpy_cleanup(ioat_chan);
 
-   ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET);
+   writeb(IOAT_CHANCMD_RESET, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
 
spin_lock_bh(ioat_chan-desc_lock);
list_for_each_entry_safe(desc, _desc, ioat_chan-used_desc, node) {
@@ -210,9 +209,9 @@ static void ioat_dma_free_chan_resources
ioat_chan-last_completion = ioat_chan-completion_addr = 0;
 
/* Tell hw the chan is free */
-   chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET);
+   chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
chanctrl = ~IOAT_CHANCTRL_CHANNEL_IN_USE;
-   ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl);
+   writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
 }
 
 /**
@@ -318,9 +317,8 @@ static dma_cookie_t do_ioat_dma_memcpy(s
spin_unlock_bh(ioat_chan-desc_lock);
 
if (append)
-   ioatdma_chan_write8(ioat_chan,
-   IOAT_CHANCMD_OFFSET,
-   IOAT_CHANCMD_APPEND);
+   writeb(IOAT_CHANCMD_APPEND,
+  ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
return cookie;
 }
 
@@ -417,9 +415,8 @@ static void ioat_dma_memcpy_issue_pendin
 
if (ioat_chan-pending != 0) {
ioat_chan-pending = 0;
-   ioatdma_chan_write8(ioat_chan,
-   IOAT_CHANCMD_OFFSET,
-   IOAT_CHANCMD_APPEND);
+   writeb(IOAT_CHANCMD_APPEND,
+  ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
}
 }
 
@@ -449,7 +446,7 @@ static void ioat_dma_memcpy_cleanup(stru
if ((chan-completion_virt-full  IOAT_CHANSTS_DMA_TRANSFER_STATUS

[PATCH 4/7] I/OAT: Remove the use of writeq from the ioatdma driver

2006-10-18 Thread Chris Leech
There's only one now anyway, and it's not in a performance path,
so make it behave the same on 32-bit and 64-bit CPUs.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/ioatdma.c |   10 --
 1 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index ceb03ee..2800c19 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -608,13 +608,11 @@ static void ioat_start_null_desc(struct 
list_add_tail(desc-node, ioat_chan-used_desc);
spin_unlock_bh(ioat_chan-desc_lock);
 
-#if (BITS_PER_LONG == 64)
-   writeq(desc-phys, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET);
-#else
-   writel((u32) desc-phys,
+   writel(((u64) desc-phys)  0x,
   ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_LOW);
-   writel(0, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH);
-#endif
+   writel(((u64) desc-phys)  32,
+  ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH);
+
writeb(IOAT_CHANCMD_START, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
 }
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/7] drivers/dma: handle sysfs errors

2006-10-18 Thread Chris Leech
From: Jeff Garzik [EMAIL PROTECTED]

Signed-off-by: Jeff Garzik [EMAIL PROTECTED]
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/dmaengine.c |   22 --
 1 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 1527804..dc65773 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -312,7 +312,7 @@ void dma_async_client_chan_request(struc
 int dma_async_device_register(struct dma_device *device)
 {
static int id;
-   int chancnt = 0;
+   int chancnt = 0, rc;
struct dma_chan* chan;
 
if (!device)
@@ -334,8 +334,15 @@ int dma_async_device_register(struct dma
snprintf(chan-class_dev.class_id, BUS_ID_SIZE, dma%dchan%d,
 device-dev_id, chan-chan_id);
 
+   rc = class_device_register(chan-class_dev);
+   if (rc) {
+   chancnt--;
+   free_percpu(chan-local);
+   chan-local = NULL;
+   goto err_out;
+   }
+
kref_get(device-refcount);
-   class_device_register(chan-class_dev);
}
 
mutex_lock(dma_list_mutex);
@@ -345,6 +352,17 @@ int dma_async_device_register(struct dma
dma_chans_rebalance();
 
return 0;
+
+err_out:
+   list_for_each_entry(chan, device-channels, device_node) {
+   if (chan-local == NULL)
+   continue;
+   kref_put(device-refcount, dma_async_device_cleanup);
+   class_device_unregister(chan-class_dev);
+   chancnt--;
+   free_percpu(chan-local);
+   }
+   return rc;
 }
 
 /**

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/7] [I/OAT] Push pending transactions to hardware more frequently

2006-08-18 Thread Chris Leech

On 8/18/06, Pavel Machek [EMAIL PROTECTED] wrote:


Huh, two version bumps for... ONE ONE-LINER :-).

Could we get rid of embedded version? It helps no one.


Version numbers for drivers that can be built as modules are very
helpful for anyone wanting to upgrade a driver on top of a
distribution supported kernel.  If you always just use the latest
kernel source, you're right it doesn't help you.  But that's not
everyone.

This one skips two versions because I'm trying to sync up a 1.8
version tested internally with the 1.7+ upstream changes that's in the
kernel now.

I'll accept that the official policy is to not version modules when
MODULE_VERSION is removed :-)

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: New driver questions: Attansic L1 gigabit NIC

2006-08-16 Thread Chris Leech

I thought that support statement sounded familiar, large portions of
the source code and documentation are modified from an older release
of e1000.  Nothing wrong with that as it's released under the GPL,
except that the copyright statements have mostly just been switched
from Intel to Attansic.  It's interesting to see a company that was
founded in 2000 claiming copyright back to 1999.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/7 v2] [I/OAT] Add entries to MAINTAINERS for the DMA memcpy subsystem and ioatdma

2006-08-16 Thread Chris Leech
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 MAINTAINERS |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 21116cc..2d484aa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -881,6 +881,12 @@ M: [EMAIL PROTECTED]
 L: linux-kernel@vger.kernel.org
 S: Maintained
 
+DMA GENERIC MEMCPY SUBSYSTEM
+P: Chris Leech
+M: [EMAIL PROTECTED]
+L: linux-kernel@vger.kernel.org
+S: Maintained
+
 DOCBOOK FOR DOCUMENTATION
 P: Martin Waitz
 M: [EMAIL PROTECTED]
@@ -1469,6 +1475,12 @@ P:   Tigran Aivazian
 M: [EMAIL PROTECTED]
 S: Maintained
 
+INTEL I/OAT DMA DRIVER
+P: Chris Leech
+M: [EMAIL PROTECTED]
+L: linux-kernel@vger.kernel.org
+S: Supported
+
 INTEL IXP4XX RANDOM NUMBER GENERATOR SUPPORT
 P: Deepak Saxena
 M: [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/7] [I/OAT] Only offload copies for TCP when there will be a context switch

2006-08-15 Thread Chris Leech
The performance wins come with having the DMA copy engine doing the copies
in parallel with the context switch.  If there is enough data ready on the
socket at recv time just use a regular copy.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 net/ipv4/tcp.c |   10 +++---
 1 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 934396b..36f6b64 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1105,6 +1105,8 @@ int tcp_recvmsg(struct kiocb *iocb, stru
long timeo;
struct task_struct *user_recv = NULL;
int copied_early = 0;
+   int available = 0;
+   struct sk_buff *skb;
 
lock_sock(sk);
 
@@ -1131,7 +1133,11 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 #ifdef CONFIG_NET_DMA
tp-ucopy.dma_chan = NULL;
preempt_disable();
-   if ((len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK) 
+   skb = skb_peek_tail(sk-sk_receive_queue);
+   if (skb)
+   available = TCP_SKB_CB(skb)-seq + skb-len - (*seq);
+   if ((available  target) 
+   (len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK) 
!sysctl_tcp_low_latency  __get_cpu_var(softnet_data).net_dma) {
preempt_enable_no_resched();
tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len);
@@ -1140,7 +1146,6 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 #endif
 
do {
-   struct sk_buff *skb;
u32 offset;
 
/* Are we at urgent data? Stop if we have read anything or have 
SIGURG pending. */
@@ -1428,7 +1433,6 @@ skip_copy:
 
 #ifdef CONFIG_NET_DMA
if (tp-ucopy.dma_chan) {
-   struct sk_buff *skb;
dma_cookie_t done, used;
 
dma_async_memcpy_issue_pending(tp-ucopy.dma_chan);

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/7] [I/OAT] Don't offload copies for loopback traffic

2006-08-15 Thread Chris Leech
Local traffic (loopback) is generally in cache anyway, and the overhead
cost of offloading the copy is worse than just doing it with the CPU.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 net/ipv4/tcp.c |4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 36f6b64..7971e73 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1107,6 +1107,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
int copied_early = 0;
int available = 0;
struct sk_buff *skb;
+   struct dst_entry *dst;
 
lock_sock(sk);
 
@@ -1136,7 +1137,8 @@ int tcp_recvmsg(struct kiocb *iocb, stru
skb = skb_peek_tail(sk-sk_receive_queue);
if (skb)
available = TCP_SKB_CB(skb)-seq + skb-len - (*seq);
-   if ((available  target) 
+   dst = __sk_dst_get(sk);
+   if ((available  target)  (!dst || (dst-dev != loopback_dev)) 
(len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK) 
!sysctl_tcp_low_latency  __get_cpu_var(softnet_data).net_dma) {
preempt_enable_no_resched();

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/7] [I/OAT] Push pending transactions to hardware more frequently

2006-08-15 Thread Chris Leech
Every 20 descriptors turns out to be to few append commands with
newer/faster CPUs.  Pushing every 4 still cuts down on MMIO writes to an
acceptable level without letting the DMA engine run out of work.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/ioatdma.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index dbd4d6c..be4fdd7 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -310,7 +310,7 @@ static dma_cookie_t do_ioat_dma_memcpy(s
list_splice_init(new_chain, ioat_chan-used_desc.prev);
 
ioat_chan-pending += desc_count;
-   if (ioat_chan-pending = 20) {
+   if (ioat_chan-pending = 4) {
append = 1;
ioat_chan-pending = 0;
}
@@ -818,7 +818,7 @@ static void __devexit ioat_remove(struct
 }
 
 /* MODULE API */
-MODULE_VERSION(1.7);
+MODULE_VERSION(1.9);
 MODULE_LICENSE(GPL);
 MODULE_AUTHOR(Intel Corporation);
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/7] [I/OAT] Remove the use of writeq from the ioatdma driver

2006-08-15 Thread Chris Leech
There's only one now anyway, and it's not in a performance path,
so make it behave the same on 32-bit and 64-bit CPUs.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/ioatdma.c |   10 --
 1 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 0be426f..d6d817c 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -608,13 +608,11 @@ static void ioat_start_null_desc(struct 
list_add_tail(desc-node, ioat_chan-used_desc);
spin_unlock_bh(ioat_chan-desc_lock);
 
-#if (BITS_PER_LONG == 64)
-   writeq(desc-phys, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET);
-#else
-   writel((u32) desc-phys,
+   writel(((u64) desc-phys)  0x,
   ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_LOW);
-   writel(0, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH);
-#endif
+   writel(((u64) desc-phys)  32,
+  ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH);
+
writeb(IOAT_CHANCMD_START, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
 }
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/7] [I/OAT] Add entries to MAINTAINERS for the DMA memcpy subsystem and ioatdma

2006-08-15 Thread Chris Leech
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 MAINTAINERS |   10 ++
 1 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 21116cc..9ae73c9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -881,6 +881,11 @@ M: [EMAIL PROTECTED]
 L: linux-kernel@vger.kernel.org
 S: Maintained
 
+DMA GENERIC MEMCPY SUBSYSTEM
+P: Chris Leech
+M: [EMAIL PROTECTED]
+S: Maintained
+
 DOCBOOK FOR DOCUMENTATION
 P: Martin Waitz
 M: [EMAIL PROTECTED]
@@ -1469,6 +1474,11 @@ P:   Tigran Aivazian
 M: [EMAIL PROTECTED]
 S: Maintained
 
+INTEL I/OAT DMA DRIVER
+P: Chris Leech
+M: [EMAIL PROTECTED]
+S: Supported
+
 INTEL IXP4XX RANDOM NUMBER GENERATOR SUPPORT
 P: Deepak Saxena
 M: [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/7] [I/OAT] Remove the wrappers around read(bwl)/write(bwl) in ioatdma

2006-08-15 Thread Chris Leech
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/ioatdma.c|   60 +++
 drivers/dma/ioatdma_io.h |  118 --
 2 files changed, 28 insertions(+), 150 deletions(-)

diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index be4fdd7..0be426f 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -32,7 +32,6 @@
 #include linux/delay.h
 #include linux/dma-mapping.h
 #include ioatdma.h
-#include ioatdma_io.h
 #include ioatdma_registers.h
 #include ioatdma_hw.h
 
@@ -51,8 +50,8 @@ static int enumerate_dma_channels(struct
int i;
struct ioat_dma_chan *ioat_chan;
 
-   device-common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET);
-   xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET);
+   device-common.chancnt = readb(device-reg_base + IOAT_CHANCNT_OFFSET);
+   xfercap_scale = readb(device-reg_base + IOAT_XFERCAP_OFFSET);
xfercap = (xfercap_scale == 0 ? -1 : (1UL  xfercap_scale));
 
for (i = 0; i  device-common.chancnt; i++) {
@@ -123,7 +122,7 @@ static int ioat_dma_alloc_chan_resources
 * In-use bit automatically set by reading chanctrl
 * If 0, we got it, if 1, someone else did
 */
-   chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET);
+   chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
if (chanctrl  IOAT_CHANCTRL_CHANNEL_IN_USE)
return -EBUSY;
 
@@ -132,12 +131,12 @@ static int ioat_dma_alloc_chan_resources
IOAT_CHANCTRL_ERR_INT_EN |
IOAT_CHANCTRL_ANY_ERR_ABORT_EN |
IOAT_CHANCTRL_ERR_COMPLETION_EN;
-ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl);
+writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
 
-   chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET);
+   chanerr = readl(ioat_chan-reg_base + IOAT_CHANERR_OFFSET);
if (chanerr) {
printk(IOAT: CHANERR = %x, clearing\n, chanerr);
-   ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr);
+   writel(chanerr, ioat_chan-reg_base + IOAT_CHANERR_OFFSET);
}
 
/* Allocate descriptors */
@@ -161,10 +160,10 @@ static int ioat_dma_alloc_chan_resources
   ioat_chan-completion_addr);
memset(ioat_chan-completion_virt, 0,
   sizeof(*ioat_chan-completion_virt));
-   ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW,
-  ((u64) ioat_chan-completion_addr)  0x);
-   ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH,
-  ((u64) ioat_chan-completion_addr)  32);
+   writel(((u64) ioat_chan-completion_addr)  0x,
+  ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_LOW);
+   writel(((u64) ioat_chan-completion_addr)  32,
+  ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_HIGH);
 
ioat_start_null_desc(ioat_chan);
return i;
@@ -182,7 +181,7 @@ static void ioat_dma_free_chan_resources
 
ioat_dma_memcpy_cleanup(ioat_chan);
 
-   ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET);
+   writeb(IOAT_CHANCMD_RESET, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
 
spin_lock_bh(ioat_chan-desc_lock);
list_for_each_entry_safe(desc, _desc, ioat_chan-used_desc, node) {
@@ -210,9 +209,9 @@ static void ioat_dma_free_chan_resources
ioat_chan-last_completion = ioat_chan-completion_addr = 0;
 
/* Tell hw the chan is free */
-   chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET);
+   chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
chanctrl = ~IOAT_CHANCTRL_CHANNEL_IN_USE;
-   ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl);
+   writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET);
 }
 
 /**
@@ -318,9 +317,8 @@ static dma_cookie_t do_ioat_dma_memcpy(s
spin_unlock_bh(ioat_chan-desc_lock);
 
if (append)
-   ioatdma_chan_write8(ioat_chan,
-   IOAT_CHANCMD_OFFSET,
-   IOAT_CHANCMD_APPEND);
+   writeb(IOAT_CHANCMD_APPEND,
+  ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
return cookie;
 }
 
@@ -417,9 +415,8 @@ static void ioat_dma_memcpy_issue_pendin
 
if (ioat_chan-pending != 0) {
ioat_chan-pending = 0;
-   ioatdma_chan_write8(ioat_chan,
-   IOAT_CHANCMD_OFFSET,
-   IOAT_CHANCMD_APPEND);
+   writeb(IOAT_CHANCMD_APPEND,
+  ioat_chan-reg_base + IOAT_CHANCMD_OFFSET);
}
 }
 
@@ -449,7 +446,7 @@ static void ioat_dma_memcpy_cleanup(stru
if ((chan-completion_virt-full  IOAT_CHANSTS_DMA_TRANSFER_STATUS

[PATCH 6/7] [I/OAT] Add documentation for the tcp_dma_copybreak sysctl

2006-08-15 Thread Chris Leech
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 Documentation/networking/ip-sysctl.txt |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index d46338a..841d61e 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -369,6 +369,12 @@ tcp_slow_start_after_idle - BOOLEAN
be timed out after an idle period.
Default: 1
 
+tcp_dma_copybreak - INTEGER
+   Lower limit, in bytes, of the size of socket reads that will be
+   offloaded to a DMA copy engine, if one is present in the system
+   and CONFIG_NET_DMA is enabled.
+   Default: 4096
+
 IP Variables:
 
 ip_local_port_range - 2 INTEGERS

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: problems with e1000 and jumboframes

2006-08-04 Thread Chris Leech

On 8/3/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote:

 You're changing the size of the buffer without telling the hardware.
 In the interrupt context e1000 knows the size of what was DMAed into
 the skb, but that's after the fact.  So e1000 could detect that memory
 was corrupted, but not prevent it if you don't give it power of 2
 buffers.  Actually, the power of 2 thing doesn't hold true for all
 e1000 devices.  Some have 1k granularity, but not Arnd's 82540.

I can not change it - code checks if requested mtu and additional size
is less than allocated aligned buffer it tricks allocator.
Or do you mean that even after 9k mtu was setup it is possible that card
can receive packets up to 16k?


Yes, that's exactly what I mean.  For anything above the standard 1500
bytes the e1000 _hardware_ has no concept of MTU, only buffer length.
So even if the driver is set to an MTU of 9000, the NIC will still
receive 16k frames.  Otherwise the driver would simply allocate MTU
sized buffers.

-Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: problems with e1000 and jumboframes

2006-08-03 Thread Chris Leech

On 8/3/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote:


 Strange, why this skb_shared_info cannon be added before first alignment?
 And what about smaller frames like 1500, does this driver behave similar
 (first align then add)?

It can be.
Could attached  (completely untested) patch help?


Note that e1000 uses power of two buffers because that's what the
hardware supports.  Also, there's no program able MTU - only a single
bit for long packet enable that disables frame length checks when
using jumbo frames.  That means that if you tell the e1000 it has a
16k buffer, and a 16k frame shows up on the wire, it's going to write
to the entire 16k regardless of your 9k MTU setting.  If a 32k frame
shows up, two full 16k buffers get written to (OK, assuming the frame
can fit into the receive FIFO)

That's why I've always been against trying to optimize the allocation
sizes in the driver, even with your small change the skb_shinfo area
can get corrupted.  It may be unlikely, because the frame still has to
be valid, but some switches aren't real picky about what sized frame
they'll forward on if you enable jumbo support either.  So any box on
the LAN could send you larger than MTU frames in an attempt to corrupt
memory.

I believe that if you tell a hardware device it has a buffer of a
certain size, you need to be prepared for that entire buffer to get
written to.  Unfortunately that means wasteful allocations for e1000
if a single buffer per frame is going to be used.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: problems with e1000 and jumboframes

2006-08-03 Thread Chris Leech

Maximum e1000 frame is 16128 bytes, which is enough before being rounded
to 16k to have a space for shared info.
My patch just tricks refilling logic to request to allocate slightly less
than was setup when mtu was changed.


The maximum supported MTU size differs between e1000 devices due to
differences in FIFO size.  For performance reasons the driver won't
enable a MTU that doesn't allow for at least two frames in the Tx FIFO
at once - you really want e1000 to be able to DMA the next frame into
Tx FIFO while the current one is going out on the wire.  This doesn't
change the fact that with LPE set, anything that can fit into the Rx
FIFO and has a valid CRC will be DMAed into buffers regardless of
length.


Hardware is not affected, second patch just checks if there is enough
space (e1000 stores real mtu). I can not believe that such modern NIC
like e1000 can not know in receive interrupt size of the received
packet, if it is true, than in generel you are right and some more
clever mechanisms shoud be used (at least turn hack off for small
packets and only enable it for less than 16 jumbo frames wheere place
always is), if size of the received packet is known, then it is enough
to compare aligned size and size of the packet to make a decision for
allocation.


You're changing the size of the buffer without telling the hardware.
In the interrupt context e1000 knows the size of what was DMAed into
the skb, but that's after the fact.  So e1000 could detect that memory
was corrupted, but not prevent it if you don't give it power of 2
buffers.  Actually, the power of 2 thing doesn't hold true for all
e1000 devices.  Some have 1k granularity, but not Arnd's 82540.

You can't know the size of a received packet before it's DMAed into
host memory, no high performance network controller works that way.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: problems with e1000 and jumboframes

2006-08-03 Thread Chris Leech

On 8/3/06, Arnd Hannemann [EMAIL PROTECTED] wrote:

Well you say if a single buffer per frame is going to be used. Well,
if I understood you correctly i could set the MTU to, lets say 4000.
Then the driver would enable the jumbo frame bit of the hardware, and
allocate only a 4k rx buffer, right? (and allocate 16k, because of
skb_shinfo)
Now if a new 9k frame arrives the hardware will accept it regardless of
the 2k MTU and will split it into 3x 4k rx buffers?
Does the current driver work in this way? That would be great.

Perhaps then one should change the driver in a way that the MTU can
changed independently of the buffer size?


Yes, e1000 devices will spill over and use multiple buffers for a
single frame.  We've been trying to find a good way to use multiple
buffers to take care of these allocation problems.  The structure of
the sk_buff does not make it easy.  Or should I say that it's the
limitation that drivers are not allowed to chain together multiple
sk_buffs to represent a single frame that does not make it easy.

PCI-Express e1000 devices support a feature called header split, where
the protocol headers go into a different buffer from the payload.  We
use that today to put headers into the kmalloc() allocated skb-data
area, and payload into one or more skb-frags[] pages.  You don't ever
have multiple page allocations from the driver in this mode.

We could try and only use page allocations for older e1000 devices,
putting headers and payload into skb-frags and copying the headers
out into the skb-data area as needed for processing.  That would do
away with large allocations, but in Jesse's experiments calling
alloc_page() is slower than kmalloc(), so there can actually be a
performance hit from trying to use page allocations all the time.

It's an interesting problem.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/9] [I/OAT] DMA memcpy subsystem

2006-05-23 Thread Chris Leech
Provides an API for offloading memory copies to DMA devices

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/Kconfig   |2 
 drivers/Makefile  |1 
 drivers/dma/Kconfig   |   13 +
 drivers/dma/Makefile  |1 
 drivers/dma/dmaengine.c   |  408 +
 include/linux/dmaengine.h |  337 +
 6 files changed, 762 insertions(+), 0 deletions(-)

diff --git a/drivers/Kconfig b/drivers/Kconfig
index aeb5ab2..8b11ceb 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -72,4 +72,6 @@ source drivers/edac/Kconfig
 
 source drivers/rtc/Kconfig
 
+source drivers/dma/Kconfig
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 447d8e6..3c51703 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN)  += sn/
 obj-y  += firmware/
 obj-$(CONFIG_CRYPTO)   += crypto/
 obj-$(CONFIG_SUPERH)   += sh/
+obj-$(CONFIG_DMA_ENGINE)   += dma/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644
index 000..f9ac4bc
--- /dev/null
+++ b/drivers/dma/Kconfig
@@ -0,0 +1,13 @@
+#
+# DMA engine configuration
+#
+
+menu DMA Engine support
+
+config DMA_ENGINE
+   bool Support for DMA engines
+   ---help---
+ DMA engines offload copy operations from the CPU to dedicated
+ hardware, allowing the copies to happen asynchronously.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644
index 000..10b7391
--- /dev/null
+++ b/drivers/dma/Makefile
@@ -0,0 +1 @@
+obj-y += dmaengine.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644
index 000..473c47b
--- /dev/null
+++ b/drivers/dma/dmaengine.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code implements the DMA subsystem. It provides a HW-neutral interface
+ * for other kernel code to use asynchronous memory copy capabilities,
+ * if present, and allows different HW DMA drivers to register as providing
+ * this capability.
+ *
+ * Due to the fact we are accelerating what is already a relatively fast
+ * operation, the code goes to great lengths to avoid additional overhead,
+ * such as locking.
+ *
+ * LOCKING:
+ *
+ * The subsystem keeps two global lists, dma_device_list and dma_client_list.
+ * Both of these are protected by a mutex, dma_list_mutex.
+ *
+ * Each device has a channels list, which runs unlocked but is never modified
+ * once the device is registered, it's just setup by the driver.
+ *
+ * Each client has a channels list, it's only modified under the client-lock
+ * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ *
+ * Each device has a kref, which is initialized to 1 when the device is
+ * registered. A kref_put is done for each class_device registered.  When the
+ * class_device is released, the coresponding kref_put is done in the release
+ * method. Every time one of the device's channels is allocated to a client,
+ * a kref_get occurs.  When the channel is freed, the coresponding kref_put
+ * happens. The device's release function does a completion, so
+ * unregister_device does a remove event, class_device_unregister, a kref_put
+ * for the first reference, then waits on the completion for all other
+ * references to finish.
+ *
+ * Each channel has an open-coded implementation of Rusty Russell's bigref,
+ * with a kref and a per_cpu local_t.  A single reference is set when on an
+ * ADDED event, and removed with a REMOVE event.  Net DMA client takes an
+ * extra reference per outstanding transaction.  The relase function does a
+ * kref_put on the device. -ChrisL
+ */
+
+#include linux/init.h
+#include linux/module.h
+#include linux/device.h
+#include linux/dmaengine.h
+#include linux/hardirq.h
+#include linux/spinlock.h
+#include linux/percpu.h
+#include linux/rcupdate.h
+#include linux/mutex.h
+
+static DEFINE_MUTEX(dma_list_mutex);
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs

[PATCH 0/9] I/OAT repost

2006-05-23 Thread Chris Leech
This is a repost of the I/OAT patches, the only changes from last time
are refreshing the patches and removing an unused macro that was causing
the vger spam filters to drop patch 2/9.

This patch series is the a full release of the Intel(R) I/O
Acceleration Technology (I/OAT) for Linux.  It includes an in kernel API
for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy
engine, and changes to the TCP stack to offload copies of received
networking data to application space.

These changes apply to Linus' tree as of commit
387e2b0439026aa738a9edca15a57e5c0bcb4dfc
[BRIDGE]: need to ref count the LLC sap

They are available to pull from
git://63.64.152.142/~cleech/linux-2.6 ioat-2.6.18

There are 9 patches in the series:
1) The memcpy offload APIs and class code
2) The Intel I/OAT DMA driver (ioatdma)
3) Core networking code to setup networking as a DMA memcpy client
4) Utility functions for sk_buff to iovec offloaded copy
5) Structure changes needed for TCP receive offload
6) Rename cleanup_rbuf to tcp_cleanup_rbuf
7) Make sk_eat_skb aware of early copied packets
8) Add a sysctl to tune the minimum offloaded I/O size for TCP
9) The main TCP receive offload changes

--
Chris Leech [EMAIL PROTECTED]
I/O Acceleration Technology Software Development
LAN Access Division / Digital Enterprise Group 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client

2006-05-23 Thread Chris Leech
Attempts to allocate per-CPU DMA channels

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/Kconfig   |   12 +
 include/linux/netdevice.h |4 ++
 include/net/netdma.h  |   38 
 net/core/dev.c|  104 +
 4 files changed, 158 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 0f15e76..30d021d 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -10,6 +10,18 @@ config DMA_ENGINE
  DMA engines offload copy operations from the CPU to dedicated
  hardware, allowing the copies to happen asynchronously.
 
+comment DMA Clients
+
+config NET_DMA
+   bool Network: TCP receive copy offload
+   depends on DMA_ENGINE  NET
+   default y
+   ---help---
+ This enables the use of DMA engines in the network stack to
+ offload receive copy-to-user operations, freeing CPU cycles.
+ Since this is the main user of the DMA engine, it should be enabled;
+ say Y here.
+
 comment DMA Devices
 
 config INTEL_IOATDMA
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f4169bb..b5760c6 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -37,6 +37,7 @@
 #include linux/config.h
 #include linux/device.h
 #include linux/percpu.h
+#include linux/dmaengine.h
 
 struct divert_blk;
 struct vlan_group;
@@ -593,6 +594,9 @@ struct softnet_data
struct sk_buff  *completion_queue;
 
struct net_device   backlog_dev;/* Sorry. 8) */
+#ifdef CONFIG_NET_DMA
+   struct dma_chan *net_dma;
+#endif
 };
 
 DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff --git a/include/net/netdma.h b/include/net/netdma.h
new file mode 100644
index 000..cbfe89d
--- /dev/null
+++ b/include/net/netdma.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef NETDMA_H
+#define NETDMA_H
+#include linux/config.h
+#ifdef CONFIG_NET_DMA
+#include linux/dmaengine.h
+
+static inline struct dma_chan *get_softnet_dma(void)
+{
+   struct dma_chan *chan;
+   rcu_read_lock();
+   chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma));
+   if (chan)
+   dma_chan_get(chan);
+   rcu_read_unlock();
+   return chan;
+}
+#endif /* CONFIG_NET_DMA */
+#endif /* NETDMA_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 2dce673..6e78798 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,6 +115,7 @@
 #include net/iw_handler.h
 #include asm/current.h
 #include linux/audit.h
+#include linux/dmaengine.h
 
 /*
  * The list of packet types we will receive (as opposed to discard)
@@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16];/* 16 way hashed list */
 static struct list_head ptype_all; /* Taps */
 
+#ifdef CONFIG_NET_DMA
+static struct dma_client *net_dma_client;
+static unsigned int net_dma_count;
+static spinlock_t net_dma_event_lock;
+#endif
+
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtln
  * semaphore.
@@ -1844,6 +1851,19 @@ static void net_rx_action(struct softirq
}
}
 out:
+#ifdef CONFIG_NET_DMA
+   /*
+* There may not be any more sk_buffs coming right now, so push
+* any pending DMA copies to hardware
+*/
+   if (net_dma_client) {
+   struct dma_chan *chan;
+   rcu_read_lock();
+   list_for_each_entry_rcu(chan, net_dma_client-channels, 
client_node)
+   dma_async_memcpy_issue_pending(chan);
+   rcu_read_unlock();
+   }
+#endif
local_irq_enable();
return;
 
@@ -3298,6 +3318,88 @@ static int dev_cpu_callback(struct notif
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+#ifdef CONFIG_NET_DMA
+/**
+ * net_dma_rebalance -
+ * This is called when the number of channels allocated to the net_dma_client
+ * changes.  The net_dma_client tries to have one DMA channel per CPU.
+ */
+static void net_dma_rebalance(void)
+{
+   unsigned int cpu, i, n

[PATCH 4/9] [I/OAT] Utility functions for offloading sk_buff to iovec copies

2006-05-23 Thread Chris Leech
Provides for pinning user space pages in memory, copying to iovecs,
and copying from sk_buffs including fragmented and chained sk_buffs.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/Makefile  |3 
 drivers/dma/iovlock.c |  301 +
 include/linux/dmaengine.h |   22 +++
 include/net/netdma.h  |6 +
 net/core/Makefile |1 
 net/core/user_dma.c   |  127 +++
 6 files changed, 459 insertions(+), 1 deletions(-)

diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index c8a5f56..bdcfdbd 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,2 +1,3 @@
-obj-y += dmaengine.o
+obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
+obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
new file mode 100644
index 000..5ed327e
--- /dev/null
+++ b/drivers/dma/iovlock.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include linux/dmaengine.h
+#include linux/pagemap.h
+#include net/tcp.h /* for memcpy_toiovec */
+#include asm/io.h
+#include asm/uaccess.h
+
+int num_pages_spanned(struct iovec *iov)
+{
+   return
+   ((PAGE_ALIGN((unsigned long)iov-iov_base + iov-iov_len) -
+   ((unsigned long)iov-iov_base  PAGE_MASK))  PAGE_SHIFT);
+}
+
+/*
+ * Pin down all the iovec pages needed for len bytes.
+ * Return a struct dma_pinned_list to keep track of pages pinned down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len)
+{
+   struct dma_pinned_list *local_list;
+   struct page **pages;
+   int i;
+   int ret;
+   int nr_iovecs = 0;
+   int iovec_len_used = 0;
+   int iovec_pages_used = 0;
+   long err;
+
+   /* don't pin down non-user-based iovecs */
+   if (segment_eq(get_fs(), KERNEL_DS))
+   return NULL;
+
+   /* determine how many iovecs/pages there are, up front */
+   do {
+   iovec_len_used += iov[nr_iovecs].iov_len;
+   iovec_pages_used += num_pages_spanned(iov[nr_iovecs]);
+   nr_iovecs++;
+   } while (iovec_len_used  len);
+
+   /* single kmalloc for pinned list, page_list[], and the page arrays */
+   local_list = kmalloc(sizeof(*local_list)
+   + (nr_iovecs * sizeof (struct dma_page_list))
+   + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
+   if (!local_list) {
+   err = -ENOMEM;
+   goto out;
+   }
+
+   /* list of pages starts right after the page list array */
+   pages = (struct page **) local_list-page_list[nr_iovecs];
+
+   for (i = 0; i  nr_iovecs; i++) {
+   struct dma_page_list *page_list = local_list-page_list[i];
+
+   len -= iov[i].iov_len;
+
+   if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) {
+   err = -EFAULT;
+   goto unpin;
+   }
+
+   page_list-nr_pages = num_pages_spanned(iov[i]);
+   page_list-base_address = iov[i].iov_base;
+
+   page_list-pages = pages;
+   pages += page_list-nr_pages;
+
+   /* pin pages down */
+   down_read(current-mm-mmap_sem);
+   ret = get_user_pages(
+   current,
+   current-mm,
+   (unsigned long) iov[i].iov_base,
+   page_list-nr_pages,
+   1,  /* write */
+   0,  /* force */
+   page_list-pages,
+   NULL);
+   up_read(current-mm-mmap_sem

[PATCH 5/9] [I/OAT] Structure changes for TCP recv offload to I/OAT

2006-05-23 Thread Chris Leech
Adds an async_wait_queue and some additional fields to tcp_sock, and a
dma_cookie_t to sk_buff.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/linux/skbuff.h |4 
 include/linux/tcp.h|8 
 include/net/sock.h |2 ++
 include/net/tcp.h  |7 +++
 net/core/sock.c|6 ++
 5 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8f2347..23bad3b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -29,6 +29,7 @@
 #include linux/net.h
 #include linux/textsearch.h
 #include net/checksum.h
+#include linux/dmaengine.h
 
 #define HAVE_ALLOC_SKB /* For the drivers to know */
 #define HAVE_ALIGNABLE_SKB /* Ditto 8)*/
@@ -285,6 +286,9 @@ struct sk_buff {
__u16   tc_verd;/* traffic control verdict */
 #endif
 #endif
+#ifdef CONFIG_NET_DMA
+   dma_cookie_tdma_cookie;
+#endif
 
 
/* These elements must be at the end, see alloc_skb() for details.  */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 542d395..c90daa5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -18,6 +18,7 @@
 #define _LINUX_TCP_H
 
 #include linux/types.h
+#include linux/dmaengine.h
 #include asm/byteorder.h
 
 struct tcphdr {
@@ -233,6 +234,13 @@ struct tcp_sock {
struct iovec*iov;
int memory;
int len;
+#ifdef CONFIG_NET_DMA
+   /* members for async copy */
+   struct dma_chan *dma_chan;
+   int wakeup;
+   struct dma_pinned_list  *pinned_list;
+   dma_cookie_tdma_cookie;
+#endif
} ucopy;
 
__u32   snd_wl1;/* Sequence for window update   */
diff --git a/include/net/sock.h b/include/net/sock.h
index c9fad6f..90c65cb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -132,6 +132,7 @@ struct sock_common {
   *@sk_receive_queue: incoming packets
   *@sk_wmem_alloc: transmit queue bytes committed
   *@sk_write_queue: Packet sending queue
+  *@sk_async_wait_queue: DMA copied packets
   *@sk_omem_alloc: o is option or other
   *@sk_wmem_queued: persistent queue size
   *@sk_forward_alloc: space allocated forward
@@ -205,6 +206,7 @@ struct sock {
atomic_tsk_omem_alloc;
struct sk_buff_head sk_receive_queue;
struct sk_buff_head sk_write_queue;
+   struct sk_buff_head sk_async_wait_queue;
int sk_wmem_queued;
int sk_forward_alloc;
gfp_t   sk_allocation;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c989db..d0c2c2f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -28,6 +28,7 @@
 #include linux/cache.h
 #include linux/percpu.h
 #include linux/skbuff.h
+#include linux/dmaengine.h
 
 #include net/inet_connection_sock.h
 #include net/inet_timewait_sock.h
@@ -817,6 +818,12 @@ static inline void tcp_prequeue_init(str
tp-ucopy.len = 0;
tp-ucopy.memory = 0;
skb_queue_head_init(tp-ucopy.prequeue);
+#ifdef CONFIG_NET_DMA
+   tp-ucopy.dma_chan = NULL;
+   tp-ucopy.wakeup = 0;
+   tp-ucopy.pinned_list = NULL;
+   tp-ucopy.dma_cookie = 0;
+#endif
 }
 
 /* Packet is added to VJ-style prequeue for processing in process
diff --git a/net/core/sock.c b/net/core/sock.c
index ed2afdb..5d820c3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock 
atomic_set(newsk-sk_omem_alloc, 0);
skb_queue_head_init(newsk-sk_receive_queue);
skb_queue_head_init(newsk-sk_write_queue);
+#ifdef CONFIG_NET_DMA
+   skb_queue_head_init(newsk-sk_async_wait_queue);
+#endif
 
rwlock_init(newsk-sk_dst_lock);
rwlock_init(newsk-sk_callback_lock);
@@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock,
skb_queue_head_init(sk-sk_receive_queue);
skb_queue_head_init(sk-sk_write_queue);
skb_queue_head_init(sk-sk_error_queue);
+#ifdef CONFIG_NET_DMA
+   skb_queue_head_init(sk-sk_async_wait_queue);
+#endif
 
sk-sk_send_head=   NULL;
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/9] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static

2006-05-23 Thread Chris Leech
Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/net/tcp.h |2 ++
 net/ipv4/tcp.c|   10 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d0c2c2f..578cccf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -294,6 +294,8 @@ extern int  tcp_rcv_established(struct 
 
 extern voidtcp_rcv_space_adjust(struct sock *sk);
 
+extern voidtcp_cleanup_rbuf(struct sock *sk, int copied);
+
 extern int tcp_twsk_unique(struct sock *sk,
struct sock *sktw, void *twp);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e2b7b80..1c0cfd7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk,
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
struct tcp_sock *tp = tcp_sk(sk);
int time_to_ack = 0;
@@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_
 
/* Clean up data we have read: This will do ACK frames. */
if (copied)
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
return copied;
 }
 
@@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
}
}
 
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
 
if (!sysctl_tcp_low_latency  tp-ucopy.task == user_recv) {
/* Install new reader */
@@ -1391,7 +1391,7 @@ skip_copy:
 */
 
/* Clean up data we have read: This will do ACK frames. */
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
 
TCP_CHECK_TIMER(sk);
release_sock(sk);
@@ -1858,7 +1858,7 @@ static int do_tcp_setsockopt(struct sock
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) 
inet_csk_ack_scheduled(sk)) {
icsk-icsk_ack.pending |= ICSK_ACK_PUSHED;
-   cleanup_rbuf(sk, 1);
+   tcp_cleanup_rbuf(sk, 1);
if (!(val  1))
icsk-icsk_ack.pingpong = 1;
}

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/9] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold

2006-05-23 Thread Chris Leech
Any socket recv of less than this ammount will not be offloaded

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/linux/sysctl.h |1 +
 include/net/tcp.h  |1 +
 net/core/user_dma.c|4 
 net/ipv4/sysctl_net_ipv4.c |   10 ++
 4 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 76eaeff..cd9e7c0 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -403,6 +403,7 @@ enum
NET_TCP_MTU_PROBING=113,
NET_TCP_BASE_MSS=114,
NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
+   NET_TCP_DMA_COPYBREAK=116,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 578cccf..f1f4727 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -219,6 +219,7 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
+extern int sysctl_tcp_dma_copybreak;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 9eee91b..b7c98db 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -30,6 +30,10 @@
 #include linux/rtnetlink.h /* for BUG_TRAP */
 #include net/tcp.h
 
+#define NET_DMA_DEFAULT_COPYBREAK 4096
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+
 /**
  * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
  * @skb - buffer to copy
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3ad..6a6aa53 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -688,6 +688,16 @@ ctl_table ipv4_table[] = {
.mode   = 0644,
.proc_handler   = proc_dointvec
},
+#ifdef CONFIG_NET_DMA
+   {
+   .ctl_name   = NET_TCP_DMA_COPYBREAK,
+   .procname   = tcp_dma_copybreak,
+   .data   = sysctl_tcp_dma_copybreak,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec
+   },
+#endif
{ .ctl_name = 0 }
 };
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/9] [I/OAT] make sk_eat_skb I/OAT aware

2006-05-23 Thread Chris Leech
Add an extra argument to sk_eat_skb, and make it move early copied packets
to the async_wait_queue instead of freeing them.
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/net/sock.h |   13 -
 net/dccp/proto.c   |4 ++--
 net/ipv4/tcp.c |8 
 net/llc/af_llc.c   |2 +-
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 90c65cb..75b0e97 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1273,11 +1273,22 @@ sock_recv_timestamp(struct msghdr *msg, 
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
 */
-static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
+#ifdef CONFIG_NET_DMA
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int 
copied_early)
+{
+   __skb_unlink(skb, sk-sk_receive_queue);
+   if (!copied_early)
+   __kfree_skb(skb);
+   else
+   __skb_queue_tail(sk-sk_async_wait_queue, skb);
+}
+#else
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int 
copied_early)
 {
__skb_unlink(skb, sk-sk_receive_queue);
__kfree_skb(skb);
 }
+#endif
 
 extern void sock_enable_timestamp(struct sock *sk);
 extern int sock_get_timestamp(struct sock *, struct timeval __user *);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 2e0ee83..5317fd3 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, str
}
dccp_pr_debug(packet_type=%s\n,
  dccp_packet_name(dh-dccph_type));
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
 verify_sock_status:
if (sock_flag(sk, SOCK_DONE)) {
len = 0;
@@ -773,7 +773,7 @@ verify_sock_status:
}
found_fin_ok:
if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
break;
} while (1);
 out:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1c0cfd7..4e067d2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_
break;
}
if (skb-h.th-fin) {
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
++seq;
break;
}
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
if (!desc-count)
break;
}
@@ -1356,14 +1356,14 @@ skip_copy:
if (skb-h.th-fin)
goto found_fin_ok;
if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
continue;
 
found_fin_ok:
/* Process the FIN. */
++*seq;
if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
break;
} while (len  0);
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5a04db7..7465170 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *
continue;
 
if (!(flags  MSG_PEEK)) {
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
*seq = 0;
}
} while (len  0);

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine

2006-05-23 Thread Chris Leech
Adds a new ioatdma driver

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/Kconfig |9 
 drivers/dma/Makefile|1 
 drivers/dma/ioatdma.c   |  839 +++
 drivers/dma/ioatdma.h   |  126 ++
 drivers/dma/ioatdma_hw.h|   52 ++
 drivers/dma/ioatdma_io.h|  118 +
 drivers/dma/ioatdma_registers.h |  126 ++
 7 files changed, 1271 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index f9ac4bc..0f15e76 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -10,4 +10,13 @@ config DMA_ENGINE
  DMA engines offload copy operations from the CPU to dedicated
  hardware, allowing the copies to happen asynchronously.
 
+comment DMA Devices
+
+config INTEL_IOATDMA
+   tristate Intel I/OAT DMA support
+   depends on DMA_ENGINE  PCI
+   default m
+   ---help---
+ Enable support for the Intel(R) I/OAT DMA engine.
+
 endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 10b7391..c8a5f56 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1 +1,2 @@
 obj-y += dmaengine.o
+obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
new file mode 100644
index 000..11d48b9
--- /dev/null
+++ b/drivers/dma/ioatdma.c
@@ -0,0 +1,839 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This driver supports an Intel I/OAT DMA engine, which does asynchronous
+ * copy operations.
+ */
+
+#include linux/init.h
+#include linux/module.h
+#include linux/pci.h
+#include linux/interrupt.h
+#include linux/dmaengine.h
+#include linux/delay.h
+#include ioatdma.h
+#include ioatdma_io.h
+#include ioatdma_registers.h
+#include ioatdma_hw.h
+
+#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common)
+#define to_ioat_device(dev) container_of(dev, struct ioat_device, common)
+#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node)
+
+/* internal functions */
+static int __devinit ioat_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent);
+static void __devexit ioat_remove(struct pci_dev *pdev);
+
+static int enumerate_dma_channels(struct ioat_device *device)
+{
+   u8 xfercap_scale;
+   u32 xfercap;
+   int i;
+   struct ioat_dma_chan *ioat_chan;
+
+   device-common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET);
+   xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET);
+   xfercap = (xfercap_scale == 0 ? -1 : (1UL  xfercap_scale));
+
+   for (i = 0; i  device-common.chancnt; i++) {
+   ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL);
+   if (!ioat_chan) {
+   device-common.chancnt = i;
+   break;
+   }
+
+   ioat_chan-device = device;
+   ioat_chan-reg_base = device-reg_base + (0x80 * (i + 1));
+   ioat_chan-xfercap = xfercap;
+   spin_lock_init(ioat_chan-cleanup_lock);
+   spin_lock_init(ioat_chan-desc_lock);
+   INIT_LIST_HEAD(ioat_chan-free_desc);
+   INIT_LIST_HEAD(ioat_chan-used_desc);
+   /* This should be made common somewhere in dmaengine.c */
+   ioat_chan-common.device = device-common;
+   ioat_chan-common.client = NULL;
+   list_add_tail(ioat_chan-common.device_node,
+ device-common.channels);
+   }
+   return device-common.chancnt;
+}
+
+static struct ioat_desc_sw *ioat_dma_alloc_descriptor(
+   struct ioat_dma_chan *ioat_chan,
+   int flags)
+{
+   struct ioat_dma_descriptor *desc;
+   struct ioat_desc_sw *desc_sw;
+   struct ioat_device *ioat_device;
+   dma_addr_t phys;
+
+   ioat_device = to_ioat_device(ioat_chan-common.device);
+   desc = pci_pool_alloc(ioat_device-dma_pool, flags, phys);
+   if (unlikely(!desc))
+   return NULL;
+
+   desc_sw = kzalloc(sizeof(*desc_sw), flags);
+   if (unlikely(!desc_sw

Re: 2.6.16.13 e1000 reports incorrect PCI-X bus speed?

2006-05-13 Thread Chris Leech

Any idea why 120Mhz is used instead of 133?  It doesn't
seem to matter in my performance tests, but I am curious...


I think Rick is right, the bus between the bridge on the card and the
e1000s is running at 120Mhz.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/9] I/OAT network recv copy offload

2006-05-08 Thread Chris Leech
A few changes after going over all the memory allocations, but mostly just
keeping the patches up to date.

This patch series is the a full release of the Intel(R) I/O
Acceleration Technology (I/OAT) for Linux.  It includes an in kernel API
for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy
engine, and changes to the TCP stack to offload copies of received
networking data to application space.

Changes from last posting:
Fixed a struct ioat_dma_chan memory leak on driver unload.
Changed a lock that was never held in atomic contexts to a mutex
as part of avoiding unneeded GFP_ATOMIC allocations.

These changes apply to Linus' tree as of commit
6810b548b25114607e0814612d84125abccc0a4f
[PATCH] x86_64: Move ondemand timer into own work queue

They are available to pull from
git://63.64.152.142/~cleech/linux-2.6 ioat-2.6.17

There are 9 patches in the series:
1) The memcpy offload APIs and class code
2) The Intel I/OAT DMA driver (ioatdma)
3) Core networking code to setup networking as a DMA memcpy client
4) Utility functions for sk_buff to iovec offloaded copy
5) Structure changes needed for TCP receive offload
6) Rename cleanup_rbuf to tcp_cleanup_rbuf
7) Make sk_eat_skb aware of early copied packets
8) Add a sysctl to tune the minimum offloaded I/O size for TCP
9) The main TCP receive offload changes

--
Chris Leech [EMAIL PROTECTED]
I/O Acceleration Technology Software Development
LAN Access Division / Digital Enterprise Group 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/9] I/OAT network recv copy offload

2006-05-08 Thread Chris Leech

[I/OAT] Driver for the Intel(R) I/OAT DMA engine

From: Chris Leech [EMAIL PROTECTED]

Adds a new ioatdma driver

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

drivers/dma/Kconfig |9
drivers/dma/Makefile|1
drivers/dma/ioatdma.c   |  839 +++
drivers/dma/ioatdma.h   |  126 ++
drivers/dma/ioatdma_hw.h|   52 ++
drivers/dma/ioatdma_io.h|  118 +
drivers/dma/ioatdma_registers.h |  128 ++
7 files changed, 1273 insertions(+), 0 deletions(-)


ioatdma_driver.gz
Description: GNU Zip compressed data


[PATCH 9/9] [I/OAT] TCP recv offload to I/OAT

2006-05-08 Thread Chris Leech
Locks down user pages and sets up for DMA in tcp_recvmsg, then calls
dma_async_try_early_copy in tcp_v4_do_rcv

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 net/ipv4/tcp.c   |  103 --
 net/ipv4/tcp_input.c |   74 +---
 net/ipv4/tcp_ipv4.c  |   18 -
 net/ipv6/tcp_ipv6.c  |   12 +-
 4 files changed, 185 insertions(+), 22 deletions(-)

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4e067d2..ff6ccda 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -263,7 +263,7 @@
 #include net/tcp.h
 #include net/xfrm.h
 #include net/ip.h
-
+#include net/netdma.h
 
 #include asm/uaccess.h
 #include asm/ioctls.h
@@ -1110,6 +1110,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
+   int copied_early = 0;
 
lock_sock(sk);
 
@@ -1133,6 +1134,17 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 
target = sock_rcvlowat(sk, flags  MSG_WAITALL, len);
 
+#ifdef CONFIG_NET_DMA
+   tp-ucopy.dma_chan = NULL;
+   preempt_disable();
+   if ((len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK) 
+   !sysctl_tcp_low_latency  __get_cpu_var(softnet_data.net_dma)) {
+   preempt_enable_no_resched();
+   tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len);
+   } else
+   preempt_enable_no_resched();
+#endif
+
do {
struct sk_buff *skb;
u32 offset;
@@ -1274,6 +1286,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru
} else
sk_wait_data(sk, timeo);
 
+#ifdef CONFIG_NET_DMA
+   tp-ucopy.wakeup = 0;
+#endif
+
if (user_recv) {
int chunk;
 
@@ -1329,13 +1345,39 @@ do_prequeue:
}
 
if (!(flags  MSG_TRUNC)) {
-   err = skb_copy_datagram_iovec(skb, offset,
- msg-msg_iov, used);
-   if (err) {
-   /* Exception. Bailout! */
-   if (!copied)
-   copied = -EFAULT;
-   break;
+#ifdef CONFIG_NET_DMA
+   if (!tp-ucopy.dma_chan  tp-ucopy.pinned_list)
+   tp-ucopy.dma_chan = get_softnet_dma();
+
+   if (tp-ucopy.dma_chan) {
+   tp-ucopy.dma_cookie = 
dma_skb_copy_datagram_iovec(
+   tp-ucopy.dma_chan, skb, offset,
+   msg-msg_iov, used,
+   tp-ucopy.pinned_list);
+
+   if (tp-ucopy.dma_cookie  0) {
+
+   printk(KERN_ALERT dma_cookie  0\n);
+
+   /* Exception. Bailout! */
+   if (!copied)
+   copied = -EFAULT;
+   break;
+   }
+   if ((offset + used) == skb-len)
+   copied_early = 1;
+
+   } else
+#endif
+   {
+   err = skb_copy_datagram_iovec(skb, offset,
+   msg-msg_iov, used);
+   if (err) {
+   /* Exception. Bailout! */
+   if (!copied)
+   copied = -EFAULT;
+   break;
+   }
}
}
 
@@ -1355,15 +1397,19 @@ skip_copy:
 
if (skb-h.th-fin)
goto found_fin_ok;
-   if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb, 0);
+   if (!(flags  MSG_PEEK)) {
+   sk_eat_skb(sk, skb, copied_early);
+   copied_early = 0;
+   }
continue;
 
found_fin_ok:
/* Process the FIN. */
++*seq;
-   if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb, 0);
+   if (!(flags  MSG_PEEK)) {
+   sk_eat_skb(sk, skb, copied_early);
+   copied_early = 0;
+   }
break;
} while (len  0);
 
@@ -1386,6 +1432,36 @@ skip_copy:
tp-ucopy.len = 0;
}
 
+#ifdef CONFIG_NET_DMA
+   if (tp-ucopy.dma_chan) {
+   struct sk_buff *skb;
+   dma_cookie_t done, used;
+
+   dma_async_memcpy_issue_pending(tp-ucopy.dma_chan

[PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client

2006-05-08 Thread Chris Leech
Attempts to allocate per-CPU DMA channels

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/Kconfig   |   12 +
 include/linux/netdevice.h |4 ++
 include/net/netdma.h  |   38 
 net/core/dev.c|  104 +
 4 files changed, 158 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 0f15e76..30d021d 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -10,6 +10,18 @@ config DMA_ENGINE
  DMA engines offload copy operations from the CPU to dedicated
  hardware, allowing the copies to happen asynchronously.
 
+comment DMA Clients
+
+config NET_DMA
+   bool Network: TCP receive copy offload
+   depends on DMA_ENGINE  NET
+   default y
+   ---help---
+ This enables the use of DMA engines in the network stack to
+ offload receive copy-to-user operations, freeing CPU cycles.
+ Since this is the main user of the DMA engine, it should be enabled;
+ say Y here.
+
 comment DMA Devices
 
 config INTEL_IOATDMA
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 309f919..06bcabc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -37,6 +37,7 @@
 #include linux/config.h
 #include linux/device.h
 #include linux/percpu.h
+#include linux/dmaengine.h
 
 struct divert_blk;
 struct vlan_group;
@@ -594,6 +595,9 @@ struct softnet_data
struct sk_buff  *completion_queue;
 
struct net_device   backlog_dev;/* Sorry. 8) */
+#ifdef CONFIG_NET_DMA
+   struct dma_chan *net_dma;
+#endif
 };
 
 DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff --git a/include/net/netdma.h b/include/net/netdma.h
new file mode 100644
index 000..cbfe89d
--- /dev/null
+++ b/include/net/netdma.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef NETDMA_H
+#define NETDMA_H
+#include linux/config.h
+#ifdef CONFIG_NET_DMA
+#include linux/dmaengine.h
+
+static inline struct dma_chan *get_softnet_dma(void)
+{
+   struct dma_chan *chan;
+   rcu_read_lock();
+   chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma));
+   if (chan)
+   dma_chan_get(chan);
+   rcu_read_unlock();
+   return chan;
+}
+#endif /* CONFIG_NET_DMA */
+#endif /* NETDMA_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index 9ab3cfa..ab34006 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,6 +115,7 @@
 #include net/iw_handler.h
 #include asm/current.h
 #include linux/audit.h
+#include linux/dmaengine.h
 
 /*
  * The list of packet types we will receive (as opposed to discard)
@@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16];/* 16 way hashed list */
 static struct list_head ptype_all; /* Taps */
 
+#ifdef CONFIG_NET_DMA
+static struct dma_client *net_dma_client;
+static unsigned int net_dma_count;
+static spinlock_t net_dma_event_lock;
+#endif
+
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtln
  * semaphore.
@@ -1844,6 +1851,19 @@ static void net_rx_action(struct softirq
}
}
 out:
+#ifdef CONFIG_NET_DMA
+   /*
+* There may not be any more sk_buffs coming right now, so push
+* any pending DMA copies to hardware
+*/
+   if (net_dma_client) {
+   struct dma_chan *chan;
+   rcu_read_lock();
+   list_for_each_entry_rcu(chan, net_dma_client-channels, 
client_node)
+   dma_async_memcpy_issue_pending(chan);
+   rcu_read_unlock();
+   }
+#endif
local_irq_enable();
return;
 
@@ -3307,6 +3327,88 @@ static int dev_cpu_callback(struct notif
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+#ifdef CONFIG_NET_DMA
+/**
+ * net_dma_rebalance -
+ * This is called when the number of channels allocated to the net_dma_client
+ * changes.  The net_dma_client tries to have one DMA channel per CPU.
+ */
+static void net_dma_rebalance(void)
+{
+   unsigned int cpu, i, n

[PATCH 6/9] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static

2006-05-08 Thread Chris Leech
Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/net/tcp.h |2 ++
 net/ipv4/tcp.c|   10 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d0c2c2f..578cccf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -294,6 +294,8 @@ extern int  tcp_rcv_established(struct 
 
 extern voidtcp_rcv_space_adjust(struct sock *sk);
 
+extern voidtcp_cleanup_rbuf(struct sock *sk, int copied);
+
 extern int tcp_twsk_unique(struct sock *sk,
struct sock *sktw, void *twp);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e2b7b80..1c0cfd7 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk,
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
struct tcp_sock *tp = tcp_sk(sk);
int time_to_ack = 0;
@@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_
 
/* Clean up data we have read: This will do ACK frames. */
if (copied)
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
return copied;
 }
 
@@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
}
}
 
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
 
if (!sysctl_tcp_low_latency  tp-ucopy.task == user_recv) {
/* Install new reader */
@@ -1391,7 +1391,7 @@ skip_copy:
 */
 
/* Clean up data we have read: This will do ACK frames. */
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
 
TCP_CHECK_TIMER(sk);
release_sock(sk);
@@ -1858,7 +1858,7 @@ static int do_tcp_setsockopt(struct sock
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) 
inet_csk_ack_scheduled(sk)) {
icsk-icsk_ack.pending |= ICSK_ACK_PUSHED;
-   cleanup_rbuf(sk, 1);
+   tcp_cleanup_rbuf(sk, 1);
if (!(val  1))
icsk-icsk_ack.pingpong = 1;
}

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/9] [I/OAT] make sk_eat_skb I/OAT aware

2006-05-08 Thread Chris Leech
Add an extra argument to sk_eat_skb, and make it move early copied packets
to the async_wait_queue instead of freeing them.
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/net/sock.h |   13 -
 net/dccp/proto.c   |4 ++--
 net/ipv4/tcp.c |8 
 net/llc/af_llc.c   |2 +-
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 90c65cb..75b0e97 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1273,11 +1273,22 @@ sock_recv_timestamp(struct msghdr *msg, 
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
 */
-static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
+#ifdef CONFIG_NET_DMA
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int 
copied_early)
+{
+   __skb_unlink(skb, sk-sk_receive_queue);
+   if (!copied_early)
+   __kfree_skb(skb);
+   else
+   __skb_queue_tail(sk-sk_async_wait_queue, skb);
+}
+#else
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int 
copied_early)
 {
__skb_unlink(skb, sk-sk_receive_queue);
__kfree_skb(skb);
 }
+#endif
 
 extern void sock_enable_timestamp(struct sock *sk);
 extern int sock_get_timestamp(struct sock *, struct timeval __user *);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 2e0ee83..5317fd3 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, str
}
dccp_pr_debug(packet_type=%s\n,
  dccp_packet_name(dh-dccph_type));
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
 verify_sock_status:
if (sock_flag(sk, SOCK_DONE)) {
len = 0;
@@ -773,7 +773,7 @@ verify_sock_status:
}
found_fin_ok:
if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
break;
} while (1);
 out:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1c0cfd7..4e067d2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_
break;
}
if (skb-h.th-fin) {
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
++seq;
break;
}
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
if (!desc-count)
break;
}
@@ -1356,14 +1356,14 @@ skip_copy:
if (skb-h.th-fin)
goto found_fin_ok;
if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
continue;
 
found_fin_ok:
/* Process the FIN. */
++*seq;
if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
break;
} while (len  0);
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5a04db7..7465170 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *
continue;
 
if (!(flags  MSG_PEEK)) {
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
*seq = 0;
}
} while (len  0);

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/9] [I/OAT] Utility functions for offloading sk_buff to iovec copies

2006-05-08 Thread Chris Leech
Provides for pinning user space pages in memory, copying to iovecs,
and copying from sk_buffs including fragmented and chained sk_buffs.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/Makefile  |3 
 drivers/dma/iovlock.c |  301 +
 include/linux/dmaengine.h |   22 +++
 include/net/netdma.h  |6 +
 net/core/Makefile |1 
 net/core/user_dma.c   |  127 +++
 6 files changed, 459 insertions(+), 1 deletions(-)

diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index c8a5f56..bdcfdbd 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,2 +1,3 @@
-obj-y += dmaengine.o
+obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
+obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c
new file mode 100644
index 000..5ed327e
--- /dev/null
+++ b/drivers/dma/iovlock.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ * Portions based on net/core/datagram.c and copyrighted by their authors.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code allows the net stack to make use of a DMA engine for
+ * skb to iovec copies.
+ */
+
+#include linux/dmaengine.h
+#include linux/pagemap.h
+#include net/tcp.h /* for memcpy_toiovec */
+#include asm/io.h
+#include asm/uaccess.h
+
+int num_pages_spanned(struct iovec *iov)
+{
+   return
+   ((PAGE_ALIGN((unsigned long)iov-iov_base + iov-iov_len) -
+   ((unsigned long)iov-iov_base  PAGE_MASK))  PAGE_SHIFT);
+}
+
+/*
+ * Pin down all the iovec pages needed for len bytes.
+ * Return a struct dma_pinned_list to keep track of pages pinned down.
+ *
+ * We are allocating a single chunk of memory, and then carving it up into
+ * 3 sections, the latter 2 whose size depends on the number of iovecs and the
+ * total number of pages, respectively.
+ */
+struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len)
+{
+   struct dma_pinned_list *local_list;
+   struct page **pages;
+   int i;
+   int ret;
+   int nr_iovecs = 0;
+   int iovec_len_used = 0;
+   int iovec_pages_used = 0;
+   long err;
+
+   /* don't pin down non-user-based iovecs */
+   if (segment_eq(get_fs(), KERNEL_DS))
+   return NULL;
+
+   /* determine how many iovecs/pages there are, up front */
+   do {
+   iovec_len_used += iov[nr_iovecs].iov_len;
+   iovec_pages_used += num_pages_spanned(iov[nr_iovecs]);
+   nr_iovecs++;
+   } while (iovec_len_used  len);
+
+   /* single kmalloc for pinned list, page_list[], and the page arrays */
+   local_list = kmalloc(sizeof(*local_list)
+   + (nr_iovecs * sizeof (struct dma_page_list))
+   + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL);
+   if (!local_list) {
+   err = -ENOMEM;
+   goto out;
+   }
+
+   /* list of pages starts right after the page list array */
+   pages = (struct page **) local_list-page_list[nr_iovecs];
+
+   for (i = 0; i  nr_iovecs; i++) {
+   struct dma_page_list *page_list = local_list-page_list[i];
+
+   len -= iov[i].iov_len;
+
+   if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) {
+   err = -EFAULT;
+   goto unpin;
+   }
+
+   page_list-nr_pages = num_pages_spanned(iov[i]);
+   page_list-base_address = iov[i].iov_base;
+
+   page_list-pages = pages;
+   pages += page_list-nr_pages;
+
+   /* pin pages down */
+   down_read(current-mm-mmap_sem);
+   ret = get_user_pages(
+   current,
+   current-mm,
+   (unsigned long) iov[i].iov_base,
+   page_list-nr_pages,
+   1,  /* write */
+   0,  /* force */
+   page_list-pages,
+   NULL);
+   up_read(current-mm-mmap_sem

[PATCH 5/9] [I/OAT] Structure changes for TCP recv offload to I/OAT

2006-05-08 Thread Chris Leech
Adds an async_wait_queue and some additional fields to tcp_sock, and a
dma_cookie_t to sk_buff.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/linux/skbuff.h |4 
 include/linux/tcp.h|8 
 include/net/sock.h |2 ++
 include/net/tcp.h  |7 +++
 net/core/sock.c|6 ++
 5 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8f2347..23bad3b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -29,6 +29,7 @@
 #include linux/net.h
 #include linux/textsearch.h
 #include net/checksum.h
+#include linux/dmaengine.h
 
 #define HAVE_ALLOC_SKB /* For the drivers to know */
 #define HAVE_ALIGNABLE_SKB /* Ditto 8)*/
@@ -285,6 +286,9 @@ struct sk_buff {
__u16   tc_verd;/* traffic control verdict */
 #endif
 #endif
+#ifdef CONFIG_NET_DMA
+   dma_cookie_tdma_cookie;
+#endif
 
 
/* These elements must be at the end, see alloc_skb() for details.  */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 542d395..c90daa5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -18,6 +18,7 @@
 #define _LINUX_TCP_H
 
 #include linux/types.h
+#include linux/dmaengine.h
 #include asm/byteorder.h
 
 struct tcphdr {
@@ -233,6 +234,13 @@ struct tcp_sock {
struct iovec*iov;
int memory;
int len;
+#ifdef CONFIG_NET_DMA
+   /* members for async copy */
+   struct dma_chan *dma_chan;
+   int wakeup;
+   struct dma_pinned_list  *pinned_list;
+   dma_cookie_tdma_cookie;
+#endif
} ucopy;
 
__u32   snd_wl1;/* Sequence for window update   */
diff --git a/include/net/sock.h b/include/net/sock.h
index c9fad6f..90c65cb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -132,6 +132,7 @@ struct sock_common {
   *@sk_receive_queue: incoming packets
   *@sk_wmem_alloc: transmit queue bytes committed
   *@sk_write_queue: Packet sending queue
+  *@sk_async_wait_queue: DMA copied packets
   *@sk_omem_alloc: o is option or other
   *@sk_wmem_queued: persistent queue size
   *@sk_forward_alloc: space allocated forward
@@ -205,6 +206,7 @@ struct sock {
atomic_tsk_omem_alloc;
struct sk_buff_head sk_receive_queue;
struct sk_buff_head sk_write_queue;
+   struct sk_buff_head sk_async_wait_queue;
int sk_wmem_queued;
int sk_forward_alloc;
gfp_t   sk_allocation;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 3c989db..d0c2c2f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -28,6 +28,7 @@
 #include linux/cache.h
 #include linux/percpu.h
 #include linux/skbuff.h
+#include linux/dmaengine.h
 
 #include net/inet_connection_sock.h
 #include net/inet_timewait_sock.h
@@ -817,6 +818,12 @@ static inline void tcp_prequeue_init(str
tp-ucopy.len = 0;
tp-ucopy.memory = 0;
skb_queue_head_init(tp-ucopy.prequeue);
+#ifdef CONFIG_NET_DMA
+   tp-ucopy.dma_chan = NULL;
+   tp-ucopy.wakeup = 0;
+   tp-ucopy.pinned_list = NULL;
+   tp-ucopy.dma_cookie = 0;
+#endif
 }
 
 /* Packet is added to VJ-style prequeue for processing in process
diff --git a/net/core/sock.c b/net/core/sock.c
index ed2afdb..5d820c3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock 
atomic_set(newsk-sk_omem_alloc, 0);
skb_queue_head_init(newsk-sk_receive_queue);
skb_queue_head_init(newsk-sk_write_queue);
+#ifdef CONFIG_NET_DMA
+   skb_queue_head_init(newsk-sk_async_wait_queue);
+#endif
 
rwlock_init(newsk-sk_dst_lock);
rwlock_init(newsk-sk_callback_lock);
@@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock,
skb_queue_head_init(sk-sk_receive_queue);
skb_queue_head_init(sk-sk_write_queue);
skb_queue_head_init(sk-sk_error_queue);
+#ifdef CONFIG_NET_DMA
+   skb_queue_head_init(sk-sk_async_wait_queue);
+#endif
 
sk-sk_send_head=   NULL;
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/9] [I/OAT] DMA memcpy subsystem

2006-05-08 Thread Chris Leech
Provides an API for offloading memory copies to DMA devices

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/Kconfig   |2 
 drivers/Makefile  |1 
 drivers/dma/Kconfig   |   13 +
 drivers/dma/Makefile  |1 
 drivers/dma/dmaengine.c   |  408 +
 include/linux/dmaengine.h |  337 +
 6 files changed, 762 insertions(+), 0 deletions(-)

diff --git a/drivers/Kconfig b/drivers/Kconfig
index aeb5ab2..8b11ceb 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -72,4 +72,6 @@ source drivers/edac/Kconfig
 
 source drivers/rtc/Kconfig
 
+source drivers/dma/Kconfig
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 447d8e6..3c51703 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN)  += sn/
 obj-y  += firmware/
 obj-$(CONFIG_CRYPTO)   += crypto/
 obj-$(CONFIG_SUPERH)   += sh/
+obj-$(CONFIG_DMA_ENGINE)   += dma/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644
index 000..f9ac4bc
--- /dev/null
+++ b/drivers/dma/Kconfig
@@ -0,0 +1,13 @@
+#
+# DMA engine configuration
+#
+
+menu DMA Engine support
+
+config DMA_ENGINE
+   bool Support for DMA engines
+   ---help---
+ DMA engines offload copy operations from the CPU to dedicated
+ hardware, allowing the copies to happen asynchronously.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644
index 000..10b7391
--- /dev/null
+++ b/drivers/dma/Makefile
@@ -0,0 +1 @@
+obj-y += dmaengine.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644
index 000..473c47b
--- /dev/null
+++ b/drivers/dma/dmaengine.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code implements the DMA subsystem. It provides a HW-neutral interface
+ * for other kernel code to use asynchronous memory copy capabilities,
+ * if present, and allows different HW DMA drivers to register as providing
+ * this capability.
+ *
+ * Due to the fact we are accelerating what is already a relatively fast
+ * operation, the code goes to great lengths to avoid additional overhead,
+ * such as locking.
+ *
+ * LOCKING:
+ *
+ * The subsystem keeps two global lists, dma_device_list and dma_client_list.
+ * Both of these are protected by a mutex, dma_list_mutex.
+ *
+ * Each device has a channels list, which runs unlocked but is never modified
+ * once the device is registered, it's just setup by the driver.
+ *
+ * Each client has a channels list, it's only modified under the client-lock
+ * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ *
+ * Each device has a kref, which is initialized to 1 when the device is
+ * registered. A kref_put is done for each class_device registered.  When the
+ * class_device is released, the coresponding kref_put is done in the release
+ * method. Every time one of the device's channels is allocated to a client,
+ * a kref_get occurs.  When the channel is freed, the coresponding kref_put
+ * happens. The device's release function does a completion, so
+ * unregister_device does a remove event, class_device_unregister, a kref_put
+ * for the first reference, then waits on the completion for all other
+ * references to finish.
+ *
+ * Each channel has an open-coded implementation of Rusty Russell's bigref,
+ * with a kref and a per_cpu local_t.  A single reference is set when on an
+ * ADDED event, and removed with a REMOVE event.  Net DMA client takes an
+ * extra reference per outstanding transaction.  The relase function does a
+ * kref_put on the device. -ChrisL
+ */
+
+#include linux/init.h
+#include linux/module.h
+#include linux/device.h
+#include linux/dmaengine.h
+#include linux/hardirq.h
+#include linux/spinlock.h
+#include linux/percpu.h
+#include linux/rcupdate.h
+#include linux/mutex.h
+
+static DEFINE_MUTEX(dma_list_mutex);
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs

Re: Question on e1000 patch, rx-copy-break related.

2006-05-03 Thread Chris Leech

On 5/3/06, Ben Greear [EMAIL PROTECTED] wrote:

So, as of 2.6.16.13, is the hardware stripping (SERC) enabled?  Could
you also let me know where this bit is defined in case I want to twiddle
it myself (a quick grep for SERC in 2.6.16.13 yields nothing.)


You missed a C, it's SECRC (Strip Ethernet CRC) in the RCTL register
or E1000_RCTL_SECRC.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/10] [IOAT] I/OAT patches repost

2006-04-27 Thread Chris Leech
 Netperf2 TOT now accesses the buffer that was just recv()'d rather than
 the one that is about to be recv()'d.

We've posted netperf2 results with I/OAT enabled/disabled and the data
access option on/off at
http://kernel.org/pub/linux/kernel/people/grover/ioat/netperf-icb-1.5-postscaling-both.pdf

This link has also been added to the I/OAT page on the LinuxNet wiki.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/9] I/OAT

2006-03-31 Thread Chris Leech
 Could you please describe how struct ioat_dma_chan channels are freed?

Sorry, I got distracted by other issues and never ended up following
up on this.  You're right, and it's just sloppiness on my part for
missing it, those structs are being leaked on module unload.  I'll fix
it.  Thanks.

-Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/9] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold

2006-03-29 Thread Chris Leech
Any socket recv of less than this ammount will not be offloaded

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/linux/sysctl.h |1 +
 include/net/tcp.h  |1 +
 net/core/user_dma.c|4 
 net/ipv4/sysctl_net_ipv4.c |   10 ++
 4 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 76eaeff..cd9e7c0 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -403,6 +403,7 @@ enum
NET_TCP_MTU_PROBING=113,
NET_TCP_BASE_MSS=114,
NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
+   NET_TCP_DMA_COPYBREAK=116,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ca5bdaf..2e6fdef 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -219,6 +219,7 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
+extern int sysctl_tcp_dma_copybreak;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index ec177ef..642a3f3 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -33,6 +33,10 @@
 
 #ifdef CONFIG_NET_DMA
 
+#define NET_DMA_DEFAULT_COPYBREAK 1024
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+
 /**
  * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
  * @skb - buffer to copy
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3ad..6a6aa53 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -688,6 +688,16 @@ ctl_table ipv4_table[] = {
.mode   = 0644,
.proc_handler   = proc_dointvec
},
+#ifdef CONFIG_NET_DMA
+   {
+   .ctl_name   = NET_TCP_DMA_COPYBREAK,
+   .procname   = tcp_dma_copybreak,
+   .data   = sysctl_tcp_dma_copybreak,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec
+   },
+#endif
{ .ctl_name = 0 }
 };
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/9] [I/OAT] make sk_eat_skb I/OAT aware

2006-03-29 Thread Chris Leech
Add an extra argument to sk_eat_skb, and make it move early copied packets
to the async_wait_queue instead of freeing them.
Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/net/sock.h |   13 -
 net/dccp/proto.c   |4 ++--
 net/ipv4/tcp.c |8 
 net/llc/af_llc.c   |2 +-
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 190809c..e3723b6 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1272,11 +1272,22 @@ sock_recv_timestamp(struct msghdr *msg, 
  * This routine must be called with interrupts disabled or with the socket
  * locked so that the sk_buff queue operation is ok.
 */
-static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
+#ifdef CONFIG_NET_DMA
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int 
copied_early)
+{
+   __skb_unlink(skb, sk-sk_receive_queue);
+   if (!copied_early)
+   __kfree_skb(skb);
+   else
+   __skb_queue_tail(sk-sk_async_wait_queue, skb);
+}
+#else
+static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int 
copied_early)
 {
__skb_unlink(skb, sk-sk_receive_queue);
__kfree_skb(skb);
 }
+#endif
 
 extern void sock_enable_timestamp(struct sock *sk);
 extern int sock_get_timestamp(struct sock *, struct timeval __user *);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 1ff7328..35d7dfd 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, str
}
dccp_pr_debug(packet_type=%s\n,
  dccp_packet_name(dh-dccph_type));
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
 verify_sock_status:
if (sock_flag(sk, SOCK_DONE)) {
len = 0;
@@ -773,7 +773,7 @@ verify_sock_status:
}
found_fin_ok:
if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
break;
} while (1);
 out:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b10f78c..2346539 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_
break;
}
if (skb-h.th-fin) {
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
++seq;
break;
}
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
if (!desc-count)
break;
}
@@ -1356,14 +1356,14 @@ skip_copy:
if (skb-h.th-fin)
goto found_fin_ok;
if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
continue;
 
found_fin_ok:
/* Process the FIN. */
++*seq;
if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
break;
} while (len  0);
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 5a04db7..7465170 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *
continue;
 
if (!(flags  MSG_PEEK)) {
-   sk_eat_skb(sk, skb);
+   sk_eat_skb(sk, skb, 0);
*seq = 0;
}
} while (len  0);

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/9] I/OAT

2006-03-29 Thread Chris Leech
[I/OAT] Driver for the Intel(R) I/OAT DMA engine

From: Chris Leech [EMAIL PROTECTED]

Adds a new ioatdma driver

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/Kconfig |9 
 drivers/dma/Makefile|1 
 drivers/dma/ioatdma.c   |  805
+++
 drivers/dma/ioatdma.h   |  126 ++
 drivers/dma/ioatdma_hw.h|   52 +++
 drivers/dma/ioatdma_io.h|  118 ++
 drivers/dma/ioatdma_registers.h |  128 ++
 7 files changed, 1239 insertions(+), 0 deletions(-)



ioatdma_driver.gz
Description: GNU Zip compressed data


[PATCH 1/9] [I/OAT] DMA memcpy subsystem

2006-03-29 Thread Chris Leech
Provides an API for offloading memory copies to DMA devices

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/Kconfig   |2 
 drivers/Makefile  |1 
 drivers/dma/Kconfig   |   13 +
 drivers/dma/Makefile  |1 
 drivers/dma/dmaengine.c   |  405 +
 include/linux/dmaengine.h |  337 +
 6 files changed, 759 insertions(+), 0 deletions(-)

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 9f5c0da..f89ac05 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -72,4 +72,6 @@ source drivers/edac/Kconfig
 
 source drivers/rtc/Kconfig
 
+source drivers/dma/Kconfig
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 4249552..9b808a6 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN)  += sn/
 obj-y  += firmware/
 obj-$(CONFIG_CRYPTO)   += crypto/
 obj-$(CONFIG_SUPERH)   += sh/
+obj-$(CONFIG_DMA_ENGINE)   += dma/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644
index 000..f9ac4bc
--- /dev/null
+++ b/drivers/dma/Kconfig
@@ -0,0 +1,13 @@
+#
+# DMA engine configuration
+#
+
+menu DMA Engine support
+
+config DMA_ENGINE
+   bool Support for DMA engines
+   ---help---
+ DMA engines offload copy operations from the CPU to dedicated
+ hardware, allowing the copies to happen asynchronously.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644
index 000..10b7391
--- /dev/null
+++ b/drivers/dma/Makefile
@@ -0,0 +1 @@
+obj-y += dmaengine.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644
index 000..683456a
--- /dev/null
+++ b/drivers/dma/dmaengine.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This code implements the DMA subsystem. It provides a HW-neutral interface
+ * for other kernel code to use asynchronous memory copy capabilities,
+ * if present, and allows different HW DMA drivers to register as providing
+ * this capability.
+ *
+ * Due to the fact we are accelerating what is already a relatively fast
+ * operation, the code goes to great lengths to avoid additional overhead,
+ * such as locking.
+ *
+ * LOCKING:
+ *
+ * The subsystem keeps two global lists, dma_device_list and dma_client_list.
+ * Both of these are protected by a spinlock, dma_list_lock.
+ *
+ * Each device has a channels list, which runs unlocked but is never modified
+ * once the device is registered, it's just setup by the driver.
+ *
+ * Each client has a channels list, it's only modified under the client-lock
+ * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ *
+ * Each device has a kref, which is initialized to 1 when the device is
+ * registered. A kref_put is done for each class_device registered.  When the
+ * class_device is released, the coresponding kref_put is done in the release
+ * method. Every time one of the device's channels is allocated to a client,
+ * a kref_get occurs.  When the channel is freed, the coresponding kref_put
+ * happens. The device's release function does a completion, so
+ * unregister_device does a remove event, class_device_unregister, a kref_put
+ * for the first reference, then waits on the completion for all other
+ * references to finish.
+ *
+ * Each channel has an open-coded implementation of Rusty Russell's bigref,
+ * with a kref and a per_cpu local_t.  A single reference is set when on an
+ * ADDED event, and removed with a REMOVE event.  Net DMA client takes an
+ * extra reference per outstanding transaction.  The relase function does a
+ * kref_put on the device. -ChrisL
+ */
+
+#include linux/init.h
+#include linux/module.h
+#include linux/device.h
+#include linux/dmaengine.h
+#include linux/hardirq.h
+#include linux/spinlock.h
+#include linux/percpu.h
+#include linux/rcupdate.h
+
+static DEFINE_SPINLOCK(dma_list_lock);
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs implementation --- */
+
+static

[PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client

2006-03-29 Thread Chris Leech
Attempts to allocate per-CPU DMA channels

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/Kconfig   |   12 +
 include/linux/netdevice.h |4 ++
 include/net/netdma.h  |   38 
 net/core/dev.c|  104 +
 4 files changed, 158 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 0f15e76..30d021d 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -10,6 +10,18 @@ config DMA_ENGINE
  DMA engines offload copy operations from the CPU to dedicated
  hardware, allowing the copies to happen asynchronously.
 
+comment DMA Clients
+
+config NET_DMA
+   bool Network: TCP receive copy offload
+   depends on DMA_ENGINE  NET
+   default y
+   ---help---
+ This enables the use of DMA engines in the network stack to
+ offload receive copy-to-user operations, freeing CPU cycles.
+ Since this is the main user of the DMA engine, it should be enabled;
+ say Y here.
+
 comment DMA Devices
 
 config INTEL_IOATDMA
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 950dc55..7fda35f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -37,6 +37,7 @@
 #include linux/config.h
 #include linux/device.h
 #include linux/percpu.h
+#include linux/dmaengine.h
 
 struct divert_blk;
 struct vlan_group;
@@ -592,6 +593,9 @@ struct softnet_data
struct sk_buff  *completion_queue;
 
struct net_device   backlog_dev;/* Sorry. 8) */
+#ifdef CONFIG_NET_DMA
+   struct dma_chan *net_dma;
+#endif
 };
 
 DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff --git a/include/net/netdma.h b/include/net/netdma.h
new file mode 100644
index 000..cbfe89d
--- /dev/null
+++ b/include/net/netdma.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#ifndef NETDMA_H
+#define NETDMA_H
+#include linux/config.h
+#ifdef CONFIG_NET_DMA
+#include linux/dmaengine.h
+
+static inline struct dma_chan *get_softnet_dma(void)
+{
+   struct dma_chan *chan;
+   rcu_read_lock();
+   chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma));
+   if (chan)
+   dma_chan_get(chan);
+   rcu_read_unlock();
+   return chan;
+}
+#endif /* CONFIG_NET_DMA */
+#endif /* NETDMA_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index a3ab11f..ffd3d6d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,6 +115,7 @@
 #include net/iw_handler.h
 #include asm/current.h
 #include linux/audit.h
+#include linux/dmaengine.h
 
 /*
  * The list of packet types we will receive (as opposed to discard)
@@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16];/* 16 way hashed list */
 static struct list_head ptype_all; /* Taps */
 
+#ifdef CONFIG_NET_DMA
+static struct dma_client *net_dma_client;
+static unsigned int net_dma_count;
+static spinlock_t net_dma_event_lock;
+#endif
+
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtln
  * semaphore.
@@ -1780,6 +1787,19 @@ static void net_rx_action(struct softirq
}
}
 out:
+#ifdef CONFIG_NET_DMA
+   /*
+* There may not be any more sk_buffs coming right now, so push
+* any pending DMA copies to hardware
+*/
+   if (net_dma_client) {
+   struct dma_chan *chan;
+   rcu_read_lock();
+   list_for_each_entry_rcu(chan, net_dma_client-channels, 
client_node)
+   dma_async_memcpy_issue_pending(chan);
+   rcu_read_unlock();
+   }
+#endif
local_irq_enable();
return;
 
@@ -3243,6 +3263,88 @@ static int dev_cpu_callback(struct notif
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+#ifdef CONFIG_NET_DMA
+/**
+ * net_dma_rebalance -
+ * This is called when the number of channels allocated to the net_dma_client
+ * changes.  The net_dma_client tries to have one DMA channel per CPU.
+ */
+static void net_dma_rebalance(void)
+{
+   unsigned int cpu, i, n

[PATCH 0/9] I/OAT

2006-03-29 Thread Chris Leech
This patch series is the a full release of the Intel(R) I/O
Acceleration Technology (I/OAT) for Linux.  It includes an in kernel API
for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy
engine, and changes to the TCP stack to offload copies of received
networking data to application space.

Changes from last posting:
Fixed a page reference leak that happened when offloaded copies were 
set up but never used for a recv.
Fixed the ioatdma self test to handle failures correctly.
Serialized DMA ADD and REMOVE events in the networking core with a lock.
Added a long comment in dmaengine.c to describe the locking and 
reference counting being used.
Disabled preempt around a use of get_cpu_var.
Made tcp_dma_try_early_copy static, it is only used in one file.
Made some GFP_ATOMIC allocations GFP_KERNEL where safe to sleep.
Made changes to sk_eat_skb, removing some ifdefs in the TCP code.


These changes apply to DaveM's net-2.6.17 tree as of commit
68907dad58cd7ef11536e1db6baeb98b20af91b2 ([DCCP]: Use NULL for pointers, 
comfort sparse.)

They are available to pull from
git://198.78.49.142/~cleech/linux-2.6 ioat-2.6.17

There are 9 patches in the series:
1) The memcpy offload APIs and class code
2) The Intel I/OAT DMA driver (ioatdma)
3) Core networking code to setup networking as a DMA memcpy client
4) Utility functions for sk_buff to iovec offloaded copy
5) Structure changes needed for TCP receive offload
6) Rename cleanup_rbuf to tcp_cleanup_rbuf
7) Make sk_eat_skb aware of early copied packets
8) Add a sysctl to tune the minimum offloaded I/O size for TCP
9) The main TCP receive offload changes

--
Chris Leech [EMAIL PROTECTED]
I/O Acceleration Technology Software Development
LAN Access Division / Digital Enterprise Group 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/9] [I/OAT] Structure changes for TCP recv offload to I/OAT

2006-03-29 Thread Chris Leech
Adds an async_wait_queue and some additional fields to tcp_sock, and a
dma_cookie_t to sk_buff.

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/linux/skbuff.h |4 
 include/linux/tcp.h|8 
 include/net/sock.h |2 ++
 include/net/tcp.h  |7 +++
 net/core/sock.c|6 ++
 5 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 613b951..76861a8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -29,6 +29,7 @@
 #include linux/net.h
 #include linux/textsearch.h
 #include net/checksum.h
+#include linux/dmaengine.h
 
 #define HAVE_ALLOC_SKB /* For the drivers to know */
 #define HAVE_ALIGNABLE_SKB /* Ditto 8)*/
@@ -285,6 +286,9 @@ struct sk_buff {
__u16   tc_verd;/* traffic control verdict */
 #endif
 #endif
+#ifdef CONFIG_NET_DMA
+   dma_cookie_tdma_cookie;
+#endif
 
 
/* These elements must be at the end, see alloc_skb() for details.  */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 542d395..c90daa5 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -18,6 +18,7 @@
 #define _LINUX_TCP_H
 
 #include linux/types.h
+#include linux/dmaengine.h
 #include asm/byteorder.h
 
 struct tcphdr {
@@ -233,6 +234,13 @@ struct tcp_sock {
struct iovec*iov;
int memory;
int len;
+#ifdef CONFIG_NET_DMA
+   /* members for async copy */
+   struct dma_chan *dma_chan;
+   int wakeup;
+   struct dma_pinned_list  *pinned_list;
+   dma_cookie_tdma_cookie;
+#endif
} ucopy;
 
__u32   snd_wl1;/* Sequence for window update   */
diff --git a/include/net/sock.h b/include/net/sock.h
index af2b054..190809c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -132,6 +132,7 @@ struct sock_common {
   *@sk_receive_queue: incoming packets
   *@sk_wmem_alloc: transmit queue bytes committed
   *@sk_write_queue: Packet sending queue
+  *@sk_async_wait_queue: DMA copied packets
   *@sk_omem_alloc: o is option or other
   *@sk_wmem_queued: persistent queue size
   *@sk_forward_alloc: space allocated forward
@@ -205,6 +206,7 @@ struct sock {
atomic_tsk_omem_alloc;
struct sk_buff_head sk_receive_queue;
struct sk_buff_head sk_write_queue;
+   struct sk_buff_head sk_async_wait_queue;
int sk_wmem_queued;
int sk_forward_alloc;
gfp_t   sk_allocation;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9418f4d..54e4367 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -28,6 +28,7 @@
 #include linux/cache.h
 #include linux/percpu.h
 #include linux/skbuff.h
+#include linux/dmaengine.h
 
 #include net/inet_connection_sock.h
 #include net/inet_timewait_sock.h
@@ -820,6 +821,12 @@ static inline void tcp_prequeue_init(str
tp-ucopy.len = 0;
tp-ucopy.memory = 0;
skb_queue_head_init(tp-ucopy.prequeue);
+#ifdef CONFIG_NET_DMA
+   tp-ucopy.dma_chan = NULL;
+   tp-ucopy.wakeup = 0;
+   tp-ucopy.pinned_list = NULL;
+   tp-ucopy.dma_cookie = 0;
+#endif
 }
 
 /* Packet is added to VJ-style prequeue for processing in process
diff --git a/net/core/sock.c b/net/core/sock.c
index a96ea7d..d2acd35 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -818,6 +818,9 @@ struct sock *sk_clone(const struct sock 
atomic_set(newsk-sk_omem_alloc, 0);
skb_queue_head_init(newsk-sk_receive_queue);
skb_queue_head_init(newsk-sk_write_queue);
+#ifdef CONFIG_NET_DMA
+   skb_queue_head_init(newsk-sk_async_wait_queue);
+#endif
 
rwlock_init(newsk-sk_dst_lock);
rwlock_init(newsk-sk_callback_lock);
@@ -1369,6 +1372,9 @@ void sock_init_data(struct socket *sock,
skb_queue_head_init(sk-sk_receive_queue);
skb_queue_head_init(sk-sk_write_queue);
skb_queue_head_init(sk-sk_error_queue);
+#ifdef CONFIG_NET_DMA
+   skb_queue_head_init(sk-sk_async_wait_queue);
+#endif
 
sk-sk_send_head=   NULL;
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/9] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static

2006-03-29 Thread Chris Leech
Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/net/tcp.h |2 ++
 net/ipv4/tcp.c|   10 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 54e4367..ca5bdaf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -294,6 +294,8 @@ extern int  tcp_rcv_established(struct 
 
 extern voidtcp_rcv_space_adjust(struct sock *sk);
 
+extern voidtcp_cleanup_rbuf(struct sock *sk, int copied);
+
 extern int tcp_twsk_unique(struct sock *sk,
struct sock *sktw, void *twp);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 87f68e7..b10f78c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk,
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
struct tcp_sock *tp = tcp_sk(sk);
int time_to_ack = 0;
@@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_
 
/* Clean up data we have read: This will do ACK frames. */
if (copied)
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
return copied;
 }
 
@@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
}
}
 
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
 
if (!sysctl_tcp_low_latency  tp-ucopy.task == user_recv) {
/* Install new reader */
@@ -1391,7 +1391,7 @@ skip_copy:
 */
 
/* Clean up data we have read: This will do ACK frames. */
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
 
TCP_CHECK_TIMER(sk);
release_sock(sk);
@@ -1853,7 +1853,7 @@ static int do_tcp_setsockopt(struct sock
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) 
inet_csk_ack_scheduled(sk)) {
icsk-icsk_ack.pending |= ICSK_ACK_PUSHED;
-   cleanup_rbuf(sk, 1);
+   tcp_cleanup_rbuf(sk, 1);
if (!(val  1))
icsk-icsk_ack.pingpong = 1;
}

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Fwd: I/OAT performance data

2006-03-16 Thread Chris Leech
should have kept this on list

-- Forwarded message --
From: Chris Leech [EMAIL PROTECTED]
Date: Mar 16, 2006 11:13 AM
Subject: Re: I/OAT performance data
To: Rick Jones [EMAIL PROTECTED]


 I must be missing something - if the MTU was 1500 bytes, how did the
 receiver's offloaded copies get to the 2k level?  Were several arriving
 TCP segments aggregated?

Most of the overhead (get_user_pages) is per recv, not on a per packet
basis.  Regardless of packet size, we offload the copies if the total
requested data amount is 2k or greater.  So while there's no
aggregation of TCP segments before the socket level, we are talking
about copying multiple packets per I/O.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Fwd: I/OAT performance data

2006-03-16 Thread Chris Leech
oops, should have kept this on list

-- Forwarded message --
From: Chris Leech [EMAIL PROTECTED]
Date: Mar 16, 2006 10:56 AM
Subject: Re: I/OAT performance data
To: Rick Jones [EMAIL PROTECTED]


 When it says buffer size for the Chariot stuff, is that the socket
 buffer size, or the size of the buffer(s) being passed to the transport?

That's the I/O size for the application, being passed to the transport.

 Was the MTU 1500 or 9000 bytes?

1500 byte MTU

 Can the Chariot do small packet latency tests and/or aggregate small
 packet performance?

Yes, but for small I/O the overhead of pinning pages and initiating
the offloaded copy overtakes the benefits.  We currently see that
cutoff at about 2k.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: I/OAT performance data

2006-03-16 Thread Chris Leech
On 3/16/06, Leonid Grossman [EMAIL PROTECTED] wrote:
 Hi Chris,
 Do you know what part of the performance delta is contributed by the
 offload for copy operations, and what part comes from other I/OAT
 features like header separation, etc. ?

This is showing the offloaded copy as the only difference.  Header
separation is being used in the e1000 driver in both test runs.

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: I/OAT performance data

2006-03-16 Thread Chris Leech
 Thanks, that clarifies things.  So, if I've understood correctly, the
 benefit kicks in when:

 1) I/OAT is enabled :)
 2) The user posts a recv() (or the like) of = 2K
 3) There is = 2K of data available to give them

 yes?

Yes
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: I/OAT performance data

2006-03-16 Thread Chris Leech
On 3/16/06, Scott Feldman [EMAIL PROTECTED] wrote:
 Do you have any data to share on header split?  Also, can other non-
 Intel nics use I/OAT copy, and if so, is header-split a requirement
 for the copy?

I don't have any header-split data.  The I/OAT copy offload will work
for any TCP traffic, regardless of what kind of NIC it was received on
(of course you need a system with the additional memcpy engine in the
chipset)

- Chris
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)

2006-03-10 Thread Chris Leech
This patch series is the a full release of the Intel(R) I/O
Acceleration Technology (I/OAT) for Linux.  It includes an in kernel API
for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy
engine, and changes to the TCP stack to offload copies of received
networking data to application space.

Changes from last weeks posting:
  fixed return value from sysfs show functions as suggested by Joe Perches
  code style fixes suggested by Andrew Morton, David Miller, and others
  renamed anything related to pinning pages from lock/locked to pin/pinned
  renamed ioatdma register read/write functions with less generic names
  return a pinned list from dma_pin_iovec_pages instead of passing in a 
**dma_pinned_list
  replaced all cb/CB symbol prefixes in ioatdma with ioat/IOAT,
CB was an abbreviation of an early code name
  use set_page_dirty_lock instead of SetPageDirty pointed out by Andrew Morton
  rename dma_async_try_early_copy to tcp_dma_try_early_copy and stop exporting

I'll be focusing on reducing ifdefs and adding much needed comments, with
another release early next week. 

These changes apply to DaveM's net-2.6.17 tree as of commit
32639ad6b7e3da27f233c0516471f0747f1178f5 ([SPARC]: Fixup SO_*SEC values on 
32-bit sparc.)

They are available to pull from
git://198.78.49.142/~cleech/linux-2.6 ioat-2.6.17

There are 8 patches in the series:
1) The memcpy offload APIs and class code
2) The Intel I/OAT DMA driver (ioatdma)
3) Core networking code to setup networking as a DMA memcpy client
4) Utility functions for sk_buff to iovec offloaded copy
5) Structure changes needed for TCP receive offload
6) Rename cleanup_rbuf to tcp_cleanup_rbuf
7) Add a sysctl to tune the minimum offloaded I/O size for TCP
8) The main TCP receive offload changes

--
Chris Leech [EMAIL PROTECTED]
I/O Acceleration Technology Software Development
LAN Access Division / Digital Enterprise Group 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/8] [I/OAT] DMA memcpy subsystem

2006-03-10 Thread Chris Leech
Provides an API for offloading memory copies to DMA devices

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/Kconfig   |2 
 drivers/Makefile  |1 
 drivers/dma/Kconfig   |   13 ++
 drivers/dma/Makefile  |1 
 drivers/dma/dmaengine.c   |  360 +
 include/linux/dmaengine.h |  323 
 6 files changed, 700 insertions(+), 0 deletions(-)

diff --git a/drivers/Kconfig b/drivers/Kconfig
index bddf431..ce7ffa7 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -70,4 +70,6 @@ source drivers/sn/Kconfig
 
 source drivers/edac/Kconfig
 
+source drivers/dma/Kconfig
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 5c69b86..516ba5e 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -73,3 +73,4 @@ obj-$(CONFIG_SGI_SN)  += sn/
 obj-y  += firmware/
 obj-$(CONFIG_CRYPTO)   += crypto/
 obj-$(CONFIG_SUPERH)   += sh/
+obj-$(CONFIG_DMA_ENGINE)   += dma/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
new file mode 100644
index 000..f9ac4bc
--- /dev/null
+++ b/drivers/dma/Kconfig
@@ -0,0 +1,13 @@
+#
+# DMA engine configuration
+#
+
+menu DMA Engine support
+
+config DMA_ENGINE
+   bool Support for DMA engines
+   ---help---
+ DMA engines offload copy operations from the CPU to dedicated
+ hardware, allowing the copies to happen asynchronously.
+
+endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
new file mode 100644
index 000..10b7391
--- /dev/null
+++ b/drivers/dma/Makefile
@@ -0,0 +1 @@
+obj-y += dmaengine.o
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
new file mode 100644
index 000..35a63d8
--- /dev/null
+++ b/drivers/dma/dmaengine.c
@@ -0,0 +1,360 @@
+/*
+Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+The full GNU General Public License is included in this distribution in the
+file called LICENSE.
+*/
+#include linux/init.h
+#include linux/module.h
+#include linux/device.h
+#include linux/dmaengine.h
+#include linux/hardirq.h
+#include linux/spinlock.h
+#include linux/percpu.h
+#include linux/rcupdate.h
+
+static DEFINE_SPINLOCK(dma_list_lock);
+static LIST_HEAD(dma_device_list);
+static LIST_HEAD(dma_client_list);
+
+/* --- sysfs implementation --- */
+
+static ssize_t show_memcpy_count(struct class_device *cd, char *buf)
+{
+   struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+   unsigned long count = 0;
+   int i;
+
+   for_each_cpu(i)
+   count += per_cpu_ptr(chan-local, i)-memcpy_count;
+
+   return sprintf(buf, %lu\n, count);
+}
+
+static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
+{
+   struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+   unsigned long count = 0;
+   int i;
+
+   for_each_cpu(i)
+   count += per_cpu_ptr(chan-local, i)-bytes_transferred;
+
+   return sprintf(buf, %lu\n, count);
+}
+
+static ssize_t show_in_use(struct class_device *cd, char *buf)
+{
+   struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+
+   return sprintf(buf, %d\n, (chan-client ? 1 : 0));
+}
+
+static struct class_device_attribute dma_class_attrs[] = {
+   __ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL),
+   __ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL),
+   __ATTR(in_use, S_IRUGO, show_in_use, NULL),
+   __ATTR_NULL
+};
+
+static void dma_async_device_cleanup(struct kref *kref);
+
+static void dma_class_dev_release(struct class_device *cd)
+{
+   struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+   kref_put(chan-device-refcount, dma_async_device_cleanup);
+}
+
+static struct class dma_devclass = {
+   .name= dma,
+   .class_dev_attrs = dma_class_attrs,
+   .release = dma_class_dev_release,
+};
+
+/* --- client and device registration --- */
+
+/**
+ * dma_client_chan_alloc - try to allocate a channel to a client
+ * @client: dma_client
+ *
+ * Called

[PATCH 3/8] [I/OAT] Setup the networking subsystem as a DMA client

2006-03-10 Thread Chris Leech
Attempts to allocate per-CPU DMA channels

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 drivers/dma/Kconfig   |   12 +
 include/linux/netdevice.h |6 +++
 include/net/netdma.h  |   37 +
 net/core/dev.c|  100 +
 4 files changed, 155 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 0f15e76..30d021d 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -10,6 +10,18 @@ config DMA_ENGINE
  DMA engines offload copy operations from the CPU to dedicated
  hardware, allowing the copies to happen asynchronously.
 
+comment DMA Clients
+
+config NET_DMA
+   bool Network: TCP receive copy offload
+   depends on DMA_ENGINE  NET
+   default y
+   ---help---
+ This enables the use of DMA engines in the network stack to
+ offload receive copy-to-user operations, freeing CPU cycles.
+ Since this is the main user of the DMA engine, it should be enabled;
+ say Y here.
+
 comment DMA Devices
 
 config INTEL_IOATDMA
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 950dc55..25d8610 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -37,6 +37,9 @@
 #include linux/config.h
 #include linux/device.h
 #include linux/percpu.h
+#ifdef CONFIG_NET_DMA
+#include linux/dmaengine.h
+#endif
 
 struct divert_blk;
 struct vlan_group;
@@ -592,6 +595,9 @@ struct softnet_data
struct sk_buff  *completion_queue;
 
struct net_device   backlog_dev;/* Sorry. 8) */
+#ifdef CONFIG_NET_DMA
+   struct dma_chan *net_dma;
+#endif
 };
 
 DECLARE_PER_CPU(struct softnet_data,softnet_data);
diff --git a/include/net/netdma.h b/include/net/netdma.h
new file mode 100644
index 000..6435aef
--- /dev/null
+++ b/include/net/netdma.h
@@ -0,0 +1,37 @@
+/*
+Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2 of the License, or (at your option)
+any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59
+Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+
+The full GNU General Public License is included in this distribution in the
+file called LICENSE.
+*/
+#ifndef NETDMA_H
+#define NETDMA_H
+#ifdef CONFIG_NET_DMA
+#include linux/dmaengine.h
+
+static inline struct dma_chan *get_softnet_dma(void)
+{
+   struct dma_chan *chan;
+   rcu_read_lock();
+   chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma));
+   if (chan)
+   dma_chan_get(chan);
+   rcu_read_unlock();
+   return chan;
+}
+#endif /* CONFIG_NET_DMA */
+#endif /* NETDMA_H */
diff --git a/net/core/dev.c b/net/core/dev.c
index f7f6f99..d7e61b4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,6 +115,7 @@
 #include linux/wireless.h/* Note : will define WIRELESS_EXT */
 #include net/iw_handler.h
 #endif /* CONFIG_NET_RADIO */
+#include linux/dmaengine.h
 #include asm/current.h
 
 /*
@@ -149,6 +150,11 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16];/* 16 way hashed list */
 static struct list_head ptype_all; /* Taps */
 
+#ifdef CONFIG_NET_DMA
+static struct dma_client *net_dma_client;
+static unsigned int net_dma_count;
+#endif
+
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtln
  * semaphore.
@@ -1750,6 +1756,19 @@ static void net_rx_action(struct softirq
}
}
 out:
+#ifdef CONFIG_NET_DMA
+   /*
+* There may not be any more sk_buffs coming right now, so push
+* any pending DMA copies to hardware
+*/
+   if (net_dma_client) {
+   struct dma_chan *chan;
+   rcu_read_lock();
+   list_for_each_entry_rcu(chan, net_dma_client-channels, 
client_node)
+   dma_async_memcpy_issue_pending(chan);
+   rcu_read_unlock();
+   }
+#endif
local_irq_enable();
return;
 
@@ -3205,6 +3224,85 @@ static int dev_cpu_callback(struct notif
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+#ifdef CONFIG_NET_DMA
+/**
+ * net_dma_rebalance -
+ * This is called when the number of channels allocated to the net_dma_client
+ * changes.  The net_dma_client tries to have one DMA channel per CPU

[PATCH 6/8] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static

2006-03-10 Thread Chris Leech
Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/net/tcp.h |2 ++
 net/ipv4/tcp.c|   10 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 610f66b..afc4b8a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -296,6 +296,8 @@ extern int  tcp_rcv_established(struct 
 
 extern voidtcp_rcv_space_adjust(struct sock *sk);
 
+extern voidtcp_cleanup_rbuf(struct sock *sk, int copied);
+
 extern int tcp_twsk_unique(struct sock *sk,
struct sock *sktw, void *twp);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 4b0272c..9122520 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -936,7 +936,7 @@ static int tcp_recv_urg(struct sock *sk,
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void cleanup_rbuf(struct sock *sk, int copied)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
struct tcp_sock *tp = tcp_sk(sk);
int time_to_ack = 0;
@@ -1085,7 +1085,7 @@ int tcp_read_sock(struct sock *sk, read_
 
/* Clean up data we have read: This will do ACK frames. */
if (copied)
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
return copied;
 }
 
@@ -1219,7 +1219,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
}
}
 
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
 
if (!sysctl_tcp_low_latency  tp-ucopy.task == user_recv) {
/* Install new reader */
@@ -1390,7 +1390,7 @@ skip_copy:
 */
 
/* Clean up data we have read: This will do ACK frames. */
-   cleanup_rbuf(sk, copied);
+   tcp_cleanup_rbuf(sk, copied);
 
TCP_CHECK_TIMER(sk);
release_sock(sk);
@@ -1852,7 +1852,7 @@ static int do_tcp_setsockopt(struct sock
(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) 
inet_csk_ack_scheduled(sk)) {
icsk-icsk_ack.pending |= ICSK_ACK_PUSHED;
-   cleanup_rbuf(sk, 1);
+   tcp_cleanup_rbuf(sk, 1);
if (!(val  1))
icsk-icsk_ack.pingpong = 1;
}

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-10 Thread Chris Leech
Locks down user pages and sets up for DMA in tcp_recvmsg, then calls
dma_async_try_early_copy in tcp_v4_do_rcv

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/net/netdma.h |1 
 net/ipv4/tcp.c   |  110 +-
 net/ipv4/tcp_input.c |   74 ++
 net/ipv4/tcp_ipv4.c  |   18 
 net/ipv6/tcp_ipv6.c  |   12 +
 5 files changed, 193 insertions(+), 22 deletions(-)

diff --git a/include/net/netdma.h b/include/net/netdma.h
index feb499f..3d9c222 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -38,6 +38,7 @@ static inline struct dma_chan *get_softn
 int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
const struct sk_buff *skb, int offset, struct iovec *to,
size_t len, struct dma_pinned_list *pinned_list);
+int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen);
 
 #endif /* CONFIG_NET_DMA */
 #endif /* NETDMA_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9122520..a277398 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -262,7 +262,7 @@
 #include net/tcp.h
 #include net/xfrm.h
 #include net/ip.h
-
+#include net/netdma.h
 
 #include asm/uaccess.h
 #include asm/ioctls.h
@@ -1109,6 +1109,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
+   int copied_early = 0;
 
lock_sock(sk);
 
@@ -1132,6 +1133,12 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 
target = sock_rcvlowat(sk, flags  MSG_WAITALL, len);
 
+#ifdef CONFIG_NET_DMA
+   tp-ucopy.dma_chan = NULL;
+   if ((len  sysctl_tcp_dma_copybreak)  !(flags  MSG_PEEK)  
!sysctl_tcp_low_latency  __get_cpu_var(softnet_data.net_dma))
+   tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len);
+#endif
+
do {
struct sk_buff *skb;
u32 offset;
@@ -1273,6 +1280,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru
} else
sk_wait_data(sk, timeo);
 
+#ifdef CONFIG_NET_DMA
+   tp-ucopy.wakeup = 0;
+#endif
+
if (user_recv) {
int chunk;
 
@@ -1328,13 +1339,39 @@ do_prequeue:
}
 
if (!(flags  MSG_TRUNC)) {
-   err = skb_copy_datagram_iovec(skb, offset,
- msg-msg_iov, used);
-   if (err) {
-   /* Exception. Bailout! */
-   if (!copied)
-   copied = -EFAULT;
-   break;
+#ifdef CONFIG_NET_DMA
+   if (!tp-ucopy.dma_chan  tp-ucopy.pinned_list)
+   tp-ucopy.dma_chan = get_softnet_dma();
+
+   if (tp-ucopy.dma_chan) {
+   tp-ucopy.dma_cookie = 
dma_skb_copy_datagram_iovec(
+   tp-ucopy.dma_chan, skb, offset,
+   msg-msg_iov, used,
+   tp-ucopy.pinned_list);
+
+   if (tp-ucopy.dma_cookie  0) {
+
+   printk(KERN_ALERT dma_cookie  0\n);
+
+   /* Exception. Bailout! */
+   if (!copied)
+   copied = -EFAULT;
+   break;
+   }
+   if ((offset + used) == skb-len)
+   copied_early = 1;
+
+   } else
+#endif
+   {
+   err = skb_copy_datagram_iovec(skb, offset,
+   msg-msg_iov, used);
+   if (err) {
+   /* Exception. Bailout! */
+   if (!copied)
+   copied = -EFAULT;
+   break;
+   }
}
}
 
@@ -1354,15 +1391,33 @@ skip_copy:
 
if (skb-h.th-fin)
goto found_fin_ok;
-   if (!(flags  MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   if (!(flags  MSG_PEEK)) {
+   if (!copied_early)
+   sk_eat_skb(sk, skb);
+#ifdef CONFIG_NET_DMA
+   else {
+   __skb_unlink(skb, sk-sk_receive_queue);
+   __skb_queue_tail(sk-sk_async_wait_queue, skb);
+   copied_early = 0;
+   }
+#endif

[PATCH 7/8] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold

2006-03-10 Thread Chris Leech
Any socket recv of less than this ammount will not be offloaded

Signed-off-by: Chris Leech [EMAIL PROTECTED]
---

 include/linux/sysctl.h |1 +
 include/net/tcp.h  |1 +
 net/core/user_dma.c|4 
 net/ipv4/sysctl_net_ipv4.c |   10 ++
 4 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 76eaeff..cd9e7c0 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -403,6 +403,7 @@ enum
NET_TCP_MTU_PROBING=113,
NET_TCP_BASE_MSS=114,
NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115,
+   NET_TCP_DMA_COPYBREAK=116,
 };
 
 enum {
diff --git a/include/net/tcp.h b/include/net/tcp.h
index afc4b8a..f319368 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -221,6 +221,7 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
+extern int sysctl_tcp_dma_copybreak;
 extern int sysctl_tcp_nometrics_save;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 24e51eb..a85d1f1 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -33,6 +33,10 @@ file called LICENSE.
 
 #ifdef CONFIG_NET_DMA
 
+#define NET_DMA_DEFAULT_COPYBREAK 1024
+
+int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK;
+
 /**
  * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec.
  * @skb - buffer to copy
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 6b6c3ad..6a6aa53 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -688,6 +688,16 @@ ctl_table ipv4_table[] = {
.mode   = 0644,
.proc_handler   = proc_dointvec
},
+#ifdef CONFIG_NET_DMA
+   {
+   .ctl_name   = NET_TCP_DMA_COPYBREAK,
+   .procname   = tcp_dma_copybreak,
+   .data   = sysctl_tcp_dma_copybreak,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec
+   },
+#endif
{ .ctl_name = 0 }
 };
 

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >