[PATCH] iscsi: respond to netlink with unicast when appropriate
Instead of always multicasting responses, send a unicast netlink message directed at the correct pid. This will be needed if we ever want to support multiple userspace processes interacting with the kernel over iSCSI netlink simultaneously. Limitations can currently be seen if you attempt to run multiple iscsistart commands in parallel. We've fixed up the userspace issues in iscsistart that prevented multiple instances from running, so now attempts to speed up booting by bringing up multiple iscsi sessions at once in the initramfs are just running into misrouted responses that this fixes. Signed-off-by: Chris Leech <cle...@redhat.com> --- drivers/scsi/scsi_transport_iscsi.c | 29 ++--- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index f4b52b44b966..65f6c94f2e9b 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2322,6 +2322,12 @@ iscsi_multicast_skb(struct sk_buff *skb, uint32_t group, gfp_t gfp) return nlmsg_multicast(nls, skb, 0, group, gfp); } +static int +iscsi_unicast_skb(struct sk_buff *skb, u32 portid) +{ + return nlmsg_unicast(nls, skb, portid); +} + int iscsi_recv_pdu(struct iscsi_cls_conn *conn, struct iscsi_hdr *hdr, char *data, uint32_t data_size) { @@ -2524,14 +2530,11 @@ void iscsi_ping_comp_event(uint32_t host_no, struct iscsi_transport *transport, EXPORT_SYMBOL_GPL(iscsi_ping_comp_event); static int -iscsi_if_send_reply(uint32_t group, int seq, int type, int done, int multi, - void *payload, int size) +iscsi_if_send_reply(u32 portid, int type, void *payload, int size) { struct sk_buff *skb; struct nlmsghdr *nlh; int len = nlmsg_total_size(size); - int flags = multi ? NLM_F_MULTI : 0; - int t = done ? NLMSG_DONE : type; skb = alloc_skb(len, GFP_ATOMIC); if (!skb) { @@ -2539,10 +2542,9 @@ iscsi_if_send_reply(uint32_t group, int seq, int type, int done, int multi, return -ENOMEM; } - nlh = __nlmsg_put(skb, 0, 0, t, (len - sizeof(*nlh)), 0); - nlh->nlmsg_flags = flags; + nlh = __nlmsg_put(skb, 0, 0, type, (len - sizeof(*nlh)), 0); memcpy(nlmsg_data(nlh), payload, size); - return iscsi_multicast_skb(skb, group, GFP_ATOMIC); + return iscsi_unicast_skb(skb, portid); } static int @@ -3470,6 +3472,7 @@ static int iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) { int err = 0; + u32 portid; struct iscsi_uevent *ev = nlmsg_data(nlh); struct iscsi_transport *transport = NULL; struct iscsi_internal *priv; @@ -3490,10 +3493,12 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) if (!try_module_get(transport->owner)) return -EINVAL; + portid = NETLINK_CB(skb).portid; + switch (nlh->nlmsg_type) { case ISCSI_UEVENT_CREATE_SESSION: err = iscsi_if_create_session(priv, ep, ev, - NETLINK_CB(skb).portid, + portid, ev->u.c_session.initial_cmdsn, ev->u.c_session.cmds_max, ev->u.c_session.queue_depth); @@ -3506,7 +3511,7 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) } err = iscsi_if_create_session(priv, ep, ev, - NETLINK_CB(skb).portid, + portid, ev->u.c_bound_session.initial_cmdsn, ev->u.c_bound_session.cmds_max, ev->u.c_bound_session.queue_depth); @@ -3664,6 +3669,8 @@ iscsi_if_recv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, uint32_t *group) static void iscsi_if_rx(struct sk_buff *skb) { + u32 portid = NETLINK_CB(skb).portid; + mutex_lock(_queue_mutex); while (skb->len >= NLMSG_HDRLEN) { int err; @@ -3699,8 +3706,8 @@ iscsi_if_rx(struct sk_buff *skb) break; if (ev->type == ISCSI_UEVENT_GET_CHAP && !err) break; - err = iscsi_if_send_reply(group, nlh->nlmsg_seq, - nlh->nlmsg_type, 0, 0, ev, sizeof(*ev)); + err = iscsi_if_send_reply(portid, nlh->nlmsg_type, + ev, sizeof(*ev)); } while (err < 0 && err != -ECONNREFUSED && err != -ESRCH); skb_pull(skb, rlen); } -- 2.14.3
Re: [PATCH 0/9] use network namespace for iSCSI control interfaces
On Tue, Nov 21, 2017 at 11:26:09AM +, David Laight wrote: > From: Chris Leech > > Sent: 15 November 2017 00:25 > > To: David Laight > > Cc: netdev@vger.kernel.org; contain...@lists.linux-foundation.org > > Subject: Re: [PATCH 0/9] use network namespace for iSCSI control interfaces > > > > On Wed, Nov 08, 2017 at 10:31:04AM +, David Laight wrote: > > > From: Chris Leech > > > > Sent: 07 November 2017 22:45 > > > > > > > > I've posted these changes to allow iSCSI management within a container > > > > using a network namespace to the SCSI and Open-iSCSI lists, but seeing > > > > as it's not really SCSI/block related I'm casting a wider net looking > > > > for reviews. > > > > > > I didn't spot you acquiring and releasing references to the namespace. > > > (I might have missed it, the relevant patch is difficult to read). > > > > > > If the sockets are created in the context of the process whose namespace > > > you are using you don't need it, but given the hooks and callbacks > > > I'm not at all sure that is obviously true. > > > > Thanks David, > > > > Looking at it again, you're right and I think I need to hold a reference > > for the iSCSI host and handle namespace deletion. Even for iscsi_tcp > > the socket gets handed off from the creating process to the transport > > and can outlive iscsid. > > It isn't that simple > IIRC: > > The namespace delete callback isn't made until the reference count is zero. > Sockets created with sock_create_kern() don't hold a reference to the > namespace > > This is all fine for sockets used for admin purposes, but rather hopeless > if you really need the namespace to continue to exist while the connections > are open - if only for long enough to close the connection. Yeah, I'm catching up on a lot of the details as I attempt to sort out what a sane behavior for iscsi_tcp should be here. With these patches as is, iscsi_tcp will hold a reference to a TCP socket created by iscsid and keep the net namespace from exiting. That's good for keeping iSCSI sessions alive. Bad in that all processes attached to the namespace can terminate, and if filesystem references (like bind mounts from ip-netns) are unlinked then I don't see any way to get back into the namespace to shut down iSCSI. I've been trying to sort out a way to shut down and clean up in that case, but the other approach might be to look at having a kernel thread to reference the namespace so that the ns inode could be recovered from /proc? > To make matters even more annoying the functions for holding and > releasing a namespace are GPL_ONLY :-( I have no problem with that. Thanks, Chris Leech
Re: [PATCH 0/9] use network namespace for iSCSI control interfaces
On Wed, Nov 08, 2017 at 10:31:04AM +, David Laight wrote: > From: Chris Leech > > Sent: 07 November 2017 22:45 > > > > I've posted these changes to allow iSCSI management within a container > > using a network namespace to the SCSI and Open-iSCSI lists, but seeing > > as it's not really SCSI/block related I'm casting a wider net looking > > for reviews. > > I didn't spot you acquiring and releasing references to the namespace. > (I might have missed it, the relevant patch is difficult to read). > > If the sockets are created in the context of the process whose namespace > you are using you don't need it, but given the hooks and callbacks > I'm not at all sure that is obviously true. Thanks David, Looking at it again, you're right and I think I need to hold a reference for the iSCSI host and handle namespace deletion. Even for iscsi_tcp the socket gets handed off from the creating process to the transport and can outlive iscsid. I'm looking at migration or destruction now rather than later. Chris
[PATCH 4/9] iscsi: make all iSCSI netlink multicast namespace aware
Make use of the per-net netlink sockets. Responses are sent back on the same socket/namespace the request was received on. Async events are reported on the socket/namespace stored in the iscsi_cls_host associated with the event. Signed-off-by: Chris Leech <cle...@redhat.com> --- drivers/scsi/scsi_transport_iscsi.c | 92 - 1 file changed, 61 insertions(+), 31 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index d29c095ccc7d..1fc5878b1a8c 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2418,8 +2418,8 @@ iscsi_if_transport_lookup(struct iscsi_transport *tt) } static int -iscsi_multicast_netns(struct net *net, struct sk_buff *skb, - uint32_t group, gfp_t gfp) +iscsi_multicast_skb(struct net *net, struct sk_buff *skb, + uint32_t group, gfp_t gfp) { struct sock *nls; struct iscsi_net *isn; @@ -2429,12 +2429,6 @@ iscsi_multicast_netns(struct net *net, struct sk_buff *skb, return nlmsg_multicast(nls, skb, 0, group, gfp); } -static int -iscsi_multicast_skb(struct sk_buff *skb, uint32_t group, gfp_t gfp) -{ - return iscsi_multicast_netns(_net, skb, group, gfp); -} - int iscsi_recv_pdu(struct iscsi_cls_conn *conn, struct iscsi_hdr *hdr, char *data, uint32_t data_size) { @@ -2443,6 +2437,7 @@ int iscsi_recv_pdu(struct iscsi_cls_conn *conn, struct iscsi_hdr *hdr, struct iscsi_uevent *ev; char *pdu; struct iscsi_internal *priv; + struct net *net; int len = nlmsg_total_size(sizeof(*ev) + sizeof(struct iscsi_hdr) + data_size); @@ -2469,7 +2464,8 @@ int iscsi_recv_pdu(struct iscsi_cls_conn *conn, struct iscsi_hdr *hdr, memcpy(pdu, hdr, sizeof(struct iscsi_hdr)); memcpy(pdu + sizeof(struct iscsi_hdr), data, data_size); - return iscsi_multicast_skb(skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC); + net = iscsi_conn_net(conn); + return iscsi_multicast_skb(net, skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(iscsi_recv_pdu); @@ -2480,6 +2476,7 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, struct nlmsghdr *nlh; struct sk_buff *skb; struct iscsi_uevent *ev; + struct net *net; int len = nlmsg_total_size(sizeof(*ev) + data_size); skb = alloc_skb(len, GFP_ATOMIC); @@ -2504,7 +2501,8 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, memcpy((char *)ev + sizeof(*ev), data, data_size); - return iscsi_multicast_skb(skb, ISCSI_NL_GRP_UIP, GFP_ATOMIC); + net = iscsi_host_net(shost->shost_data); + return iscsi_multicast_skb(net, skb, ISCSI_NL_GRP_UIP, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(iscsi_offload_mesg); @@ -2514,6 +2512,7 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) struct sk_buff *skb; struct iscsi_uevent *ev; struct iscsi_internal *priv; + struct net *net; int len = nlmsg_total_size(sizeof(*ev)); priv = iscsi_if_transport_lookup(conn->transport); @@ -2535,7 +2534,8 @@ void iscsi_conn_error_event(struct iscsi_cls_conn *conn, enum iscsi_err error) ev->r.connerror.cid = conn->cid; ev->r.connerror.sid = iscsi_conn_get_sid(conn); - iscsi_multicast_skb(skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC); + net = iscsi_conn_net(conn); + iscsi_multicast_skb(net, skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC); iscsi_cls_conn_printk(KERN_INFO, conn, "detected conn error (%d)\n", error); @@ -2549,6 +2549,7 @@ void iscsi_conn_login_event(struct iscsi_cls_conn *conn, struct sk_buff *skb; struct iscsi_uevent *ev; struct iscsi_internal *priv; + struct net *net; int len = nlmsg_total_size(sizeof(*ev)); priv = iscsi_if_transport_lookup(conn->transport); @@ -2569,7 +2570,9 @@ void iscsi_conn_login_event(struct iscsi_cls_conn *conn, ev->r.conn_login.state = state; ev->r.conn_login.cid = conn->cid; ev->r.conn_login.sid = iscsi_conn_get_sid(conn); - iscsi_multicast_skb(skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC); + + net = iscsi_conn_net(conn); + iscsi_multicast_skb(net, skb, ISCSI_NL_GRP_ISCSID, GFP_ATOMIC); iscsi_cls_conn_printk(KERN_INFO, conn, "detected conn login (%d)\n", state); @@ -2580,11 +2583,17 @@ void iscsi_post_host_event(uint32_t host_no, struct iscsi_transport *transport, enum iscsi_host_event_code code, uint32_t data_size, uint8_t *data) { + struct Scsi_Host *shost; + struct net *net; struct nlmsghdr *nlh; struct sk_buff *skb; struct iscsi_uevent *ev; int len = nlmsg_total_size(sizeof(*ev)
[PATCH 0/9] use network namespace for iSCSI control interfaces
Hello, I've posted these changes to allow iSCSI management within a container using a network namespace to the SCSI and Open-iSCSI lists, but seeing as it's not really SCSI/block related I'm casting a wider net looking for reviews. These patches apply network namespace to the iSCSI netlink family and sysfs objects from the iSCSI transport class. Thank you, Chris Leech --- This series of changes makes the iSCSI netlink and sysfs control interfaces filtered by network namespace. This is required to run iscsid in any network namespace other than the initial default one. Currently the netlink communication will fail if iscsid is started in a non-default network namespace, as there is no kernel side socket. After fixing that, the rest of these changes are to filter visibility of the iSCSI transport objects by netns. This allows for multiple iscsid instances to be run, one per netns, each controlling it's own set of iSCSI sessions. The iSCSI transport objects are filtered, but not the SCSI or block layer devices. So while iSCSI hosts and sessions become limited to a network namespace, any attached devices remain visible system wide. This currently only supports iscsi_tcp running in a new namespace, as it creates a virtual host per session. Support could be added later to allow assignment of iSCSI HBAs to network namespace, much as is done for network interfaces. Chris Leech (9): iscsi: create per-net iscsi netlink kernel sockets iscsi: associate endpoints with a host iscsi: sysfs filtering by network namespace iscsi: make all iSCSI netlink multicast namespace aware iscsi: set netns for iscsi_tcp hosts iscsi: check net namespace for all iscsi lookups iscsi: convert flashnode devices from bus to class iscsi: rename iscsi_bus_flash_* to iscsi_flash_* iscsi: filter flashnode sysfs by net namespace drivers/infiniband/ulp/iser/iscsi_iser.c | 7 +- drivers/scsi/be2iscsi/be_iscsi.c | 6 +- drivers/scsi/bnx2i/bnx2i_iscsi.c | 6 +- drivers/scsi/cxgbi/libcxgbi.c| 6 +- drivers/scsi/iscsi_tcp.c | 7 + drivers/scsi/qedi/qedi_iscsi.c | 6 +- drivers/scsi/qla4xxx/ql4_os.c| 62 +-- drivers/scsi/scsi_transport_iscsi.c | 625 ++- include/scsi/scsi_transport_iscsi.h | 63 ++-- 9 files changed, 538 insertions(+), 250 deletions(-) -- 2.9.5
[PATCH 5/9] iscsi: set netns for iscsi_tcp hosts
This lets iscsi_tcp operate in multiple namespaces. It uses current during session creation to find the net namespace, but it might be better to manage to pass it along from the iscsi netlink socket. Signed-off-by: Chris Leech <cle...@redhat.com> --- drivers/scsi/iscsi_tcp.c| 7 +++ drivers/scsi/scsi_transport_iscsi.c | 7 ++- include/scsi/scsi_transport_iscsi.h | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 4d934d6c3e13..b368c94c884b 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -957,6 +957,11 @@ static int iscsi_sw_tcp_slave_configure(struct scsi_device *sdev) return 0; } +static struct net *iscsi_sw_tcp_netns(struct Scsi_Host *shost) +{ + return current->nsproxy->net_ns; +} + static struct scsi_host_template iscsi_sw_tcp_sht = { .module = THIS_MODULE, .name = "iSCSI Initiator over TCP/IP", @@ -1013,6 +1018,8 @@ static struct iscsi_transport iscsi_sw_tcp_transport = { .alloc_pdu = iscsi_sw_tcp_pdu_alloc, /* recovery */ .session_recovery_timedout = iscsi_session_recovery_timedout, + /* net namespace */ + .get_netns = iscsi_sw_tcp_netns, }; static int __init iscsi_sw_tcp_init(void) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 1fc5878b1a8c..2ec10f6ac3a2 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -1600,11 +1600,16 @@ static int iscsi_setup_host(struct transport_container *tc, struct device *dev, { struct Scsi_Host *shost = dev_to_shost(dev); struct iscsi_cls_host *ihost = shost->shost_data; + struct iscsi_internal *priv = to_iscsi_internal(shost->transportt); + struct iscsi_transport *transport = priv->iscsi_transport; memset(ihost, 0, sizeof(*ihost)); atomic_set(>nr_scans, 0); mutex_init(>mutex); - ihost->netns = _net; + if (transport->get_netns) + ihost->netns = transport->get_netns(shost); + else + ihost->netns = _net; iscsi_bsg_host_add(shost, ihost); /* ignore any bsg add error - we just can't do sgio */ diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 8c8191dfdc21..3c4cd4779f72 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -168,6 +168,7 @@ struct iscsi_transport { int (*logout_flashnode_sid) (struct iscsi_cls_session *cls_sess); int (*get_host_stats) (struct Scsi_Host *shost, char *buf, int len); u8 (*check_protection)(struct iscsi_task *task, sector_t *sector); + struct net *(*get_netns)(struct Scsi_Host *shost); }; /* -- 2.9.5
[PATCH 3/9] iscsi: sysfs filtering by network namespace
This makes the iscsi_host, iscsi_session, iscsi_connection, iscsi_iface, and iscsi_endpoint transport class devices only visible in sysfs under a matching network namespace. The network namespace for all of these objects is tracked in the iscsi_cls_host structure. Signed-off-by: Chris Leech <cle...@redhat.com> --- drivers/scsi/scsi_transport_iscsi.c | 128 +++- include/scsi/scsi_transport_iscsi.h | 1 + 2 files changed, 112 insertions(+), 17 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 6ab7ca82b121..d29c095ccc7d 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -161,9 +161,31 @@ static void iscsi_endpoint_release(struct device *dev) kfree(ep); } +static struct net *iscsi_host_net(struct iscsi_cls_host *ihost) +{ + return ihost->netns; +} + +static struct net *iscsi_endpoint_net(struct iscsi_endpoint *ep) +{ + struct Scsi_Host *shost = iscsi_endpoint_to_shost(ep); + struct iscsi_cls_host *ihost = shost->shost_data; + + return iscsi_host_net(ihost); +} + +static const void *iscsi_endpoint_namespace(struct device *dev) +{ + struct iscsi_endpoint *ep = iscsi_dev_to_endpoint(dev); + + return iscsi_endpoint_net(ep); +} + static struct class iscsi_endpoint_class = { .name = "iscsi_endpoint", .dev_release = iscsi_endpoint_release, + .ns_type = _ns_type_operations, + .namespace = iscsi_endpoint_namespace, }; static ssize_t @@ -285,10 +307,26 @@ static void iscsi_iface_release(struct device *dev) put_device(parent); } +static struct net *iscsi_iface_net(struct iscsi_iface *iface) +{ + struct Scsi_Host *shost = iscsi_iface_to_shost(iface); + struct iscsi_cls_host *ihost = shost->shost_data; + + return iscsi_host_net(ihost); +} + +static const void *iscsi_iface_namespace(struct device *dev) +{ + struct iscsi_iface *iface = iscsi_dev_to_iface(dev); + + return iscsi_iface_net(iface); +} static struct class iscsi_iface_class = { .name = "iscsi_iface", .dev_release = iscsi_iface_release, + .ns_type = _ns_type_operations, + .namespace = iscsi_iface_namespace, }; #define ISCSI_IFACE_ATTR(_prefix, _name, _mode, _show, _store) \ @@ -1566,6 +1604,7 @@ static int iscsi_setup_host(struct transport_container *tc, struct device *dev, memset(ihost, 0, sizeof(*ihost)); atomic_set(>nr_scans, 0); mutex_init(>mutex); + ihost->netns = _net; iscsi_bsg_host_add(shost, ihost); /* ignore any bsg add error - we just can't do sgio */ @@ -1586,23 +1625,78 @@ static int iscsi_remove_host(struct transport_container *tc, return 0; } -static DECLARE_TRANSPORT_CLASS(iscsi_host_class, - "iscsi_host", - iscsi_setup_host, - iscsi_remove_host, - NULL); - -static DECLARE_TRANSPORT_CLASS(iscsi_session_class, - "iscsi_session", - NULL, - NULL, - NULL); - -static DECLARE_TRANSPORT_CLASS(iscsi_connection_class, - "iscsi_connection", - NULL, - NULL, - NULL); +#define DECLARE_TRANSPORT_CLASS_NS(cls, nm, su, rm, cfg, ns, nslookup) \ +struct transport_class cls = { \ + .class = { \ + .name = nm, \ + .ns_type = ns, \ + .namespace = nslookup, \ + }, \ + .setup = su,\ + .remove = rm, \ + .configure = cfg, \ +} + +static const void *iscsi_host_namespace(struct device *dev) +{ + struct Scsi_Host *shost = transport_class_to_shost(dev); + struct iscsi_cls_host *ihost = shost->shost_data; + + return iscsi_host_net(ihost); +} + +static DECLARE_TRANSPORT_CLASS_NS(iscsi_host_class, + "iscsi_host", + iscsi_setup_host, + iscsi_remove_host, + NULL, + _ns_type_operations, + iscsi_host_namespace); + +static struct net *iscsi_sess_net(struct iscsi_cls_session *cls_session) +{ + struct Scsi_Host *sh
[PATCH 8/9] iscsi: rename iscsi_bus_flash_* to iscsi_flash_*
cleanups after the bus to class conversion Signed-off-by: Chris Leech <cle...@redhat.com> --- drivers/scsi/qla4xxx/ql4_os.c | 52 +- drivers/scsi/scsi_transport_iscsi.c | 102 ++-- include/scsi/scsi_transport_iscsi.h | 48 + 3 files changed, 102 insertions(+), 100 deletions(-) diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 55a729568873..9c80688d0681 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -169,20 +169,20 @@ static int qla4xxx_host_reset(struct Scsi_Host *shost, int reset_type); * iSCSI Flash DDB sysfs entry points */ static int -qla4xxx_sysfs_ddb_set_param(struct iscsi_bus_flash_session *fnode_sess, - struct iscsi_bus_flash_conn *fnode_conn, +qla4xxx_sysfs_ddb_set_param(struct iscsi_flash_session *fnode_sess, + struct iscsi_flash_conn *fnode_conn, void *data, int len); static int -qla4xxx_sysfs_ddb_get_param(struct iscsi_bus_flash_session *fnode_sess, +qla4xxx_sysfs_ddb_get_param(struct iscsi_flash_session *fnode_sess, int param, char *buf); static int qla4xxx_sysfs_ddb_add(struct Scsi_Host *shost, const char *buf, int len); static int -qla4xxx_sysfs_ddb_delete(struct iscsi_bus_flash_session *fnode_sess); -static int qla4xxx_sysfs_ddb_login(struct iscsi_bus_flash_session *fnode_sess, - struct iscsi_bus_flash_conn *fnode_conn); -static int qla4xxx_sysfs_ddb_logout(struct iscsi_bus_flash_session *fnode_sess, - struct iscsi_bus_flash_conn *fnode_conn); +qla4xxx_sysfs_ddb_delete(struct iscsi_flash_session *fnode_sess); +static int qla4xxx_sysfs_ddb_login(struct iscsi_flash_session *fnode_sess, + struct iscsi_flash_conn *fnode_conn); +static int qla4xxx_sysfs_ddb_logout(struct iscsi_flash_session *fnode_sess, + struct iscsi_flash_conn *fnode_conn); static int qla4xxx_sysfs_ddb_logout_sid(struct iscsi_cls_session *cls_sess); static struct qla4_8xxx_legacy_intr_set legacy_intr[] = @@ -3454,8 +3454,8 @@ static int qla4xxx_task_xmit(struct iscsi_task *task) return -ENOSYS; } -static int qla4xxx_copy_from_fwddb_param(struct iscsi_bus_flash_session *sess, -struct iscsi_bus_flash_conn *conn, +static int qla4xxx_copy_from_fwddb_param(struct iscsi_flash_session *sess, +struct iscsi_flash_conn *conn, struct dev_db_entry *fw_ddb_entry) { unsigned long options = 0; @@ -3596,8 +3596,8 @@ static int qla4xxx_copy_from_fwddb_param(struct iscsi_bus_flash_session *sess, return rc; } -static int qla4xxx_copy_to_fwddb_param(struct iscsi_bus_flash_session *sess, - struct iscsi_bus_flash_conn *conn, +static int qla4xxx_copy_to_fwddb_param(struct iscsi_flash_session *sess, + struct iscsi_flash_conn *conn, struct dev_db_entry *fw_ddb_entry) { uint16_t options; @@ -7162,7 +7162,7 @@ static void qla4xxx_build_new_nt_list(struct scsi_qla_host *ha, **/ static int qla4xxx_sysfs_ddb_is_non_persistent(struct device *dev, void *data) { - struct iscsi_bus_flash_session *fnode_sess; + struct iscsi_flash_session *fnode_sess; if (!iscsi_is_flashnode_session_dev(dev)) return 0; @@ -7192,8 +7192,8 @@ static int qla4xxx_sysfs_ddb_tgt_create(struct scsi_qla_host *ha, struct dev_db_entry *fw_ddb_entry, uint16_t *idx, int user) { - struct iscsi_bus_flash_session *fnode_sess = NULL; - struct iscsi_bus_flash_conn *fnode_conn = NULL; + struct iscsi_flash_session *fnode_sess = NULL; + struct iscsi_flash_conn *fnode_conn = NULL; int rc = QLA_ERROR; fnode_sess = iscsi_create_flashnode_sess(ha->host, *idx, @@ -7330,8 +7330,8 @@ static int qla4xxx_sysfs_ddb_add(struct Scsi_Host *shost, const char *buf, * This writes the contents of target ddb buffer to Flash with a valid cookie * value in order to make the ddb entry persistent. **/ -static int qla4xxx_sysfs_ddb_apply(struct iscsi_bus_flash_session *fnode_sess, - struct iscsi_bus_flash_conn *fnode_conn) +static int qla4xxx_sysfs_ddb_apply(struct iscsi_flash_session *fnode_sess, + struct iscsi_flash_conn *fnode_conn) { struct Scsi_Host *shost = iscsi_flash_session_to_shost(fnode_sess); struct scsi_qla_host *ha = to_qla_host(shost); @@ -7520,8 +7520,8 @@ static int qla4xxx_ddb_login_nt(struct scsi_qla_host *ha, * * This logs in to the specifi
[PATCH 6/9] iscsi: check net namespace for all iscsi lookups
All internal lookups of iSCSI transport objects need to be filtered by net namespace. Signed-off-by: Chris Leech <cle...@redhat.com> --- drivers/infiniband/ulp/iser/iscsi_iser.c | 5 +- drivers/scsi/be2iscsi/be_iscsi.c | 4 +- drivers/scsi/bnx2i/bnx2i_iscsi.c | 4 +- drivers/scsi/cxgbi/libcxgbi.c| 4 +- drivers/scsi/qedi/qedi_iscsi.c | 4 +- drivers/scsi/qla4xxx/ql4_os.c| 6 +- drivers/scsi/scsi_transport_iscsi.c | 201 +++ include/scsi/scsi_transport_iscsi.h | 5 +- 8 files changed, 150 insertions(+), 83 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 0a4214be4877..6d088634a806 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -464,15 +464,18 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, struct iscsi_conn *conn = cls_conn->dd_data; struct iser_conn *iser_conn; struct iscsi_endpoint *ep; + struct net *net; int error; error = iscsi_conn_bind(cls_session, cls_conn, is_leading); if (error) return error; + /* the transport ep handle comes from user space so it must be * verified against the global ib connections list */ - ep = iscsi_lookup_endpoint(transport_eph); + net = iscsi_sess_net(cls_session); + ep = iscsi_lookup_endpoint(net, transport_eph); if (!ep) { iser_err("can't bind eph %llx\n", (unsigned long long)transport_eph); diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c index 33f79f385660..1f4b1b98b4e6 100644 --- a/drivers/scsi/be2iscsi/be_iscsi.c +++ b/drivers/scsi/be2iscsi/be_iscsi.c @@ -181,8 +181,10 @@ int beiscsi_conn_bind(struct iscsi_cls_session *cls_session, struct beiscsi_endpoint *beiscsi_ep; struct iscsi_endpoint *ep; uint16_t cri_index; + struct net *net; - ep = iscsi_lookup_endpoint(transport_fd); + net = iscsi_sess_net(cls_session); + ep = iscsi_lookup_endpoint(net, transport_fd); if (!ep) return -EINVAL; diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c index 19fadb5d3b3c..58dca20f0ba0 100644 --- a/drivers/scsi/bnx2i/bnx2i_iscsi.c +++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c @@ -1414,9 +1414,11 @@ static int bnx2i_conn_bind(struct iscsi_cls_session *cls_session, struct bnx2i_hba *hba = iscsi_host_priv(shost); struct bnx2i_endpoint *bnx2i_ep; struct iscsi_endpoint *ep; + struct net *net; int ret_code; - ep = iscsi_lookup_endpoint(transport_fd); + net = iscsi_sess_net(cls_session); + ep = iscsi_lookup_endpoint(net, transport_fd); if (!ep) return -EINVAL; /* diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c index 558484f72738..e768fe285e85 100644 --- a/drivers/scsi/cxgbi/libcxgbi.c +++ b/drivers/scsi/cxgbi/libcxgbi.c @@ -2373,9 +2373,11 @@ int cxgbi_bind_conn(struct iscsi_cls_session *cls_session, struct iscsi_endpoint *ep; struct cxgbi_endpoint *cep; struct cxgbi_sock *csk; + struct net *net; int err; - ep = iscsi_lookup_endpoint(transport_eph); + net = iscsi_sess_net(cls_session); + ep = iscsi_lookup_endpoint(net, transport_eph); if (!ep) return -EINVAL; diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index 5ae589ea1dd2..5cd267a457f4 100644 --- a/drivers/scsi/qedi/qedi_iscsi.c +++ b/drivers/scsi/qedi/qedi_iscsi.c @@ -381,8 +381,10 @@ static int qedi_conn_bind(struct iscsi_cls_session *cls_session, struct qedi_ctx *qedi = iscsi_host_priv(shost); struct qedi_endpoint *qedi_ep; struct iscsi_endpoint *ep; + struct net *net; - ep = iscsi_lookup_endpoint(transport_fd); + net = iscsi_sess_net(cls_session); + ep = iscsi_lookup_endpoint(net, transport_fd); if (!ep) return -EINVAL; diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 5785bf6c3ec0..770313d0b986 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -3178,6 +3178,7 @@ static int qla4xxx_conn_bind(struct iscsi_cls_session *cls_session, struct ddb_entry *ddb_entry; struct scsi_qla_host *ha; struct iscsi_session *sess; + struct net *net; sess = cls_session->dd_data; ddb_entry = sess->dd_data; @@ -3186,9 +3187,12 @@ static int qla4xxx_conn_bind(struct iscsi_cls_session *cls_session, DEBUG2(ql4_printk(KERN_INFO, ha, "%s: sid = %d, cid = %d\n", __func__, cls_session->sid, cls_conn->cid)); + net = iscsi_sess_net(cls_session
[PATCH 7/9] iscsi: convert flashnode devices from bus to class
The flashnode session and connection devices should be filtered by net namespace along with the iscsi_host, but we can't do that with a bus device. As these don't use any of the bus matching functionality, they make more sense as a class device anyway. Signed-off-by: Chris Leech <cle...@redhat.com> --- drivers/scsi/qla4xxx/ql4_os.c | 2 +- drivers/scsi/scsi_transport_iscsi.c | 36 +++- include/scsi/scsi_transport_iscsi.h | 2 ++ 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 770313d0b986..55a729568873 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -7164,7 +7164,7 @@ static int qla4xxx_sysfs_ddb_is_non_persistent(struct device *dev, void *data) { struct iscsi_bus_flash_session *fnode_sess; - if (!iscsi_flashnode_bus_match(dev, NULL)) + if (!iscsi_is_flashnode_session_dev(dev)) return 0; fnode_sess = iscsi_dev_to_flash_session(dev); diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index fbec3a019f00..b053d57a482d 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -1060,6 +1060,12 @@ static const struct device_type iscsi_flashnode_sess_dev_type = { .release = iscsi_flashnode_sess_release, }; +bool iscsi_is_flashnode_session_dev(struct device *dev) +{ + return dev->type == _flashnode_sess_dev_type; +} +EXPORT_SYMBOL_GPL(iscsi_is_flashnode_session_dev); + /* flash node connection attrs show */ #define iscsi_flashnode_conn_attr_show(type, name, param) \ static ssize_t \ @@ -1246,20 +1252,8 @@ static const struct device_type iscsi_flashnode_conn_dev_type = { .release = iscsi_flashnode_conn_release, }; -static struct bus_type iscsi_flashnode_bus; - -int iscsi_flashnode_bus_match(struct device *dev, -struct device_driver *drv) -{ - if (dev->bus == _flashnode_bus) - return 1; - return 0; -} -EXPORT_SYMBOL_GPL(iscsi_flashnode_bus_match); - -static struct bus_type iscsi_flashnode_bus = { +static struct class iscsi_flashnode_bus = { .name = "iscsi_flashnode", - .match = _flashnode_bus_match, }; /** @@ -1290,7 +1284,7 @@ iscsi_create_flashnode_sess(struct Scsi_Host *shost, int index, fnode_sess->transport = transport; fnode_sess->target_id = index; fnode_sess->dev.type = _flashnode_sess_dev_type; - fnode_sess->dev.bus = _flashnode_bus; + fnode_sess->dev.class = _flashnode_bus; fnode_sess->dev.parent = >shost_gendev; dev_set_name(_sess->dev, "flashnode_sess-%u:%u", shost->host_no, index); @@ -1338,7 +1332,7 @@ iscsi_create_flashnode_conn(struct Scsi_Host *shost, fnode_conn->transport = transport; fnode_conn->dev.type = _flashnode_conn_dev_type; - fnode_conn->dev.bus = _flashnode_bus; + fnode_conn->dev.class = _flashnode_bus; fnode_conn->dev.parent = _sess->dev; dev_set_name(_conn->dev, "flashnode_conn-%u:%u:0", shost->host_no, fnode_sess->target_id); @@ -1371,7 +1365,7 @@ EXPORT_SYMBOL_GPL(iscsi_create_flashnode_conn); */ static int iscsi_is_flashnode_conn_dev(struct device *dev, void *data) { - return dev->bus == _flashnode_bus; + return dev->type == _flashnode_conn_dev_type; } static int iscsi_destroy_flashnode_conn(struct iscsi_bus_flash_conn *fnode_conn) @@ -1385,7 +1379,7 @@ static int flashnode_match_index(struct device *dev, void *data) struct iscsi_bus_flash_session *fnode_sess = NULL; int ret = 0; - if (!iscsi_flashnode_bus_match(dev, NULL)) + if (dev->type != _flashnode_sess_dev_type) goto exit_match_index; fnode_sess = iscsi_dev_to_flash_session(dev); @@ -1491,7 +1485,7 @@ EXPORT_SYMBOL_GPL(iscsi_destroy_flashnode_sess); static int iscsi_iter_destroy_flashnode_fn(struct device *dev, void *data) { - if (!iscsi_flashnode_bus_match(dev, NULL)) + if (dev->type != _flashnode_sess_dev_type) return 0; iscsi_destroy_flashnode_sess(iscsi_dev_to_flash_session(dev)); @@ -4752,7 +4746,7 @@ static __init int iscsi_transport_init(void) if (err) goto unregister_conn_class; - err = bus_register(_flashnode_bus); + err = class_register(_flashnode_bus); if (err) goto unregister_session_class; @@ -4773,7 +4767,7 @@ static __init int iscsi_transport_init(void) unregister_pernet_subsys: unregister_pernet_subsys(_net_ops); unregister_flashnode_bus: - bus_unregister(_flashnode_bus); + class_unregister(_flashnode
[PATCH 1/9] iscsi: create per-net iscsi netlink kernel sockets
Prepare iSCSI netlink to operate in multiple namespaces. Signed-off-by: Chris Leech <cle...@redhat.com> --- drivers/scsi/scsi_transport_iscsi.c | 67 +++-- 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 7404d26895f5..0b23ba346cbe 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include #include @@ -1601,7 +1603,11 @@ static DECLARE_TRANSPORT_CLASS(iscsi_connection_class, NULL, NULL); -static struct sock *nls; +struct iscsi_net { + struct sock *nls; +}; + +static int iscsi_net_id __read_mostly; static DEFINE_MUTEX(rx_queue_mutex); static LIST_HEAD(sesslist); @@ -2317,11 +2323,23 @@ iscsi_if_transport_lookup(struct iscsi_transport *tt) } static int -iscsi_multicast_skb(struct sk_buff *skb, uint32_t group, gfp_t gfp) +iscsi_multicast_netns(struct net *net, struct sk_buff *skb, + uint32_t group, gfp_t gfp) { + struct sock *nls; + struct iscsi_net *isn; + + isn = net_generic(net, iscsi_net_id); + nls = isn->nls; return nlmsg_multicast(nls, skb, 0, group, gfp); } +static int +iscsi_multicast_skb(struct sk_buff *skb, uint32_t group, gfp_t gfp) +{ + return iscsi_multicast_netns(_net, skb, group, gfp); +} + int iscsi_recv_pdu(struct iscsi_cls_conn *conn, struct iscsi_hdr *hdr, char *data, uint32_t data_size) { @@ -4490,13 +4508,42 @@ int iscsi_unregister_transport(struct iscsi_transport *tt) } EXPORT_SYMBOL_GPL(iscsi_unregister_transport); -static __init int iscsi_transport_init(void) +static int __net_init iscsi_net_init(struct net *net) { - int err; + struct sock *nls; + struct iscsi_net *isn; struct netlink_kernel_cfg cfg = { .groups = 1, .input = iscsi_if_rx, }; + + nls = netlink_kernel_create(net, NETLINK_ISCSI, ); + if (!nls) + return -ENOMEM; + isn = net_generic(net, iscsi_net_id); + isn->nls = nls; + return 0; +} + +static void __net_exit iscsi_net_exit(struct net *net) +{ + struct iscsi_net *isn; + + isn = net_generic(net, iscsi_net_id); + netlink_kernel_release(isn->nls); + isn->nls = NULL; +} + +static struct pernet_operations iscsi_net_ops = { + .init = iscsi_net_init, + .exit = iscsi_net_exit, + .id = _net_id, + .size = sizeof(struct iscsi_net), +}; + +static __init int iscsi_transport_init(void) +{ + int err; printk(KERN_INFO "Loading iSCSI transport class v%s.\n", ISCSI_TRANSPORT_VERSION); @@ -4530,8 +4577,8 @@ static __init int iscsi_transport_init(void) if (err) goto unregister_session_class; - nls = netlink_kernel_create(_net, NETLINK_ISCSI, ); - if (!nls) { + err = register_pernet_subsys(_net_ops); + if (err) { err = -ENOBUFS; goto unregister_flashnode_bus; } @@ -4539,13 +4586,13 @@ static __init int iscsi_transport_init(void) iscsi_eh_timer_workq = create_singlethread_workqueue("iscsi_eh"); if (!iscsi_eh_timer_workq) { err = -ENOMEM; - goto release_nls; + goto unregister_pernet_subsys; } return 0; -release_nls: - netlink_kernel_release(nls); +unregister_pernet_subsys: + unregister_pernet_subsys(_net_ops); unregister_flashnode_bus: bus_unregister(_flashnode_bus); unregister_session_class: @@ -4566,7 +4613,7 @@ static __init int iscsi_transport_init(void) static void __exit iscsi_transport_exit(void) { destroy_workqueue(iscsi_eh_timer_workq); - netlink_kernel_release(nls); + unregister_pernet_subsys(_net_ops); bus_unregister(_flashnode_bus); transport_class_unregister(_connection_class); transport_class_unregister(_session_class); -- 2.9.5
[PATCH 9/9] iscsi: filter flashnode sysfs by net namespace
Finished the net namespace support for flashnode sysfs devices Signed-off-by: Chris Leech <cle...@redhat.com> --- drivers/scsi/scsi_transport_iscsi.c | 33 + 1 file changed, 33 insertions(+) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 5ffda170ac9d..783971d72c4c 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -1268,8 +1268,41 @@ static int iscsi_is_flashnode_conn_dev(struct device *dev, void *data) return dev->type == _flashnode_conn_dev_type; } +static struct net *iscsi_flashnode_sess_net(struct iscsi_flash_session *f_sess) +{ + struct Scsi_Host *shost = iscsi_flash_session_to_shost(f_sess); + struct iscsi_cls_host *ihost = shost->shost_data; + + return iscsi_host_net(ihost); +} + +static struct net *iscsi_flashnode_conn_net(struct iscsi_flash_conn *f_conn) +{ + struct iscsi_flash_session *f_sess = + iscsi_flash_conn_to_flash_session(f_conn); + + return iscsi_flashnode_sess_net(f_sess); +} + +static const void *iscsi_flashnode_namespace(struct device *dev) +{ + struct iscsi_flash_conn *f_conn; + struct iscsi_flash_session *f_sess; + + if (iscsi_is_flashnode_conn_dev(dev, NULL)) { + f_conn = iscsi_dev_to_flash_conn(dev); + return iscsi_flashnode_conn_net(f_conn); + } else if (iscsi_is_flashnode_session_dev(dev)) { + f_sess = iscsi_dev_to_flash_session(dev); + return iscsi_flashnode_sess_net(f_sess); + } + return NULL; +} + static struct class iscsi_flashnode = { .name = "iscsi_flashnode", + .ns_type = _ns_type_operations, + .namespace = iscsi_flashnode_namespace, }; /** -- 2.9.5
[PATCH 2/9] iscsi: associate endpoints with a host
Right now the iscsi_endpoint is only linked to a connection once that connection has been established. For net namespace filtering of the sysfs objects, associate an endpoint with the host that it was allocated for when it is created. Signed-off-by: Chris Leech <cle...@redhat.com> --- drivers/infiniband/ulp/iser/iscsi_iser.c | 2 +- drivers/scsi/be2iscsi/be_iscsi.c | 2 +- drivers/scsi/bnx2i/bnx2i_iscsi.c | 2 +- drivers/scsi/cxgbi/libcxgbi.c| 2 +- drivers/scsi/qedi/qedi_iscsi.c | 2 +- drivers/scsi/qla4xxx/ql4_os.c| 2 +- drivers/scsi/scsi_transport_iscsi.c | 3 ++- include/scsi/scsi_transport_iscsi.h | 6 +- 8 files changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 19624e023ebd..0a4214be4877 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -817,7 +817,7 @@ iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, struct iser_conn *iser_conn; struct iscsi_endpoint *ep; - ep = iscsi_create_endpoint(0); + ep = iscsi_create_endpoint(shost, 0); if (!ep) return ERR_PTR(-ENOMEM); diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c index a398c54139aa..33f79f385660 100644 --- a/drivers/scsi/be2iscsi/be_iscsi.c +++ b/drivers/scsi/be2iscsi/be_iscsi.c @@ -1157,7 +1157,7 @@ beiscsi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, return ERR_PTR(ret); } - ep = iscsi_create_endpoint(sizeof(struct beiscsi_endpoint)); + ep = iscsi_create_endpoint(shost, sizeof(struct beiscsi_endpoint)); if (!ep) { ret = -ENOMEM; return ERR_PTR(ret); diff --git a/drivers/scsi/bnx2i/bnx2i_iscsi.c b/drivers/scsi/bnx2i/bnx2i_iscsi.c index 03c104b47f31..19fadb5d3b3c 100644 --- a/drivers/scsi/bnx2i/bnx2i_iscsi.c +++ b/drivers/scsi/bnx2i/bnx2i_iscsi.c @@ -384,7 +384,7 @@ static struct iscsi_endpoint *bnx2i_alloc_ep(struct bnx2i_hba *hba) struct bnx2i_endpoint *bnx2i_ep; u32 ec_div; - ep = iscsi_create_endpoint(sizeof(*bnx2i_ep)); + ep = iscsi_create_endpoint(hba->shost, sizeof(*bnx2i_ep)); if (!ep) { printk(KERN_ERR "bnx2i: Could not allocate ep\n"); return NULL; diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c index 858e32e8ad2d..558484f72738 100644 --- a/drivers/scsi/cxgbi/libcxgbi.c +++ b/drivers/scsi/cxgbi/libcxgbi.c @@ -2616,7 +2616,7 @@ struct iscsi_endpoint *cxgbi_ep_connect(struct Scsi_Host *shost, goto release_conn; } - ep = iscsi_create_endpoint(sizeof(*cep)); + ep = iscsi_create_endpoint(shost, sizeof(*cep)); if (!ep) { err = -ENOMEM; pr_info("iscsi alloc ep, OOM.\n"); diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c index a02b34ea5cab..5ae589ea1dd2 100644 --- a/drivers/scsi/qedi/qedi_iscsi.c +++ b/drivers/scsi/qedi/qedi_iscsi.c @@ -847,7 +847,7 @@ qedi_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, return ERR_PTR(ret); } - ep = iscsi_create_endpoint(sizeof(struct qedi_endpoint)); + ep = iscsi_create_endpoint(shost, sizeof(struct qedi_endpoint)); if (!ep) { QEDI_ERR(>dbg_ctx, "endpoint create fail\n"); ret = -ENOMEM; diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 64c6fa563fdb..5785bf6c3ec0 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -1673,7 +1673,7 @@ qla4xxx_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, } ha = iscsi_host_priv(shost); - ep = iscsi_create_endpoint(sizeof(struct qla_endpoint)); + ep = iscsi_create_endpoint(shost, sizeof(struct qla_endpoint)); if (!ep) { ret = -ENOMEM; return ERR_PTR(ret); diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 0b23ba346cbe..6ab7ca82b121 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -194,7 +194,7 @@ static int iscsi_match_epid(struct device *dev, const void *data) } struct iscsi_endpoint * -iscsi_create_endpoint(int dd_size) +iscsi_create_endpoint(struct Scsi_Host *shost, int dd_size) { struct device *dev; struct iscsi_endpoint *ep; @@ -221,6 +221,7 @@ iscsi_create_endpoint(int dd_size) ep->id = id; ep->dev.class = _endpoint_class; + ep->dev.parent = >shost_gendev; dev_set_name(>dev, "ep-%llu", (unsigned long long) id); err = device_register(>dev); if (err) diff --git a/include/scsi/scsi_trans
Re: [PATCH 22/29] drivers, scsi: convert iscsi_task.refcount from atomic_t to refcount_t
On Mon, Mar 06, 2017 at 04:21:09PM +0200, Elena Reshetova wrote: > refcount_t type and corresponding API should be > used instead of atomic_t when the variable is used as > a reference counter. This allows to avoid accidental > refcounter overflows that might lead to use-after-free > situations. > > Signed-off-by: Elena Reshetova <elena.reshet...@intel.com> > Signed-off-by: Hans Liljestrand <ishkam...@gmail.com> > Signed-off-by: Kees Cook <keesc...@chromium.org> > Signed-off-by: David Windsor <dwind...@gmail.com> This looks OK to me. Acked-by: Chris Leech <cle...@redhat.com> > --- > drivers/scsi/libiscsi.c| 8 > drivers/scsi/qedi/qedi_iscsi.c | 2 +- > include/scsi/libiscsi.h| 3 ++- > 3 files changed, 7 insertions(+), 6 deletions(-) > > diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c > index 834d121..7eb1d2c 100644 > --- a/drivers/scsi/libiscsi.c > +++ b/drivers/scsi/libiscsi.c > @@ -516,13 +516,13 @@ static void iscsi_free_task(struct iscsi_task *task) > > void __iscsi_get_task(struct iscsi_task *task) > { > - atomic_inc(>refcount); > + refcount_inc(>refcount); > } > EXPORT_SYMBOL_GPL(__iscsi_get_task); > > void __iscsi_put_task(struct iscsi_task *task) > { > - if (atomic_dec_and_test(>refcount)) > + if (refcount_dec_and_test(>refcount)) > iscsi_free_task(task); > } > EXPORT_SYMBOL_GPL(__iscsi_put_task); > @@ -744,7 +744,7 @@ __iscsi_conn_send_pdu(struct iscsi_conn *conn, struct > iscsi_hdr *hdr, >* released by the lld when it has transmitted the task for >* pdus we do not expect a response for. >*/ > - atomic_set(>refcount, 1); > + refcount_set(>refcount, 1); > task->conn = conn; > task->sc = NULL; > INIT_LIST_HEAD(>running); > @@ -1616,7 +1616,7 @@ static inline struct iscsi_task > *iscsi_alloc_task(struct iscsi_conn *conn, > sc->SCp.phase = conn->session->age; > sc->SCp.ptr = (char *) task; > > - atomic_set(>refcount, 1); > + refcount_set(>refcount, 1); > task->state = ISCSI_TASK_PENDING; > task->conn = conn; > task->sc = sc; > diff --git a/drivers/scsi/qedi/qedi_iscsi.c b/drivers/scsi/qedi/qedi_iscsi.c > index b9f79d3..3895bd5 100644 > --- a/drivers/scsi/qedi/qedi_iscsi.c > +++ b/drivers/scsi/qedi/qedi_iscsi.c > @@ -1372,7 +1372,7 @@ static void qedi_cleanup_task(struct iscsi_task *task) > { > if (!task->sc || task->state == ISCSI_TASK_PENDING) { > QEDI_INFO(NULL, QEDI_LOG_IO, "Returning ref_cnt=%d\n", > - atomic_read(>refcount)); > + refcount_read(>refcount)); > return; > } > > diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h > index b0e275d..24d74b5 100644 > --- a/include/scsi/libiscsi.h > +++ b/include/scsi/libiscsi.h > @@ -29,6 +29,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -139,7 +140,7 @@ struct iscsi_task { > > /* state set/tested under session->lock */ > int state; > - atomic_trefcount; > + refcount_t refcount; > struct list_headrunning;/* running cmd list */ > void*dd_data; /* driver/transport data */ > }; > -- > 2.7.4 >
Re: [RFC PATCH 0/4] Make iSCSI network namespace aware
On Wed, May 20, 2015 at 11:45:43AM -0700, Andy Grover wrote: On 05/13/2015 03:12 PM, Chris Leech wrote: This is only about the structures and functionality involved in maintaining the iSCSI session, the SCSI host along with it's discovered targets and devices has no association with network namespaces. These patches are functional, but not complete. There's no isolation enforced in the kernel just yet, so it relies on well behaved userspace. I plan on fixing that, but wanted some feedback on the idea and approach so far. Seems like a good direction, to me. What would be the extent of the userspace (open-iscsi) changes needed to go along with this? There's no core changes needed in the open-iscsi tools, it's more a matter of how iscsid is packaged and executed. The control socket between iscsid and iscsiadm binds to an abstract unix domain path, so that works fine as long as you run iscsiadm from within the same net ns as the iscsid instance you want to talk to. The pid file checks clash if /var/run is common between instances. Putting iscsid in a container could provide separate config files and configuration databases, but there may be something that could improve handling there. I've been testing using 'ip netns exec' to run iscsid in a new network namespace (it actually crates a new mount namespace as well, to remount /sys with the new namespace filtered view). My test setup so far has been the following: A VM with two virtio network interfaces on different virtual networks. I have an iSCSI target configured with two portals, one on each virtual network. I create two new network namespaces with 'ip netns add' and then move the nics into them with 'ip link dev netns ns' and bring them online. Using 'ip netns exec' I start up an iscsid instance in each namespace, using the --foreground option to avoid the PID file clash. Form within each namespace I can run iscsiadm to manage sessions through one of the iscsid instances. With this setup they share the persistent configuration database, so I specifically select which records to start/stop. - Chris -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 0/4] Make iSCSI network namespace aware
I've had a few reports of people trying to run iscsid in a container, which doesn't work at all when using network namespaces. This is the start of me looking at what it would take to make that work, and if it makes sense at all. The first issue is that the kernel side of the iSCSI netlink control protocol only operates in the initial network namespace. But beyond that, if we allow iSCSI to be managed within a namespace we need to decide what that means. I think it makes the most sense to isolate the iSCSI host, along with it's associated endpoints, connections, and sessions, to a network namespace and allow multiple instances of the userspace tools to exist in separate namespaces managing separate hosts. It works well for iscsi_tcp, which creates a host per session. There's no attempt to manage sessions on offloading hosts independently, although future work could include the ability to move an entire host to a new namespace like is supported for network devices. This is only about the structures and functionality involved in maintaining the iSCSI session, the SCSI host along with it's discovered targets and devices has no association with network namespaces. These patches are functional, but not complete. There's no isolation enforced in the kernel just yet, so it relies on well behaved userspace. I plan on fixing that, but wanted some feedback on the idea and approach so far. Thanks, Chris Chris Leech (4): iscsi: create per-net iscsi nl kernel sockets iscsi: sysfs filtering by network namespace iscsi: make all netlink multicast namespace aware iscsi: set netns for iscsi_tcp hosts drivers/scsi/iscsi_tcp.c| 7 + drivers/scsi/scsi_transport_iscsi.c | 264 +--- include/scsi/scsi_transport_iscsi.h | 2 + 3 files changed, 222 insertions(+), 51 deletions(-) -- 2.1.0 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 4/4] iscsi: set netns for iscsi_tcp hosts
This lets iscsi_tcp operate in multiple namespaces. It uses current during session creation to find the net namespace, but it might be better to manage to pass it along from the iscsi netlink socket. --- drivers/scsi/iscsi_tcp.c| 7 +++ drivers/scsi/scsi_transport_iscsi.c | 7 ++- include/scsi/scsi_transport_iscsi.h | 1 + 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 0b8af18..ebe99da 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -948,6 +948,11 @@ static int iscsi_sw_tcp_slave_configure(struct scsi_device *sdev) return 0; } +static struct net *iscsi_sw_tcp_netns(struct Scsi_Host *shost) +{ + return current-nsproxy-net_ns; +} + static struct scsi_host_template iscsi_sw_tcp_sht = { .module = THIS_MODULE, .name = iSCSI Initiator over TCP/IP, @@ -1003,6 +1008,8 @@ static struct iscsi_transport iscsi_sw_tcp_transport = { .alloc_pdu = iscsi_sw_tcp_pdu_alloc, /* recovery */ .session_recovery_timedout = iscsi_session_recovery_timedout, + /* net namespace */ + .get_netns = iscsi_sw_tcp_netns, }; static int __init iscsi_sw_tcp_init(void) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 4fdd4bf..791aacd 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -1590,11 +1590,16 @@ static int iscsi_setup_host(struct transport_container *tc, struct device *dev, { struct Scsi_Host *shost = dev_to_shost(dev); struct iscsi_cls_host *ihost = shost-shost_data; + struct iscsi_internal *priv = to_iscsi_internal(shost-transportt); + struct iscsi_transport *transport = priv-iscsi_transport; memset(ihost, 0, sizeof(*ihost)); atomic_set(ihost-nr_scans, 0); mutex_init(ihost-mutex); - ihost-netns = init_net; + if (transport-get_netns) + ihost-netns = transport-get_netns(shost); + else + ihost-netns = init_net; iscsi_bsg_host_add(shost, ihost); /* ignore any bsg add error - we just can't do sgio */ diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 860ac0c..878bcf2 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -168,6 +168,7 @@ struct iscsi_transport { int (*logout_flashnode_sid) (struct iscsi_cls_session *cls_sess); int (*get_host_stats) (struct Scsi_Host *shost, char *buf, int len); u8 (*check_protection)(struct iscsi_task *task, sector_t *sector); + struct net *(*get_netns)(struct Scsi_Host *shost); }; /* -- 2.1.0 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[VLAN] set_rx_mode support for unicast address list
Reuse the existing logic for multicast list synchronization for the unicast address list. The core of dev_mc_sync/unsync are split out as __dev_addr_sync/unsync and moved from dev_mcast.c to dev.c. These are then used to implement dev_unicast_sync/unsync as well. I'm working on cleaning up Intel's FCoE stack, which generates new MAC addresses from the fibre channel device id assigned by the fabric as per the current draft specification in T11. When using such a protocol in a VLAN environment it would be nice to not always be forced into promiscuous mode, assuming the underlying Ethernet driver supports multiple unicast addresses as well. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/linux/netdevice.h |4 ++ net/8021q/vlan_dev.c |7 ++- net/core/dev.c| 96 + net/core/dev_mcast.c | 39 ++ 4 files changed, 110 insertions(+), 36 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b0813c3..047d432 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1414,12 +1414,16 @@ extern void dev_set_rx_mode(struct net_device *dev); extern void__dev_set_rx_mode(struct net_device *dev); extern int dev_unicast_delete(struct net_device *dev, void *addr, int alen); extern int dev_unicast_add(struct net_device *dev, void *addr, int alen); +extern int dev_unicast_sync(struct net_device *to, struct net_device *from); +extern voiddev_unicast_unsync(struct net_device *to, struct net_device *from); extern int dev_mc_delete(struct net_device *dev, void *addr, int alen, int all); extern int dev_mc_add(struct net_device *dev, void *addr, int alen, int newonly); extern int dev_mc_sync(struct net_device *to, struct net_device *from); extern voiddev_mc_unsync(struct net_device *to, struct net_device *from); extern int __dev_addr_delete(struct dev_addr_list **list, int *count, void *addr, int alen, int all); extern int __dev_addr_add(struct dev_addr_list **list, int *count, void *addr, int alen, int newonly); +extern int __dev_addr_sync(struct dev_addr_list **to, int *to_count, struct dev_addr_list **from, int *from_count); +extern void__dev_addr_unsync(struct dev_addr_list **to, int *to_count, struct dev_addr_list **from, int *from_count); extern voiddev_set_promiscuity(struct net_device *dev, int inc); extern voiddev_set_allmulti(struct net_device *dev, int inc); extern voidnetdev_state_change(struct net_device *dev); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 8059fa4..77f04e4 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -563,6 +563,7 @@ static int vlan_dev_stop(struct net_device *dev) struct net_device *real_dev = vlan_dev_info(dev)-real_dev; dev_mc_unsync(real_dev, dev); + dev_unicast_unsync(real_dev, dev); if (dev-flags IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); if (dev-flags IFF_PROMISC) @@ -634,9 +635,10 @@ static void vlan_dev_change_rx_flags(struct net_device *dev, int change) dev_set_promiscuity(real_dev, dev-flags IFF_PROMISC ? 1 : -1); } -static void vlan_dev_set_multicast_list(struct net_device *vlan_dev) +static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) { dev_mc_sync(vlan_dev_info(vlan_dev)-real_dev, vlan_dev); + dev_unicast_sync(vlan_dev_info(vlan_dev)-real_dev, vlan_dev); } /* @@ -702,7 +704,8 @@ void vlan_setup(struct net_device *dev) dev-open = vlan_dev_open; dev-stop = vlan_dev_stop; dev-set_mac_address= vlan_dev_set_mac_address; - dev-set_multicast_list = vlan_dev_set_multicast_list; + dev-set_rx_mode= vlan_dev_set_rx_mode; + dev-set_multicast_list = vlan_dev_set_rx_mode; dev-change_rx_flags= vlan_dev_change_rx_flags; dev-do_ioctl = vlan_dev_ioctl; dev-destructor = free_netdev; diff --git a/net/core/dev.c b/net/core/dev.c index c9c593e..edaff27 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2962,6 +2962,102 @@ int dev_unicast_add(struct net_device *dev, void *addr, int alen) } EXPORT_SYMBOL(dev_unicast_add); +int __dev_addr_sync(struct dev_addr_list **to, int *to_count, + struct dev_addr_list **from, int *from_count) +{ + struct dev_addr_list *da, *next; + int err = 0; + + da = *from; + while (da != NULL) { + next = da-next; + if (!da-da_synced) { + err = __dev_addr_add(to, to_count, +da-da_addr, da-da_addrlen, 0); + if (err 0) + break
Re: [ANNOUNCE] Open-FCoE - Fibre Channel over Ethernet Project
Christoph Hellwig wrote: I just did a very quick glance over the tree. Some extremly highlevel comments to start with before actually starting the source review: Thanks for taking a look Christoph - why do you need your own libcrc? lib/crc32.c has a crc32_le We shouldn't, but we may want to add a CRC and copy routine. - libsa should go. Much of it is just wrappers of kernel functions that should be used directly. Other like that hash, even or state helpers might either be opencoded in the caller or made completely generic in lib/. Probably the former but we'll have to see. Yes, and along with it the last use of the BSD TAILQ macros. Just before Rob set up the open repos I finished converting most of those to list_head, the only one left is in the sa_event mechanism. Rather than convert it I'd like to replace the use of sa_event with notifier call chains. I just need to finish auditing the use to make sure the differences won't cause unexpected problems. After than and unwrapping kernel functions, I think the only thing left before completly removing libsa is to open code the state machines. Similarly I think net_types.h need to go. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: change the way e1000 is handling short VLAN frames
On 9/21/07, jamal [EMAIL PROTECTED] wrote: On Fri, 2007-21-09 at 08:43 -0700, Ben Greear wrote: I just re-read the spec, and a bridge *may* pad up to 68, but it is not required. On page 166, it says equipment must be able to handle 64 byte minimums. See page 22 (section 7.2) of this document: http://standards.ieee.org/getieee802/download/802.1Q-1998.pdf Also, page 63, 165, 166 Thanks for the enlightnment. Do we need an ethtool interface to turn off hardware accelerated vlans? Jesse is indicating that the intel hardware can only handle the MUST but not the SHOULD of the spec. Actually a more basic question: Can you select one or the other mode in the software based vlans? Inserting the VLAN tag in software will not change the behavior in the way you want anyway, short frames will still be padded to 64 bytes. You'd have to do short packet padding in software to 68 bytes. Or do software padding to 64 bytes and let the hardware insert the VLAN tag after. Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: change the way e1000 is handling short VLAN frames
On 9/21/07, jamal [EMAIL PROTECTED] wrote: On Fri, 2007-21-09 at 14:34 -0700, Kok, Auke wrote: I never saw any bugreports about e1000 not being able to accept vlan packets because of this, so I'm quite certain it works OK, feel free to find me a case where this isn't so :) If you tell me it can be done on the rx, i will take your word for it;- Emil can certainly verify it. The tx you certainly have issues - Look at one of the suggestions from Chris, i think it is resolvable. I'd say that devices that can't receive 64 bytes VLAN tagged frames have an issue, but for the sake of interoperability and solving Emil's problem I'm willing to discuss how a change to e1000 would work ;-) The simplest option is to add software small frame padding all the time. It won't catch software tagged frames if they were generated somehow, but should fix the hardware tagged ones to be 68 bytes on the wire. If you were worried about software tagged frames then replacing ETH_ZLEN with VLAN_ETH_ZLEN would pad all frames, VLAN or not, to 68 bytes. Emil, this patch will probably do what you want. diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index 4a22595..34e3d18 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -3284,6 +3284,9 @@ e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev) return NETDEV_TX_OK; } + if (skb_padto(skb, ETH_ZLEN)) + return NETDEV_TX_OK; + /* 82571 and newer doesn't need the workaround that limited descriptor * length to 4kB */ if (adapter-hw.mac_type = e1000_82571) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [NET 00/02]: MACVLAN driver
On 6/19/07, Stephen Hemminger [EMAIL PROTECTED] wrote: Looks good. I have some changes to allow devices with multiple MAC addresses (never finished). This device could use that. Stephen, Is this patch available somewhere? I'd be interested in taking a look at it. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [GIT PULL] I/OAT updates for 2.6.22
On Wed, 2007-05-02 at 15:44 -0700, David Miller wrote: Chrstopher, I really really would like you to post these patches early and often to netdev@vger.kernel.org especially because you are touching the TCP code. You're right, I should have sent this to netdev as well. I'm Sorry. As for early and often, I have posted all of these patches to netdev, and made suggested changes, and re-posted. And when I have other networking changes, you can bet they'll get sent to netdev for review first before I think about asking that they be included. You aren't doing this, for several rounds, and just submitting your stuff directly to Linus, Andrew, and lkml, and it's starting to annoy me greatly. For several rounds, I've been posting patches that go nowhere. I honestly don't care if they go straight to Linus, through you, or through Andrew. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: AF_PACKET how to get the original netdev from a packet received from a bonded master
On 4/18/07, David Miller [EMAIL PROTECTED] wrote: Ok, it will give you one level of decapsulation. What do we tell people who want 2 devices previous? :-) I can tell you that the intent of PJs patch was to provide the ifindex of the physical interface that a packet entered the system on, regardless of how many layers of encapsulation are involved. Of course it may not actually do that ... - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: AF_PACKET how to get the original netdev from a packet received from a bonded master
On 4/18/07, David Miller [EMAIL PROTECTED] wrote: Ok, I'll try to remember to high-priority reviewing PJ's patch on my next rebase of the net-2.6.22 tree which should be tonight or tomorrow sometime. Thanks Dave, PJ is offline this week so I'm trying to keep an eye out for discussions related to his various patches :-) Just to give you an idea of our motivation around this, we're looking at layer 2 configuration protocols implemented from user space. As an example Link Layer Discovery Protocol could be used to detect trunking misconfiguration, but only if you can track that information for the underlying interfaces of a bond. Things like 802.1x authenticated links in a bond would have a similar issue of needing to configure each underlying interface before bringing up the bond, but with LLDP there's the added fun of being able to receive updated notifications of configuration changes from the link partner at any time. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/9] ioatdma: Remove the use of writeq from the ioatdma driver
There's only one now anyway, and it's not in a performance path, so make it behave the same on 32-bit and 64-bit CPUs. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/ioatdma.c | 10 -- 1 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index ec11131..cbf93ca 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -608,13 +608,11 @@ static void ioat_start_null_desc(struct ioat_dma_chan *ioat_chan) list_add_tail(desc-node, ioat_chan-used_desc); spin_unlock_bh(ioat_chan-desc_lock); -#if (BITS_PER_LONG == 64) - writeq(desc-phys, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET); -#else - writel((u32) desc-phys, + writel(((u64) desc-phys) 0x, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_LOW); - writel(0, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH); -#endif + writel(((u64) desc-phys) 32, + ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH); + writeb(IOAT_CHANCMD_START, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); } - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/9] I/OAT fixes
Andrew Morton (1): I/OAT: warning fix Chris Leech (6): ioatdma: Push pending transactions to hardware more frequently ioatdma: Remove the wrappers around read(bwl)/write(bwl) in ioatdma ioatdma: Remove the use of writeq from the ioatdma driver I/OAT: Add documentation for the tcp_dma_copybreak sysctl I/OAT: Add entries to MAINTAINERS for the DMA memcpy subsystem and ioatdma I/OAT: Only offload copies for TCP when there will be a context switch Dan Aloni (1): I/OAT: fix I/OAT for kexec Jeff Garzik (1): drivers/dma: handle sysfs errors Documentation/networking/ip-sysctl.txt |6 + MAINTAINERS| 12 +++ drivers/dma/dmaengine.c| 22 +- drivers/dma/ioatdma.c | 81 -- drivers/dma/ioatdma_io.h | 118 - net/ipv4/tcp.c | 26 +-- 6 files changed, 100 insertions(+), 165 deletions(-) -- Chris Leech [EMAIL PROTECTED] I/O Acceleration Technology Software Development LAN Access Division / Digital Enterprise Group - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/9] drivers/dma: handle sysfs errors
From: Jeff Garzik [EMAIL PROTECTED] Signed-off-by: Jeff Garzik [EMAIL PROTECTED] Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/dmaengine.c | 22 -- 1 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 1527804..dc65773 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -312,7 +312,7 @@ void dma_async_client_chan_request(struct dma_client *client, int dma_async_device_register(struct dma_device *device) { static int id; - int chancnt = 0; + int chancnt = 0, rc; struct dma_chan* chan; if (!device) @@ -334,8 +334,15 @@ int dma_async_device_register(struct dma_device *device) snprintf(chan-class_dev.class_id, BUS_ID_SIZE, dma%dchan%d, device-dev_id, chan-chan_id); + rc = class_device_register(chan-class_dev); + if (rc) { + chancnt--; + free_percpu(chan-local); + chan-local = NULL; + goto err_out; + } + kref_get(device-refcount); - class_device_register(chan-class_dev); } mutex_lock(dma_list_mutex); @@ -345,6 +352,17 @@ int dma_async_device_register(struct dma_device *device) dma_chans_rebalance(); return 0; + +err_out: + list_for_each_entry(chan, device-channels, device_node) { + if (chan-local == NULL) + continue; + kref_put(device-refcount, dma_async_device_cleanup); + class_device_unregister(chan-class_dev); + chancnt--; + free_percpu(chan-local); + } + return rc; } /** - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 9/9] I/OAT: fix I/OAT for kexec
Under kexec, I/OAT initialization breaks over busy resources because the previous kernel did not release them. I'm not sure this fix can be considered a complete one but it works for me. I guess something similar to the *_remove method should occur there.. Signed-off-by: Dan Aloni [EMAIL PROTECTED] Signed-off-by: Chris Leech [EMAIL PROTECTED] Signed-off-by: Andrew Morton [EMAIL PROTECTED] --- drivers/dma/ioatdma.c | 13 + 1 files changed, 13 insertions(+), 0 deletions(-) diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index cbf93ca..1d259e5 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -41,6 +41,7 @@ /* internal functions */ static int __devinit ioat_probe(struct pci_dev *pdev, const struct pci_device_id *ent); +static void ioat_shutdown(struct pci_dev *pdev); static void __devexit ioat_remove(struct pci_dev *pdev); static int enumerate_dma_channels(struct ioat_device *device) @@ -557,6 +558,7 @@ static struct pci_driver ioat_pci_drv = { .name = ioatdma, .id_table = ioat_pci_tbl, .probe = ioat_probe, + .shutdown = ioat_shutdown, .remove = __devexit_p(ioat_remove), }; @@ -781,9 +783,20 @@ err_request_regions: err_set_dma_mask: pci_disable_device(pdev); err_enable_device: + + printk(KERN_ERR Intel(R) I/OAT DMA Engine initialization failed\n); + return err; } +static void ioat_shutdown(struct pci_dev *pdev) +{ + struct ioat_device *device; + device = pci_get_drvdata(pdev); + + dma_async_device_unregister(device-common); +} + static void __devexit ioat_remove(struct pci_dev *pdev) { struct ioat_device *device; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/9] ioatdma: Remove the wrappers around read(bwl)/write(bwl) in ioatdma
Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/ioatdma.c| 60 +++ drivers/dma/ioatdma_io.h | 118 -- 2 files changed, 28 insertions(+), 150 deletions(-) diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 0f77a9d..ec11131 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -32,7 +32,6 @@ #include linux/delay.h #include linux/dma-mapping.h #include ioatdma.h -#include ioatdma_io.h #include ioatdma_registers.h #include ioatdma_hw.h @@ -51,8 +50,8 @@ static int enumerate_dma_channels(struct ioat_device *device) int i; struct ioat_dma_chan *ioat_chan; - device-common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET); - xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET); + device-common.chancnt = readb(device-reg_base + IOAT_CHANCNT_OFFSET); + xfercap_scale = readb(device-reg_base + IOAT_XFERCAP_OFFSET); xfercap = (xfercap_scale == 0 ? -1 : (1UL xfercap_scale)); for (i = 0; i device-common.chancnt; i++) { @@ -123,7 +122,7 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan *chan) * In-use bit automatically set by reading chanctrl * If 0, we got it, if 1, someone else did */ - chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); + chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); if (chanctrl IOAT_CHANCTRL_CHANNEL_IN_USE) return -EBUSY; @@ -132,12 +131,12 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan *chan) IOAT_CHANCTRL_ERR_INT_EN | IOAT_CHANCTRL_ANY_ERR_ABORT_EN | IOAT_CHANCTRL_ERR_COMPLETION_EN; -ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); +writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); - chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET); + chanerr = readl(ioat_chan-reg_base + IOAT_CHANERR_OFFSET); if (chanerr) { printk(IOAT: CHANERR = %x, clearing\n, chanerr); - ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr); + writel(chanerr, ioat_chan-reg_base + IOAT_CHANERR_OFFSET); } /* Allocate descriptors */ @@ -161,10 +160,10 @@ static int ioat_dma_alloc_chan_resources(struct dma_chan *chan) ioat_chan-completion_addr); memset(ioat_chan-completion_virt, 0, sizeof(*ioat_chan-completion_virt)); - ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW, - ((u64) ioat_chan-completion_addr) 0x); - ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH, - ((u64) ioat_chan-completion_addr) 32); + writel(((u64) ioat_chan-completion_addr) 0x, + ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_LOW); + writel(((u64) ioat_chan-completion_addr) 32, + ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_HIGH); ioat_start_null_desc(ioat_chan); return i; @@ -182,7 +181,7 @@ static void ioat_dma_free_chan_resources(struct dma_chan *chan) ioat_dma_memcpy_cleanup(ioat_chan); - ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET); + writeb(IOAT_CHANCMD_RESET, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); spin_lock_bh(ioat_chan-desc_lock); list_for_each_entry_safe(desc, _desc, ioat_chan-used_desc, node) { @@ -210,9 +209,9 @@ static void ioat_dma_free_chan_resources(struct dma_chan *chan) ioat_chan-last_completion = ioat_chan-completion_addr = 0; /* Tell hw the chan is free */ - chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); + chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); chanctrl = ~IOAT_CHANCTRL_CHANNEL_IN_USE; - ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); + writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); } /** @@ -318,9 +317,8 @@ static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan, spin_unlock_bh(ioat_chan-desc_lock); if (append) - ioatdma_chan_write8(ioat_chan, - IOAT_CHANCMD_OFFSET, - IOAT_CHANCMD_APPEND); + writeb(IOAT_CHANCMD_APPEND, + ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); return cookie; } @@ -417,9 +415,8 @@ static void ioat_dma_memcpy_issue_pending(struct dma_chan *chan) if (ioat_chan-pending != 0) { ioat_chan-pending = 0; - ioatdma_chan_write8(ioat_chan, - IOAT_CHANCMD_OFFSET, - IOAT_CHANCMD_APPEND); + writeb(IOAT_CHANCMD_APPEND, + ioat_chan-reg_base
[PATCH 1/9] ioatdma: Push pending transactions to hardware more frequently
Every 20 descriptors turns out to be to few append commands with newer/faster CPUs. Pushing every 4 still cuts down on MMIO writes to an acceptable level without letting the DMA engine run out of work. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/ioatdma.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 8e87261..0f77a9d 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -310,7 +310,7 @@ static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan, list_splice_init(new_chain, ioat_chan-used_desc.prev); ioat_chan-pending += desc_count; - if (ioat_chan-pending = 20) { + if (ioat_chan-pending = 4) { append = 1; ioat_chan-pending = 0; } @@ -818,7 +818,7 @@ static void __devexit ioat_remove(struct pci_dev *pdev) } /* MODULE API */ -MODULE_VERSION(1.7); +MODULE_VERSION(1.9); MODULE_LICENSE(GPL); MODULE_AUTHOR(Intel Corporation); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/9] I/OAT: Only offload copies for TCP when there will be a context switch
The performance wins come with having the DMA copy engine doing the copies in parallel with the context switch. If there is enough data ready on the socket at recv time just use a regular copy. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- net/ipv4/tcp.c | 10 +++--- 1 files changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 74c4d10..5ccd5e1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1110,6 +1110,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, long timeo; struct task_struct *user_recv = NULL; int copied_early = 0; + int available = 0; + struct sk_buff *skb; lock_sock(sk); @@ -1136,7 +1138,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, #ifdef CONFIG_NET_DMA tp-ucopy.dma_chan = NULL; preempt_disable(); - if ((len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) + skb = skb_peek_tail(sk-sk_receive_queue); + if (skb) + available = TCP_SKB_CB(skb)-seq + skb-len - (*seq); + if ((available target) + (len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) !sysctl_tcp_low_latency __get_cpu_var(softnet_data).net_dma) { preempt_enable_no_resched(); tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len); @@ -1145,7 +1151,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, #endif do { - struct sk_buff *skb; u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ @@ -1433,7 +1438,6 @@ skip_copy: #ifdef CONFIG_NET_DMA if (tp-ucopy.dma_chan) { - struct sk_buff *skb; dma_cookie_t done, used; dma_async_memcpy_issue_pending(tp-ucopy.dma_chan); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/9] I/OAT: Add documentation for the tcp_dma_copybreak sysctl
Signed-off-by: Chris Leech [EMAIL PROTECTED] --- Documentation/networking/ip-sysctl.txt |6 ++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index d3aae1f..9541691 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -413,6 +413,12 @@ tcp_workaround_signed_windows - BOOLEAN not receive a window scaling option from them. Default: 0 +tcp_dma_copybreak - INTEGER + Lower limit, in bytes, of the size of socket reads that will be + offloaded to a DMA copy engine, if one is present in the system + and CONFIG_NET_DMA is enabled. + Default: 4096 + CIPSOv4 Variables: cipso_cache_enable - BOOLEAN - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/9] I/OAT: Add entries to MAINTAINERS for the DMA memcpy subsystem and ioatdma
Signed-off-by: Chris Leech [EMAIL PROTECTED] --- MAINTAINERS | 12 1 files changed, 12 insertions(+), 0 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1dfba85..2dd5d23 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1156,6 +1156,12 @@ M: [EMAIL PROTECTED] L: netdev@vger.kernel.org S: Maintained +DMA GENERIC MEMCPY SUBSYSTEM +P: Chris Leech +M: [EMAIL PROTECTED] +L: linux-kernel@vger.kernel.org +S: Maintained + DOCBOOK FOR DOCUMENTATION P: Randy Dunlap M: [EMAIL PROTECTED] @@ -1777,6 +1783,12 @@ P: Tigran Aivazian M: [EMAIL PROTECTED] S: Maintained +INTEL I/OAT DMA DRIVER +P: Chris Leech +M: [EMAIL PROTECTED] +L: linux-kernel@vger.kernel.org +S: Supported + INTEL IXP4XX RANDOM NUMBER GENERATOR SUPPORT P: Deepak Saxena M: [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 8/9] I/OAT: warning fix
net/ipv4/tcp.c: In function 'tcp_recvmsg': net/ipv4/tcp.c:: warning: unused variable 'available' Signed-off-by: Andrew Morton [EMAIL PROTECTED] Signed-off-by: Chris Leech [EMAIL PROTECTED] --- net/ipv4/tcp.c | 26 -- 1 files changed, 16 insertions(+), 10 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5ccd5e1..69c525d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1110,7 +1110,6 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, long timeo; struct task_struct *user_recv = NULL; int copied_early = 0; - int available = 0; struct sk_buff *skb; lock_sock(sk); @@ -1139,15 +1138,22 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, tp-ucopy.dma_chan = NULL; preempt_disable(); skb = skb_peek_tail(sk-sk_receive_queue); - if (skb) - available = TCP_SKB_CB(skb)-seq + skb-len - (*seq); - if ((available target) - (len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) - !sysctl_tcp_low_latency __get_cpu_var(softnet_data).net_dma) { - preempt_enable_no_resched(); - tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len); - } else - preempt_enable_no_resched(); + { + int available = 0; + + if (skb) + available = TCP_SKB_CB(skb)-seq + skb-len - (*seq); + if ((available target) + (len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) + !sysctl_tcp_low_latency + __get_cpu_var(softnet_data).net_dma) { + preempt_enable_no_resched(); + tp-ucopy.pinned_list = + dma_pin_iovec_pages(msg-msg_iov, len); + } else { + preempt_enable_no_resched(); + } + } #endif do { - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/9] ioatdma: Push pending transactions to hardware more frequently
This sounds like something that will always be wrong -- or in other words, always be right for only the latest CPUs. Can this be made dynamic, based on some timing factor? In fact I think this has been tweaked twice in the vanilla tree already. This is actually just the same tweak you remember me posting before and I never pushed to get it in mainline, but Jeff's right. The problem isn't so much in the driver itself, as in how it's used by I/OAT in the TCP receive code, there are inherent assumptions about how long a context switch takes compared to how long an offloaded memcpy takes. I'm working on using completion interrupts for the device so as not to end up polling when the CPUs are faster than the code was tuned for, and doing it in a way that doesn't introduce extra context switches. I'm hoping to have something ready for 2.6.22, or at least ready for MM in that time frame. As for this change in the short term, we did go back and make sure that it didn't performance worse with the older CPUs supported on these platforms. We should have tested more intermediate values instead of just jumping from 1 t o 20 for that threshold. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Question on IOAT
On 2/5/07, Olaf Kirch [EMAIL PROTECTED] wrote: Nowhere in the dma_async_*complete functions can I see any code that would sleep if the DMA is not yet complete. Am I missing something, or are we really busy-waiting on the DMA engine? Wouldn't this kind of defeat the purpose of freeing up the CPU from the chores of memcpying? It is busy waiting, but only because the TCP socket use initiates the DMA copies from the softirq and they have time to complete during the switch back to application context. Going back to sleep and creating more context switching made things worse. I'm working on seeing if completion interrupts could be used with a better thought out implementation, the performance implications aren't fully clear to me yet. For other uses, interrupts are probably desired. I also checked the code in ioatdma.c - I would have expected there to be some kind of interrupt handler that kicks the upper layers when a DMA operation completes. But the interrupt handler seems to be for error reporting exclusively... It's just not there now, but it can be added easily, it's one bit in the descriptor and a register read in the interrupt handler to see which channel(s) need attention. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/7] I/OAT: Push pending transactions to hardware more frequently
Every 20 descriptors turns out to be to few append commands with newer/faster CPUs. Pushing every 4 still cuts down on MMIO writes to an acceptable level without letting the DMA engine run out of work. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/ioatdma.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 0358419..f3b34b5 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -310,7 +310,7 @@ static dma_cookie_t do_ioat_dma_memcpy(s list_splice_init(new_chain, ioat_chan-used_desc.prev); ioat_chan-pending += desc_count; - if (ioat_chan-pending = 20) { + if (ioat_chan-pending = 4) { append = 1; ioat_chan-pending = 0; } @@ -818,7 +818,7 @@ static void __devexit ioat_remove(struct } /* MODULE API */ -MODULE_VERSION(1.7); +MODULE_VERSION(1.9); MODULE_LICENSE(GPL); MODULE_AUTHOR(Intel Corporation); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/7] I/OAT: Add entries to MAINTAINERS for the DMA memcpy subsystem and ioatdma
Signed-off-by: Chris Leech [EMAIL PROTECTED] --- MAINTAINERS | 12 1 files changed, 12 insertions(+), 0 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5305dd6..533adbe 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -915,6 +915,12 @@ M: [EMAIL PROTECTED] L: linux-kernel@vger.kernel.org S: Maintained +DMA GENERIC MEMCPY SUBSYSTEM +P: Chris Leech +M: [EMAIL PROTECTED] +L: linux-kernel@vger.kernel.org +S: Maintained + DOCBOOK FOR DOCUMENTATION P: Martin Waitz M: [EMAIL PROTECTED] @@ -1516,6 +1522,12 @@ P: Tigran Aivazian M: [EMAIL PROTECTED] S: Maintained +INTEL I/OAT DMA DRIVER +P: Chris Leech +M: [EMAIL PROTECTED] +L: linux-kernel@vger.kernel.org +S: Supported + INTEL IXP4XX RANDOM NUMBER GENERATOR SUPPORT P: Deepak Saxena M: [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/7] I/OAT: Add documentation for the tcp_dma_copybreak sysctl
Signed-off-by: Chris Leech [EMAIL PROTECTED] --- Documentation/networking/ip-sysctl.txt |6 ++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index fd3c0c0..e9ee102 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -375,6 +375,12 @@ tcp_slow_start_after_idle - BOOLEAN be timed out after an idle period. Default: 1 +tcp_dma_copybreak - INTEGER + Lower limit, in bytes, of the size of socket reads that will be + offloaded to a DMA copy engine, if one is present in the system + and CONFIG_NET_DMA is enabled. + Default: 4096 + CIPSOv4 Variables: cipso_cache_enable - BOOLEAN - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/7] drivers/dma I/OAT fixes
Various fixes for the hardware memcpy engine code and ioatdma Most of these I've posted before, except for the patch to handle sysfs errors from Jeff Garzik. I've dropped the controversial change to not offload loopback traffic. These changes can be pulled from git://lost.foo-projects.org/~cleech/linux-2.6 master -- Chris Leech [EMAIL PROTECTED] I/O Acceleration Technology Software Development LAN Access Division / Digital Enterprise Group - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/7] I/OAT: Only offload copies for TCP when there will be a context switch
The performance wins come with having the DMA copy engine doing the copies in parallel with the context switch. If there is enough data ready on the socket at recv time just use a regular copy. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- net/ipv4/tcp.c | 10 +++--- 1 files changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 66e9a72..ef0a6cd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1108,6 +1108,8 @@ int tcp_recvmsg(struct kiocb *iocb, stru long timeo; struct task_struct *user_recv = NULL; int copied_early = 0; + int available = 0; + struct sk_buff *skb; lock_sock(sk); @@ -1134,7 +1136,11 @@ int tcp_recvmsg(struct kiocb *iocb, stru #ifdef CONFIG_NET_DMA tp-ucopy.dma_chan = NULL; preempt_disable(); - if ((len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) + skb = skb_peek_tail(sk-sk_receive_queue); + if (skb) + available = TCP_SKB_CB(skb)-seq + skb-len - (*seq); + if ((available target) + (len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) !sysctl_tcp_low_latency __get_cpu_var(softnet_data).net_dma) { preempt_enable_no_resched(); tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len); @@ -1143,7 +1149,6 @@ int tcp_recvmsg(struct kiocb *iocb, stru #endif do { - struct sk_buff *skb; u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ @@ -1431,7 +1436,6 @@ skip_copy: #ifdef CONFIG_NET_DMA if (tp-ucopy.dma_chan) { - struct sk_buff *skb; dma_cookie_t done, used; dma_async_memcpy_issue_pending(tp-ucopy.dma_chan); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/7] I/OAT: Remove the wrappers around read(bwl)/write(bwl) in ioatdma
Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/ioatdma.c| 60 +++ drivers/dma/ioatdma_io.h | 118 -- 2 files changed, 28 insertions(+), 150 deletions(-) diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index f3b34b5..ceb03ee 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -32,7 +32,6 @@ #include linux/delay.h #include linux/dma-mapping.h #include ioatdma.h -#include ioatdma_io.h #include ioatdma_registers.h #include ioatdma_hw.h @@ -51,8 +50,8 @@ static int enumerate_dma_channels(struct int i; struct ioat_dma_chan *ioat_chan; - device-common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET); - xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET); + device-common.chancnt = readb(device-reg_base + IOAT_CHANCNT_OFFSET); + xfercap_scale = readb(device-reg_base + IOAT_XFERCAP_OFFSET); xfercap = (xfercap_scale == 0 ? -1 : (1UL xfercap_scale)); for (i = 0; i device-common.chancnt; i++) { @@ -123,7 +122,7 @@ static int ioat_dma_alloc_chan_resources * In-use bit automatically set by reading chanctrl * If 0, we got it, if 1, someone else did */ - chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); + chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); if (chanctrl IOAT_CHANCTRL_CHANNEL_IN_USE) return -EBUSY; @@ -132,12 +131,12 @@ static int ioat_dma_alloc_chan_resources IOAT_CHANCTRL_ERR_INT_EN | IOAT_CHANCTRL_ANY_ERR_ABORT_EN | IOAT_CHANCTRL_ERR_COMPLETION_EN; -ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); +writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); - chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET); + chanerr = readl(ioat_chan-reg_base + IOAT_CHANERR_OFFSET); if (chanerr) { printk(IOAT: CHANERR = %x, clearing\n, chanerr); - ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr); + writel(chanerr, ioat_chan-reg_base + IOAT_CHANERR_OFFSET); } /* Allocate descriptors */ @@ -161,10 +160,10 @@ static int ioat_dma_alloc_chan_resources ioat_chan-completion_addr); memset(ioat_chan-completion_virt, 0, sizeof(*ioat_chan-completion_virt)); - ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW, - ((u64) ioat_chan-completion_addr) 0x); - ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH, - ((u64) ioat_chan-completion_addr) 32); + writel(((u64) ioat_chan-completion_addr) 0x, + ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_LOW); + writel(((u64) ioat_chan-completion_addr) 32, + ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_HIGH); ioat_start_null_desc(ioat_chan); return i; @@ -182,7 +181,7 @@ static void ioat_dma_free_chan_resources ioat_dma_memcpy_cleanup(ioat_chan); - ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET); + writeb(IOAT_CHANCMD_RESET, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); spin_lock_bh(ioat_chan-desc_lock); list_for_each_entry_safe(desc, _desc, ioat_chan-used_desc, node) { @@ -210,9 +209,9 @@ static void ioat_dma_free_chan_resources ioat_chan-last_completion = ioat_chan-completion_addr = 0; /* Tell hw the chan is free */ - chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); + chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); chanctrl = ~IOAT_CHANCTRL_CHANNEL_IN_USE; - ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); + writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); } /** @@ -318,9 +317,8 @@ static dma_cookie_t do_ioat_dma_memcpy(s spin_unlock_bh(ioat_chan-desc_lock); if (append) - ioatdma_chan_write8(ioat_chan, - IOAT_CHANCMD_OFFSET, - IOAT_CHANCMD_APPEND); + writeb(IOAT_CHANCMD_APPEND, + ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); return cookie; } @@ -417,9 +415,8 @@ static void ioat_dma_memcpy_issue_pendin if (ioat_chan-pending != 0) { ioat_chan-pending = 0; - ioatdma_chan_write8(ioat_chan, - IOAT_CHANCMD_OFFSET, - IOAT_CHANCMD_APPEND); + writeb(IOAT_CHANCMD_APPEND, + ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); } } @@ -449,7 +446,7 @@ static void ioat_dma_memcpy_cleanup(stru if ((chan-completion_virt-full IOAT_CHANSTS_DMA_TRANSFER_STATUS
[PATCH 4/7] I/OAT: Remove the use of writeq from the ioatdma driver
There's only one now anyway, and it's not in a performance path, so make it behave the same on 32-bit and 64-bit CPUs. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/ioatdma.c | 10 -- 1 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index ceb03ee..2800c19 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -608,13 +608,11 @@ static void ioat_start_null_desc(struct list_add_tail(desc-node, ioat_chan-used_desc); spin_unlock_bh(ioat_chan-desc_lock); -#if (BITS_PER_LONG == 64) - writeq(desc-phys, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET); -#else - writel((u32) desc-phys, + writel(((u64) desc-phys) 0x, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_LOW); - writel(0, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH); -#endif + writel(((u64) desc-phys) 32, + ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH); + writeb(IOAT_CHANCMD_START, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); } - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/7] drivers/dma: handle sysfs errors
From: Jeff Garzik [EMAIL PROTECTED] Signed-off-by: Jeff Garzik [EMAIL PROTECTED] Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/dmaengine.c | 22 -- 1 files changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 1527804..dc65773 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -312,7 +312,7 @@ void dma_async_client_chan_request(struc int dma_async_device_register(struct dma_device *device) { static int id; - int chancnt = 0; + int chancnt = 0, rc; struct dma_chan* chan; if (!device) @@ -334,8 +334,15 @@ int dma_async_device_register(struct dma snprintf(chan-class_dev.class_id, BUS_ID_SIZE, dma%dchan%d, device-dev_id, chan-chan_id); + rc = class_device_register(chan-class_dev); + if (rc) { + chancnt--; + free_percpu(chan-local); + chan-local = NULL; + goto err_out; + } + kref_get(device-refcount); - class_device_register(chan-class_dev); } mutex_lock(dma_list_mutex); @@ -345,6 +352,17 @@ int dma_async_device_register(struct dma dma_chans_rebalance(); return 0; + +err_out: + list_for_each_entry(chan, device-channels, device_node) { + if (chan-local == NULL) + continue; + kref_put(device-refcount, dma_async_device_cleanup); + class_device_unregister(chan-class_dev); + chancnt--; + free_percpu(chan-local); + } + return rc; } /** - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/7] [I/OAT] Push pending transactions to hardware more frequently
On 8/18/06, Pavel Machek [EMAIL PROTECTED] wrote: Huh, two version bumps for... ONE ONE-LINER :-). Could we get rid of embedded version? It helps no one. Version numbers for drivers that can be built as modules are very helpful for anyone wanting to upgrade a driver on top of a distribution supported kernel. If you always just use the latest kernel source, you're right it doesn't help you. But that's not everyone. This one skips two versions because I'm trying to sync up a 1.8 version tested internally with the 1.7+ upstream changes that's in the kernel now. I'll accept that the official policy is to not version modules when MODULE_VERSION is removed :-) - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: New driver questions: Attansic L1 gigabit NIC
I thought that support statement sounded familiar, large portions of the source code and documentation are modified from an older release of e1000. Nothing wrong with that as it's released under the GPL, except that the copyright statements have mostly just been switched from Intel to Attansic. It's interesting to see a company that was founded in 2000 claiming copyright back to 1999. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/7 v2] [I/OAT] Add entries to MAINTAINERS for the DMA memcpy subsystem and ioatdma
Signed-off-by: Chris Leech [EMAIL PROTECTED] --- MAINTAINERS | 12 1 files changed, 12 insertions(+), 0 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 21116cc..2d484aa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -881,6 +881,12 @@ M: [EMAIL PROTECTED] L: linux-kernel@vger.kernel.org S: Maintained +DMA GENERIC MEMCPY SUBSYSTEM +P: Chris Leech +M: [EMAIL PROTECTED] +L: linux-kernel@vger.kernel.org +S: Maintained + DOCBOOK FOR DOCUMENTATION P: Martin Waitz M: [EMAIL PROTECTED] @@ -1469,6 +1475,12 @@ P: Tigran Aivazian M: [EMAIL PROTECTED] S: Maintained +INTEL I/OAT DMA DRIVER +P: Chris Leech +M: [EMAIL PROTECTED] +L: linux-kernel@vger.kernel.org +S: Supported + INTEL IXP4XX RANDOM NUMBER GENERATOR SUPPORT P: Deepak Saxena M: [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/7] [I/OAT] Only offload copies for TCP when there will be a context switch
The performance wins come with having the DMA copy engine doing the copies in parallel with the context switch. If there is enough data ready on the socket at recv time just use a regular copy. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- net/ipv4/tcp.c | 10 +++--- 1 files changed, 7 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 934396b..36f6b64 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1105,6 +1105,8 @@ int tcp_recvmsg(struct kiocb *iocb, stru long timeo; struct task_struct *user_recv = NULL; int copied_early = 0; + int available = 0; + struct sk_buff *skb; lock_sock(sk); @@ -1131,7 +1133,11 @@ int tcp_recvmsg(struct kiocb *iocb, stru #ifdef CONFIG_NET_DMA tp-ucopy.dma_chan = NULL; preempt_disable(); - if ((len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) + skb = skb_peek_tail(sk-sk_receive_queue); + if (skb) + available = TCP_SKB_CB(skb)-seq + skb-len - (*seq); + if ((available target) + (len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) !sysctl_tcp_low_latency __get_cpu_var(softnet_data).net_dma) { preempt_enable_no_resched(); tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len); @@ -1140,7 +1146,6 @@ int tcp_recvmsg(struct kiocb *iocb, stru #endif do { - struct sk_buff *skb; u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ @@ -1428,7 +1433,6 @@ skip_copy: #ifdef CONFIG_NET_DMA if (tp-ucopy.dma_chan) { - struct sk_buff *skb; dma_cookie_t done, used; dma_async_memcpy_issue_pending(tp-ucopy.dma_chan); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/7] [I/OAT] Don't offload copies for loopback traffic
Local traffic (loopback) is generally in cache anyway, and the overhead cost of offloading the copy is worse than just doing it with the CPU. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- net/ipv4/tcp.c |4 +++- 1 files changed, 3 insertions(+), 1 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 36f6b64..7971e73 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1107,6 +1107,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru int copied_early = 0; int available = 0; struct sk_buff *skb; + struct dst_entry *dst; lock_sock(sk); @@ -1136,7 +1137,8 @@ int tcp_recvmsg(struct kiocb *iocb, stru skb = skb_peek_tail(sk-sk_receive_queue); if (skb) available = TCP_SKB_CB(skb)-seq + skb-len - (*seq); - if ((available target) + dst = __sk_dst_get(sk); + if ((available target) (!dst || (dst-dev != loopback_dev)) (len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) !sysctl_tcp_low_latency __get_cpu_var(softnet_data).net_dma) { preempt_enable_no_resched(); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/7] [I/OAT] Push pending transactions to hardware more frequently
Every 20 descriptors turns out to be to few append commands with newer/faster CPUs. Pushing every 4 still cuts down on MMIO writes to an acceptable level without letting the DMA engine run out of work. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/ioatdma.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index dbd4d6c..be4fdd7 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -310,7 +310,7 @@ static dma_cookie_t do_ioat_dma_memcpy(s list_splice_init(new_chain, ioat_chan-used_desc.prev); ioat_chan-pending += desc_count; - if (ioat_chan-pending = 20) { + if (ioat_chan-pending = 4) { append = 1; ioat_chan-pending = 0; } @@ -818,7 +818,7 @@ static void __devexit ioat_remove(struct } /* MODULE API */ -MODULE_VERSION(1.7); +MODULE_VERSION(1.9); MODULE_LICENSE(GPL); MODULE_AUTHOR(Intel Corporation); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/7] [I/OAT] Remove the use of writeq from the ioatdma driver
There's only one now anyway, and it's not in a performance path, so make it behave the same on 32-bit and 64-bit CPUs. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/ioatdma.c | 10 -- 1 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 0be426f..d6d817c 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -608,13 +608,11 @@ static void ioat_start_null_desc(struct list_add_tail(desc-node, ioat_chan-used_desc); spin_unlock_bh(ioat_chan-desc_lock); -#if (BITS_PER_LONG == 64) - writeq(desc-phys, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET); -#else - writel((u32) desc-phys, + writel(((u64) desc-phys) 0x, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_LOW); - writel(0, ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH); -#endif + writel(((u64) desc-phys) 32, + ioat_chan-reg_base + IOAT_CHAINADDR_OFFSET_HIGH); + writeb(IOAT_CHANCMD_START, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); } - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/7] [I/OAT] Add entries to MAINTAINERS for the DMA memcpy subsystem and ioatdma
Signed-off-by: Chris Leech [EMAIL PROTECTED] --- MAINTAINERS | 10 ++ 1 files changed, 10 insertions(+), 0 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 21116cc..9ae73c9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -881,6 +881,11 @@ M: [EMAIL PROTECTED] L: linux-kernel@vger.kernel.org S: Maintained +DMA GENERIC MEMCPY SUBSYSTEM +P: Chris Leech +M: [EMAIL PROTECTED] +S: Maintained + DOCBOOK FOR DOCUMENTATION P: Martin Waitz M: [EMAIL PROTECTED] @@ -1469,6 +1474,11 @@ P: Tigran Aivazian M: [EMAIL PROTECTED] S: Maintained +INTEL I/OAT DMA DRIVER +P: Chris Leech +M: [EMAIL PROTECTED] +S: Supported + INTEL IXP4XX RANDOM NUMBER GENERATOR SUPPORT P: Deepak Saxena M: [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/7] [I/OAT] Remove the wrappers around read(bwl)/write(bwl) in ioatdma
Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/ioatdma.c| 60 +++ drivers/dma/ioatdma_io.h | 118 -- 2 files changed, 28 insertions(+), 150 deletions(-) diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index be4fdd7..0be426f 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -32,7 +32,6 @@ #include linux/delay.h #include linux/dma-mapping.h #include ioatdma.h -#include ioatdma_io.h #include ioatdma_registers.h #include ioatdma_hw.h @@ -51,8 +50,8 @@ static int enumerate_dma_channels(struct int i; struct ioat_dma_chan *ioat_chan; - device-common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET); - xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET); + device-common.chancnt = readb(device-reg_base + IOAT_CHANCNT_OFFSET); + xfercap_scale = readb(device-reg_base + IOAT_XFERCAP_OFFSET); xfercap = (xfercap_scale == 0 ? -1 : (1UL xfercap_scale)); for (i = 0; i device-common.chancnt; i++) { @@ -123,7 +122,7 @@ static int ioat_dma_alloc_chan_resources * In-use bit automatically set by reading chanctrl * If 0, we got it, if 1, someone else did */ - chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); + chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); if (chanctrl IOAT_CHANCTRL_CHANNEL_IN_USE) return -EBUSY; @@ -132,12 +131,12 @@ static int ioat_dma_alloc_chan_resources IOAT_CHANCTRL_ERR_INT_EN | IOAT_CHANCTRL_ANY_ERR_ABORT_EN | IOAT_CHANCTRL_ERR_COMPLETION_EN; -ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); +writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); - chanerr = ioatdma_chan_read32(ioat_chan, IOAT_CHANERR_OFFSET); + chanerr = readl(ioat_chan-reg_base + IOAT_CHANERR_OFFSET); if (chanerr) { printk(IOAT: CHANERR = %x, clearing\n, chanerr); - ioatdma_chan_write32(ioat_chan, IOAT_CHANERR_OFFSET, chanerr); + writel(chanerr, ioat_chan-reg_base + IOAT_CHANERR_OFFSET); } /* Allocate descriptors */ @@ -161,10 +160,10 @@ static int ioat_dma_alloc_chan_resources ioat_chan-completion_addr); memset(ioat_chan-completion_virt, 0, sizeof(*ioat_chan-completion_virt)); - ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_LOW, - ((u64) ioat_chan-completion_addr) 0x); - ioatdma_chan_write32(ioat_chan, IOAT_CHANCMP_OFFSET_HIGH, - ((u64) ioat_chan-completion_addr) 32); + writel(((u64) ioat_chan-completion_addr) 0x, + ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_LOW); + writel(((u64) ioat_chan-completion_addr) 32, + ioat_chan-reg_base + IOAT_CHANCMP_OFFSET_HIGH); ioat_start_null_desc(ioat_chan); return i; @@ -182,7 +181,7 @@ static void ioat_dma_free_chan_resources ioat_dma_memcpy_cleanup(ioat_chan); - ioatdma_chan_write8(ioat_chan, IOAT_CHANCMD_OFFSET, IOAT_CHANCMD_RESET); + writeb(IOAT_CHANCMD_RESET, ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); spin_lock_bh(ioat_chan-desc_lock); list_for_each_entry_safe(desc, _desc, ioat_chan-used_desc, node) { @@ -210,9 +209,9 @@ static void ioat_dma_free_chan_resources ioat_chan-last_completion = ioat_chan-completion_addr = 0; /* Tell hw the chan is free */ - chanctrl = ioatdma_chan_read16(ioat_chan, IOAT_CHANCTRL_OFFSET); + chanctrl = readw(ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); chanctrl = ~IOAT_CHANCTRL_CHANNEL_IN_USE; - ioatdma_chan_write16(ioat_chan, IOAT_CHANCTRL_OFFSET, chanctrl); + writew(chanctrl, ioat_chan-reg_base + IOAT_CHANCTRL_OFFSET); } /** @@ -318,9 +317,8 @@ static dma_cookie_t do_ioat_dma_memcpy(s spin_unlock_bh(ioat_chan-desc_lock); if (append) - ioatdma_chan_write8(ioat_chan, - IOAT_CHANCMD_OFFSET, - IOAT_CHANCMD_APPEND); + writeb(IOAT_CHANCMD_APPEND, + ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); return cookie; } @@ -417,9 +415,8 @@ static void ioat_dma_memcpy_issue_pendin if (ioat_chan-pending != 0) { ioat_chan-pending = 0; - ioatdma_chan_write8(ioat_chan, - IOAT_CHANCMD_OFFSET, - IOAT_CHANCMD_APPEND); + writeb(IOAT_CHANCMD_APPEND, + ioat_chan-reg_base + IOAT_CHANCMD_OFFSET); } } @@ -449,7 +446,7 @@ static void ioat_dma_memcpy_cleanup(stru if ((chan-completion_virt-full IOAT_CHANSTS_DMA_TRANSFER_STATUS
[PATCH 6/7] [I/OAT] Add documentation for the tcp_dma_copybreak sysctl
Signed-off-by: Chris Leech [EMAIL PROTECTED] --- Documentation/networking/ip-sysctl.txt |6 ++ 1 files changed, 6 insertions(+), 0 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index d46338a..841d61e 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -369,6 +369,12 @@ tcp_slow_start_after_idle - BOOLEAN be timed out after an idle period. Default: 1 +tcp_dma_copybreak - INTEGER + Lower limit, in bytes, of the size of socket reads that will be + offloaded to a DMA copy engine, if one is present in the system + and CONFIG_NET_DMA is enabled. + Default: 4096 + IP Variables: ip_local_port_range - 2 INTEGERS - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: problems with e1000 and jumboframes
On 8/3/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote: You're changing the size of the buffer without telling the hardware. In the interrupt context e1000 knows the size of what was DMAed into the skb, but that's after the fact. So e1000 could detect that memory was corrupted, but not prevent it if you don't give it power of 2 buffers. Actually, the power of 2 thing doesn't hold true for all e1000 devices. Some have 1k granularity, but not Arnd's 82540. I can not change it - code checks if requested mtu and additional size is less than allocated aligned buffer it tricks allocator. Or do you mean that even after 9k mtu was setup it is possible that card can receive packets up to 16k? Yes, that's exactly what I mean. For anything above the standard 1500 bytes the e1000 _hardware_ has no concept of MTU, only buffer length. So even if the driver is set to an MTU of 9000, the NIC will still receive 16k frames. Otherwise the driver would simply allocate MTU sized buffers. -Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: problems with e1000 and jumboframes
On 8/3/06, Evgeniy Polyakov [EMAIL PROTECTED] wrote: Strange, why this skb_shared_info cannon be added before first alignment? And what about smaller frames like 1500, does this driver behave similar (first align then add)? It can be. Could attached (completely untested) patch help? Note that e1000 uses power of two buffers because that's what the hardware supports. Also, there's no program able MTU - only a single bit for long packet enable that disables frame length checks when using jumbo frames. That means that if you tell the e1000 it has a 16k buffer, and a 16k frame shows up on the wire, it's going to write to the entire 16k regardless of your 9k MTU setting. If a 32k frame shows up, two full 16k buffers get written to (OK, assuming the frame can fit into the receive FIFO) That's why I've always been against trying to optimize the allocation sizes in the driver, even with your small change the skb_shinfo area can get corrupted. It may be unlikely, because the frame still has to be valid, but some switches aren't real picky about what sized frame they'll forward on if you enable jumbo support either. So any box on the LAN could send you larger than MTU frames in an attempt to corrupt memory. I believe that if you tell a hardware device it has a buffer of a certain size, you need to be prepared for that entire buffer to get written to. Unfortunately that means wasteful allocations for e1000 if a single buffer per frame is going to be used. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: problems with e1000 and jumboframes
Maximum e1000 frame is 16128 bytes, which is enough before being rounded to 16k to have a space for shared info. My patch just tricks refilling logic to request to allocate slightly less than was setup when mtu was changed. The maximum supported MTU size differs between e1000 devices due to differences in FIFO size. For performance reasons the driver won't enable a MTU that doesn't allow for at least two frames in the Tx FIFO at once - you really want e1000 to be able to DMA the next frame into Tx FIFO while the current one is going out on the wire. This doesn't change the fact that with LPE set, anything that can fit into the Rx FIFO and has a valid CRC will be DMAed into buffers regardless of length. Hardware is not affected, second patch just checks if there is enough space (e1000 stores real mtu). I can not believe that such modern NIC like e1000 can not know in receive interrupt size of the received packet, if it is true, than in generel you are right and some more clever mechanisms shoud be used (at least turn hack off for small packets and only enable it for less than 16 jumbo frames wheere place always is), if size of the received packet is known, then it is enough to compare aligned size and size of the packet to make a decision for allocation. You're changing the size of the buffer without telling the hardware. In the interrupt context e1000 knows the size of what was DMAed into the skb, but that's after the fact. So e1000 could detect that memory was corrupted, but not prevent it if you don't give it power of 2 buffers. Actually, the power of 2 thing doesn't hold true for all e1000 devices. Some have 1k granularity, but not Arnd's 82540. You can't know the size of a received packet before it's DMAed into host memory, no high performance network controller works that way. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: problems with e1000 and jumboframes
On 8/3/06, Arnd Hannemann [EMAIL PROTECTED] wrote: Well you say if a single buffer per frame is going to be used. Well, if I understood you correctly i could set the MTU to, lets say 4000. Then the driver would enable the jumbo frame bit of the hardware, and allocate only a 4k rx buffer, right? (and allocate 16k, because of skb_shinfo) Now if a new 9k frame arrives the hardware will accept it regardless of the 2k MTU and will split it into 3x 4k rx buffers? Does the current driver work in this way? That would be great. Perhaps then one should change the driver in a way that the MTU can changed independently of the buffer size? Yes, e1000 devices will spill over and use multiple buffers for a single frame. We've been trying to find a good way to use multiple buffers to take care of these allocation problems. The structure of the sk_buff does not make it easy. Or should I say that it's the limitation that drivers are not allowed to chain together multiple sk_buffs to represent a single frame that does not make it easy. PCI-Express e1000 devices support a feature called header split, where the protocol headers go into a different buffer from the payload. We use that today to put headers into the kmalloc() allocated skb-data area, and payload into one or more skb-frags[] pages. You don't ever have multiple page allocations from the driver in this mode. We could try and only use page allocations for older e1000 devices, putting headers and payload into skb-frags and copying the headers out into the skb-data area as needed for processing. That would do away with large allocations, but in Jesse's experiments calling alloc_page() is slower than kmalloc(), so there can actually be a performance hit from trying to use page allocations all the time. It's an interesting problem. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/9] [I/OAT] DMA memcpy subsystem
Provides an API for offloading memory copies to DMA devices Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/Kconfig |2 drivers/Makefile |1 drivers/dma/Kconfig | 13 + drivers/dma/Makefile |1 drivers/dma/dmaengine.c | 408 + include/linux/dmaengine.h | 337 + 6 files changed, 762 insertions(+), 0 deletions(-) diff --git a/drivers/Kconfig b/drivers/Kconfig index aeb5ab2..8b11ceb 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -72,4 +72,6 @@ source drivers/edac/Kconfig source drivers/rtc/Kconfig +source drivers/dma/Kconfig + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 447d8e6..3c51703 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN) += sn/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ obj-$(CONFIG_SUPERH) += sh/ +obj-$(CONFIG_DMA_ENGINE) += dma/ diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig new file mode 100644 index 000..f9ac4bc --- /dev/null +++ b/drivers/dma/Kconfig @@ -0,0 +1,13 @@ +# +# DMA engine configuration +# + +menu DMA Engine support + +config DMA_ENGINE + bool Support for DMA engines + ---help--- + DMA engines offload copy operations from the CPU to dedicated + hardware, allowing the copies to happen asynchronously. + +endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile new file mode 100644 index 000..10b7391 --- /dev/null +++ b/drivers/dma/Makefile @@ -0,0 +1 @@ +obj-y += dmaengine.o diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c new file mode 100644 index 000..473c47b --- /dev/null +++ b/drivers/dma/dmaengine.c @@ -0,0 +1,408 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code implements the DMA subsystem. It provides a HW-neutral interface + * for other kernel code to use asynchronous memory copy capabilities, + * if present, and allows different HW DMA drivers to register as providing + * this capability. + * + * Due to the fact we are accelerating what is already a relatively fast + * operation, the code goes to great lengths to avoid additional overhead, + * such as locking. + * + * LOCKING: + * + * The subsystem keeps two global lists, dma_device_list and dma_client_list. + * Both of these are protected by a mutex, dma_list_mutex. + * + * Each device has a channels list, which runs unlocked but is never modified + * once the device is registered, it's just setup by the driver. + * + * Each client has a channels list, it's only modified under the client-lock + * and in an RCU callback, so it's safe to read under rcu_read_lock(). + * + * Each device has a kref, which is initialized to 1 when the device is + * registered. A kref_put is done for each class_device registered. When the + * class_device is released, the coresponding kref_put is done in the release + * method. Every time one of the device's channels is allocated to a client, + * a kref_get occurs. When the channel is freed, the coresponding kref_put + * happens. The device's release function does a completion, so + * unregister_device does a remove event, class_device_unregister, a kref_put + * for the first reference, then waits on the completion for all other + * references to finish. + * + * Each channel has an open-coded implementation of Rusty Russell's bigref, + * with a kref and a per_cpu local_t. A single reference is set when on an + * ADDED event, and removed with a REMOVE event. Net DMA client takes an + * extra reference per outstanding transaction. The relase function does a + * kref_put on the device. -ChrisL + */ + +#include linux/init.h +#include linux/module.h +#include linux/device.h +#include linux/dmaengine.h +#include linux/hardirq.h +#include linux/spinlock.h +#include linux/percpu.h +#include linux/rcupdate.h +#include linux/mutex.h + +static DEFINE_MUTEX(dma_list_mutex); +static LIST_HEAD(dma_device_list); +static LIST_HEAD(dma_client_list); + +/* --- sysfs
[PATCH 0/9] I/OAT repost
This is a repost of the I/OAT patches, the only changes from last time are refreshing the patches and removing an unused macro that was causing the vger spam filters to drop patch 2/9. This patch series is the a full release of the Intel(R) I/O Acceleration Technology (I/OAT) for Linux. It includes an in kernel API for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy engine, and changes to the TCP stack to offload copies of received networking data to application space. These changes apply to Linus' tree as of commit 387e2b0439026aa738a9edca15a57e5c0bcb4dfc [BRIDGE]: need to ref count the LLC sap They are available to pull from git://63.64.152.142/~cleech/linux-2.6 ioat-2.6.18 There are 9 patches in the series: 1) The memcpy offload APIs and class code 2) The Intel I/OAT DMA driver (ioatdma) 3) Core networking code to setup networking as a DMA memcpy client 4) Utility functions for sk_buff to iovec offloaded copy 5) Structure changes needed for TCP receive offload 6) Rename cleanup_rbuf to tcp_cleanup_rbuf 7) Make sk_eat_skb aware of early copied packets 8) Add a sysctl to tune the minimum offloaded I/O size for TCP 9) The main TCP receive offload changes -- Chris Leech [EMAIL PROTECTED] I/O Acceleration Technology Software Development LAN Access Division / Digital Enterprise Group - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client
Attempts to allocate per-CPU DMA channels Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/Kconfig | 12 + include/linux/netdevice.h |4 ++ include/net/netdma.h | 38 net/core/dev.c| 104 + 4 files changed, 158 insertions(+), 0 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 0f15e76..30d021d 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -10,6 +10,18 @@ config DMA_ENGINE DMA engines offload copy operations from the CPU to dedicated hardware, allowing the copies to happen asynchronously. +comment DMA Clients + +config NET_DMA + bool Network: TCP receive copy offload + depends on DMA_ENGINE NET + default y + ---help--- + This enables the use of DMA engines in the network stack to + offload receive copy-to-user operations, freeing CPU cycles. + Since this is the main user of the DMA engine, it should be enabled; + say Y here. + comment DMA Devices config INTEL_IOATDMA diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f4169bb..b5760c6 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -37,6 +37,7 @@ #include linux/config.h #include linux/device.h #include linux/percpu.h +#include linux/dmaengine.h struct divert_blk; struct vlan_group; @@ -593,6 +594,9 @@ struct softnet_data struct sk_buff *completion_queue; struct net_device backlog_dev;/* Sorry. 8) */ +#ifdef CONFIG_NET_DMA + struct dma_chan *net_dma; +#endif }; DECLARE_PER_CPU(struct softnet_data,softnet_data); diff --git a/include/net/netdma.h b/include/net/netdma.h new file mode 100644 index 000..cbfe89d --- /dev/null +++ b/include/net/netdma.h @@ -0,0 +1,38 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef NETDMA_H +#define NETDMA_H +#include linux/config.h +#ifdef CONFIG_NET_DMA +#include linux/dmaengine.h + +static inline struct dma_chan *get_softnet_dma(void) +{ + struct dma_chan *chan; + rcu_read_lock(); + chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma)); + if (chan) + dma_chan_get(chan); + rcu_read_unlock(); + return chan; +} +#endif /* CONFIG_NET_DMA */ +#endif /* NETDMA_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 2dce673..6e78798 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -115,6 +115,7 @@ #include net/iw_handler.h #include asm/current.h #include linux/audit.h +#include linux/dmaengine.h /* * The list of packet types we will receive (as opposed to discard) @@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[16];/* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ +#ifdef CONFIG_NET_DMA +static struct dma_client *net_dma_client; +static unsigned int net_dma_count; +static spinlock_t net_dma_event_lock; +#endif + /* * The @dev_base list is protected by @dev_base_lock and the rtln * semaphore. @@ -1844,6 +1851,19 @@ static void net_rx_action(struct softirq } } out: +#ifdef CONFIG_NET_DMA + /* +* There may not be any more sk_buffs coming right now, so push +* any pending DMA copies to hardware +*/ + if (net_dma_client) { + struct dma_chan *chan; + rcu_read_lock(); + list_for_each_entry_rcu(chan, net_dma_client-channels, client_node) + dma_async_memcpy_issue_pending(chan); + rcu_read_unlock(); + } +#endif local_irq_enable(); return; @@ -3298,6 +3318,88 @@ static int dev_cpu_callback(struct notif } #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_NET_DMA +/** + * net_dma_rebalance - + * This is called when the number of channels allocated to the net_dma_client + * changes. The net_dma_client tries to have one DMA channel per CPU. + */ +static void net_dma_rebalance(void) +{ + unsigned int cpu, i, n
[PATCH 4/9] [I/OAT] Utility functions for offloading sk_buff to iovec copies
Provides for pinning user space pages in memory, copying to iovecs, and copying from sk_buffs including fragmented and chained sk_buffs. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/Makefile |3 drivers/dma/iovlock.c | 301 + include/linux/dmaengine.h | 22 +++ include/net/netdma.h |6 + net/core/Makefile |1 net/core/user_dma.c | 127 +++ 6 files changed, 459 insertions(+), 1 deletions(-) diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index c8a5f56..bdcfdbd 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,2 +1,3 @@ -obj-y += dmaengine.o +obj-$(CONFIG_DMA_ENGINE) += dmaengine.o +obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c new file mode 100644 index 000..5ed327e --- /dev/null +++ b/drivers/dma/iovlock.c @@ -0,0 +1,301 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * Portions based on net/core/datagram.c and copyrighted by their authors. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code allows the net stack to make use of a DMA engine for + * skb to iovec copies. + */ + +#include linux/dmaengine.h +#include linux/pagemap.h +#include net/tcp.h /* for memcpy_toiovec */ +#include asm/io.h +#include asm/uaccess.h + +int num_pages_spanned(struct iovec *iov) +{ + return + ((PAGE_ALIGN((unsigned long)iov-iov_base + iov-iov_len) - + ((unsigned long)iov-iov_base PAGE_MASK)) PAGE_SHIFT); +} + +/* + * Pin down all the iovec pages needed for len bytes. + * Return a struct dma_pinned_list to keep track of pages pinned down. + * + * We are allocating a single chunk of memory, and then carving it up into + * 3 sections, the latter 2 whose size depends on the number of iovecs and the + * total number of pages, respectively. + */ +struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len) +{ + struct dma_pinned_list *local_list; + struct page **pages; + int i; + int ret; + int nr_iovecs = 0; + int iovec_len_used = 0; + int iovec_pages_used = 0; + long err; + + /* don't pin down non-user-based iovecs */ + if (segment_eq(get_fs(), KERNEL_DS)) + return NULL; + + /* determine how many iovecs/pages there are, up front */ + do { + iovec_len_used += iov[nr_iovecs].iov_len; + iovec_pages_used += num_pages_spanned(iov[nr_iovecs]); + nr_iovecs++; + } while (iovec_len_used len); + + /* single kmalloc for pinned list, page_list[], and the page arrays */ + local_list = kmalloc(sizeof(*local_list) + + (nr_iovecs * sizeof (struct dma_page_list)) + + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL); + if (!local_list) { + err = -ENOMEM; + goto out; + } + + /* list of pages starts right after the page list array */ + pages = (struct page **) local_list-page_list[nr_iovecs]; + + for (i = 0; i nr_iovecs; i++) { + struct dma_page_list *page_list = local_list-page_list[i]; + + len -= iov[i].iov_len; + + if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) { + err = -EFAULT; + goto unpin; + } + + page_list-nr_pages = num_pages_spanned(iov[i]); + page_list-base_address = iov[i].iov_base; + + page_list-pages = pages; + pages += page_list-nr_pages; + + /* pin pages down */ + down_read(current-mm-mmap_sem); + ret = get_user_pages( + current, + current-mm, + (unsigned long) iov[i].iov_base, + page_list-nr_pages, + 1, /* write */ + 0, /* force */ + page_list-pages, + NULL); + up_read(current-mm-mmap_sem
[PATCH 5/9] [I/OAT] Structure changes for TCP recv offload to I/OAT
Adds an async_wait_queue and some additional fields to tcp_sock, and a dma_cookie_t to sk_buff. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/linux/skbuff.h |4 include/linux/tcp.h|8 include/net/sock.h |2 ++ include/net/tcp.h |7 +++ net/core/sock.c|6 ++ 5 files changed, 27 insertions(+), 0 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f8f2347..23bad3b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -29,6 +29,7 @@ #include linux/net.h #include linux/textsearch.h #include net/checksum.h +#include linux/dmaengine.h #define HAVE_ALLOC_SKB /* For the drivers to know */ #define HAVE_ALIGNABLE_SKB /* Ditto 8)*/ @@ -285,6 +286,9 @@ struct sk_buff { __u16 tc_verd;/* traffic control verdict */ #endif #endif +#ifdef CONFIG_NET_DMA + dma_cookie_tdma_cookie; +#endif /* These elements must be at the end, see alloc_skb() for details. */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 542d395..c90daa5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -18,6 +18,7 @@ #define _LINUX_TCP_H #include linux/types.h +#include linux/dmaengine.h #include asm/byteorder.h struct tcphdr { @@ -233,6 +234,13 @@ struct tcp_sock { struct iovec*iov; int memory; int len; +#ifdef CONFIG_NET_DMA + /* members for async copy */ + struct dma_chan *dma_chan; + int wakeup; + struct dma_pinned_list *pinned_list; + dma_cookie_tdma_cookie; +#endif } ucopy; __u32 snd_wl1;/* Sequence for window update */ diff --git a/include/net/sock.h b/include/net/sock.h index c9fad6f..90c65cb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -132,6 +132,7 @@ struct sock_common { *@sk_receive_queue: incoming packets *@sk_wmem_alloc: transmit queue bytes committed *@sk_write_queue: Packet sending queue + *@sk_async_wait_queue: DMA copied packets *@sk_omem_alloc: o is option or other *@sk_wmem_queued: persistent queue size *@sk_forward_alloc: space allocated forward @@ -205,6 +206,7 @@ struct sock { atomic_tsk_omem_alloc; struct sk_buff_head sk_receive_queue; struct sk_buff_head sk_write_queue; + struct sk_buff_head sk_async_wait_queue; int sk_wmem_queued; int sk_forward_alloc; gfp_t sk_allocation; diff --git a/include/net/tcp.h b/include/net/tcp.h index 3c989db..d0c2c2f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -28,6 +28,7 @@ #include linux/cache.h #include linux/percpu.h #include linux/skbuff.h +#include linux/dmaengine.h #include net/inet_connection_sock.h #include net/inet_timewait_sock.h @@ -817,6 +818,12 @@ static inline void tcp_prequeue_init(str tp-ucopy.len = 0; tp-ucopy.memory = 0; skb_queue_head_init(tp-ucopy.prequeue); +#ifdef CONFIG_NET_DMA + tp-ucopy.dma_chan = NULL; + tp-ucopy.wakeup = 0; + tp-ucopy.pinned_list = NULL; + tp-ucopy.dma_cookie = 0; +#endif } /* Packet is added to VJ-style prequeue for processing in process diff --git a/net/core/sock.c b/net/core/sock.c index ed2afdb..5d820c3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock atomic_set(newsk-sk_omem_alloc, 0); skb_queue_head_init(newsk-sk_receive_queue); skb_queue_head_init(newsk-sk_write_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(newsk-sk_async_wait_queue); +#endif rwlock_init(newsk-sk_dst_lock); rwlock_init(newsk-sk_callback_lock); @@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, skb_queue_head_init(sk-sk_receive_queue); skb_queue_head_init(sk-sk_write_queue); skb_queue_head_init(sk-sk_error_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(sk-sk_async_wait_queue); +#endif sk-sk_send_head= NULL; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/9] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static
Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/net/tcp.h |2 ++ net/ipv4/tcp.c| 10 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index d0c2c2f..578cccf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -294,6 +294,8 @@ extern int tcp_rcv_established(struct extern voidtcp_rcv_space_adjust(struct sock *sk); +extern voidtcp_cleanup_rbuf(struct sock *sk, int copied); + extern int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2b7b80..1c0cfd7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, * calculation of whether or not we must ACK for the sake of * a window update. */ -static void cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; @@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_ /* Clean up data we have read: This will do ACK frames. */ if (copied) - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); return copied; } @@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru } } - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency tp-ucopy.task == user_recv) { /* Install new reader */ @@ -1391,7 +1391,7 @@ skip_copy: */ /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk); release_sock(sk); @@ -1858,7 +1858,7 @@ static int do_tcp_setsockopt(struct sock (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) inet_csk_ack_scheduled(sk)) { icsk-icsk_ack.pending |= ICSK_ACK_PUSHED; - cleanup_rbuf(sk, 1); + tcp_cleanup_rbuf(sk, 1); if (!(val 1)) icsk-icsk_ack.pingpong = 1; } - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 8/9] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold
Any socket recv of less than this ammount will not be offloaded Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/linux/sysctl.h |1 + include/net/tcp.h |1 + net/core/user_dma.c|4 net/ipv4/sysctl_net_ipv4.c | 10 ++ 4 files changed, 16 insertions(+), 0 deletions(-) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 76eaeff..cd9e7c0 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -403,6 +403,7 @@ enum NET_TCP_MTU_PROBING=113, NET_TCP_BASE_MSS=114, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, + NET_TCP_DMA_COPYBREAK=116, }; enum { diff --git a/include/net/tcp.h b/include/net/tcp.h index 578cccf..f1f4727 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -219,6 +219,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; +extern int sysctl_tcp_dma_copybreak; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 9eee91b..b7c98db 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -30,6 +30,10 @@ #include linux/rtnetlink.h /* for BUG_TRAP */ #include net/tcp.h +#define NET_DMA_DEFAULT_COPYBREAK 4096 + +int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; + /** * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. * @skb - buffer to copy diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6b6c3ad..6a6aa53 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -688,6 +688,16 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_NET_DMA + { + .ctl_name = NET_TCP_DMA_COPYBREAK, + .procname = tcp_dma_copybreak, + .data = sysctl_tcp_dma_copybreak, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif { .ctl_name = 0 } }; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/9] [I/OAT] make sk_eat_skb I/OAT aware
Add an extra argument to sk_eat_skb, and make it move early copied packets to the async_wait_queue instead of freeing them. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/net/sock.h | 13 - net/dccp/proto.c |4 ++-- net/ipv4/tcp.c |8 net/llc/af_llc.c |2 +- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 90c65cb..75b0e97 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1273,11 +1273,22 @@ sock_recv_timestamp(struct msghdr *msg, * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ -static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) +#ifdef CONFIG_NET_DMA +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) +{ + __skb_unlink(skb, sk-sk_receive_queue); + if (!copied_early) + __kfree_skb(skb); + else + __skb_queue_tail(sk-sk_async_wait_queue, skb); +} +#else +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) { __skb_unlink(skb, sk-sk_receive_queue); __kfree_skb(skb); } +#endif extern void sock_enable_timestamp(struct sock *sk); extern int sock_get_timestamp(struct sock *, struct timeval __user *); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2e0ee83..5317fd3 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, str } dccp_pr_debug(packet_type=%s\n, dccp_packet_name(dh-dccph_type)); - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; @@ -773,7 +773,7 @@ verify_sock_status: } found_fin_ok: if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (1); out: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1c0cfd7..4e067d2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_ break; } if (skb-h.th-fin) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); ++seq; break; } - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); if (!desc-count) break; } @@ -1356,14 +1356,14 @@ skip_copy: if (skb-h.th-fin) goto found_fin_ok; if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); continue; found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (len 0); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5a04db7..7465170 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb * continue; if (!(flags MSG_PEEK)) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); *seq = 0; } } while (len 0); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/9] [I/OAT] Driver for the Intel(R) I/OAT DMA engine
Adds a new ioatdma driver Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/Kconfig |9 drivers/dma/Makefile|1 drivers/dma/ioatdma.c | 839 +++ drivers/dma/ioatdma.h | 126 ++ drivers/dma/ioatdma_hw.h| 52 ++ drivers/dma/ioatdma_io.h| 118 + drivers/dma/ioatdma_registers.h | 126 ++ 7 files changed, 1271 insertions(+), 0 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index f9ac4bc..0f15e76 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -10,4 +10,13 @@ config DMA_ENGINE DMA engines offload copy operations from the CPU to dedicated hardware, allowing the copies to happen asynchronously. +comment DMA Devices + +config INTEL_IOATDMA + tristate Intel I/OAT DMA support + depends on DMA_ENGINE PCI + default m + ---help--- + Enable support for the Intel(R) I/OAT DMA engine. + endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 10b7391..c8a5f56 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1 +1,2 @@ obj-y += dmaengine.o +obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c new file mode 100644 index 000..11d48b9 --- /dev/null +++ b/drivers/dma/ioatdma.c @@ -0,0 +1,839 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This driver supports an Intel I/OAT DMA engine, which does asynchronous + * copy operations. + */ + +#include linux/init.h +#include linux/module.h +#include linux/pci.h +#include linux/interrupt.h +#include linux/dmaengine.h +#include linux/delay.h +#include ioatdma.h +#include ioatdma_io.h +#include ioatdma_registers.h +#include ioatdma_hw.h + +#define to_ioat_chan(chan) container_of(chan, struct ioat_dma_chan, common) +#define to_ioat_device(dev) container_of(dev, struct ioat_device, common) +#define to_ioat_desc(lh) container_of(lh, struct ioat_desc_sw, node) + +/* internal functions */ +static int __devinit ioat_probe(struct pci_dev *pdev, const struct pci_device_id *ent); +static void __devexit ioat_remove(struct pci_dev *pdev); + +static int enumerate_dma_channels(struct ioat_device *device) +{ + u8 xfercap_scale; + u32 xfercap; + int i; + struct ioat_dma_chan *ioat_chan; + + device-common.chancnt = ioatdma_read8(device, IOAT_CHANCNT_OFFSET); + xfercap_scale = ioatdma_read8(device, IOAT_XFERCAP_OFFSET); + xfercap = (xfercap_scale == 0 ? -1 : (1UL xfercap_scale)); + + for (i = 0; i device-common.chancnt; i++) { + ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL); + if (!ioat_chan) { + device-common.chancnt = i; + break; + } + + ioat_chan-device = device; + ioat_chan-reg_base = device-reg_base + (0x80 * (i + 1)); + ioat_chan-xfercap = xfercap; + spin_lock_init(ioat_chan-cleanup_lock); + spin_lock_init(ioat_chan-desc_lock); + INIT_LIST_HEAD(ioat_chan-free_desc); + INIT_LIST_HEAD(ioat_chan-used_desc); + /* This should be made common somewhere in dmaengine.c */ + ioat_chan-common.device = device-common; + ioat_chan-common.client = NULL; + list_add_tail(ioat_chan-common.device_node, + device-common.channels); + } + return device-common.chancnt; +} + +static struct ioat_desc_sw *ioat_dma_alloc_descriptor( + struct ioat_dma_chan *ioat_chan, + int flags) +{ + struct ioat_dma_descriptor *desc; + struct ioat_desc_sw *desc_sw; + struct ioat_device *ioat_device; + dma_addr_t phys; + + ioat_device = to_ioat_device(ioat_chan-common.device); + desc = pci_pool_alloc(ioat_device-dma_pool, flags, phys); + if (unlikely(!desc)) + return NULL; + + desc_sw = kzalloc(sizeof(*desc_sw), flags); + if (unlikely(!desc_sw
Re: 2.6.16.13 e1000 reports incorrect PCI-X bus speed?
Any idea why 120Mhz is used instead of 133? It doesn't seem to matter in my performance tests, but I am curious... I think Rick is right, the bus between the bridge on the card and the e1000s is running at 120Mhz. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/9] I/OAT network recv copy offload
A few changes after going over all the memory allocations, but mostly just keeping the patches up to date. This patch series is the a full release of the Intel(R) I/O Acceleration Technology (I/OAT) for Linux. It includes an in kernel API for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy engine, and changes to the TCP stack to offload copies of received networking data to application space. Changes from last posting: Fixed a struct ioat_dma_chan memory leak on driver unload. Changed a lock that was never held in atomic contexts to a mutex as part of avoiding unneeded GFP_ATOMIC allocations. These changes apply to Linus' tree as of commit 6810b548b25114607e0814612d84125abccc0a4f [PATCH] x86_64: Move ondemand timer into own work queue They are available to pull from git://63.64.152.142/~cleech/linux-2.6 ioat-2.6.17 There are 9 patches in the series: 1) The memcpy offload APIs and class code 2) The Intel I/OAT DMA driver (ioatdma) 3) Core networking code to setup networking as a DMA memcpy client 4) Utility functions for sk_buff to iovec offloaded copy 5) Structure changes needed for TCP receive offload 6) Rename cleanup_rbuf to tcp_cleanup_rbuf 7) Make sk_eat_skb aware of early copied packets 8) Add a sysctl to tune the minimum offloaded I/O size for TCP 9) The main TCP receive offload changes -- Chris Leech [EMAIL PROTECTED] I/O Acceleration Technology Software Development LAN Access Division / Digital Enterprise Group - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/9] I/OAT network recv copy offload
[I/OAT] Driver for the Intel(R) I/OAT DMA engine From: Chris Leech [EMAIL PROTECTED] Adds a new ioatdma driver Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/Kconfig |9 drivers/dma/Makefile|1 drivers/dma/ioatdma.c | 839 +++ drivers/dma/ioatdma.h | 126 ++ drivers/dma/ioatdma_hw.h| 52 ++ drivers/dma/ioatdma_io.h| 118 + drivers/dma/ioatdma_registers.h | 128 ++ 7 files changed, 1273 insertions(+), 0 deletions(-) ioatdma_driver.gz Description: GNU Zip compressed data
[PATCH 9/9] [I/OAT] TCP recv offload to I/OAT
Locks down user pages and sets up for DMA in tcp_recvmsg, then calls dma_async_try_early_copy in tcp_v4_do_rcv Signed-off-by: Chris Leech [EMAIL PROTECTED] --- net/ipv4/tcp.c | 103 -- net/ipv4/tcp_input.c | 74 +--- net/ipv4/tcp_ipv4.c | 18 - net/ipv6/tcp_ipv6.c | 12 +- 4 files changed, 185 insertions(+), 22 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e067d2..ff6ccda 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -263,7 +263,7 @@ #include net/tcp.h #include net/xfrm.h #include net/ip.h - +#include net/netdma.h #include asm/uaccess.h #include asm/ioctls.h @@ -1110,6 +1110,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; + int copied_early = 0; lock_sock(sk); @@ -1133,6 +1134,17 @@ int tcp_recvmsg(struct kiocb *iocb, stru target = sock_rcvlowat(sk, flags MSG_WAITALL, len); +#ifdef CONFIG_NET_DMA + tp-ucopy.dma_chan = NULL; + preempt_disable(); + if ((len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) + !sysctl_tcp_low_latency __get_cpu_var(softnet_data.net_dma)) { + preempt_enable_no_resched(); + tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len); + } else + preempt_enable_no_resched(); +#endif + do { struct sk_buff *skb; u32 offset; @@ -1274,6 +1286,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru } else sk_wait_data(sk, timeo); +#ifdef CONFIG_NET_DMA + tp-ucopy.wakeup = 0; +#endif + if (user_recv) { int chunk; @@ -1329,13 +1345,39 @@ do_prequeue: } if (!(flags MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg-msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; +#ifdef CONFIG_NET_DMA + if (!tp-ucopy.dma_chan tp-ucopy.pinned_list) + tp-ucopy.dma_chan = get_softnet_dma(); + + if (tp-ucopy.dma_chan) { + tp-ucopy.dma_cookie = dma_skb_copy_datagram_iovec( + tp-ucopy.dma_chan, skb, offset, + msg-msg_iov, used, + tp-ucopy.pinned_list); + + if (tp-ucopy.dma_cookie 0) { + + printk(KERN_ALERT dma_cookie 0\n); + + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + if ((offset + used) == skb-len) + copied_early = 1; + + } else +#endif + { + err = skb_copy_datagram_iovec(skb, offset, + msg-msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } } @@ -1355,15 +1397,19 @@ skip_copy: if (skb-h.th-fin) goto found_fin_ok; - if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb, 0); + if (!(flags MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb, 0); + if (!(flags MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } break; } while (len 0); @@ -1386,6 +1432,36 @@ skip_copy: tp-ucopy.len = 0; } +#ifdef CONFIG_NET_DMA + if (tp-ucopy.dma_chan) { + struct sk_buff *skb; + dma_cookie_t done, used; + + dma_async_memcpy_issue_pending(tp-ucopy.dma_chan
[PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client
Attempts to allocate per-CPU DMA channels Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/Kconfig | 12 + include/linux/netdevice.h |4 ++ include/net/netdma.h | 38 net/core/dev.c| 104 + 4 files changed, 158 insertions(+), 0 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 0f15e76..30d021d 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -10,6 +10,18 @@ config DMA_ENGINE DMA engines offload copy operations from the CPU to dedicated hardware, allowing the copies to happen asynchronously. +comment DMA Clients + +config NET_DMA + bool Network: TCP receive copy offload + depends on DMA_ENGINE NET + default y + ---help--- + This enables the use of DMA engines in the network stack to + offload receive copy-to-user operations, freeing CPU cycles. + Since this is the main user of the DMA engine, it should be enabled; + say Y here. + comment DMA Devices config INTEL_IOATDMA diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 309f919..06bcabc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -37,6 +37,7 @@ #include linux/config.h #include linux/device.h #include linux/percpu.h +#include linux/dmaengine.h struct divert_blk; struct vlan_group; @@ -594,6 +595,9 @@ struct softnet_data struct sk_buff *completion_queue; struct net_device backlog_dev;/* Sorry. 8) */ +#ifdef CONFIG_NET_DMA + struct dma_chan *net_dma; +#endif }; DECLARE_PER_CPU(struct softnet_data,softnet_data); diff --git a/include/net/netdma.h b/include/net/netdma.h new file mode 100644 index 000..cbfe89d --- /dev/null +++ b/include/net/netdma.h @@ -0,0 +1,38 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef NETDMA_H +#define NETDMA_H +#include linux/config.h +#ifdef CONFIG_NET_DMA +#include linux/dmaengine.h + +static inline struct dma_chan *get_softnet_dma(void) +{ + struct dma_chan *chan; + rcu_read_lock(); + chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma)); + if (chan) + dma_chan_get(chan); + rcu_read_unlock(); + return chan; +} +#endif /* CONFIG_NET_DMA */ +#endif /* NETDMA_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 9ab3cfa..ab34006 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -115,6 +115,7 @@ #include net/iw_handler.h #include asm/current.h #include linux/audit.h +#include linux/dmaengine.h /* * The list of packet types we will receive (as opposed to discard) @@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[16];/* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ +#ifdef CONFIG_NET_DMA +static struct dma_client *net_dma_client; +static unsigned int net_dma_count; +static spinlock_t net_dma_event_lock; +#endif + /* * The @dev_base list is protected by @dev_base_lock and the rtln * semaphore. @@ -1844,6 +1851,19 @@ static void net_rx_action(struct softirq } } out: +#ifdef CONFIG_NET_DMA + /* +* There may not be any more sk_buffs coming right now, so push +* any pending DMA copies to hardware +*/ + if (net_dma_client) { + struct dma_chan *chan; + rcu_read_lock(); + list_for_each_entry_rcu(chan, net_dma_client-channels, client_node) + dma_async_memcpy_issue_pending(chan); + rcu_read_unlock(); + } +#endif local_irq_enable(); return; @@ -3307,6 +3327,88 @@ static int dev_cpu_callback(struct notif } #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_NET_DMA +/** + * net_dma_rebalance - + * This is called when the number of channels allocated to the net_dma_client + * changes. The net_dma_client tries to have one DMA channel per CPU. + */ +static void net_dma_rebalance(void) +{ + unsigned int cpu, i, n
[PATCH 6/9] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static
Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/net/tcp.h |2 ++ net/ipv4/tcp.c| 10 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index d0c2c2f..578cccf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -294,6 +294,8 @@ extern int tcp_rcv_established(struct extern voidtcp_rcv_space_adjust(struct sock *sk); +extern voidtcp_cleanup_rbuf(struct sock *sk, int copied); + extern int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2b7b80..1c0cfd7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, * calculation of whether or not we must ACK for the sake of * a window update. */ -static void cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; @@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_ /* Clean up data we have read: This will do ACK frames. */ if (copied) - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); return copied; } @@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru } } - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency tp-ucopy.task == user_recv) { /* Install new reader */ @@ -1391,7 +1391,7 @@ skip_copy: */ /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk); release_sock(sk); @@ -1858,7 +1858,7 @@ static int do_tcp_setsockopt(struct sock (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) inet_csk_ack_scheduled(sk)) { icsk-icsk_ack.pending |= ICSK_ACK_PUSHED; - cleanup_rbuf(sk, 1); + tcp_cleanup_rbuf(sk, 1); if (!(val 1)) icsk-icsk_ack.pingpong = 1; } - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/9] [I/OAT] make sk_eat_skb I/OAT aware
Add an extra argument to sk_eat_skb, and make it move early copied packets to the async_wait_queue instead of freeing them. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/net/sock.h | 13 - net/dccp/proto.c |4 ++-- net/ipv4/tcp.c |8 net/llc/af_llc.c |2 +- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 90c65cb..75b0e97 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1273,11 +1273,22 @@ sock_recv_timestamp(struct msghdr *msg, * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ -static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) +#ifdef CONFIG_NET_DMA +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) +{ + __skb_unlink(skb, sk-sk_receive_queue); + if (!copied_early) + __kfree_skb(skb); + else + __skb_queue_tail(sk-sk_async_wait_queue, skb); +} +#else +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) { __skb_unlink(skb, sk-sk_receive_queue); __kfree_skb(skb); } +#endif extern void sock_enable_timestamp(struct sock *sk); extern int sock_get_timestamp(struct sock *, struct timeval __user *); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2e0ee83..5317fd3 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, str } dccp_pr_debug(packet_type=%s\n, dccp_packet_name(dh-dccph_type)); - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; @@ -773,7 +773,7 @@ verify_sock_status: } found_fin_ok: if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (1); out: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1c0cfd7..4e067d2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_ break; } if (skb-h.th-fin) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); ++seq; break; } - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); if (!desc-count) break; } @@ -1356,14 +1356,14 @@ skip_copy: if (skb-h.th-fin) goto found_fin_ok; if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); continue; found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (len 0); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5a04db7..7465170 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb * continue; if (!(flags MSG_PEEK)) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); *seq = 0; } } while (len 0); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/9] [I/OAT] Utility functions for offloading sk_buff to iovec copies
Provides for pinning user space pages in memory, copying to iovecs, and copying from sk_buffs including fragmented and chained sk_buffs. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/Makefile |3 drivers/dma/iovlock.c | 301 + include/linux/dmaengine.h | 22 +++ include/net/netdma.h |6 + net/core/Makefile |1 net/core/user_dma.c | 127 +++ 6 files changed, 459 insertions(+), 1 deletions(-) diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index c8a5f56..bdcfdbd 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,2 +1,3 @@ -obj-y += dmaengine.o +obj-$(CONFIG_DMA_ENGINE) += dmaengine.o +obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c new file mode 100644 index 000..5ed327e --- /dev/null +++ b/drivers/dma/iovlock.c @@ -0,0 +1,301 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * Portions based on net/core/datagram.c and copyrighted by their authors. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code allows the net stack to make use of a DMA engine for + * skb to iovec copies. + */ + +#include linux/dmaengine.h +#include linux/pagemap.h +#include net/tcp.h /* for memcpy_toiovec */ +#include asm/io.h +#include asm/uaccess.h + +int num_pages_spanned(struct iovec *iov) +{ + return + ((PAGE_ALIGN((unsigned long)iov-iov_base + iov-iov_len) - + ((unsigned long)iov-iov_base PAGE_MASK)) PAGE_SHIFT); +} + +/* + * Pin down all the iovec pages needed for len bytes. + * Return a struct dma_pinned_list to keep track of pages pinned down. + * + * We are allocating a single chunk of memory, and then carving it up into + * 3 sections, the latter 2 whose size depends on the number of iovecs and the + * total number of pages, respectively. + */ +struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len) +{ + struct dma_pinned_list *local_list; + struct page **pages; + int i; + int ret; + int nr_iovecs = 0; + int iovec_len_used = 0; + int iovec_pages_used = 0; + long err; + + /* don't pin down non-user-based iovecs */ + if (segment_eq(get_fs(), KERNEL_DS)) + return NULL; + + /* determine how many iovecs/pages there are, up front */ + do { + iovec_len_used += iov[nr_iovecs].iov_len; + iovec_pages_used += num_pages_spanned(iov[nr_iovecs]); + nr_iovecs++; + } while (iovec_len_used len); + + /* single kmalloc for pinned list, page_list[], and the page arrays */ + local_list = kmalloc(sizeof(*local_list) + + (nr_iovecs * sizeof (struct dma_page_list)) + + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL); + if (!local_list) { + err = -ENOMEM; + goto out; + } + + /* list of pages starts right after the page list array */ + pages = (struct page **) local_list-page_list[nr_iovecs]; + + for (i = 0; i nr_iovecs; i++) { + struct dma_page_list *page_list = local_list-page_list[i]; + + len -= iov[i].iov_len; + + if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) { + err = -EFAULT; + goto unpin; + } + + page_list-nr_pages = num_pages_spanned(iov[i]); + page_list-base_address = iov[i].iov_base; + + page_list-pages = pages; + pages += page_list-nr_pages; + + /* pin pages down */ + down_read(current-mm-mmap_sem); + ret = get_user_pages( + current, + current-mm, + (unsigned long) iov[i].iov_base, + page_list-nr_pages, + 1, /* write */ + 0, /* force */ + page_list-pages, + NULL); + up_read(current-mm-mmap_sem
[PATCH 5/9] [I/OAT] Structure changes for TCP recv offload to I/OAT
Adds an async_wait_queue and some additional fields to tcp_sock, and a dma_cookie_t to sk_buff. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/linux/skbuff.h |4 include/linux/tcp.h|8 include/net/sock.h |2 ++ include/net/tcp.h |7 +++ net/core/sock.c|6 ++ 5 files changed, 27 insertions(+), 0 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f8f2347..23bad3b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -29,6 +29,7 @@ #include linux/net.h #include linux/textsearch.h #include net/checksum.h +#include linux/dmaengine.h #define HAVE_ALLOC_SKB /* For the drivers to know */ #define HAVE_ALIGNABLE_SKB /* Ditto 8)*/ @@ -285,6 +286,9 @@ struct sk_buff { __u16 tc_verd;/* traffic control verdict */ #endif #endif +#ifdef CONFIG_NET_DMA + dma_cookie_tdma_cookie; +#endif /* These elements must be at the end, see alloc_skb() for details. */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 542d395..c90daa5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -18,6 +18,7 @@ #define _LINUX_TCP_H #include linux/types.h +#include linux/dmaengine.h #include asm/byteorder.h struct tcphdr { @@ -233,6 +234,13 @@ struct tcp_sock { struct iovec*iov; int memory; int len; +#ifdef CONFIG_NET_DMA + /* members for async copy */ + struct dma_chan *dma_chan; + int wakeup; + struct dma_pinned_list *pinned_list; + dma_cookie_tdma_cookie; +#endif } ucopy; __u32 snd_wl1;/* Sequence for window update */ diff --git a/include/net/sock.h b/include/net/sock.h index c9fad6f..90c65cb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -132,6 +132,7 @@ struct sock_common { *@sk_receive_queue: incoming packets *@sk_wmem_alloc: transmit queue bytes committed *@sk_write_queue: Packet sending queue + *@sk_async_wait_queue: DMA copied packets *@sk_omem_alloc: o is option or other *@sk_wmem_queued: persistent queue size *@sk_forward_alloc: space allocated forward @@ -205,6 +206,7 @@ struct sock { atomic_tsk_omem_alloc; struct sk_buff_head sk_receive_queue; struct sk_buff_head sk_write_queue; + struct sk_buff_head sk_async_wait_queue; int sk_wmem_queued; int sk_forward_alloc; gfp_t sk_allocation; diff --git a/include/net/tcp.h b/include/net/tcp.h index 3c989db..d0c2c2f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -28,6 +28,7 @@ #include linux/cache.h #include linux/percpu.h #include linux/skbuff.h +#include linux/dmaengine.h #include net/inet_connection_sock.h #include net/inet_timewait_sock.h @@ -817,6 +818,12 @@ static inline void tcp_prequeue_init(str tp-ucopy.len = 0; tp-ucopy.memory = 0; skb_queue_head_init(tp-ucopy.prequeue); +#ifdef CONFIG_NET_DMA + tp-ucopy.dma_chan = NULL; + tp-ucopy.wakeup = 0; + tp-ucopy.pinned_list = NULL; + tp-ucopy.dma_cookie = 0; +#endif } /* Packet is added to VJ-style prequeue for processing in process diff --git a/net/core/sock.c b/net/core/sock.c index ed2afdb..5d820c3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock atomic_set(newsk-sk_omem_alloc, 0); skb_queue_head_init(newsk-sk_receive_queue); skb_queue_head_init(newsk-sk_write_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(newsk-sk_async_wait_queue); +#endif rwlock_init(newsk-sk_dst_lock); rwlock_init(newsk-sk_callback_lock); @@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, skb_queue_head_init(sk-sk_receive_queue); skb_queue_head_init(sk-sk_write_queue); skb_queue_head_init(sk-sk_error_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(sk-sk_async_wait_queue); +#endif sk-sk_send_head= NULL; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/9] [I/OAT] DMA memcpy subsystem
Provides an API for offloading memory copies to DMA devices Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/Kconfig |2 drivers/Makefile |1 drivers/dma/Kconfig | 13 + drivers/dma/Makefile |1 drivers/dma/dmaengine.c | 408 + include/linux/dmaengine.h | 337 + 6 files changed, 762 insertions(+), 0 deletions(-) diff --git a/drivers/Kconfig b/drivers/Kconfig index aeb5ab2..8b11ceb 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -72,4 +72,6 @@ source drivers/edac/Kconfig source drivers/rtc/Kconfig +source drivers/dma/Kconfig + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 447d8e6..3c51703 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN) += sn/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ obj-$(CONFIG_SUPERH) += sh/ +obj-$(CONFIG_DMA_ENGINE) += dma/ diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig new file mode 100644 index 000..f9ac4bc --- /dev/null +++ b/drivers/dma/Kconfig @@ -0,0 +1,13 @@ +# +# DMA engine configuration +# + +menu DMA Engine support + +config DMA_ENGINE + bool Support for DMA engines + ---help--- + DMA engines offload copy operations from the CPU to dedicated + hardware, allowing the copies to happen asynchronously. + +endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile new file mode 100644 index 000..10b7391 --- /dev/null +++ b/drivers/dma/Makefile @@ -0,0 +1 @@ +obj-y += dmaengine.o diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c new file mode 100644 index 000..473c47b --- /dev/null +++ b/drivers/dma/dmaengine.c @@ -0,0 +1,408 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code implements the DMA subsystem. It provides a HW-neutral interface + * for other kernel code to use asynchronous memory copy capabilities, + * if present, and allows different HW DMA drivers to register as providing + * this capability. + * + * Due to the fact we are accelerating what is already a relatively fast + * operation, the code goes to great lengths to avoid additional overhead, + * such as locking. + * + * LOCKING: + * + * The subsystem keeps two global lists, dma_device_list and dma_client_list. + * Both of these are protected by a mutex, dma_list_mutex. + * + * Each device has a channels list, which runs unlocked but is never modified + * once the device is registered, it's just setup by the driver. + * + * Each client has a channels list, it's only modified under the client-lock + * and in an RCU callback, so it's safe to read under rcu_read_lock(). + * + * Each device has a kref, which is initialized to 1 when the device is + * registered. A kref_put is done for each class_device registered. When the + * class_device is released, the coresponding kref_put is done in the release + * method. Every time one of the device's channels is allocated to a client, + * a kref_get occurs. When the channel is freed, the coresponding kref_put + * happens. The device's release function does a completion, so + * unregister_device does a remove event, class_device_unregister, a kref_put + * for the first reference, then waits on the completion for all other + * references to finish. + * + * Each channel has an open-coded implementation of Rusty Russell's bigref, + * with a kref and a per_cpu local_t. A single reference is set when on an + * ADDED event, and removed with a REMOVE event. Net DMA client takes an + * extra reference per outstanding transaction. The relase function does a + * kref_put on the device. -ChrisL + */ + +#include linux/init.h +#include linux/module.h +#include linux/device.h +#include linux/dmaengine.h +#include linux/hardirq.h +#include linux/spinlock.h +#include linux/percpu.h +#include linux/rcupdate.h +#include linux/mutex.h + +static DEFINE_MUTEX(dma_list_mutex); +static LIST_HEAD(dma_device_list); +static LIST_HEAD(dma_client_list); + +/* --- sysfs
Re: Question on e1000 patch, rx-copy-break related.
On 5/3/06, Ben Greear [EMAIL PROTECTED] wrote: So, as of 2.6.16.13, is the hardware stripping (SERC) enabled? Could you also let me know where this bit is defined in case I want to twiddle it myself (a quick grep for SERC in 2.6.16.13 yields nothing.) You missed a C, it's SECRC (Strip Ethernet CRC) in the RCTL register or E1000_RCTL_SECRC. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/10] [IOAT] I/OAT patches repost
Netperf2 TOT now accesses the buffer that was just recv()'d rather than the one that is about to be recv()'d. We've posted netperf2 results with I/OAT enabled/disabled and the data access option on/off at http://kernel.org/pub/linux/kernel/people/grover/ioat/netperf-icb-1.5-postscaling-both.pdf This link has also been added to the I/OAT page on the LinuxNet wiki. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/9] I/OAT
Could you please describe how struct ioat_dma_chan channels are freed? Sorry, I got distracted by other issues and never ended up following up on this. You're right, and it's just sloppiness on my part for missing it, those structs are being leaked on module unload. I'll fix it. Thanks. -Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 8/9] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold
Any socket recv of less than this ammount will not be offloaded Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/linux/sysctl.h |1 + include/net/tcp.h |1 + net/core/user_dma.c|4 net/ipv4/sysctl_net_ipv4.c | 10 ++ 4 files changed, 16 insertions(+), 0 deletions(-) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 76eaeff..cd9e7c0 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -403,6 +403,7 @@ enum NET_TCP_MTU_PROBING=113, NET_TCP_BASE_MSS=114, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, + NET_TCP_DMA_COPYBREAK=116, }; enum { diff --git a/include/net/tcp.h b/include/net/tcp.h index ca5bdaf..2e6fdef 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -219,6 +219,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; +extern int sysctl_tcp_dma_copybreak; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; diff --git a/net/core/user_dma.c b/net/core/user_dma.c index ec177ef..642a3f3 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -33,6 +33,10 @@ #ifdef CONFIG_NET_DMA +#define NET_DMA_DEFAULT_COPYBREAK 1024 + +int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; + /** * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. * @skb - buffer to copy diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6b6c3ad..6a6aa53 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -688,6 +688,16 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_NET_DMA + { + .ctl_name = NET_TCP_DMA_COPYBREAK, + .procname = tcp_dma_copybreak, + .data = sysctl_tcp_dma_copybreak, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif { .ctl_name = 0 } }; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/9] [I/OAT] make sk_eat_skb I/OAT aware
Add an extra argument to sk_eat_skb, and make it move early copied packets to the async_wait_queue instead of freeing them. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/net/sock.h | 13 - net/dccp/proto.c |4 ++-- net/ipv4/tcp.c |8 net/llc/af_llc.c |2 +- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 190809c..e3723b6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1272,11 +1272,22 @@ sock_recv_timestamp(struct msghdr *msg, * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ -static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) +#ifdef CONFIG_NET_DMA +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) +{ + __skb_unlink(skb, sk-sk_receive_queue); + if (!copied_early) + __kfree_skb(skb); + else + __skb_queue_tail(sk-sk_async_wait_queue, skb); +} +#else +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) { __skb_unlink(skb, sk-sk_receive_queue); __kfree_skb(skb); } +#endif extern void sock_enable_timestamp(struct sock *sk); extern int sock_get_timestamp(struct sock *, struct timeval __user *); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 1ff7328..35d7dfd 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, str } dccp_pr_debug(packet_type=%s\n, dccp_packet_name(dh-dccph_type)); - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; @@ -773,7 +773,7 @@ verify_sock_status: } found_fin_ok: if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (1); out: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b10f78c..2346539 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_ break; } if (skb-h.th-fin) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); ++seq; break; } - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); if (!desc-count) break; } @@ -1356,14 +1356,14 @@ skip_copy: if (skb-h.th-fin) goto found_fin_ok; if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); continue; found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (len 0); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5a04db7..7465170 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb * continue; if (!(flags MSG_PEEK)) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); *seq = 0; } } while (len 0); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/9] I/OAT
[I/OAT] Driver for the Intel(R) I/OAT DMA engine From: Chris Leech [EMAIL PROTECTED] Adds a new ioatdma driver Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/Kconfig |9 drivers/dma/Makefile|1 drivers/dma/ioatdma.c | 805 +++ drivers/dma/ioatdma.h | 126 ++ drivers/dma/ioatdma_hw.h| 52 +++ drivers/dma/ioatdma_io.h| 118 ++ drivers/dma/ioatdma_registers.h | 128 ++ 7 files changed, 1239 insertions(+), 0 deletions(-) ioatdma_driver.gz Description: GNU Zip compressed data
[PATCH 1/9] [I/OAT] DMA memcpy subsystem
Provides an API for offloading memory copies to DMA devices Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/Kconfig |2 drivers/Makefile |1 drivers/dma/Kconfig | 13 + drivers/dma/Makefile |1 drivers/dma/dmaengine.c | 405 + include/linux/dmaengine.h | 337 + 6 files changed, 759 insertions(+), 0 deletions(-) diff --git a/drivers/Kconfig b/drivers/Kconfig index 9f5c0da..f89ac05 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -72,4 +72,6 @@ source drivers/edac/Kconfig source drivers/rtc/Kconfig +source drivers/dma/Kconfig + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 4249552..9b808a6 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -74,3 +74,4 @@ obj-$(CONFIG_SGI_SN) += sn/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ obj-$(CONFIG_SUPERH) += sh/ +obj-$(CONFIG_DMA_ENGINE) += dma/ diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig new file mode 100644 index 000..f9ac4bc --- /dev/null +++ b/drivers/dma/Kconfig @@ -0,0 +1,13 @@ +# +# DMA engine configuration +# + +menu DMA Engine support + +config DMA_ENGINE + bool Support for DMA engines + ---help--- + DMA engines offload copy operations from the CPU to dedicated + hardware, allowing the copies to happen asynchronously. + +endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile new file mode 100644 index 000..10b7391 --- /dev/null +++ b/drivers/dma/Makefile @@ -0,0 +1 @@ +obj-y += dmaengine.o diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c new file mode 100644 index 000..683456a --- /dev/null +++ b/drivers/dma/dmaengine.c @@ -0,0 +1,405 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code implements the DMA subsystem. It provides a HW-neutral interface + * for other kernel code to use asynchronous memory copy capabilities, + * if present, and allows different HW DMA drivers to register as providing + * this capability. + * + * Due to the fact we are accelerating what is already a relatively fast + * operation, the code goes to great lengths to avoid additional overhead, + * such as locking. + * + * LOCKING: + * + * The subsystem keeps two global lists, dma_device_list and dma_client_list. + * Both of these are protected by a spinlock, dma_list_lock. + * + * Each device has a channels list, which runs unlocked but is never modified + * once the device is registered, it's just setup by the driver. + * + * Each client has a channels list, it's only modified under the client-lock + * and in an RCU callback, so it's safe to read under rcu_read_lock(). + * + * Each device has a kref, which is initialized to 1 when the device is + * registered. A kref_put is done for each class_device registered. When the + * class_device is released, the coresponding kref_put is done in the release + * method. Every time one of the device's channels is allocated to a client, + * a kref_get occurs. When the channel is freed, the coresponding kref_put + * happens. The device's release function does a completion, so + * unregister_device does a remove event, class_device_unregister, a kref_put + * for the first reference, then waits on the completion for all other + * references to finish. + * + * Each channel has an open-coded implementation of Rusty Russell's bigref, + * with a kref and a per_cpu local_t. A single reference is set when on an + * ADDED event, and removed with a REMOVE event. Net DMA client takes an + * extra reference per outstanding transaction. The relase function does a + * kref_put on the device. -ChrisL + */ + +#include linux/init.h +#include linux/module.h +#include linux/device.h +#include linux/dmaengine.h +#include linux/hardirq.h +#include linux/spinlock.h +#include linux/percpu.h +#include linux/rcupdate.h + +static DEFINE_SPINLOCK(dma_list_lock); +static LIST_HEAD(dma_device_list); +static LIST_HEAD(dma_client_list); + +/* --- sysfs implementation --- */ + +static
[PATCH 3/9] [I/OAT] Setup the networking subsystem as a DMA client
Attempts to allocate per-CPU DMA channels Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/Kconfig | 12 + include/linux/netdevice.h |4 ++ include/net/netdma.h | 38 net/core/dev.c| 104 + 4 files changed, 158 insertions(+), 0 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 0f15e76..30d021d 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -10,6 +10,18 @@ config DMA_ENGINE DMA engines offload copy operations from the CPU to dedicated hardware, allowing the copies to happen asynchronously. +comment DMA Clients + +config NET_DMA + bool Network: TCP receive copy offload + depends on DMA_ENGINE NET + default y + ---help--- + This enables the use of DMA engines in the network stack to + offload receive copy-to-user operations, freeing CPU cycles. + Since this is the main user of the DMA engine, it should be enabled; + say Y here. + comment DMA Devices config INTEL_IOATDMA diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 950dc55..7fda35f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -37,6 +37,7 @@ #include linux/config.h #include linux/device.h #include linux/percpu.h +#include linux/dmaengine.h struct divert_blk; struct vlan_group; @@ -592,6 +593,9 @@ struct softnet_data struct sk_buff *completion_queue; struct net_device backlog_dev;/* Sorry. 8) */ +#ifdef CONFIG_NET_DMA + struct dma_chan *net_dma; +#endif }; DECLARE_PER_CPU(struct softnet_data,softnet_data); diff --git a/include/net/netdma.h b/include/net/netdma.h new file mode 100644 index 000..cbfe89d --- /dev/null +++ b/include/net/netdma.h @@ -0,0 +1,38 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#ifndef NETDMA_H +#define NETDMA_H +#include linux/config.h +#ifdef CONFIG_NET_DMA +#include linux/dmaengine.h + +static inline struct dma_chan *get_softnet_dma(void) +{ + struct dma_chan *chan; + rcu_read_lock(); + chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma)); + if (chan) + dma_chan_get(chan); + rcu_read_unlock(); + return chan; +} +#endif /* CONFIG_NET_DMA */ +#endif /* NETDMA_H */ diff --git a/net/core/dev.c b/net/core/dev.c index a3ab11f..ffd3d6d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -115,6 +115,7 @@ #include net/iw_handler.h #include asm/current.h #include linux/audit.h +#include linux/dmaengine.h /* * The list of packet types we will receive (as opposed to discard) @@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[16];/* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ +#ifdef CONFIG_NET_DMA +static struct dma_client *net_dma_client; +static unsigned int net_dma_count; +static spinlock_t net_dma_event_lock; +#endif + /* * The @dev_base list is protected by @dev_base_lock and the rtln * semaphore. @@ -1780,6 +1787,19 @@ static void net_rx_action(struct softirq } } out: +#ifdef CONFIG_NET_DMA + /* +* There may not be any more sk_buffs coming right now, so push +* any pending DMA copies to hardware +*/ + if (net_dma_client) { + struct dma_chan *chan; + rcu_read_lock(); + list_for_each_entry_rcu(chan, net_dma_client-channels, client_node) + dma_async_memcpy_issue_pending(chan); + rcu_read_unlock(); + } +#endif local_irq_enable(); return; @@ -3243,6 +3263,88 @@ static int dev_cpu_callback(struct notif } #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_NET_DMA +/** + * net_dma_rebalance - + * This is called when the number of channels allocated to the net_dma_client + * changes. The net_dma_client tries to have one DMA channel per CPU. + */ +static void net_dma_rebalance(void) +{ + unsigned int cpu, i, n
[PATCH 0/9] I/OAT
This patch series is the a full release of the Intel(R) I/O Acceleration Technology (I/OAT) for Linux. It includes an in kernel API for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy engine, and changes to the TCP stack to offload copies of received networking data to application space. Changes from last posting: Fixed a page reference leak that happened when offloaded copies were set up but never used for a recv. Fixed the ioatdma self test to handle failures correctly. Serialized DMA ADD and REMOVE events in the networking core with a lock. Added a long comment in dmaengine.c to describe the locking and reference counting being used. Disabled preempt around a use of get_cpu_var. Made tcp_dma_try_early_copy static, it is only used in one file. Made some GFP_ATOMIC allocations GFP_KERNEL where safe to sleep. Made changes to sk_eat_skb, removing some ifdefs in the TCP code. These changes apply to DaveM's net-2.6.17 tree as of commit 68907dad58cd7ef11536e1db6baeb98b20af91b2 ([DCCP]: Use NULL for pointers, comfort sparse.) They are available to pull from git://198.78.49.142/~cleech/linux-2.6 ioat-2.6.17 There are 9 patches in the series: 1) The memcpy offload APIs and class code 2) The Intel I/OAT DMA driver (ioatdma) 3) Core networking code to setup networking as a DMA memcpy client 4) Utility functions for sk_buff to iovec offloaded copy 5) Structure changes needed for TCP receive offload 6) Rename cleanup_rbuf to tcp_cleanup_rbuf 7) Make sk_eat_skb aware of early copied packets 8) Add a sysctl to tune the minimum offloaded I/O size for TCP 9) The main TCP receive offload changes -- Chris Leech [EMAIL PROTECTED] I/O Acceleration Technology Software Development LAN Access Division / Digital Enterprise Group - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/9] [I/OAT] Structure changes for TCP recv offload to I/OAT
Adds an async_wait_queue and some additional fields to tcp_sock, and a dma_cookie_t to sk_buff. Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/linux/skbuff.h |4 include/linux/tcp.h|8 include/net/sock.h |2 ++ include/net/tcp.h |7 +++ net/core/sock.c|6 ++ 5 files changed, 27 insertions(+), 0 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 613b951..76861a8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -29,6 +29,7 @@ #include linux/net.h #include linux/textsearch.h #include net/checksum.h +#include linux/dmaengine.h #define HAVE_ALLOC_SKB /* For the drivers to know */ #define HAVE_ALIGNABLE_SKB /* Ditto 8)*/ @@ -285,6 +286,9 @@ struct sk_buff { __u16 tc_verd;/* traffic control verdict */ #endif #endif +#ifdef CONFIG_NET_DMA + dma_cookie_tdma_cookie; +#endif /* These elements must be at the end, see alloc_skb() for details. */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 542d395..c90daa5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -18,6 +18,7 @@ #define _LINUX_TCP_H #include linux/types.h +#include linux/dmaengine.h #include asm/byteorder.h struct tcphdr { @@ -233,6 +234,13 @@ struct tcp_sock { struct iovec*iov; int memory; int len; +#ifdef CONFIG_NET_DMA + /* members for async copy */ + struct dma_chan *dma_chan; + int wakeup; + struct dma_pinned_list *pinned_list; + dma_cookie_tdma_cookie; +#endif } ucopy; __u32 snd_wl1;/* Sequence for window update */ diff --git a/include/net/sock.h b/include/net/sock.h index af2b054..190809c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -132,6 +132,7 @@ struct sock_common { *@sk_receive_queue: incoming packets *@sk_wmem_alloc: transmit queue bytes committed *@sk_write_queue: Packet sending queue + *@sk_async_wait_queue: DMA copied packets *@sk_omem_alloc: o is option or other *@sk_wmem_queued: persistent queue size *@sk_forward_alloc: space allocated forward @@ -205,6 +206,7 @@ struct sock { atomic_tsk_omem_alloc; struct sk_buff_head sk_receive_queue; struct sk_buff_head sk_write_queue; + struct sk_buff_head sk_async_wait_queue; int sk_wmem_queued; int sk_forward_alloc; gfp_t sk_allocation; diff --git a/include/net/tcp.h b/include/net/tcp.h index 9418f4d..54e4367 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -28,6 +28,7 @@ #include linux/cache.h #include linux/percpu.h #include linux/skbuff.h +#include linux/dmaengine.h #include net/inet_connection_sock.h #include net/inet_timewait_sock.h @@ -820,6 +821,12 @@ static inline void tcp_prequeue_init(str tp-ucopy.len = 0; tp-ucopy.memory = 0; skb_queue_head_init(tp-ucopy.prequeue); +#ifdef CONFIG_NET_DMA + tp-ucopy.dma_chan = NULL; + tp-ucopy.wakeup = 0; + tp-ucopy.pinned_list = NULL; + tp-ucopy.dma_cookie = 0; +#endif } /* Packet is added to VJ-style prequeue for processing in process diff --git a/net/core/sock.c b/net/core/sock.c index a96ea7d..d2acd35 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -818,6 +818,9 @@ struct sock *sk_clone(const struct sock atomic_set(newsk-sk_omem_alloc, 0); skb_queue_head_init(newsk-sk_receive_queue); skb_queue_head_init(newsk-sk_write_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(newsk-sk_async_wait_queue); +#endif rwlock_init(newsk-sk_dst_lock); rwlock_init(newsk-sk_callback_lock); @@ -1369,6 +1372,9 @@ void sock_init_data(struct socket *sock, skb_queue_head_init(sk-sk_receive_queue); skb_queue_head_init(sk-sk_write_queue); skb_queue_head_init(sk-sk_error_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(sk-sk_async_wait_queue); +#endif sk-sk_send_head= NULL; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/9] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static
Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/net/tcp.h |2 ++ net/ipv4/tcp.c| 10 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 54e4367..ca5bdaf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -294,6 +294,8 @@ extern int tcp_rcv_established(struct extern voidtcp_rcv_space_adjust(struct sock *sk); +extern voidtcp_cleanup_rbuf(struct sock *sk, int copied); + extern int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 87f68e7..b10f78c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, * calculation of whether or not we must ACK for the sake of * a window update. */ -static void cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; @@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_ /* Clean up data we have read: This will do ACK frames. */ if (copied) - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); return copied; } @@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru } } - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency tp-ucopy.task == user_recv) { /* Install new reader */ @@ -1391,7 +1391,7 @@ skip_copy: */ /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk); release_sock(sk); @@ -1853,7 +1853,7 @@ static int do_tcp_setsockopt(struct sock (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) inet_csk_ack_scheduled(sk)) { icsk-icsk_ack.pending |= ICSK_ACK_PUSHED; - cleanup_rbuf(sk, 1); + tcp_cleanup_rbuf(sk, 1); if (!(val 1)) icsk-icsk_ack.pingpong = 1; } - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Fwd: I/OAT performance data
should have kept this on list -- Forwarded message -- From: Chris Leech [EMAIL PROTECTED] Date: Mar 16, 2006 11:13 AM Subject: Re: I/OAT performance data To: Rick Jones [EMAIL PROTECTED] I must be missing something - if the MTU was 1500 bytes, how did the receiver's offloaded copies get to the 2k level? Were several arriving TCP segments aggregated? Most of the overhead (get_user_pages) is per recv, not on a per packet basis. Regardless of packet size, we offload the copies if the total requested data amount is 2k or greater. So while there's no aggregation of TCP segments before the socket level, we are talking about copying multiple packets per I/O. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Fwd: I/OAT performance data
oops, should have kept this on list -- Forwarded message -- From: Chris Leech [EMAIL PROTECTED] Date: Mar 16, 2006 10:56 AM Subject: Re: I/OAT performance data To: Rick Jones [EMAIL PROTECTED] When it says buffer size for the Chariot stuff, is that the socket buffer size, or the size of the buffer(s) being passed to the transport? That's the I/O size for the application, being passed to the transport. Was the MTU 1500 or 9000 bytes? 1500 byte MTU Can the Chariot do small packet latency tests and/or aggregate small packet performance? Yes, but for small I/O the overhead of pinning pages and initiating the offloaded copy overtakes the benefits. We currently see that cutoff at about 2k. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: I/OAT performance data
On 3/16/06, Leonid Grossman [EMAIL PROTECTED] wrote: Hi Chris, Do you know what part of the performance delta is contributed by the offload for copy operations, and what part comes from other I/OAT features like header separation, etc. ? This is showing the offloaded copy as the only difference. Header separation is being used in the e1000 driver in both test runs. - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: I/OAT performance data
Thanks, that clarifies things. So, if I've understood correctly, the benefit kicks in when: 1) I/OAT is enabled :) 2) The user posts a recv() (or the like) of = 2K 3) There is = 2K of data available to give them yes? Yes - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: I/OAT performance data
On 3/16/06, Scott Feldman [EMAIL PROTECTED] wrote: Do you have any data to share on header split? Also, can other non- Intel nics use I/OAT copy, and if so, is header-split a requirement for the copy? I don't have any header-split data. The I/OAT copy offload will work for any TCP traffic, regardless of what kind of NIC it was received on (of course you need a system with the additional memcpy engine in the chipset) - Chris - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/8] Intel I/O Acceleration Technology (I/OAT)
This patch series is the a full release of the Intel(R) I/O Acceleration Technology (I/OAT) for Linux. It includes an in kernel API for offloading memory copies to hardware, a driver for the I/OAT DMA memcpy engine, and changes to the TCP stack to offload copies of received networking data to application space. Changes from last weeks posting: fixed return value from sysfs show functions as suggested by Joe Perches code style fixes suggested by Andrew Morton, David Miller, and others renamed anything related to pinning pages from lock/locked to pin/pinned renamed ioatdma register read/write functions with less generic names return a pinned list from dma_pin_iovec_pages instead of passing in a **dma_pinned_list replaced all cb/CB symbol prefixes in ioatdma with ioat/IOAT, CB was an abbreviation of an early code name use set_page_dirty_lock instead of SetPageDirty pointed out by Andrew Morton rename dma_async_try_early_copy to tcp_dma_try_early_copy and stop exporting I'll be focusing on reducing ifdefs and adding much needed comments, with another release early next week. These changes apply to DaveM's net-2.6.17 tree as of commit 32639ad6b7e3da27f233c0516471f0747f1178f5 ([SPARC]: Fixup SO_*SEC values on 32-bit sparc.) They are available to pull from git://198.78.49.142/~cleech/linux-2.6 ioat-2.6.17 There are 8 patches in the series: 1) The memcpy offload APIs and class code 2) The Intel I/OAT DMA driver (ioatdma) 3) Core networking code to setup networking as a DMA memcpy client 4) Utility functions for sk_buff to iovec offloaded copy 5) Structure changes needed for TCP receive offload 6) Rename cleanup_rbuf to tcp_cleanup_rbuf 7) Add a sysctl to tune the minimum offloaded I/O size for TCP 8) The main TCP receive offload changes -- Chris Leech [EMAIL PROTECTED] I/O Acceleration Technology Software Development LAN Access Division / Digital Enterprise Group - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/8] [I/OAT] DMA memcpy subsystem
Provides an API for offloading memory copies to DMA devices Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/Kconfig |2 drivers/Makefile |1 drivers/dma/Kconfig | 13 ++ drivers/dma/Makefile |1 drivers/dma/dmaengine.c | 360 + include/linux/dmaengine.h | 323 6 files changed, 700 insertions(+), 0 deletions(-) diff --git a/drivers/Kconfig b/drivers/Kconfig index bddf431..ce7ffa7 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -70,4 +70,6 @@ source drivers/sn/Kconfig source drivers/edac/Kconfig +source drivers/dma/Kconfig + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 5c69b86..516ba5e 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -73,3 +73,4 @@ obj-$(CONFIG_SGI_SN) += sn/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ obj-$(CONFIG_SUPERH) += sh/ +obj-$(CONFIG_DMA_ENGINE) += dma/ diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig new file mode 100644 index 000..f9ac4bc --- /dev/null +++ b/drivers/dma/Kconfig @@ -0,0 +1,13 @@ +# +# DMA engine configuration +# + +menu DMA Engine support + +config DMA_ENGINE + bool Support for DMA engines + ---help--- + DMA engines offload copy operations from the CPU to dedicated + hardware, allowing the copies to happen asynchronously. + +endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile new file mode 100644 index 000..10b7391 --- /dev/null +++ b/drivers/dma/Makefile @@ -0,0 +1 @@ +obj-y += dmaengine.o diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c new file mode 100644 index 000..35a63d8 --- /dev/null +++ b/drivers/dma/dmaengine.c @@ -0,0 +1,360 @@ +/* +Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2 of the License, or (at your option) +any later version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 +Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +The full GNU General Public License is included in this distribution in the +file called LICENSE. +*/ +#include linux/init.h +#include linux/module.h +#include linux/device.h +#include linux/dmaengine.h +#include linux/hardirq.h +#include linux/spinlock.h +#include linux/percpu.h +#include linux/rcupdate.h + +static DEFINE_SPINLOCK(dma_list_lock); +static LIST_HEAD(dma_device_list); +static LIST_HEAD(dma_client_list); + +/* --- sysfs implementation --- */ + +static ssize_t show_memcpy_count(struct class_device *cd, char *buf) +{ + struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + unsigned long count = 0; + int i; + + for_each_cpu(i) + count += per_cpu_ptr(chan-local, i)-memcpy_count; + + return sprintf(buf, %lu\n, count); +} + +static ssize_t show_bytes_transferred(struct class_device *cd, char *buf) +{ + struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + unsigned long count = 0; + int i; + + for_each_cpu(i) + count += per_cpu_ptr(chan-local, i)-bytes_transferred; + + return sprintf(buf, %lu\n, count); +} + +static ssize_t show_in_use(struct class_device *cd, char *buf) +{ + struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + + return sprintf(buf, %d\n, (chan-client ? 1 : 0)); +} + +static struct class_device_attribute dma_class_attrs[] = { + __ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL), + __ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL), + __ATTR(in_use, S_IRUGO, show_in_use, NULL), + __ATTR_NULL +}; + +static void dma_async_device_cleanup(struct kref *kref); + +static void dma_class_dev_release(struct class_device *cd) +{ + struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + kref_put(chan-device-refcount, dma_async_device_cleanup); +} + +static struct class dma_devclass = { + .name= dma, + .class_dev_attrs = dma_class_attrs, + .release = dma_class_dev_release, +}; + +/* --- client and device registration --- */ + +/** + * dma_client_chan_alloc - try to allocate a channel to a client + * @client: dma_client + * + * Called
[PATCH 3/8] [I/OAT] Setup the networking subsystem as a DMA client
Attempts to allocate per-CPU DMA channels Signed-off-by: Chris Leech [EMAIL PROTECTED] --- drivers/dma/Kconfig | 12 + include/linux/netdevice.h |6 +++ include/net/netdma.h | 37 + net/core/dev.c| 100 + 4 files changed, 155 insertions(+), 0 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 0f15e76..30d021d 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -10,6 +10,18 @@ config DMA_ENGINE DMA engines offload copy operations from the CPU to dedicated hardware, allowing the copies to happen asynchronously. +comment DMA Clients + +config NET_DMA + bool Network: TCP receive copy offload + depends on DMA_ENGINE NET + default y + ---help--- + This enables the use of DMA engines in the network stack to + offload receive copy-to-user operations, freeing CPU cycles. + Since this is the main user of the DMA engine, it should be enabled; + say Y here. + comment DMA Devices config INTEL_IOATDMA diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 950dc55..25d8610 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -37,6 +37,9 @@ #include linux/config.h #include linux/device.h #include linux/percpu.h +#ifdef CONFIG_NET_DMA +#include linux/dmaengine.h +#endif struct divert_blk; struct vlan_group; @@ -592,6 +595,9 @@ struct softnet_data struct sk_buff *completion_queue; struct net_device backlog_dev;/* Sorry. 8) */ +#ifdef CONFIG_NET_DMA + struct dma_chan *net_dma; +#endif }; DECLARE_PER_CPU(struct softnet_data,softnet_data); diff --git a/include/net/netdma.h b/include/net/netdma.h new file mode 100644 index 000..6435aef --- /dev/null +++ b/include/net/netdma.h @@ -0,0 +1,37 @@ +/* +Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation; either version 2 of the License, or (at your option) +any later version. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 +Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +The full GNU General Public License is included in this distribution in the +file called LICENSE. +*/ +#ifndef NETDMA_H +#define NETDMA_H +#ifdef CONFIG_NET_DMA +#include linux/dmaengine.h + +static inline struct dma_chan *get_softnet_dma(void) +{ + struct dma_chan *chan; + rcu_read_lock(); + chan = rcu_dereference(__get_cpu_var(softnet_data.net_dma)); + if (chan) + dma_chan_get(chan); + rcu_read_unlock(); + return chan; +} +#endif /* CONFIG_NET_DMA */ +#endif /* NETDMA_H */ diff --git a/net/core/dev.c b/net/core/dev.c index f7f6f99..d7e61b4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -115,6 +115,7 @@ #include linux/wireless.h/* Note : will define WIRELESS_EXT */ #include net/iw_handler.h #endif /* CONFIG_NET_RADIO */ +#include linux/dmaengine.h #include asm/current.h /* @@ -149,6 +150,11 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[16];/* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ +#ifdef CONFIG_NET_DMA +static struct dma_client *net_dma_client; +static unsigned int net_dma_count; +#endif + /* * The @dev_base list is protected by @dev_base_lock and the rtln * semaphore. @@ -1750,6 +1756,19 @@ static void net_rx_action(struct softirq } } out: +#ifdef CONFIG_NET_DMA + /* +* There may not be any more sk_buffs coming right now, so push +* any pending DMA copies to hardware +*/ + if (net_dma_client) { + struct dma_chan *chan; + rcu_read_lock(); + list_for_each_entry_rcu(chan, net_dma_client-channels, client_node) + dma_async_memcpy_issue_pending(chan); + rcu_read_unlock(); + } +#endif local_irq_enable(); return; @@ -3205,6 +3224,85 @@ static int dev_cpu_callback(struct notif } #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_NET_DMA +/** + * net_dma_rebalance - + * This is called when the number of channels allocated to the net_dma_client + * changes. The net_dma_client tries to have one DMA channel per CPU
[PATCH 6/8] [I/OAT] Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static
Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/net/tcp.h |2 ++ net/ipv4/tcp.c| 10 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 610f66b..afc4b8a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -296,6 +296,8 @@ extern int tcp_rcv_established(struct extern voidtcp_rcv_space_adjust(struct sock *sk); +extern voidtcp_cleanup_rbuf(struct sock *sk, int copied); + extern int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4b0272c..9122520 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -936,7 +936,7 @@ static int tcp_recv_urg(struct sock *sk, * calculation of whether or not we must ACK for the sake of * a window update. */ -static void cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; @@ -1085,7 +1085,7 @@ int tcp_read_sock(struct sock *sk, read_ /* Clean up data we have read: This will do ACK frames. */ if (copied) - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); return copied; } @@ -1219,7 +1219,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru } } - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency tp-ucopy.task == user_recv) { /* Install new reader */ @@ -1390,7 +1390,7 @@ skip_copy: */ /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk); release_sock(sk); @@ -1852,7 +1852,7 @@ static int do_tcp_setsockopt(struct sock (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) inet_csk_ack_scheduled(sk)) { icsk-icsk_ack.pending |= ICSK_ACK_PUSHED; - cleanup_rbuf(sk, 1); + tcp_cleanup_rbuf(sk, 1); if (!(val 1)) icsk-icsk_ack.pingpong = 1; } - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
Locks down user pages and sets up for DMA in tcp_recvmsg, then calls dma_async_try_early_copy in tcp_v4_do_rcv Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/net/netdma.h |1 net/ipv4/tcp.c | 110 +- net/ipv4/tcp_input.c | 74 ++ net/ipv4/tcp_ipv4.c | 18 net/ipv6/tcp_ipv6.c | 12 + 5 files changed, 193 insertions(+), 22 deletions(-) diff --git a/include/net/netdma.h b/include/net/netdma.h index feb499f..3d9c222 100644 --- a/include/net/netdma.h +++ b/include/net/netdma.h @@ -38,6 +38,7 @@ static inline struct dma_chan *get_softn int dma_skb_copy_datagram_iovec(struct dma_chan* chan, const struct sk_buff *skb, int offset, struct iovec *to, size_t len, struct dma_pinned_list *pinned_list); +int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen); #endif /* CONFIG_NET_DMA */ #endif /* NETDMA_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9122520..a277398 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -262,7 +262,7 @@ #include net/tcp.h #include net/xfrm.h #include net/ip.h - +#include net/netdma.h #include asm/uaccess.h #include asm/ioctls.h @@ -1109,6 +1109,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; + int copied_early = 0; lock_sock(sk); @@ -1132,6 +1133,12 @@ int tcp_recvmsg(struct kiocb *iocb, stru target = sock_rcvlowat(sk, flags MSG_WAITALL, len); +#ifdef CONFIG_NET_DMA + tp-ucopy.dma_chan = NULL; + if ((len sysctl_tcp_dma_copybreak) !(flags MSG_PEEK) !sysctl_tcp_low_latency __get_cpu_var(softnet_data.net_dma)) + tp-ucopy.pinned_list = dma_pin_iovec_pages(msg-msg_iov, len); +#endif + do { struct sk_buff *skb; u32 offset; @@ -1273,6 +1280,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru } else sk_wait_data(sk, timeo); +#ifdef CONFIG_NET_DMA + tp-ucopy.wakeup = 0; +#endif + if (user_recv) { int chunk; @@ -1328,13 +1339,39 @@ do_prequeue: } if (!(flags MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg-msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; +#ifdef CONFIG_NET_DMA + if (!tp-ucopy.dma_chan tp-ucopy.pinned_list) + tp-ucopy.dma_chan = get_softnet_dma(); + + if (tp-ucopy.dma_chan) { + tp-ucopy.dma_cookie = dma_skb_copy_datagram_iovec( + tp-ucopy.dma_chan, skb, offset, + msg-msg_iov, used, + tp-ucopy.pinned_list); + + if (tp-ucopy.dma_cookie 0) { + + printk(KERN_ALERT dma_cookie 0\n); + + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + if ((offset + used) == skb-len) + copied_early = 1; + + } else +#endif + { + err = skb_copy_datagram_iovec(skb, offset, + msg-msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } } @@ -1354,15 +1391,33 @@ skip_copy: if (skb-h.th-fin) goto found_fin_ok; - if (!(flags MSG_PEEK)) - sk_eat_skb(sk, skb); + if (!(flags MSG_PEEK)) { + if (!copied_early) + sk_eat_skb(sk, skb); +#ifdef CONFIG_NET_DMA + else { + __skb_unlink(skb, sk-sk_receive_queue); + __skb_queue_tail(sk-sk_async_wait_queue, skb); + copied_early = 0; + } +#endif
[PATCH 7/8] [I/OAT] Add a sysctl for tuning the I/OAT offloaded I/O threshold
Any socket recv of less than this ammount will not be offloaded Signed-off-by: Chris Leech [EMAIL PROTECTED] --- include/linux/sysctl.h |1 + include/net/tcp.h |1 + net/core/user_dma.c|4 net/ipv4/sysctl_net_ipv4.c | 10 ++ 4 files changed, 16 insertions(+), 0 deletions(-) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 76eaeff..cd9e7c0 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -403,6 +403,7 @@ enum NET_TCP_MTU_PROBING=113, NET_TCP_BASE_MSS=114, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, + NET_TCP_DMA_COPYBREAK=116, }; enum { diff --git a/include/net/tcp.h b/include/net/tcp.h index afc4b8a..f319368 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -221,6 +221,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; +extern int sysctl_tcp_dma_copybreak; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 24e51eb..a85d1f1 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -33,6 +33,10 @@ file called LICENSE. #ifdef CONFIG_NET_DMA +#define NET_DMA_DEFAULT_COPYBREAK 1024 + +int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; + /** * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. * @skb - buffer to copy diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6b6c3ad..6a6aa53 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -688,6 +688,16 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#ifdef CONFIG_NET_DMA + { + .ctl_name = NET_TCP_DMA_COPYBREAK, + .procname = tcp_dma_copybreak, + .data = sysctl_tcp_dma_copybreak, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif { .ctl_name = 0 } }; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html