RE: [PATCH net-next] cxgb4: replace ntoh{s,l} and hton{s,l} calls with the generic byteorder
Ignore this. Needs some correction. Using incorrect byte-ordering at 2 places. Will send a V2 for the same. Thanks, Hari From: Hariprasad Shenai [haripra...@chelsio.com] Sent: Tuesday, May 19, 2015 3:43 PM To: netdev@vger.kernel.org Cc: da...@davemloft.net; Casey Leedom; Nirranjan Kirubaharan; Hariprasad S Subject: [PATCH net-next] cxgb4: replace ntoh{s,l} and hton{s,l} calls with the generic byteorder replace ntoh{s,l} and hton{s,l} calls with the generic byteorder in cxgb4/t4_hw.c file Signed-off-by: Hariprasad Shenai haripra...@chelsio.com --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 403 - 1 file changed, 219 insertions(+), 184 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index c626252..e16f406 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -214,8 +214,8 @@ static void fw_asrt(struct adapter *adap, u32 mbox_addr) get_mbox_rpl(adap, (__be64 *)asrt, sizeof(asrt) / 8, mbox_addr); dev_alert(adap-pdev_dev, FW assertion at %.16s:%u, val0 %#x, val1 %#x\n, - asrt.u.assert.filename_0_7, ntohl(asrt.u.assert.line), - ntohl(asrt.u.assert.x), ntohl(asrt.u.assert.y)); + asrt.u.assert.filename_0_7, be32_to_cpu(asrt.u.assert.line), + be32_to_cpu(asrt.u.assert.x), be32_to_cpu(asrt.u.assert.y)); } static void dump_mbox(struct adapter *adap, int mbox, u32 data_reg) @@ -378,7 +378,7 @@ int t4_mc_read(struct adapter *adap, int idx, u32 addr, __be32 *data, u64 *ecc) #define MC_DATA(i) MC_BIST_STATUS_REG(mc_bist_status_rdata, i) for (i = 15; i = 0; i--) - *data++ = htonl(t4_read_reg(adap, MC_DATA(i))); + *data++ = cpu_to_be32(t4_read_reg(adap, MC_DATA(i))); if (ecc) *ecc = t4_read_reg64(adap, MC_DATA(16)); #undef MC_DATA @@ -435,7 +435,7 @@ int t4_edc_read(struct adapter *adap, int idx, u32 addr, __be32 *data, u64 *ecc) #define EDC_DATA(i) (EDC_BIST_STATUS_REG(edc_bist_status_rdata, i)) for (i = 15; i = 0; i--) - *data++ = htonl(t4_read_reg(adap, EDC_DATA(i))); + *data++ = cpu_to_be32(t4_read_reg(adap, EDC_DATA(i))); if (ecc) *ecc = t4_read_reg64(adap, EDC_DATA(16)); #undef EDC_DATA @@ -1618,7 +1618,7 @@ int t4_read_flash(struct adapter *adapter, unsigned int addr, if (ret) return ret; if (byte_oriented) - *data = (__force __u32) (htonl(*data)); + *data = (__force __u32)(cpu_to_be32(*data)); } return 0; } @@ -1979,7 +1979,7 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size) FW image size not multiple of 512 bytes\n); return -EINVAL; } - if (ntohs(hdr-len512) * 512 != size) { + if ((unsigned int)be16_to_cpu(hdr-len512) * 512 != size) { dev_err(adap-pdev_dev, FW image size differs from size in FW header\n); return -EINVAL; @@ -1993,7 +1993,7 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size) return -EINVAL; for (csum = 0, i = 0; i size / sizeof(csum); i++) - csum += ntohl(p[i]); + csum += be32_to_cpu(p[i]); if (csum != 0x) { dev_err(adap-pdev_dev, @@ -2012,7 +2012,7 @@ int t4_load_fw(struct adapter *adap, const u8 *fw_data, unsigned int size) * first page with a bad version. */ memcpy(first_page, fw_data, SF_PAGE_SIZE); - ((struct fw_hdr *)first_page)-fw_ver = htonl(0x); + ((struct fw_hdr *)first_page)-fw_ver = cpu_to_be32(0x); ret = t4_write_flash(adap, fw_img_start, SF_PAGE_SIZE, first_page); if (ret) goto out; @@ -2107,19 +2107,22 @@ int t4_link_start(struct adapter *adap, unsigned int mbox, unsigned int port, fc |= FW_PORT_CAP_FC_TX; memset(c, 0, sizeof(c)); - c.op_to_portid = htonl(FW_CMD_OP_V(FW_PORT_CMD) | FW_CMD_REQUEST_F | - FW_CMD_EXEC_F | FW_PORT_CMD_PORTID_V(port)); - c.action_to_len16 = htonl(FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_L1_CFG) | - FW_LEN16(c)); + c.op_to_portid = cpu_to_be32(FW_CMD_OP_V(FW_PORT_CMD) | +FW_CMD_REQUEST_F | FW_CMD_EXEC_F | +FW_PORT_CMD_PORTID_V(port)); + c.action_to_len16 = + cpu_to_be32(FW_PORT_CMD_ACTION_V(FW_PORT_ACTION_L1_CFG) | + FW_LEN16(c)); if (!(lc-supported FW_PORT_CAP_ANEG)) { - c.u.l1cfg.rcap = htonl((lc-supported ADVERT_MASK) | fc); +
pull-request: mac80211 2015-05-19
Hi Dave, And one more - for the current cycle (and stable as well). Let me know if there's any problem. johannes The following changes since commit ff419b3f95ab7a97c5f72876b53f12a249dacc2a: mac80211: fix 90 kernel-doc warnings (2015-05-04 12:56:13 +0200) are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/jberg/mac80211.git tags/mac80211-for-davem-2015-05-19 for you to fetch changes up to 47b4e1fc4972cc43a19121bc2608a60aef3bf216: mac80211: move WEP tailroom size check (2015-05-11 14:51:29 +0200) This has just a single fix, for a WEP tailroom check problem that leads to dropped frames. Janusz Dziedzic (1): mac80211: move WEP tailroom size check net/mac80211/wep.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Experimental new bonding driver mode=batman
On Tue, May 19, 2015 at 02:09:43AM -0400, Patrick Simmons wrote: I've written a new mode for the kernel bonding driver. It's similar to the round-robin mode, but it keeps statistics on TCP resends so as to favor slave devices with more bandwidth when choosing where to send packets. I've tested it on two laptops using WiFi/Ethernet and it seems to work okay, but it should be considered experimental. A description of how is the mode supposed to work would be definitely helpful. Below are some comments from a quick look over the patch, I didn't think too much about the logic of the operation. +struct batnode +{ + struct batnode* next_batnode; + char slvname[IFNAMSIZ]; + unsigned long jiffiestamp; + u32 tcp_seq_num; + u16 port_id; +}; + +struct slave_congestion_node +{ + struct slave_congestion_node* next_slave; + char this_slave[IFNAMSIZ]; + u32 congestion_counter; +}; Referencing the slave by its name doesn't seem very efficient, especially if you are going to do it in the xmit handler. +#define BATTABLE_SIZE 256 +static struct batnode battable[BATTABLE_SIZE]; +static struct slave_congestion_node* slavelist_head_ptr = NULL; Why one global list rather than one list per bond? And why do you need your own list in addition to already existing slave structures? +static struct slave_congestion_node* get_slave_congestion_node(char* slvname) +{ + //printk(In get_slave_congestion_node: 0x%zx\n,the_slave); + struct slave_congestion_node* cur_slave = slavelist_head_ptr; + while(cur_slave) + { + if(strcmp(cur_slave-this_slave,slvname)==0) + { + //printk(Found a match: 0x%zx\n,cur_slave); + return cur_slave; + } + + if(!cur_slave-next_slave) + break; + cur_slave = cur_slave-next_slave; + } + + //Create new slave node + if(!cur_slave) //special case head + { + cur_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC)); + slavelist_head_ptr = cur_slave; + } + else + { + cur_slave-next_slave = (struct slave_congestion_node*)(kmalloc(sizeof(struct slave_congestion_node),GFP_ATOMIC)); + cur_slave = cur_slave-next_slave; + } You don't handle kmalloc() failure. + + //Initialize new slave node + cur_slave-next_slave = NULL; + strlcpy(cur_slave-this_slave,slvname,IFNAMSIZ); + cur_slave-congestion_counter = 1; + + //printk(Created new congestion node: 0x%zx\n,cur_slave); + return cur_slave; +} + +static void slave_congestion_increment(char* the_slave) +{ + get_slave_congestion_node(the_slave)-congestion_counter++; +} + +static void slave_congestion_decrement(char* the_slave) +{ + get_slave_congestion_node(the_slave)-congestion_counter--; +} Should these three really be in a header file? +static DEFINE_SPINLOCK(batlock); A lot of effort has been done recently to get rid of per-bond spinlocks. Adding a global one kind of goes against this work. @@ -1082,6 +1088,69 @@ static bool bond_should_deliver_exact_ma return false; } +static void batman_normalize_congestion(struct bonding* bond, int* congestion) +{ + static long unsigned int jiffiestamp = 0; + long unsigned int jiffie_temp = jiffies; + struct slave* slave; + struct list_head* i; + if(jiffie_temp!=jiffiestamp jiffie_temp%1000==0) //every thousand packets, normalize congestion Looks more like every 1000 jiffies, not packets. And what if this function doesn't get called in that one jiffy when jiffies is a multiple of 1000? (Similar problem later in bond_xmit_batman().) + printk(KERN_DEBUG batman: congestion normalization has occurred.\n); Use pr_debug() or netdev_dbg(). +static int batman_timer_invoke(void* bond_) +{ + struct bonding* bond = (struct bonding*)(bond_); + struct slave* slave; + struct list_head* i; + + while(!kthread_should_stop()) + { + unsigned long batflags; + long unsigned int jiffiestamp = jiffies; + + msleep(10*1000); + rcu_read_lock(); + spin_lock_irqsave(batlock,batflags); + + long unsigned int jiffie_temp = jiffies; + if(!(jiffie_temp - jiffiestamp 5 * HZ)) + bond_for_each_slave_rcu(bond,slave,i) + { +struct slave_congestion_node* congest_current = get_slave_congestion_node(slave-dev-name); +if(congest_current-congestion_counter = 2) + congest_current-congestion_counter/=2; + } + + spin_unlock_irqrestore(batlock,batflags); + rcu_read_unlock(); + } +} This seems to only run something periodically, why do you need a (per bond) kthread instead of a simple timer? @@
Re: [Xen-devel] [RFC PATCH 12/13] xen-netfront: implement TX persistent grants
On 18 May 2015, at 17:55, David Vrabel david.vra...@citrix.com wrote: On 12/05/15 18:18, Joao Martins wrote: Instead of grant/revoking the buffer related to the skb, it will use an already granted page and memcpy to it. The grants will be mapped by xen-netback and reused overtime, but only unmapped when the vif disconnects, as opposed to every packet. This only happens if the backend supports persistent grants since it would, otherwise, introduce the overhead of a memcpy on top of the grant map. [...] --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c [...] @@ -1610,7 +1622,10 @@ static int xennet_init_queue(struct netfront_queue *queue) for (i = 0; i NET_TX_RING_SIZE; i++) { skb_entry_set_link(queue-tx_skbs[i], i+1); queue-grant_tx[i].ref = GRANT_INVALID_REF; -queue-grant_tx[i].page = NULL; +if (queue-info-feature_persistent) +queue-grant_tx[i].page = alloc_page(GFP_NOIO); Need to check for alloc failure here and unwind correctly? Sorry, I overlooked this check. I will fix that. Why NOIO? May be I am misusing NOIO where I meant __GFP_WAIT. Tough given we are under rtnl_lock() perhaps GFP_ATOMIC should be used instead.-- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] vhost: support upto 509 memory regions
On Mon, 18 May 2015 19:22:34 +0300 Andrey Korolyov and...@xdel.ru wrote: On Wed, Feb 18, 2015 at 7:27 AM, Michael S. Tsirkin m...@redhat.com wrote: On Tue, Feb 17, 2015 at 04:53:45PM -0800, Eric Northup wrote: On Tue, Feb 17, 2015 at 4:32 AM, Michael S. Tsirkin m...@redhat.com wrote: On Tue, Feb 17, 2015 at 11:59:48AM +0100, Paolo Bonzini wrote: On 17/02/2015 10:02, Michael S. Tsirkin wrote: Increasing VHOST_MEMORY_MAX_NREGIONS from 65 to 509 to match KVM_USER_MEM_SLOTS fixes issue for vhost-net. Signed-off-by: Igor Mammedov imamm...@redhat.com This scares me a bit: each region is 32byte, we are talking a 16K allocation that userspace can trigger. What's bad with a 16K allocation? It fails when memory is fragmented. How does kvm handle this issue? It doesn't. Paolo I'm guessing kvm doesn't do memory scans on data path, vhost does. qemu is just doing things that kernel didn't expect it to need. Instead, I suggest reducing number of GPA-HVA mappings: you have GPA 1,5,7 map them at HVA 11,15,17 then you can have 1 slot: 1-11 To avoid libc reusing the memory holes, reserve them with MAP_NORESERVE or something like this. This works beautifully when host virtual address bits are more plentiful than guest physical address bits. Not all architectures have that property, though. AFAIK this is pretty much a requirement for both kvm and vhost, as we require each guest page to also be mapped in qemu memory. We can discuss smarter lookup algorithms but I'd rather userspace didn't do things that we then have to work around in kernel. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Hello, any chance of getting the proposed patch in the mainline? Though it seems that most users will not suffer from relatively slot number ceiling (they can decrease slot 'granularity' for larger VMs and vice-versa), fine slot size, 256M or even 128M, with the large number of slots can be useful for a certain kind of tasks for an orchestration systems. I`ve made a backport series of all seemingly interesting memslot-related improvements to a 3.10 branch, is it worth to be tested with straighforward patch like one from above, with simulated fragmentation of allocations in host? I'm almost done with approach suggested by Paolo, i.e. replace linear search with faster/scalable lookup alg. Hope to post it soon. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] net: fix a double free issue for neighbour entry
On Wed, 2015-05-20 at 09:25 +0800, Ying Xue wrote: Calling __ipv4_neigh_lookup_noref() inside rcu_read_lock_bh() can guarantee that its searched neighbour entry is not freed before RCU grace period, but it cannot ensure that its obtained neighbour will be freed shortly. Exactly saying, it cannot prevent neigh_destroy() from being executed on another context at the same time. For example, if ip_finish_output2() continues to deliver a SKB with a neighbour entry whose refcount is zero, neigh_add_timer() may be called in neigh_resolve_output() subsequently. As a result, neigh_add_timer() takes refcount on the neighbour that already had a refcount of zero. When the neighbour refcount is put before the timer's handler is exited, neigh_destroy() will be called again, meaning crash happens at the moment. To prevent the issue from occurring, we must check whether the refcount of a neighbour searched by __ipv4_neigh_lookup_noref() is decremented to zero or not. If it's zero, we should create a new one. However, as reading neigh's refcount is unsafe through atomic_read() like it doesn't imply any memory barrier and the cost is too expensive if we enforce a proper implicit or explicit memory barrier on it, another checking of identifying whether neigh's dead flag is set or not is involved into __neigh_event_send() to further prevent neigh_add_timer() from holding a neigh's refcount that already hit zero, thereby avoiding what the issue cannot absolutely happen. Reported-by: Joern Engel jo...@logfs.org Cc: Eric Dumazet eduma...@google.com Signed-off-by: Ying Xue ying@windriver.com --- v2: - As Eric pointed that identifying whether neigh's refcnt is zero through atomic_read() is unsafe, another condition checking of verifying neigh's dead flag is set is involved into __neigh_event_send() to further prevent neigh_add_timer() from holding a neigh's refcnt that already hit zero. - Now the patch is created based on net tree considering it's a very fatal issue. net/core/neighbour.c |3 +++ net/ipv4/ip_output.c |2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3de6542..c7a675c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -958,6 +958,9 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) if (neigh-nud_state (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) goto out_unlock_bh; + if (neigh-dead) + goto out_unlock_bh; + if (!(neigh-nud_state (NUD_STALE | NUD_INCOMPLETE))) { if (NEIGH_VAR(neigh-parms, MCAST_PROBES) + NEIGH_VAR(neigh-parms, APP_PROBES)) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c65b93a..5889774 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -200,7 +200,7 @@ static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb) rcu_read_lock_bh(); nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)-daddr); neigh = __ipv4_neigh_lookup_noref(dev, nexthop); - if (unlikely(!neigh)) + if (unlikely(!neigh || !atomic_read(neigh-refcnt))) neigh = __neigh_create(arp_tbl, nexthop, dev, false); if (!IS_ERR(neigh)) { int res = dst_neigh_output(dst, neigh, skb); Sorry, this atomic_read() makes no sense to me. When rcu is used, this is perfectly fine to use an object which refcount is 0. If you believe the opposite, then point me to relevant Documentation/RCU/ file. refcount should only protect the object itself, not the use of it by a rcu reader. This has nothing to do with rcu but standard refcounting of objects. The only forbidden thing would be to try to take a reference on it with atomic_inc() instead of the more careful atomic_inc_not_zero(), if the caller needs to exit the rcu_read_lock() section safely (as explained in Documentation/RCU/rcuref.txt around line 52) So far, dst_neigh_output() will not try to take a refcnt. By the time you check atomic_read() the result is meaningless if another cpu decrements the refcount to 0. So what is the point, trying to reduce a race window but not properly remove it ? I repeat again : using atomic_read() in a rcu lookup is absolutely useless and is a clear sign you missed a fundamental issue. So now, we are back to the patch I initially sent, but you did not address Julian Anastasov feedback. Really I am not very happy... -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 net-next 0/4] rocker: transaction fixes
Hi, this series addresses what appear to be errors in the handling of prepare and then commit transactions in the rocker driver. In all cases the problem is that data structures visible outside of the transaction are modified during the prepare phase. In the case of the first two patches this results in the kernel reporting a BUG. I have noted test-cases in the change logs. The third patch is also a bug fix, as noted by Toshiaki Makita, however I have not been able to reliably reproduce the problem and thus have not provided a test case. The last patch is a correctness fix that does not fix a bug that manifests as far as I can tell. Changes: v2-v3 * rocker: do not make neighbour entry changes when preparing transactions - Correct inverted logic - Added ack from Scott Feldman Changes: v1-v2 * rocker: do not make neighbour entry changes when preparing transactions - Revised changelog to reflect information from Toshiaki Makita that there is a bug that can manifest - Update address and ttl regardless of the value of the transaction state * All other patches - Added acks from Scott Feldman Simon Horman (4): rocker: do not delete fdb entries in rocker_port_fdb_flush() when preparing transactions rocker: do not modify fdb table in rocker_port_fdb() when preparing transactions rocker: do not make neighbour entry changes when preparing transactions rocker: make rocker_port_internal_vlan_id_{get,put}() non-transactional drivers/net/ethernet/rocker/rocker.c | 46 +++- 1 file changed, 24 insertions(+), 22 deletions(-) -- 2.1.4 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: iptables: ensure number of counters is 0 in do_replace()
From: Dave Jones da...@codemonkey.org.uk Date: Tue, 19 May 2015 19:42:52 -0400 What's the preferred format, 8 separate patches, or 1 all-in-one diff for all instances of this bug ? Personally, I think you could do this in one patch. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC 12/23] xen: Extend page_to_mfn to take an offset in the page
Hi David, On 19/05/15 14:57, David Vrabel wrote: On 14/05/15 18:00, Julien Grall wrote: With 64KB page granularity support in Linux, a page will be split accross multiple MFN (Xen is using 4KB page granularity). Thoses MFNs may not be contiguous. With the offset in the page, the helper will be able to know which MFN the driver needs to retrieve. I think a gnttab_grant_foreign_access_ref()-like helper that takes a page would be better. You will probably want this helper able to return/fill a set of refs for 64 KiB pages. I will see what I can do. Although, I think this patch is still valid to avoid wrong usage with 64KB page granularity by the caller. The developer may think that MFN are contiguous which is not always true. Regards, -- Julien Grall -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH V6 05/10] audit: log creation and deletion of namespace instances
On 15/05/16, Paul Moore wrote: On Sat, May 16, 2015 at 10:46 AM, Eric W. Biederman ebied...@xmission.com wrote: Paul Moore p...@paul-moore.com writes: On Sat, May 16, 2015 at 5:46 AM, Daniel J Walsh dwa...@redhat.com wrote: On 05/15/2015 05:05 PM, Paul Moore wrote: On Thursday, May 14, 2015 11:23:09 PM Andy Lutomirski wrote: On Thu, May 14, 2015 at 7:32 PM, Richard Guy Briggs r...@redhat.com wrote: On 15/05/14, Paul Moore wrote: * Look at our existing audit records to determine which records should have namespace and container ID tokens added. We may only want to add the additional fields in the case where the namespace/container ID tokens are not the init namespace. If we have a record that ties a set of namespace IDs with a container ID, then I expect we only need to list the containerID along with auid and sessionID. The problem here is that the kernel has no concept of a container, and I don't think it makes any sense to add one just for audit. Container is a marketing term used by some userspace tools. I can imagine that both audit could benefit from a concept of a namespace *path* that understands nesting (e.g. root/2/5/1 or something along those lines). Mapping these to containers belongs in userspace, I think. It might be helpful to climb up a few levels in this thread ... I think we all agree that containers are not a kernel construct. I further believe that the kernel has no business generating container IDs, those should come from userspace and will likely be different depending on how you define container. However, what is less clear to me at this point is how the kernel should handle the setting, reporting, and general management of this container ID token. Wouldn't the easiest thing be to just treat add a containerid to the process context like auid. I believe so. At least that was the point I was trying to get across when I first jumped into this thread. It sounds nice but containers are not just a per process construct. Sometimes you might know anamespace but not which process instigated action to happen on that namespace. From an auditing perspective I'm not sure we will ever hit those cases; did you have a particular example in mind? The example that immediately came to mind when I first read Eric's comment was a packet coming in off a network in a particular network namespace. That could narrow it down to a subset of containers based on which network namespace it inhabits, but since it isn't associated with a particular task yet (other than a kernel thread) it will not be possible to select the precise nsproxy, let alone the container. paul moore - RGB -- Richard Guy Briggs rbri...@redhat.com Senior Software Engineer, Kernel Security, AMER ENG Base Operating Systems, Red Hat Remote, Ottawa, Canada Voice: +1.647.777.2635, Internal: (81) 32635, Alt: +1.613.693.0684x3545 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 06/13] xen-netback: copy buffer on xenvif_start_xmit()
On Tue, May 12, 2015 at 07:18:30PM +0200, Joao Martins wrote: By introducing persistent grants we speed up the RX thread with the decreased copy cost, that leads to a throughput decrease of 20%. It is observed that the rx_queue stays mostly at 10% of its capacity, as opposed to full capacity when using grant copy. And a finer measure with lock_stat (below with pkt_size 64, burst 1) shows much higher wait queue contention on queue-wq, which hints that the RX kthread is waits/wake_up more often, that is actually doing work. Without persistent grants: class namecon-bouncescontentions waittime-min waittime-max waittime-total waittime-avgacq-bounces acquisitions holdtime-min holdtime-max holdtime-total holdtime-avg -- queue-wq: 792792 0.36 24.36 1140.30 1.44 42081002671 0.00 46.75 538164.02 0.54 -- queue-wq326 [8115949f] __wake_up+0x2f/0x80 queue-wq410 [811592bf] finish_wait+0x4f/0xa0 queue-wq 56 [811593eb] prepare_to_wait+0x2b/0xb0 -- queue-wq202 [811593eb] prepare_to_wait+0x2b/0xb0 queue-wq467 [8115949f] __wake_up+0x2f/0x80 queue-wq123 [811592bf] finish_wait+0x4f/0xa0 With persistent grants: queue-wq: 61834 61836 0.32 30.12 99710.27 1.61 2414001125308 0.00 75.61 1106578.82 0.98 -- queue-wq 5079[8115949f] __wake_up+0x2f/0x80 queue-wq56280[811592bf] finish_wait+0x4f/0xa0 queue-wq 479[811593eb] prepare_to_wait+0x2b/0xb0 -- queue-wq 1005[811592bf] finish_wait+0x4f/0xa0 queue-wq56761[8115949f] __wake_up+0x2f/0x80 queue-wq 4072[811593eb] prepare_to_wait+0x2b/0xb0 Also, with persistent grants, we don't require batching grant copy ops (besides the initial copy+map) which makes me believe that deferring the skb to the RX kthread just adds up unnecessary overhead (for this particular case). This patch proposes copying the buffer on xenvif_start_xmit(), which lets us both remove the contention on queue-wq and lock on rx_queue. Here, an alternative to xenvif_rx_action routine is added namely xenvif_rx_map() that maps and copies the buffer to the guest. This is only used when persistent grants are used, since it would otherwise mean an hypercall per packet. Improvements are up to a factor of 2.14 with a single queue getting us from 1.04 Mpps to 1.7 Mpps (burst 1, pkt_size 64) and 1.5 to 2.6 Mpps (burst 2, pkt_size 64) compared to using the kthread. Maximum with grant copy is 1.2 Mpps, irrespective of the burst. All of this, measured on an Intel(R) Xeon(R) CPU E5-1650 v2 @ 3.50GHz. Signed-off-by: Joao Martins joao.mart...@neclab.eu --- drivers/net/xen-netback/common.h| 2 ++ drivers/net/xen-netback/interface.c | 11 +--- drivers/net/xen-netback/netback.c | 52 + 3 files changed, 51 insertions(+), 14 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 23deb6a..f3ece12 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -363,6 +363,8 @@ void xenvif_kick_thread(struct xenvif_queue *queue); int xenvif_dealloc_kthread(void *data); +int xenvif_rx_map(struct xenvif_queue *queue, struct sk_buff *skb); + void xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb); /* Determine whether the needed number of slots (req) are available, diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 1103568..dfe2b7b 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -109,7 +109,8 @@ static irqreturn_t xenvif_rx_interrupt(int irq, void *dev_id) { struct xenvif_queue *queue = dev_id; - xenvif_kick_thread(queue); + if (!queue-vif-persistent_grants) + xenvif_kick_thread(queue); return IRQ_HANDLED; } @@ -168,8 +169,12 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev) cb = XENVIF_RX_CB(skb); cb-expires = jiffies + vif-drain_timeout; - xenvif_rx_queue_tail(queue, skb); - xenvif_kick_thread(queue); + if (!queue-vif-persistent_grants) { + xenvif_rx_queue_tail(queue, skb); + xenvif_kick_thread(queue); + } else if (xenvif_rx_map(queue, skb)) { + return NETDEV_TX_BUSY; + } We now have two different functions for guest RX, one is xenvif_rx_map, the other is xenvif_rx_action. They look very similar. Can we only have one? return
Re: [RFC PATCH 02/13] xen-netback: xenbus feature persistent support
On Tue, May 12, 2015 at 07:18:26PM +0200, Joao Martins wrote: Checks for feature-persistent that indicates persistent grants support. Adds max_persistent_grants module param that specifies the max number of persistent grants, which if set to zero disables persistent grants. Signed-off-by: Joao Martins joao.mart...@neclab.eu This patch needs to be moved later. The feature needs to be implemented first. Also you need to patch netif.h to document this new feature. To do this, you need to patch the master netif.h in Xen tree then sync the change to Linux. Of course I don't mind if we discuss wording in the Linux copy then you devise a patch for Xen. Wei. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next V4 00/12] net/mlx5: ConnectX-4 100G Ethernet driver
From: Amir Vadai am...@mellanox.com Date: Tue, 19 May 2015 12:25:12 +0300 On Sun, May 17, 2015 at 8:05 PM, David Miller da...@davemloft.net wrote: From: Amir Vadai am...@mellanox.com Date: Sun, 17 May 2015 16:02:11 +0300 We didn't get a response yet regarding your comment about the irq renaming [3]. Well then, please hold off on resubmissions of this series until you do get a response and that issue is firmly resolved. Hi, I don't mean to push you, I only want to understand what is expected from me and what are the next steps: How will the issue be resolved? Do you plan to answer my question [1] from last week, and just too busy right now or something like that? I have not seen any response to me explaining why it's ok to change the IRQ name strings in the context where this will occur. Once you explain that, we can make forward progress, but only at that point. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 1/2] Renesas Ethernet AVB driver proper
From: Sergei Shtylyov sergei.shtyl...@cogentembedded.com Date: Sun, 17 May 2015 23:52:42 +0300 +/* Packet transmit function for Ethernet AVB */ +static netdev_tx_t ravb_start_xmit(struct sk_buff *skb, struct net_device *ndev) +{ ... + spin_lock_irqsave(priv-lock, flags); + if (priv-cur_tx[q] - priv-dirty_tx[q] priv-num_tx_ring[q]) { + if (!ravb_tx_free(ndev, q)) { + netif_warn(priv, tx_queued, ndev, TX FD exhausted.\n); + netif_stop_queue(ndev); + spin_unlock_irqrestore(priv-lock, flags); + return NETDEV_TX_BUSY; + } This is not an acceptable way to manage your transmit queue state. When your transmit queue fills up and you cannot accept the next arbitrary packet into your TX queue, you must stop the queue before returning from this function. Therefore, only error conditions should cause NETDEV_TX_BUSY to ever be returned from this routine. I know what argument you are going to make to me in response to this email. You are going to tell me that you have these two queues, one for tstamp packets and one for the rest, and therefore that model of TX queue management can't be made to work. That's not acceptable. Instead, you should classify packets to the different queues using -ndo_select_queue() and therefore have multiple TX queues, which you can management the fullness of independantly. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Intel-wired-lan] [PATCH] pci: Limit VPD reads for all Intel Ethernet devices
On May 19, 2015, at 10:50 AM, Alexander Duyck alexander.h.du...@redhat.com wrote: These two patches are very much related. They are only related because I saw an opportunity to do this while working on the other issue. That is the only relationship. snip material on the other patch Artificially limiting the size of the VPD does nothing but cut off possibly useful data, you would be better of providing all of the data on only the first function than providing only partial data on all functions and adding extra lock overhead. This limit only limits the maximum that the OS will read to what is architecturally possible in these devices. Yes, PCIe architecturally provides for the possibility of more, but these devices do not. More repeating data can be read, however slowly, but there is no possibility of useful content beyond the first 1K. If this limit were set to 0x100, which is more in line what the actual usage is, it would be an artificial limit, but at 1K it is not. Oh and it does include devices made by others that incorporate Intel Ethernet silicon, not just Intel-built devices. As per section 3.4.4 of the X540 datasheet the upper addressable range for the VPD section is 0xFFF which means the upper limit for the hardware is 0x1000, not 0x400. Ok. I have no problem changing it to that. I had been looking at other specs, but 0x1000 really is a hard limit. snip more material mostly relating to the other patch -- Mark Rustad, Networking Division, Intel Corporation signature.asc Description: Message signed with OpenPGP using GPGMail
Re: [PATCH] net-next: ethtool: Added port speed macros.
From: Ben Hutchings b...@decadent.org.uk Date: Tue, 19 May 2015 18:00:35 +0100 On Mon, 2015-05-18 at 11:46 -0400, David Miller wrote: From: Parav Pandit parav.pan...@avagotech.com Date: Mon, 18 May 2015 16:31:47 +0530 Signed-off-by: Parav Pandit parav.pan...@avagotech.com I thought we had decided that we weren't going to keep adding convenience macros for new speeds, and were simply going to use the appropriate constants in the future. Ben? That's what I thought, but you accepted commit dcf972a334dd ethtool, net/mlx4_en: Add 100M, 20G, 56G speeds ethtool reporting support not so long ago. Grumble... my bad. Given that I should probably accept this patch set as well, hopefully we won't get very many more of these any time soon. Sorry for creating this mess. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] net-next: ethtool: Added port speed macros.
From: Parav Pandit parav.pan...@avagotech.com Date: Tue, 19 May 2015 10:15:49 +0530 On Mon, May 18, 2015 at 9:16 PM, David Miller da...@davemloft.net wrote: From: Parav Pandit parav.pan...@avagotech.com Date: Mon, 18 May 2015 16:31:47 +0530 Signed-off-by: Parav Pandit parav.pan...@avagotech.com I thought we had decided that we weren't going to keep adding convenience macros for new speeds, and were simply going to use the appropriate constants in the future. I added them to accommodate any speed/PHY interface specific configuration to make use of it. If convenience macros are not needed, I will provide the code cleanup patch to remove existing convenience macros. With that we will have unified code for old and new constants. I will anyway let Ben comment on it. You cannot remove the existing macros, because they are exported to userspace and you'll break application compilation if you do that. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: pull-request: mac80211-next 2015-05-19
From: Johannes Berg johan...@sipsolutions.net Date: Tue, 19 May 2015 09:20:39 +0200 Hi Dave, My last pull request introduced two bugs, and we found two more that were present earlier, so I have four bug fixes now. Please pull. Pulled, thanks Johannes. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net v3 2/2] ipv6: fix ECMP route replacement
From: Michal Kubecek mkube...@suse.cz Date: Mon, 18 May 2015 20:54:00 +0200 (CEST) When replacing an IPv6 multipath route with ip route replace, i.e. NLM_F_CREATE | NLM_F_REPLACE, fib6_add_rt2node() replaces only first matching route without fixing its siblings, resulting in corrupted siblings linked list; removing one of the siblings can then end in an infinite loop. IPv6 ECMP implementation is a bit different from IPv4 so that route replacement cannot work in exactly the same way. This should be a reasonable approximation: 1. If the new route is ECMP-able and there is a matching ECMP-able one already, replace it and all its siblings (if any). 2. If the new route is ECMP-able and no matching ECMP-able route exists, replace first matching non-ECMP-able (if any) or just add the new one. 3. If the new route is not ECMP-able, replace first matching non-ECMP-able route (if any) or add the new route. We also need to remove the NLM_F_REPLACE flag after replacing old route(s) by first nexthop of an ECMP route so that each subsequent nexthop does not replace previous one. Fixes: 51ebd3181572 (ipv6: add support of equal cost multipath (ECMP)) Signed-off-by: Michal Kubecek mkube...@suse.cz Can I get some reviews please from interested parties? As far as I can tell, this faithfully implements the policy we decided upon at the end of the previous thread for this patch. But I could be wrong :-) Thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next 3/4] rocker: do not make neighbour entry changes when preparing transactions
On Tue, May 19, 2015 at 09:11:29AM -0700, Scott Feldman wrote: On Mon, May 18, 2015 at 11:24 PM, Simon Horman simon.hor...@netronome.com wrote: rocker_port_ipv4_nh() and in turn rocker_port_ipv4_neigh() may be be called with trans == SWITCHDEV_TRANS_PREPARE and then trans == SWITCHDEV_TRANS_COMMIT from switchdev_port_obj_set() via fib_table_insert(). Although I have been unable to find a scenario where a bug manifests I do see some incorrect behaviour when adding a fib entry with a gateway. The first time that rocker_port_ipv4_nh() is called, with trans == SWITCHDEV_TRANS_PREPARE, _rocker_neigh_add() adds a new entry to the neigh table. And the second time rocker_port_ipv4_nh() is called, with trans == SWITCHDEV_TRANS_COMMIT, that entry is found and _rocker_neigh_update() is called. Assuming the transaction runs to completion this appears to be harmless. I suspect it would be not correct if the transaction was aborted. This problem does not appear to affect deletion as my analysis is that deletion is always performed with trans == SWITCHDEV_TRANS_NONE. For completeness _rocker_neigh_{add,del,prepare} are updated not to manipulate fib table entries if trans == SWITCHDEV_TRANS_PREPARE. Fixes: c4f20321d968 (rocker: support prepare-commit transaction model) Signed-off-by: Simon Horman simon.hor...@netronome.com --- drivers/net/ethernet/rocker/rocker.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) [cut] @@ -2928,8 +2933,11 @@ static void _rocker_neigh_del(struct rocker_port *rocker_port, } static void _rocker_neigh_update(struct rocker_neigh_tbl_entry *entry, +enum switchdev_trans trans, u8 *eth_dst, bool ttl_check) { + if (trans == SWITCHDEV_TRANS_PREPARE) + return; On this one, let move the trans==PREPARE test to the else branch, in case someone down the line is dependent on eth_dst and ttl_check being copied over. Everything else in the patch looks good. Thanks Scott, I'll adjust things as you suggest. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v3 for-next 05/13] IB/cm: Reference count ib_cm_ids
I find Haggai's argument compelling, it is a very small amount of code and data to add a sharing count, and a very large amount to duplicate the whole service id map into cma.c. I get wanting to share the listen list, but we end up with this: drivers/infiniband/core/cm.c | 129 +- that's more code than what the ib_cm module uses to track listens, and the result is a solution that ends up being split in a weird fashion between the ib_cm, iw_cm (TBD), and rdma_cm. I wonder if the existing ib_cm interface is even what we want. Currently, the rdma_cm pushes the private data (i.e. IP address) comparison into the ib_cm. This is only used by the rdma_cm. Should that instead be moved out of the ib_cm and handled in the rdma_cm? And then update the ib_cm to support multiple listens on the same service id. For example, the ib_cm could simply queue all listen requests on the same service id. When a REQ arrives, it just calls each listener back until one 'claims' the REQ. The destruction of a listener could then be synced with the callback. From what I can tell, the current proposal requires that the ib_cm user be prepared to receive a REQ callback for a listen that it has already destroyed. If needed, a flag can be added to the ib_cm_listen to indicate if the service id is shared/exclusive. - Sean -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Default XPS settings in mlx4 driver
Hi Ido, I'm looking at your patch net/mlx4_en: Configure the XPS queue mapping on driver load. We're testing a 40 CPU system and it looks like XPS is being configured by default with forty queues 0-39 where each xps_cpus is (1 i). The problem is that this does not easily align with RX queues and the TX completion interrupt for TX queue z happens in interrupt for (z % num_rx_queues). So it looks like, the default XPS doesn't respect the RX interrupt affinities, so we have TX queues that are bound to a CPU on one numa node but have their TX completions happen on another which is suboptimal. It would be nice if the default XPS could take where the completion interrupt happens into account somehow. Looks like there's a few other drivers that are setting XPS, they might have similar issue. Thanks, Tom -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Default XPS settings in mlx4 driver
From: Tom Herbert t...@herbertland.com Date: Tue, 19 May 2015 13:38:35 -0700 Looks like there's a few other drivers that are setting XPS, they might have similar issue. We really need generic infrastructure for such configuration attempts so that the default policy across all such drivers is consistent. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] net/ipv6/udp: Fix ipv6 multicast socket filter regression
From: Henning Rogge hro...@gmail.com Date: Mon, 18 May 2015 21:08:49 +0200 Commit 5cf3d46192fc (udp: Simplify__udp*_lib_mcast_deliver) simplified the filter for incoming IPv6 multicast but removed the check of the local socket address and the UDP destination address. This patch restores the filter to prevent sockets bound to a IPv6 multicast IP to receive other UDP traffic link unicast. Signed-off-by: Henning Rogge hro...@gmail.com Fixes: 5cf3d46192fc (udp: Simplify__udp*_lib_mcast_deliver) Applied and queued up for -stable, thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next v3] tcp: add rfc3168, section 6.1.1.1. fallback
From: Daniel Borkmann dan...@iogearbox.net Date: Tue, 19 May 2015 21:33:42 +0200 This work as a follow-up of commit f7b3bec6f516 (net: allow setting ecn via routing table) and adds RFC3168 section 6.1.1.1. fallback for outgoing ECN connections. In other words, this work adds a retry with a non-ECN setup SYN packet, as suggested from the RFC on the first timeout: [...] A host that receives no reply to an ECN-setup SYN within the normal SYN retransmission timeout interval MAY resend the SYN and any subsequent SYN retransmissions with CWR and ECE cleared. [...] ... Reference: https://www.ietf.org/proceedings/92/slides/slides-92-iccrg-1.pdf Reference: https://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf Signed-off-by: Daniel Borkmann dan...@iogearbox.net Signed-off-by: Florian Westphal f...@strlen.de Signed-off-by: Mirja Kühlewind mirja.kuehlew...@tik.ee.ethz.ch Signed-off-by: Brian Trammell tramm...@tik.ee.ethz.ch Cc: Eric Dumazet eduma...@google.com Cc: Dave Täht dave.t...@gmail.com --- v2 - v3: - Very sorry. Typo happened in Dave's name since v1, getting it right this time, no bad intentions. ;) v1 - v2: - Added suggestion from Eric to let ecn_flags be cleared eventually in tcp_ecn_rcv_synack(), thanks! - Rest as is. Applied, thanks everyone. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] ip: remove unused function prototype
From: Andy Zhou az...@nicira.com Date: Tue, 19 May 2015 12:41:47 -0700 ip_do_nat() function was removed prior to kernel 3.4. Remove the unnecessary function prototype as well. Reported-by: Florian Westphal f...@strlen.de Signed-off-by: Andy Zhou az...@nicira.com Applied, thank you. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next] tcp: Return error instead of partial read for saved syn headers
From: Eric B Munson emun...@akamai.com Date: Mon, 18 May 2015 14:35:58 -0400 Currently the getsockopt() requesting the cached contents of the syn packet headers will fail silently if the caller uses a buffer that is too small to contain the requested data. Rather than fail silently and discard the headers, getsockopt() should return an error and report the required size to hold the data. Signed-off-by: Eric B Munson emun...@akamai.com Applied, thanks Eric. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] be2net: make hwmon interface optional
From: Arnd Bergmann a...@arndb.de Date: Mon, 18 May 2015 23:06:45 +0200 The hwmon interface in the be2net driver causes a link error when be2net is built-in while the hwmon subsystem is a loadable module: drivers/built-in.o: In function `be_probe': drivers/net/ethernet/emulex/benet/be_main.c:5761: undefined reference to `devm_hwmon_device_register_with_groups' This adds a new Kconfig symbol, following the example of multiple other drivers that have the same problem. The new CONFIG_BE2NET_HWMON will not be available when (BE2NET=y HWMON=m) to avoid this problem. We have to also mark be_hwmon_show_temp as 'static' to ensure the compiler can optimize out all the unused code. Signed-off-by: Arnd Bergmann a...@arndb.de Fixes: 29e9122b3a (be2net: Export board temperature using hwmon-sysfs interface.) Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net 1/2] tcp: only undo on partial ACKs in CA_Loss
From: Yuchung Cheng ych...@google.com Date: Mon, 18 May 2015 12:31:44 -0700 Undo based on TCP timestamps should only happen on ACKs that advance SND_UNA, according to the Eifel algorithm in RFC 3522: Section 3.2: (4) If the value of the Timestamp Echo Reply field of the acceptable ACK's Timestamps option is smaller than the value of RetransmitTS, then proceed to step (5), Section Terminology: We use the term 'acceptable ACK' as defined in [RFC793]. That is an ACK that acknowledges previously unacknowledged data. This is because upon receiving an out-of-order packet, the receiver returns the last timestamp that advances RCV_NXT, not the current timestamp of the packet in the DUPACK. Without checking the flag, the DUPACK will cause tcp_packet_delayed() to return true and tcp_try_undo_loss() will revert cwnd reduction. Note that we check the condition in CA_Recovery already by only calling tcp_try_undo_partial() if FLAG_SND_UNA_ADVANCED is set or tcp_try_undo_recovery() if snd_una crosses high_seq. Signed-off-by: Yuchung Cheng ych...@google.com Signed-off-by: Neal Cardwell ncardw...@google.com Applied. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net 2/2] tcp: don't over-send F-RTO probes
From: Yuchung Cheng ych...@google.com Date: Mon, 18 May 2015 12:31:45 -0700 After sending the new data packets to probe (step 2), F-RTO may incorrectly send more probes if the next ACK advances SND_UNA and does not sack new packet. However F-RTO RFC 5682 probes at most once. This bug may cause sender to always send new data instead of repairing holes, inducing longer HoL blocking on the receiver for the application. Signed-off-by: Yuchung Cheng ych...@google.com Signed-off-by: Neal Cardwell ncardw...@google.com Applied. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] irda: irda-usb: use msecs_to_jiffies for conversions
From: Joe Perches j...@perches.com Date: Tue, 19 May 2015 07:32:15 -0700 On Tue, 2015-05-19 at 12:51 +0200, Nicholas Mc Guire wrote: Converting milliseconds to jiffies by val * HZ / 1000 is technically is not a clean solution as it does not handle all corner cases correctly. By changing the conversion to use msecs_to_jiffies(val) conversion is correct in all cases. in the current code: mod_timer(self-rx_defer_timer, jiffies + (10 * HZ / 1000)); for HZ 100 (e.g. CONFIG_HZ == 64|32 in alpha) this effectively results in no delay at all. [] diff --git a/drivers/net/irda/irda-usb.c b/drivers/net/irda/irda-usb.c [] @@ -848,7 +848,9 @@ static void irda_usb_receive(struct urb *urb) * Jean II */ self-rx_defer_timer.function = irda_usb_rx_defer_expired; self-rx_defer_timer.data = (unsigned long) urb; -mod_timer(self-rx_defer_timer, jiffies + (10 * HZ / 1000)); +mod_timer(self-rx_defer_timer, + jiffies + (msecs_to_jiffies(10))); The unnecessary () around msecs_to_jiffies could be removed. Agreed. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: pull-request: mac80211 2015-05-19
From: Johannes Berg johan...@sipsolutions.net Date: Tue, 19 May 2015 09:22:34 +0200 And one more - for the current cycle (and stable as well). Let me know if there's any problem. Also pulled, thanks Johannes. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 for-next 05/12] IB/cm: Share listening CM IDs
On Tue, May 19, 2015 at 12:35:45PM -0600, Jason Gunthorpe wrote: On Sun, May 17, 2015 at 08:51:01AM +0300, Haggai Eran wrote: @@ -212,6 +212,8 @@ struct cm_id_private { spinlock_t lock;/* Do not acquire inside cm.lock */ struct completion comp; atomic_t refcount; + /* Number of clients sharing this ib_cm_id. Only valid for listeners. */ + atomic_t sharecount; No need for this atomic, hold the lock The use of the atomic looks racy: + if (!atomic_dec_and_test(cm_id_priv-sharecount)) { + /* The id is still shared. */ + return; + } Might race with this: + if (atomic_inc_return(cm_id_priv-sharecount) == 1) { + /* This ID is already being destroyed */ + atomic_dec(cm_id_priv-sharecount); + goto new_id; + } + Resulting in use-after-free of cm_id_priv-sharecount Actually, there is something else odd here.. I mentioned the above because there wasn't obvious ref'ing on the cm_id_priv. Looking closer the cm.lock should prevent use-after-free, but there is still no ref. The more I look at this, the more I think it is sketchy. Don't try and merge sharecount and refcount together, after cm_find_listen is called you have to increment the refcount before dropping cm.lock. Decrement the refcount when destroying a shared listen. I also don't see how the 'goto new_id' can work, if cm_find_listen succeeds then __ib_cm_listen is guarenteed to fail. Fix the locking to make that impossible, associate sharecount with the cm.lock and, rework how cm_destroy_id grabs the cm_id_priv-lock spinlock: case IB_CM_LISTEN: spin_lock_irq(cm.lock); if (cm_id_priv-sharecount != 0) { cm_id_prv-sharecount--; // paired with in in ib_cm_id_create_and_listen atomic_dec(cm_id_priv-refcount); spin_unlock_irq(cm.lock); return; } rb_erase(cm_id_priv-service_node, cm.listen_service_table); spin_unlock_irq(cm.lock); spin_lock_irq(cm_id_priv-lock); cm_id-state = IB_CM_IDLE; spin_unlock_irq(cm_id_priv-lock); break; Now that condition is eliminated, the unneeded atomic is gone, and refcount still acts like a proper kref should. Jason -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net v2] switchdev: don't abort hardware ipv4 fib offload on failure to program fib entry in hardware
From: Andy Gospodarek go...@cumulusnetworks.com Date: Tue, 19 May 2015 15:47:32 -0400 Are you actually saying that if users complain loudly enough about the current behavior (not the change Roopa has proposed) that you would be open to considering a change the current behavior? I am saying that we have a contract with users not to break existing behavior. Full stop. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCHv2 net-next 0/2] Remove dead code and replace byte-oder functions
From: Hariprasad Shenai haripra...@chelsio.com Date: Tue, 19 May 2015 18:20:42 +0530 This series removes dead fn t4_read_edc and t4_read_mc, also replaces ntoh{s,l} and hton{s,l} calls with the generic byteorder. PATCH 2/2 was sent as a single PATCH, but had some byte-ordering issues in t4_read_edc and t4_read_mc function. Found that t4_read_edc and t4_read_mc is unused, so PATCH 1/2 is added to remove it. This patch series is created against net-next tree and includes patches on cxgb4 driver. We have included all the maintainers of respective drivers. Kindly review the change and let us know in case of any review comments. Series applied, thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next 0/4] sfc: add MCDI tracing
From: Edward Cree ec...@solarflare.com Date: Tue, 19 May 2015 18:33:55 +0100 This patchset adds support for logging Management-Controller-to-Driver Interface interactions between the sfc driver and a bound device, to aid in debugging. Solarflare has a tool to decode the resulting traces and will look to open-source this if there is any external interest, but the protocol is already detailed in drivers/net/ethernet/sfc/mcdi_pcol.h. Please properly format your email message body contents to ~80 columns. Thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Kernel crash while using tc script
On 05/19/2015 10:11 PM, Vijay Subramanian wrote: Hi, It seems latest net-next kernel crashes while unloading modules. Please see simple script below to reproduce the crash. === #!/bin/bash while true; do # modules will be loaded automatically tc qdisc add dev eth1 root handle 1: prio tc filter add dev eth1 parent 1: u32 match u32 0 0 flowid 1 tc qdisc del dev eth1 root rmmod cls_u32 rmmod sch_prio done = It seems there is some refcounting or locking issue issue. I am unable to easily post the dump but sometimes it points to crashes in various functions in prio_class_ops (sch_prio.c), such as prio_walk(), prio_dump_class etc. Probably, sch_prio call back functions are invoked when they should not be. I bisected this down to following commit: commit 78fd1d0ab072d4d9b5f0b7c14a1516665170b565 Author: Thomas Graf tg...@suug.ch Date: Tue Oct 21 22:05:38 2014 +0200 netlink: Re-add locking to netlink_lookup() and seq walker If there are suggestions for me to try or you need more info, let me know. Hmm, seems rather like the synchronize_net() removal in netlink_release() must have uncovered a bug elsewhere. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Intel-wired-lan] [PATCH] pci: Limit VPD reads for all Intel Ethernet devices
On May 19, 2015, at 2:17 PM, Alexander Duyck alexander.du...@gmail.com wrote: Any chance you could point me toward the software in question? Just wondering because it seems like what you are fixing with this is an implementation issue in the application since you really shouldn't be accessing areas outside the scope of the VPD data structure, and doing so is undefined in terms of what happens if you do. I don't have it, but if you dump VPD via sysfs you will see it comes out as 32k in size. The kernel just blindly provides access to the full 32K space provided by the spec. I'm sure that we agree that the kernel should not go parse it and find the actual size. If it is read via stdio, say fread, the read access would be whatever buffer size it chooses to use. If you looked at the quirks, you might have noticed that Broadcom limited the VPD access for some devices for functional reasons. That is what gave me the idea for limiting access to what was possibly there. With the existing Intel Ethernet quirk, it seemed like a simple thing to do. -- Mark Rustad, Networking Division, Intel Corporation signature.asc Description: Message signed with OpenPGP using GPGMail
Re: [PATCH] net-next: ethtool: Added port speed macros.
From: David Miller da...@davemloft.net Date: Tue, 19 May 2015 14:52:02 -0400 (EDT) Given that I should probably accept this patch set as well, hopefully we won't get very many more of these any time soon. And that's what I've just done. Applied to net-next, thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net v2] switchdev: don't abort hardware ipv4 fib offload on failure to program fib entry in hardware
From: roopa ro...@cumulusnetworks.com Date: Mon, 18 May 2015 22:58:54 -0700 from where I see, with the limitations on these boxes, this requires every app, every `ip route` cmd running on the box to explicitly specify offload when running on this hardware. Users know what they are doing, and will ask for a policy change. The default should be what happens right now. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next 4/4] rocker: make rocker_port_internal_vlan_id_{get,put}() non-transactional
On Mon, May 18, 2015 at 11:24 PM, Simon Horman simon.hor...@netronome.com wrote: The motivation for this is that rocker_port_internal_vlan_id_{get,put} appear to only partially implement the transaction model: memory allocation and freeing is transactional, but hash and bitmap manipulation is not. The latter could be fixed, however, as it is not currently exercised due to trans always being SWITCHDEV_TRANS_NONE it seems cleaner to make rocker_port_internal_vlan_id_get non-transactional. Found by inspection. I do not believe that this change should have any run-time effect. Fixes: c4f20321d968 (rocker: support prepare-commit transaction model) Signed-off-by: Simon Horman simon.hor...@netronome.com This works for now. Once we switch over to using bridge's PVID, we don't need any of this internal vlan stuff anyway, so eventually most (all?) of this internal vlan code goes away. Acked-by: Scott Feldman sfel...@gmail.com -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Intel-wired-lan] [PATCH] pci: Use a bus-global mutex to protect VPD operations
On 05/18/2015 05:00 PM, Mark D Rustad wrote: Some devices have a problem with concurrent VPD access to different functions of the same physical device, so move the protecting mutex from the pci_vpd structure to the pci_bus structure. There are a number of reports on support sites for a variety of devices from various vendors getting the vpd r/w failed message. This is likely to at least fix some of them. Thanks to Shannon Nelson for helping to come up with this approach. Signed-off-by: Mark Rustad mark.d.rus...@intel.com Acked-by: Shannon Nelson shannon.nel...@intel.com Acked-by: Jeff Kirsher jeffrey.t.kirs...@intel.com Instead of moving the mutex lock around you would be much better served by simply removing the duplicate VPD entries for a given device in a PCIe quirk. Then you can save yourself the extra pain and effort of having to deal with serialized VPD accesses for a multifunction device. The logic for the quirk should be fairly simple. 1. Scan for any other devices with VPD that share the same bus and device number. 2. If bdf is equal to us keep searching. 3. If bdf is less than our bdf we release our VPD area and set VPD pointer to NULL. - Alex -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 4/4] sfc: Initialise MCDI buffers to 0 on declaration.
From: Jon Cooper jcoo...@solarflare.com Signed-off-by: Edward Cree ec...@solarflare.com --- drivers/net/ethernet/sfc/ef10.c | 24 +--- drivers/net/ethernet/sfc/mcdi.c | 8 drivers/net/ethernet/sfc/mcdi.h | 8 +--- drivers/net/ethernet/sfc/ptp.c | 6 +++--- 4 files changed, 25 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 2e8e5de..ed0ec88 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -350,7 +350,7 @@ static int efx_ef10_probe_vf(struct efx_nic *efx __attribute__ ((unused))) static int efx_ef10_free_vis(struct efx_nic *efx) { - MCDI_DECLARE_BUF_OUT_OR_ERR(outbuf, 0); + MCDI_DECLARE_BUF_ERR(outbuf); size_t outlen; int rc = efx_mcdi_rpc_quiet(efx, MC_CMD_FREE_VIS, NULL, 0, outbuf, sizeof(outbuf), outlen); @@ -421,7 +421,7 @@ static int efx_ef10_alloc_piobufs(struct efx_nic *efx, unsigned int n) static int efx_ef10_link_piobufs(struct efx_nic *efx) { struct efx_ef10_nic_data *nic_data = efx-nic_data; - MCDI_DECLARE_BUF(inbuf, + _MCDI_DECLARE_BUF(inbuf, max(MC_CMD_LINK_PIOBUF_IN_LEN, MC_CMD_UNLINK_PIOBUF_IN_LEN)); struct efx_channel *channel; @@ -432,6 +432,8 @@ static int efx_ef10_link_piobufs(struct efx_nic *efx) BUILD_BUG_ON(MC_CMD_LINK_PIOBUF_OUT_LEN != 0); BUILD_BUG_ON(MC_CMD_UNLINK_PIOBUF_OUT_LEN != 0); + memset(inbuf, 0, sizeof(inbuf)); + /* Link a buffer to each VI in the write-combining mapping */ for (index = 0; index nic_data-n_piobufs; ++index) { MCDI_SET_DWORD(inbuf, LINK_PIOBUF_IN_PIOBUF_HANDLE, @@ -1315,17 +1317,17 @@ static void efx_ef10_tx_init(struct efx_tx_queue *tx_queue) { MCDI_DECLARE_BUF(inbuf, MC_CMD_INIT_TXQ_IN_LEN(EFX_MAX_DMAQ_SIZE * 8 / EFX_BUF_SIZE)); - MCDI_DECLARE_BUF(outbuf, MC_CMD_INIT_TXQ_OUT_LEN); bool csum_offload = tx_queue-queue EFX_TXQ_TYPE_OFFLOAD; size_t entries = tx_queue-txd.buf.len / EFX_BUF_SIZE; struct efx_channel *channel = tx_queue-channel; struct efx_nic *efx = tx_queue-efx; struct efx_ef10_nic_data *nic_data = efx-nic_data; - size_t inlen, outlen; + size_t inlen; dma_addr_t dma_addr; efx_qword_t *txd; int rc; int i; + BUILD_BUG_ON(MC_CMD_INIT_TXQ_OUT_LEN != 0); MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_SIZE, tx_queue-ptr_mask + 1); MCDI_SET_DWORD(inbuf, INIT_TXQ_IN_TARGET_EVQ, channel-channel); @@ -1350,7 +1352,7 @@ static void efx_ef10_tx_init(struct efx_tx_queue *tx_queue) inlen = MC_CMD_INIT_TXQ_IN_LEN(entries); rc = efx_mcdi_rpc(efx, MC_CMD_INIT_TXQ, inbuf, inlen, - outbuf, sizeof(outbuf), outlen); + NULL, 0, NULL); if (rc) goto fail; @@ -1383,7 +1385,7 @@ fail: static void efx_ef10_tx_fini(struct efx_tx_queue *tx_queue) { MCDI_DECLARE_BUF(inbuf, MC_CMD_FINI_TXQ_IN_LEN); - MCDI_DECLARE_BUF(outbuf, MC_CMD_FINI_TXQ_OUT_LEN); + MCDI_DECLARE_BUF_ERR(outbuf); struct efx_nic *efx = tx_queue-efx; size_t outlen; int rc; @@ -1690,15 +1692,15 @@ static void efx_ef10_rx_init(struct efx_rx_queue *rx_queue) MCDI_DECLARE_BUF(inbuf, MC_CMD_INIT_RXQ_IN_LEN(EFX_MAX_DMAQ_SIZE * 8 / EFX_BUF_SIZE)); - MCDI_DECLARE_BUF(outbuf, MC_CMD_INIT_RXQ_OUT_LEN); struct efx_channel *channel = efx_rx_queue_channel(rx_queue); size_t entries = rx_queue-rxd.buf.len / EFX_BUF_SIZE; struct efx_nic *efx = rx_queue-efx; struct efx_ef10_nic_data *nic_data = efx-nic_data; - size_t inlen, outlen; + size_t inlen; dma_addr_t dma_addr; int rc; int i; + BUILD_BUG_ON(MC_CMD_INIT_RXQ_OUT_LEN != 0); rx_queue-scatter_n = 0; rx_queue-scatter_len = 0; @@ -1727,7 +1729,7 @@ static void efx_ef10_rx_init(struct efx_rx_queue *rx_queue) inlen = MC_CMD_INIT_RXQ_IN_LEN(entries); rc = efx_mcdi_rpc(efx, MC_CMD_INIT_RXQ, inbuf, inlen, - outbuf, sizeof(outbuf), outlen); + NULL, 0, NULL); if (rc) netdev_WARN(efx-net_dev, failed to initialise RXQ %d\n, efx_rx_queue_index(rx_queue)); @@ -1736,7 +1738,7 @@ static void efx_ef10_rx_init(struct efx_rx_queue *rx_queue) static void efx_ef10_rx_fini(struct efx_rx_queue *rx_queue) { MCDI_DECLARE_BUF(inbuf, MC_CMD_FINI_RXQ_IN_LEN); - MCDI_DECLARE_BUF(outbuf, MC_CMD_FINI_RXQ_OUT_LEN); + MCDI_DECLARE_BUF_ERR(outbuf); struct efx_nic *efx = rx_queue-efx; size_t outlen; int rc; @@ -1898,7