Re: [PATCH] bpf: add mod default A and X test cases

2015-11-04 Thread Alexei Starovoitov
On Wed, Nov 04, 2015 at 11:36:37AM -0800, Yang Shi wrote:
> When running "mod X" operation, if X is 0 the filter has to be halt.
> Add new test cases to cover A = A mod X if X is 0, and A = A mod 1.
> 
> CC: Xi Wang 
> CC: Zi Shen Lim 
> Signed-off-by: Yang Shi 

Acked-by: Alexei Starovoitov 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Andy Shevchenko
On Wed, Nov 4, 2015 at 10:06 PM, Sowmini Varadhan
 wrote:
> On (11/04/15 21:59), Andy Shevchenko wrote:
>>
> See earlier response.

So, if maintainer is okay I'm also okay with those and you may take my tag.


-- 
With Best Regards,
Andy Shevchenko
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/8] mm: memcontrol: account socket memory in unified hierarchy memory controller

2015-11-04 Thread Johannes Weiner
Socket memory can be a significant share of overall memory consumed by
common workloads. In order to provide reasonable resource isolation in
the unified hierarchy, this type of memory needs to be included in the
tracking/accounting of a cgroup under active memory resource control.

Overhead is only incurred when a non-root control group is created AND
the memory controller is instructed to track and account the memory
footprint of that group. cgroup.memory=nosocket can be specified on
the boot commandline to override any runtime configuration and
forcibly exclude socket memory from active memory resource control.

Signed-off-by: Johannes Weiner 
---
 include/linux/memcontrol.h |   8 +++-
 mm/memcontrol.c| 110 +
 2 files changed, 97 insertions(+), 21 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f3caf84..7adabb7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -245,6 +245,10 @@ struct mem_cgroup {
struct wb_domain cgwb_domain;
 #endif
 
+#ifdef CONFIG_INET
+   struct work_struct socket_work;
+#endif
+
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
@@ -679,7 +683,7 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback 
*wb,
 #endif /* CONFIG_CGROUP_WRITEBACK */
 
 struct sock;
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
+#ifdef CONFIG_INET
 extern struct static_key_false mem_cgroup_sockets;
 static inline bool mem_cgroup_do_sockets(void)
 {
@@ -698,7 +702,7 @@ static inline bool mem_cgroup_do_sockets(void)
 {
return false;
 }
-#endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_INET */
 
 #ifdef CONFIG_MEMCG_KMEM
 extern struct static_key memcg_kmem_enabled_key;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 85f212e..2994c9d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -79,6 +79,9 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
 
 #define MEM_CGROUP_RECLAIM_RETRIES 5
 
+/* Socket memory accounting disabled? */
+static int cgroup_memory_nosocket;
+
 /* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
 int do_swap_account __read_mostly;
@@ -1916,6 +1919,18 @@ static int memcg_cpu_hotplug_callback(struct 
notifier_block *nb,
return NOTIFY_OK;
 }
 
+static void reclaim_high(struct mem_cgroup *memcg,
+unsigned int nr_pages,
+gfp_t gfp_mask)
+{
+   do {
+   if (page_counter_read(>memory) <= memcg->high)
+   continue;
+   mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+   try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+   } while ((memcg = parent_mem_cgroup(memcg)));
+}
+
 /*
  * Scheduled by try_charge() to be executed from the userland return path
  * and reclaims memory over the high limit.
@@ -1923,20 +1938,13 @@ static int memcg_cpu_hotplug_callback(struct 
notifier_block *nb,
 void mem_cgroup_handle_over_high(void)
 {
unsigned int nr_pages = current->memcg_nr_pages_over_high;
-   struct mem_cgroup *memcg, *pos;
+   struct mem_cgroup *memcg;
 
if (likely(!nr_pages))
return;
 
-   pos = memcg = get_mem_cgroup_from_mm(current->mm);
-
-   do {
-   if (page_counter_read(>memory) <= pos->high)
-   continue;
-   mem_cgroup_events(pos, MEMCG_HIGH, 1);
-   try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
-   } while ((pos = parent_mem_cgroup(pos)));
-
+   memcg = get_mem_cgroup_from_mm(current->mm);
+   reclaim_high(memcg, nr_pages, GFP_KERNEL);
css_put(>css);
current->memcg_nr_pages_over_high = 0;
 }
@@ -4129,6 +4137,8 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup 
*memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
 
+static void socket_work_func(struct work_struct *work);
+
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -4169,6 +4179,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state 
*parent_css)
 #ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(>cgwb_list);
 #endif
+#ifdef CONFIG_INET
+   INIT_WORK(>socket_work, socket_work_func);
+#endif
return >css;
 
 free_out:
@@ -4228,6 +4241,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (ret)
return ret;
 
+   if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+   static_branch_enable(_cgroup_sockets);
+
/*
 * Make sure the memcg is initialized: mem_cgroup_iter()
 * orders reading memcg->initialized against its callers
@@ -4266,6 +4282,8 @@ static void mem_cgroup_css_free(struct 
cgroup_subsys_state *css)
 {
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
+   

[PATCH 5/8] net: tcp_memcontrol: consolidate socket buffer tracking and accounting

2015-11-04 Thread Johannes Weiner
The tcp memory controller has extensive provisions for future memory
accounting interfaces that won't materialize after all. Cut the code
base down to what's actually used, now and in the likely future.

- There won't be any different protocol counters in the future, so a
  direct sock->sk_memcg linkage is enough. This eliminates a lot of
  callback maze and boilerplate code, and restores most of the socket
  allocation code to pre-tcp_memcontrol state.

- There won't be a tcp control soft limit, so integrating the memcg
  code into the global skmem limiting scheme complicates things
  unnecessarily. Replace all that with simple and clear charge and
  uncharge calls--hidden behind a jump label--to account skb memory.

  Without a soft limit, the per-memcg pressure state is questionable
  as well, but for now we still enter it when the hard limit is hit,
  and packets are dropped, to let other sockets in the cgroup know
  that they shouldn't grow their transmit windows, either. However,
  because network performance will already be in the toilet at this
  point, keep it simple: leave memory pressure lazily when the next
  packet is accepted, and delete the code that checks synchroneously
  when memory is released. This should be acceptable.

- The previous jump label code was an elaborate state machine that
  tracked the number of cgroups with an active socket limit in order
  to enable the skmem tracking and accounting code only when actively
  necessary. But this is overengineered: it was meant to protect the
  people who never use this feature in the first place. Simply enable
  the branches once when the first limit is set until the next reboot.

Signed-off-by: Johannes Weiner 
---
 include/linux/memcontrol.h   |  60 ++
 include/net/sock.h   | 126 +++--
 include/net/tcp.h|   5 +-
 include/net/tcp_memcontrol.h |   7 ---
 mm/memcontrol.c  | 103 --
 net/core/sock.c  |  78 ++-
 net/ipv4/sysctl_net_ipv4.c   |   1 -
 net/ipv4/tcp.c   |   3 +-
 net/ipv4/tcp_ipv4.c  |   9 +--
 net/ipv4/tcp_memcontrol.c| 147 +++
 net/ipv4/tcp_output.c|   6 +-
 net/ipv6/tcp_ipv6.c  |   3 -
 12 files changed, 137 insertions(+), 411 deletions(-)
 delete mode 100644 include/net/tcp_memcontrol.h

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8929685..f3caf84 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -85,34 +85,6 @@ enum mem_cgroup_events_target {
MEM_CGROUP_NTARGETS,
 };
 
-/*
- * Bits in struct cg_proto.flags
- */
-enum cg_proto_flags {
-   /* Currently active and new sockets should be assigned to cgroups */
-   MEMCG_SOCK_ACTIVE,
-   /* It was ever activated; we must disarm static keys on destruction */
-   MEMCG_SOCK_ACTIVATED,
-};
-
-struct cg_proto {
-   struct page_counter memory_allocated;   /* Current allocated 
memory. */
-   struct percpu_counter   sockets_allocated;  /* Current number of 
sockets. */
-   int memory_pressure;
-   longsysctl_mem[3];
-   unsigned long   flags;
-   /*
-* memcg field is used to find which memcg we belong directly
-* Each memcg struct can hold more than one cg_proto, so container_of
-* won't really cut.
-*
-* The elegant solution would be having an inverse function to
-* proto_cgroup in struct proto, but that means polluting the structure
-* for everybody, instead of just for memcg users.
-*/
-   struct mem_cgroup   *memcg;
-};
-
 #ifdef CONFIG_MEMCG
 struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
@@ -185,8 +157,16 @@ struct mem_cgroup {
 
/* Accounted resources */
struct page_counter memory;
+
+   /*
+* Legacy non-resource counters. In unified hierarchy, all
+* memory is accounted and limited through memcg->memory.
+* Consumer breakdown happens in the statistics.
+*/
struct page_counter memsw;
struct page_counter kmem;
+   struct page_counter skmem;
+   bool skmem_breached;/* (ancestral) skmem.limit breached */
 
/* Normal memory consumption range */
unsigned long low;
@@ -246,9 +226,6 @@ struct mem_cgroup {
 */
struct mem_cgroup_stat_cpu __percpu *stat;
 
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
-   struct cg_proto tcp_mem;
-#endif
 #if defined(CONFIG_MEMCG_KMEM)
 /* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
@@ -678,12 +655,6 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum 
vm_event_item idx)
 }
 #endif /* CONFIG_MEMCG */
 
-enum {
-   UNDER_LIMIT,
-   SOFT_LIMIT,
-   OVER_LIMIT,
-};
-
 

[PATCH 8/8] mm: memcontrol: hook up vmpressure to socket pressure

2015-11-04 Thread Johannes Weiner
Let the networking stack know when a memcg is under reclaim pressure
so that it can clamp its transmit windows accordingly.

Whenever the reclaim efficiency of a cgroup's LRU lists drops low
enough for a MEDIUM or HIGH vmpressure event to occur, assert a
pressure state in the socket and tcp memory code that tells it to curb
consumption growth from sockets associated with said control group.

vmpressure events are naturally edge triggered, so for hysteresis
assert socket pressure for a second to allow for subsequent vmpressure
events to occur before letting the socket code return to normal.

This will likely need finetuning for a wider variety of workloads, but
for now stick to the vmpressure presets and keep hysteresis simple.

Signed-off-by: Johannes Weiner 
---
 include/linux/memcontrol.h | 27 +--
 mm/memcontrol.c| 15 +--
 mm/vmpressure.c| 25 -
 3 files changed, 46 insertions(+), 21 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 7adabb7..d45379a 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -247,6 +247,7 @@ struct mem_cgroup {
 
 #ifdef CONFIG_INET
struct work_struct socket_work;
+   unsigned long socket_pressure;
 #endif
 
/* List of events which userspace want to receive */
@@ -292,18 +293,34 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *, 
struct zone *);
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg);
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
 }
 
+#define mem_cgroup_from_counter(counter, member)   \
+   container_of(counter, struct mem_cgroup, member)
+
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
   struct mem_cgroup *,
   struct mem_cgroup_reclaim_cookie *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 
+/**
+ * parent_mem_cgroup - find the accounting parent of a memcg
+ * @memcg: memcg whose parent to find
+ *
+ * Returns the parent memcg, or NULL if this is the root or the memory
+ * controller is in legacy no-hierarchy mode.
+ */
+static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
+{
+   if (!memcg->memory.parent)
+   return NULL;
+   return mem_cgroup_from_counter(memcg->memory.parent, memory);
+}
+
 static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
  struct mem_cgroup *root)
 {
@@ -695,7 +712,13 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, 
unsigned int nr_pages);
 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int 
nr_pages);
 static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
 {
-   return memcg->skmem_breached;
+   if (memcg->skmem_breached)
+   return true;
+   do {
+   if (time_before(jiffies, memcg->socket_pressure))
+   return true;
+   } while ((memcg = parent_mem_cgroup(memcg)));
+   return false;
 }
 #else
 static inline bool mem_cgroup_do_sockets(void)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2994c9d..e10637f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1084,9 +1084,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct 
mem_cgroup *memcg)
return ret;
 }
 
-#define mem_cgroup_from_counter(counter, member)   \
-   container_of(counter, struct mem_cgroup, member)
-
 /**
  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
  * @memcg: the memory cgroup
@@ -4126,17 +4123,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
kfree(memcg);
 }
 
-/*
- * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
- */
-struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
-{
-   if (!memcg->memory.parent)
-   return NULL;
-   return mem_cgroup_from_counter(memcg->memory.parent, memory);
-}
-EXPORT_SYMBOL(parent_mem_cgroup);
-
 static void socket_work_func(struct work_struct *work);
 
 static struct cgroup_subsys_state * __ref
@@ -4181,6 +4167,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state 
*parent_css)
 #endif
 #ifdef CONFIG_INET
INIT_WORK(>socket_work, socket_work_func);
+   memcg->socket_pressure = jiffies;
 #endif
return >css;
 
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 4c25e62..07e8440 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -137,14 +137,11 @@ struct vmpressure_event {
 };
 
 static bool vmpressure_event(struct vmpressure *vmpr,
-unsigned long scanned, unsigned long reclaimed)
+   

[PATCH net] net: dsa: mv88e6xxx: isolate unbridged ports

2015-11-04 Thread Vivien Didelot
The DSA documentation specifies that each port must be capable of
forwarding frames to the CPU port. The last changes on bridging support
for the mv88e6xxx driver broke this requirement for non-bridged ports.

So as for the bridged ports, reserve a few VLANs (4000+) in the switch
to isolate ports that have not been bridged yet.

By default, a port will be isolated with the CPU and DSA ports. When the
port joins a bridge, it will leave its reserved port. When it is removed
from a bridge, it will join its reserved VLAN again.

Fixes: 5fe7f68016ff ("net: dsa: mv88e6xxx: fix hardware bridging")
Reported-by: Andrew Lunn 
Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6171.c |  2 ++
 drivers/net/dsa/mv88e6352.c |  2 ++
 drivers/net/dsa/mv88e6xxx.c | 42 ++
 drivers/net/dsa/mv88e6xxx.h |  2 ++
 4 files changed, 48 insertions(+)

diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
index 54aa000..6e18213 100644
--- a/drivers/net/dsa/mv88e6171.c
+++ b/drivers/net/dsa/mv88e6171.c
@@ -103,6 +103,8 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
 #endif
.get_regs_len   = mv88e6xxx_get_regs_len,
.get_regs   = mv88e6xxx_get_regs,
+   .port_join_bridge   = mv88e6xxx_port_bridge_join,
+   .port_leave_bridge  = mv88e6xxx_port_bridge_leave,
.port_stp_update= mv88e6xxx_port_stp_update,
.port_pvid_get  = mv88e6xxx_port_pvid_get,
.port_vlan_prepare  = mv88e6xxx_port_vlan_prepare,
diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c
index ff846d0..cc6c545 100644
--- a/drivers/net/dsa/mv88e6352.c
+++ b/drivers/net/dsa/mv88e6352.c
@@ -323,6 +323,8 @@ struct dsa_switch_driver mv88e6352_switch_driver = {
.set_eeprom = mv88e6352_set_eeprom,
.get_regs_len   = mv88e6xxx_get_regs_len,
.get_regs   = mv88e6xxx_get_regs,
+   .port_join_bridge   = mv88e6xxx_port_bridge_join,
+   .port_leave_bridge  = mv88e6xxx_port_bridge_leave,
.port_stp_update= mv88e6xxx_port_stp_update,
.port_pvid_get  = mv88e6xxx_port_pvid_get,
.port_vlan_prepare  = mv88e6xxx_port_vlan_prepare,
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 04cff58..b06dba0 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -1462,6 +1462,10 @@ int mv88e6xxx_port_vlan_prepare(struct dsa_switch *ds, 
int port,
const struct switchdev_obj_port_vlan *vlan,
struct switchdev_trans *trans)
 {
+   /* We reserve a few VLANs to isolate unbridged ports */
+   if (vlan->vid_end >= 4000)
+   return -EOPNOTSUPP;
+
/* We don't need any dynamic resource from the kernel (yet),
 * so skip the prepare phase.
 */
@@ -1870,6 +1874,36 @@ unlock:
return err;
 }
 
+int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, u32 members)
+{
+   struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+   const u16 pvid = 4000 + ds->index * DSA_MAX_PORTS + port;
+   int err;
+
+   /* The port joined a bridge, so leave its reserved VLAN */
+   mutex_lock(>smi_mutex);
+   err = _mv88e6xxx_port_vlan_del(ds, port, pvid);
+   if (!err)
+   err = _mv88e6xxx_port_pvid_set(ds, port, 0);
+   mutex_unlock(>smi_mutex);
+   return err;
+}
+
+int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port, u32 members)
+{
+   struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
+   const u16 pvid = 4000 + ds->index * DSA_MAX_PORTS + port;
+   int err;
+
+   /* The port left the bridge, so join its reserved VLAN */
+   mutex_lock(>smi_mutex);
+   err = _mv88e6xxx_port_vlan_add(ds, port, pvid, true);
+   if (!err)
+   err = _mv88e6xxx_port_pvid_set(ds, port, pvid);
+   mutex_unlock(>smi_mutex);
+   return err;
+}
+
 static void mv88e6xxx_bridge_work(struct work_struct *work)
 {
struct mv88e6xxx_priv_state *ps;
@@ -2140,6 +2174,14 @@ int mv88e6xxx_setup_ports(struct dsa_switch *ds)
ret = mv88e6xxx_setup_port(ds, i);
if (ret < 0)
return ret;
+
+   if (dsa_is_cpu_port(ds, i) || dsa_is_dsa_port(ds, i))
+   continue;
+
+   /* setup the unbridged state */
+   ret = mv88e6xxx_port_bridge_leave(ds, i, 0);
+   if (ret < 0)
+   return ret;
}
return 0;
 }
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h
index fb9a873..21c8daa 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx.h
@@ -468,6 +468,8 @@ int mv88e6xxx_phy_write_indirect(struct dsa_switch *ds, int 
addr, int regnum,
 int mv88e6xxx_get_eee(struct dsa_switch *ds, 

RE: [PATCH v5] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Nelson, Shannon
> From: Sowmini Varadhan [mailto:sowmini.varad...@oracle.com]
> Sent: Wednesday, November 04, 2015 11:40 AM
> 
> 
> This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
> address in Open Firmware or IDPROM").
> 
> As with that fix, attempt to look up the MAC address in Open Firmware
> on systems that support it, and use IDPROM on SPARC if no OF address
> is found.
> 
> In the case of the i40e there is an assumption that the default mac
> address has already been set up as the primary mac filter on probe,
> so if this filter is obtained from the Open Firmware or IDPROM, an
> explicit write is needed via i40e_aq_mac_address_write() and
> i40e_aq_add_macvlan() invocation.
> 
> Reviewed-by: Martin K. Petersen 
> Signed-off-by: Sowmini Varadhan 
> ---
> v2, v3: Andy Shevchenko comments
> v4: Shannon Nelson review: explicitly set up mac filters before
> register_netdev
> v5: Shannon Nelson code style comments
> 
>  drivers/net/ethernet/intel/i40e/i40e_main.c |   84
> ++-
>  1 files changed, 83 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c
> b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index b825f97..a3883cf 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -24,6 +24,15 @@
>   *
> 
> **
> /
> 
> +#include 
> +#include 
> +#include 
> +
> +#ifdef CONFIG_SPARC
> +#include 
> +#include 
> +#endif
> +
>  /* Local includes */
>  #include "i40e.h"
>  #include "i40e_diag.h"
> @@ -9213,6 +9222,44 @@ static struct i40e_vsi
> *i40e_vsi_reinit_setup(struct i40e_vsi *vsi)
>  }
> 
>  /**
> + * i40e_macaddr_init - explicitly write the mac address filters. This
> + * is needed when the macaddr has been obtained by other means than
> + * the default, e.g., from Open Firmware or IDPROM.

Note that this should be a simple single line, function name and short summary; 
anything more detailed goes into a description after the variables.


[...]

> 
>  /**
> + * i40e_get_platform_mac_addr - get mac address from Open Firmware
> + * or IDPROM if supported by the platform

Again, single line.

Thanks for your work on this, Sowmini.  If you can do a quick repost with these 
little function header comment bits tweaked, I'm willing to ACK this patch and 
I think we'll be ready for Jeff to include it into his tree.

sln

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v4] ipv6: clean up dev_snmp6 proc entry when we fail to initialize inet6_dev

2015-11-04 Thread Cong Wang
On Wed, Nov 4, 2015 at 9:30 AM, Sabrina Dubroca  wrote:
> In ipv6_add_dev, when addrconf_sysctl_register fails, we do not clean up
> the dev_snmp6 entry that we have already registered for this device.
> Call snmp6_unregister_dev in this case.
>
> Fixes: a317a2f19da7d ("ipv6: fail early when creating netdev named all or 
> default")
> Reported-by: Dmitry Vyukov 
> Signed-off-by: Sabrina Dubroca 

Reviewed-by: Cong Wang 

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/4] ipv4: add option to drop unicast encapsulated in L2 multicast

2015-11-04 Thread Johannes Berg
On Wed, 2015-11-04 at 22:59 +0200, Julian Anastasov wrote:
> 
> Patches 1 and 3 look correct to me,
> 
> Reviewed-by: Julian Anastasov 

Thanks for checking!

> If the patches are lost in the merge window you
> can also consider one minor optimization, see below...

Oh, yeah, they probably are - sorry Dave.

> > - } else if (rt->rt_type == RTN_BROADCAST)
> > + } else if (rt->rt_type == RTN_BROADCAST) {
> >   IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb-
> >len);
> > + } else {
> 
> } else if (unlikely(skb->pkt_type != PACKET_HOST)) {
> 
> May be such check can save some cycles because
> it is more common to see PACKET_HOST packets...

I had thought about that based on your earlier comments, but then I
didn't quite see the point. However, perhaps we could invert the check
below to check the pkt_type first, hoping it'll have some kind of cache
effect?

johannes
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bpf: add mod default A and X test cases

2015-11-04 Thread Z Lim
On Wed, Nov 4, 2015 at 11:36 AM, Yang Shi  wrote:
> When running "mod X" operation, if X is 0 the filter has to be halt.
> Add new test cases to cover A = A mod X if X is 0, and A = A mod 1.
>
> CC: Xi Wang 
> CC: Zi Shen Lim 
> Signed-off-by: Yang Shi 
> ---

Acked-by: Zi Shen Lim 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v5] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Nelson, Shannon
> From: Andy Shevchenko [mailto:andy.shevche...@gmail.com]
> Sent: Wednesday, November 04, 2015 11:59 AM
> 
> On Wed, Nov 4, 2015 at 9:39 PM, Sowmini Varadhan
>  wrote:
> >
> > This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
> > address in Open Firmware or IDPROM").

[...]

> > +   }
> > +
> > +   memset(, 0, sizeof(element));
> > +   ether_addr_copy(element.mac_addr, macaddr);
> > +   element.flags = cpu_to_le16(I40E_AQC_MACVLAN_ADD_PERFECT_MATCH);
> > +   ret = i40e_aq_add_macvlan(>back->hw, vsi->seid, ,
> 1, NULL);
> > +   aq_err = vsi->back->hw.aq.asq_last_status;
> 
> Do you really need a separate variable (aq_err)?

These are two separate error values that we're tracking - one from the 
communication between the driver and the firmware (aq_err) and one from the 
driver activity.  Sometimes there may be an AQ error that we want to report, 
but it might not actually be a driver error.  Alternatively, there are times 
when the AQ error needs to get interpreted different ways depending on which 
task the driver is performing.  Lastly, the AQ error gives us more detail on 
whatever the transaction error may have been which gives us more useful debug 
info.

sln


[PATCH 3/8] mm: page_counter: let page_counter_try_charge() return bool

2015-11-04 Thread Johannes Weiner
page_counter_try_charge() currently returns 0 on success and -ENOMEM
on failure, which is surprising behavior given the function name.

Make it follow the expected pattern of try_stuff() functions that
return a boolean true to indicate success, or false for failure.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
---
 include/linux/page_counter.h |  6 +++---
 mm/hugetlb_cgroup.c  |  3 ++-
 mm/memcontrol.c  | 11 +--
 mm/page_counter.c| 14 +++---
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 17fa4f8..7e62920 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -36,9 +36,9 @@ static inline unsigned long page_counter_read(struct 
page_counter *counter)
 
 void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages);
 void page_counter_charge(struct page_counter *counter, unsigned long nr_pages);
-int page_counter_try_charge(struct page_counter *counter,
-   unsigned long nr_pages,
-   struct page_counter **fail);
+bool page_counter_try_charge(struct page_counter *counter,
+unsigned long nr_pages,
+struct page_counter **fail);
 void page_counter_uncharge(struct page_counter *counter, unsigned long 
nr_pages);
 int page_counter_limit(struct page_counter *counter, unsigned long limit);
 int page_counter_memparse(const char *buf, const char *max,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 6a44263..d8fb10d 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -186,7 +186,8 @@ again:
}
rcu_read_unlock();
 
-   ret = page_counter_try_charge(_cg->hugepage[idx], nr_pages, );
+   if (!page_counter_try_charge(_cg->hugepage[idx], nr_pages, ))
+   ret = -ENOMEM;
css_put(_cg->css);
 done:
*ptr = h_cg;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7049e55..e54f434 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2018,8 +2018,8 @@ retry:
return 0;
 
if (!do_swap_account ||
-   !page_counter_try_charge(>memsw, batch, )) {
-   if (!page_counter_try_charge(>memory, batch, ))
+   page_counter_try_charge(>memsw, batch, )) {
+   if (page_counter_try_charge(>memory, batch, ))
goto done_restock;
if (do_swap_account)
page_counter_uncharge(>memsw, batch);
@@ -2383,14 +2383,13 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t 
gfp, int order,
 {
unsigned int nr_pages = 1 << order;
struct page_counter *counter;
-   int ret = 0;
+   int ret;
 
if (!memcg_kmem_is_active(memcg))
return 0;
 
-   ret = page_counter_try_charge(>kmem, nr_pages, );
-   if (ret)
-   return ret;
+   if (!page_counter_try_charge(>kmem, nr_pages, ))
+   return -ENOMEM;
 
ret = try_charge(memcg, gfp, nr_pages);
if (ret) {
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 11b4bed..7c6a63d 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -56,12 +56,12 @@ void page_counter_charge(struct page_counter *counter, 
unsigned long nr_pages)
  * @nr_pages: number of pages to charge
  * @fail: points first counter to hit its limit, if any
  *
- * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
- * its ancestors has hit its configured limit.
+ * Returns %true on success, or %false and @fail if the counter or one
+ * of its ancestors has hit its configured limit.
  */
-int page_counter_try_charge(struct page_counter *counter,
-   unsigned long nr_pages,
-   struct page_counter **fail)
+bool page_counter_try_charge(struct page_counter *counter,
+unsigned long nr_pages,
+struct page_counter **fail)
 {
struct page_counter *c;
 
@@ -99,13 +99,13 @@ int page_counter_try_charge(struct page_counter *counter,
if (new > c->watermark)
c->watermark = new;
}
-   return 0;
+   return true;
 
 failed:
for (c = counter; c != *fail; c = c->parent)
page_counter_cancel(c, nr_pages);
 
-   return -ENOMEM;
+   return false;
 }
 
 /**
-- 
2.6.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/8] net: tcp_memcontrol: remove bogus hierarchy pressure propagation

2015-11-04 Thread Johannes Weiner
When a cgroup currently breaches its socket memory limit, it enters
memory pressure mode for itself and its *parents*. This throttles
transmission in unrelated groups that have nothing to do with the
breached limit.

On the contrary, breaching a limit should make that group and its
*children* enter memory pressure mode. But this happens already,
albeit lazily: if a parent limit is breached, siblings will enter
memory pressure on their own once the next packet arrives for them.

So no additional hierarchy code is needed. Remove the bogus stuff.

Signed-off-by: Johannes Weiner 
---
 include/net/sock.h | 19 ---
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 59a7196..d541bed 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1152,14 +1152,8 @@ static inline void sk_leave_memory_pressure(struct sock 
*sk)
if (*memory_pressure)
*memory_pressure = 0;
 
-   if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
-   struct cg_proto *cg_proto = sk->sk_cgrp;
-   struct proto *prot = sk->sk_prot;
-
-   for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-   cg_proto->memory_pressure = 0;
-   }
-
+   if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+   sk->sk_cgrp->memory_pressure = 0;
 }
 
 static inline void sk_enter_memory_pressure(struct sock *sk)
@@ -1167,13 +1161,8 @@ static inline void sk_enter_memory_pressure(struct sock 
*sk)
if (!sk->sk_prot->enter_memory_pressure)
return;
 
-   if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
-   struct cg_proto *cg_proto = sk->sk_cgrp;
-   struct proto *prot = sk->sk_prot;
-
-   for (; cg_proto; cg_proto = parent_cg_proto(prot, cg_proto))
-   cg_proto->memory_pressure = 1;
-   }
+   if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+   sk->sk_cgrp->memory_pressure = 1;
 
sk->sk_prot->enter_memory_pressure(sk);
 }
-- 
2.6.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/8] mm: memcontrol: export root_mem_cgroup

2015-11-04 Thread Johannes Weiner
A later patch will need this symbol in files other than memcontrol.c,
so export it now and replace mem_cgroup_root_css at the same time.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
---
 include/linux/memcontrol.h | 3 ++-
 mm/backing-dev.c   | 2 +-
 mm/memcontrol.c| 5 ++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 805da1f..19ff87b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -275,7 +275,8 @@ struct mem_cgroup {
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
 };
-extern struct cgroup_subsys_state *mem_cgroup_root_css;
+
+extern struct mem_cgroup *root_mem_cgroup;
 
 /**
  * mem_cgroup_events - count memory events against a cgroup
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 095b23b..73ab967 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -702,7 +702,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
 
ret = wb_init(>wb, bdi, 1, GFP_KERNEL);
if (!ret) {
-   bdi->wb.memcg_css = mem_cgroup_root_css;
+   bdi->wb.memcg_css = _mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
}
return ret;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c71fe40..7049e55 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -76,9 +76,9 @@
 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
 EXPORT_SYMBOL(memory_cgrp_subsys);
 
+struct mem_cgroup *root_mem_cgroup __read_mostly;
+
 #define MEM_CGROUP_RECLAIM_RETRIES 5
-static struct mem_cgroup *root_mem_cgroup __read_mostly;
-struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
 
 /* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
@@ -4214,7 +4214,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state 
*parent_css)
/* root ? */
if (parent_css == NULL) {
root_mem_cgroup = memcg;
-   mem_cgroup_root_css = >css;
page_counter_init(>memory, NULL);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
-- 
2.6.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/8] mm: memcontrol: prepare for unified hierarchy socket accounting

2015-11-04 Thread Johannes Weiner
The unified hierarchy memory controller will account socket
memory. Move the infrastructure functions accordingly.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
---
 mm/memcontrol.c | 140 
 1 file changed, 70 insertions(+), 70 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d649b56..85f212e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -287,76 +287,6 @@ static inline struct mem_cgroup 
*mem_cgroup_from_id(unsigned short id)
return mem_cgroup_from_css(css);
 }
 
-/* Writing them here to avoid exposing memcg's inner layout */
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
-
-DEFINE_STATIC_KEY_FALSE(mem_cgroup_sockets);
-
-void sock_update_memcg(struct sock *sk)
-{
-   struct mem_cgroup *memcg;
-   /*
-* Socket cloning can throw us here with sk_cgrp already
-* filled. It won't however, necessarily happen from
-* process context. So the test for root memcg given
-* the current task's memcg won't help us in this case.
-*
-* Respecting the original socket's memcg is a better
-* decision in this case.
-*/
-   if (sk->sk_memcg) {
-   BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
-   css_get(>sk_memcg->css);
-   return;
-   }
-
-   rcu_read_lock();
-   memcg = mem_cgroup_from_task(current);
-   if (css_tryget_online(>css))
-   sk->sk_memcg = memcg;
-   rcu_read_unlock();
-}
-EXPORT_SYMBOL(sock_update_memcg);
-
-void sock_release_memcg(struct sock *sk)
-{
-   if (sk->sk_memcg)
-   css_put(>sk_memcg->css);
-}
-
-/**
- * mem_cgroup_charge_skmem - charge socket memory
- * @memcg: memcg to charge
- * @nr_pages: number of pages to charge
- *
- * Charges @nr_pages to @memcg. Returns %true if the charge fit within
- * the memcg's configured limit, %false if the charge had to be forced.
- */
-bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
-   struct page_counter *counter;
-
-   if (page_counter_try_charge(>skmem, nr_pages, )) {
-   memcg->skmem_breached = false;
-   return true;
-   }
-   page_counter_charge(>skmem, nr_pages);
-   memcg->skmem_breached = true;
-   return false;
-}
-
-/**
- * mem_cgroup_uncharge_skmem - uncharge socket memory
- * @memcg: memcg to uncharge
- * @nr_pages: number of pages to uncharge
- */
-void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
-{
-   page_counter_uncharge(>skmem, nr_pages);
-}
-
-#endif
-
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -5523,6 +5453,76 @@ void mem_cgroup_replace_page(struct page *oldpage, 
struct page *newpage)
commit_charge(newpage, memcg, true);
 }
 
+/* Writing them here to avoid exposing memcg's inner layout */
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
+
+DEFINE_STATIC_KEY_FALSE(mem_cgroup_sockets);
+
+void sock_update_memcg(struct sock *sk)
+{
+   struct mem_cgroup *memcg;
+   /*
+* Socket cloning can throw us here with sk_cgrp already
+* filled. It won't however, necessarily happen from
+* process context. So the test for root memcg given
+* the current task's memcg won't help us in this case.
+*
+* Respecting the original socket's memcg is a better
+* decision in this case.
+*/
+   if (sk->sk_memcg) {
+   BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
+   css_get(>sk_memcg->css);
+   return;
+   }
+
+   rcu_read_lock();
+   memcg = mem_cgroup_from_task(current);
+   if (css_tryget_online(>css))
+   sk->sk_memcg = memcg;
+   rcu_read_unlock();
+}
+EXPORT_SYMBOL(sock_update_memcg);
+
+void sock_release_memcg(struct sock *sk)
+{
+   if (sk->sk_memcg)
+   css_put(>sk_memcg->css);
+}
+
+/**
+ * mem_cgroup_charge_skmem - charge socket memory
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ *
+ * Charges @nr_pages to @memcg. Returns %true if the charge fit within
+ * the memcg's configured limit, %false if the charge had to be forced.
+ */
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+   struct page_counter *counter;
+
+   if (page_counter_try_charge(>skmem, nr_pages, )) {
+   memcg->skmem_breached = false;
+   return true;
+   }
+   page_counter_charge(>skmem, nr_pages);
+   memcg->skmem_breached = true;
+   return false;
+}
+
+/**
+ * mem_cgroup_uncharge_skmem - uncharge socket memory
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+   page_counter_uncharge(>skmem, nr_pages);
+}
+
+#endif
+
 

[PATCH 2/8] mm: vmscan: simplify memcg vs. global shrinker invocation

2015-11-04 Thread Johannes Weiner
Letting shrink_slab() handle the root_mem_cgroup, and implicitely the
!CONFIG_MEMCG case, allows shrink_zone() to invoke the shrinkers
unconditionally from within the memcg iteration loop.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
---
 include/linux/memcontrol.h |  2 ++
 mm/vmscan.c| 31 ---
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 19ff87b..8929685 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -502,6 +502,8 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
+#define root_mem_cgroup NULL
+
 static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 enum mem_cgroup_events_index idx,
 unsigned int nr)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9b52ecf..ecc2125 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -411,6 +411,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
unsigned long freed = 0;
 
+   /* Global shrinker mode */
+   if (memcg == root_mem_cgroup)
+   memcg = NULL;
+
if (memcg && !memcg_kmem_is_active(memcg))
return 0;
 
@@ -2417,11 +2421,22 @@ static bool shrink_zone(struct zone *zone, struct 
scan_control *sc,
shrink_lruvec(lruvec, swappiness, sc, _pages);
zone_lru_pages += lru_pages;
 
-   if (memcg && is_classzone)
+   /*
+* Shrink the slab caches in the same proportion that
+* the eligible LRU pages were scanned.
+*/
+   if (is_classzone) {
shrink_slab(sc->gfp_mask, zone_to_nid(zone),
memcg, sc->nr_scanned - scanned,
lru_pages);
 
+   if (reclaim_state) {
+   sc->nr_reclaimed +=
+   reclaim_state->reclaimed_slab;
+   reclaim_state->reclaimed_slab = 0;
+   }
+   }
+
/*
 * Direct reclaim and kswapd have to scan all memory
 * cgroups to fulfill the overall scan target for the
@@ -2439,20 +2454,6 @@ static bool shrink_zone(struct zone *zone, struct 
scan_control *sc,
}
} while ((memcg = mem_cgroup_iter(root, memcg, )));
 
-   /*
-* Shrink the slab caches in the same proportion that
-* the eligible LRU pages were scanned.
-*/
-   if (global_reclaim(sc) && is_classzone)
-   shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
-   sc->nr_scanned - nr_scanned,
-   zone_lru_pages);
-
-   if (reclaim_state) {
-   sc->nr_reclaimed += reclaim_state->reclaimed_slab;
-   reclaim_state->reclaimed_slab = 0;
-   }
-
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
   sc->nr_scanned - nr_scanned,
   sc->nr_reclaimed - nr_reclaimed);
-- 
2.6.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/8] mm: memcontrol: account socket memory in unified hierarchy v2

2015-11-04 Thread Johannes Weiner
Hi,

this is version 2 of the patches to add socket memory accounting to
the unified hierarchy memory controller. Changes from v1 include:

- No accounting overhead unless a dedicated cgroup is created and the
  memory controller instructed to track that group's memory footprint.
  Distribution kernels enable CONFIG_MEMCG, and users (incl. systemd)
  might create cgroups only for process control or resources other
  than memory. As noted by David and Michal, these setups shouldn't
  pay any overhead for this.

- Continue to enter the socket pressure state when hitting the memory
  controller's hard limit. Vladimir noted that there is at least some
  value in telling other sockets in the cgroup to not increase their
  transmit windows when one of them is already dropping packets.

- Drop the controversial vmpressure rework. Instead of changing the
  level where pressure is noted, keep noting pressure in its origin
  and then make the pressure check hierarchical. As noted by Michal
  and Vladimir, we shouldn't risk changing user-visible behavior.

---

Socket buffer memory can make up a significant share of a workload's
memory footprint that can be directly linked to userspace activity,
and so it needs to be part of the memory controller to provide proper
resource isolation/containment.

Historically, socket buffers were accounted in a separate counter,
without any pressure equalization between anonymous memory, page
cache, and the socket buffers. When the socket buffer pool was
exhausted, buffer allocations would fail hard and cause network
performance to tank, regardless of whether there was still memory
available to the group or not. Likewise, struggling anonymous or cache
workingsets could not dip into an idle socket memory pool. Because of
this, the feature was not usable for many real life applications.

To not repeat this mistake, the new memory controller will account all
types of memory pages it is tracking on behalf of a cgroup in a single
pool. Upon pressure, the VM reclaims and shrinks and puts pressure on
whatever memory consumer in that pool is within its reach.

For socket memory, pressure feedback is provided through vmpressure
events. When the VM has trouble freeing memory, the network code is
instructed to stop growing the cgroup's transmit windows.

---

This series begins with a rework of the existing tcp memory controller
that simplifies and cleans up the code while allowing us to have only
one set of networking hooks for both memory controller versions. The
original behavior of the existing tcp controller should be preserved.

It then adds socket accounting to the v2 memory controller, including
the use of the per-cpu charge cache and async memory.high enforcement
from socket memory charges.

Lastly, vmpressure is hooked up to the socket code so that it stops
growing transmit windows when the VM has trouble reclaiming memory.

 include/linux/memcontrol.h   |  98 ---
 include/linux/page_counter.h |   6 +-
 include/net/sock.h   | 137 ++---
 include/net/tcp.h|   5 +-
 include/net/tcp_memcontrol.h |   7 --
 mm/backing-dev.c |   2 +-
 mm/hugetlb_cgroup.c  |   3 +-
 mm/memcontrol.c  | 262 +
 mm/page_counter.c|  14 +--
 mm/vmpressure.c  |  25 +++-
 mm/vmscan.c  |  31 ++---
 net/core/sock.c  |  78 +++-
 net/ipv4/sysctl_net_ipv4.c   |   1 -
 net/ipv4/tcp.c   |   3 +-
 net/ipv4/tcp_ipv4.c  |   9 +-
 net/ipv4/tcp_memcontrol.c| 147 ---
 net/ipv4/tcp_output.c|   6 +-
 net/ipv6/tcp_ipv6.c  |   3 -
 18 files changed, 328 insertions(+), 509 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bpf: add mod default A and X test cases

2015-11-04 Thread Xi Wang
On Wed, Nov 4, 2015 at 11:36 AM, Yang Shi  wrote:
> When running "mod X" operation, if X is 0 the filter has to be halt.
> Add new test cases to cover A = A mod X if X is 0, and A = A mod 1.
>
> CC: Xi Wang 
> CC: Zi Shen Lim 
> Signed-off-by: Yang Shi 

Acked-by: Xi Wang 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] net: dsa: mv88e6xxx: isolate unbridged ports

2015-11-04 Thread Florian Fainelli
On 04/11/15 14:23, Vivien Didelot wrote:
> The DSA documentation specifies that each port must be capable of
> forwarding frames to the CPU port. The last changes on bridging support
> for the mv88e6xxx driver broke this requirement for non-bridged ports.
> 
> So as for the bridged ports, reserve a few VLANs (4000+) in the switch
> to isolate ports that have not been bridged yet.
> 
> By default, a port will be isolated with the CPU and DSA ports. When the
> port joins a bridge, it will leave its reserved port. When it is removed
> from a bridge, it will join its reserved VLAN again.

This looks fine but the logic is a little hard to understand at first
glance.

> 
> Fixes: 5fe7f68016ff ("net: dsa: mv88e6xxx: fix hardware bridging")
> Reported-by: Andrew Lunn 
> Signed-off-by: Vivien Didelot 
> ---
>  drivers/net/dsa/mv88e6171.c |  2 ++
>  drivers/net/dsa/mv88e6352.c |  2 ++
>  drivers/net/dsa/mv88e6xxx.c | 42 ++
>  drivers/net/dsa/mv88e6xxx.h |  2 ++
>  4 files changed, 48 insertions(+)
> 
> diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
> index 54aa000..6e18213 100644
> --- a/drivers/net/dsa/mv88e6171.c
> +++ b/drivers/net/dsa/mv88e6171.c
> @@ -103,6 +103,8 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
>  #endif
>   .get_regs_len   = mv88e6xxx_get_regs_len,
>   .get_regs   = mv88e6xxx_get_regs,
> + .port_join_bridge   = mv88e6xxx_port_bridge_join,
> + .port_leave_bridge  = mv88e6xxx_port_bridge_leave,
>   .port_stp_update= mv88e6xxx_port_stp_update,
>   .port_pvid_get  = mv88e6xxx_port_pvid_get,
>   .port_vlan_prepare  = mv88e6xxx_port_vlan_prepare,
> diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c
> index ff846d0..cc6c545 100644
> --- a/drivers/net/dsa/mv88e6352.c
> +++ b/drivers/net/dsa/mv88e6352.c
> @@ -323,6 +323,8 @@ struct dsa_switch_driver mv88e6352_switch_driver = {
>   .set_eeprom = mv88e6352_set_eeprom,
>   .get_regs_len   = mv88e6xxx_get_regs_len,
>   .get_regs   = mv88e6xxx_get_regs,
> + .port_join_bridge   = mv88e6xxx_port_bridge_join,
> + .port_leave_bridge  = mv88e6xxx_port_bridge_leave,
>   .port_stp_update= mv88e6xxx_port_stp_update,
>   .port_pvid_get  = mv88e6xxx_port_pvid_get,
>   .port_vlan_prepare  = mv88e6xxx_port_vlan_prepare,
> diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
> index 04cff58..b06dba0 100644
> --- a/drivers/net/dsa/mv88e6xxx.c
> +++ b/drivers/net/dsa/mv88e6xxx.c
> @@ -1462,6 +1462,10 @@ int mv88e6xxx_port_vlan_prepare(struct dsa_switch *ds, 
> int port,
>   const struct switchdev_obj_port_vlan *vlan,
>   struct switchdev_trans *trans)
>  {
> + /* We reserve a few VLANs to isolate unbridged ports */
> + if (vlan->vid_end >= 4000)
> + return -EOPNOTSUPP;

Since this constant is repeated 3 times, you might want to create a
local define for it and size it based on the number of ports present in
the switch rather than leaving 95 numbers?

> +
>   /* We don't need any dynamic resource from the kernel (yet),
>* so skip the prepare phase.
>*/
> @@ -1870,6 +1874,36 @@ unlock:
>   return err;
>  }
>  
> +int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, u32 members)
> +{
> + struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
> + const u16 pvid = 4000 + ds->index * DSA_MAX_PORTS + port;
> + int err;
> +
> + /* The port joined a bridge, so leave its reserved VLAN */
> + mutex_lock(>smi_mutex);
> + err = _mv88e6xxx_port_vlan_del(ds, port, pvid);
> + if (!err)
> + err = _mv88e6xxx_port_pvid_set(ds, port, 0);

Does that mean that the following happens:

- bridge is created and port joins it
- port is configured to be in pvid 0 while joining
- port is then configured again by the bridge layer to be in whatever
pvid the user has decided

The other question is, does that break isolation between multiple
bridges on the same switch? Should we use the bridge ifindex here
somehow as a pvid indication?

> + mutex_unlock(>smi_mutex);
> + return err;
> +}
> +
> +int mv88e6xxx_port_bridge_leave(struct dsa_switch *ds, int port, u32 members)
> +{
> + struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
> + const u16 pvid = 4000 + ds->index * DSA_MAX_PORTS + port;
> + int err;
> +
> + /* The port left the bridge, so join its reserved VLAN */
> + mutex_lock(>smi_mutex);
> + err = _mv88e6xxx_port_vlan_add(ds, port, pvid, true);
> + if (!err)
> + err = _mv88e6xxx_port_pvid_set(ds, port, pvid);
> + mutex_unlock(>smi_mutex);
> + return err;
> +}
> +
>  static void mv88e6xxx_bridge_work(struct work_struct *work)
>  {
>   struct mv88e6xxx_priv_state *ps;
> @@ 

Re: [PATCH v2 1/4] ipv4: add option to drop unicast encapsulated in L2 multicast

2015-11-04 Thread Julian Anastasov

Hello,

On Wed, 4 Nov 2015, Johannes Berg wrote:

> From: Johannes Berg 
> 
> In order to solve a problem with 802.11, the so-called hole-196 attack,
> add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if
> enabled, causes the stack to drop IPv4 unicast packets encapsulated in
> link-layer multi- or broadcast frames. Such frames can (as an attack)
> be created by any member of the same wireless network and transmitted
> as valid encrypted frames since the symmetric key for broadcast frames
> is shared between all stations.
> 
> Additionally, enabling this option provides compliance with a SHOULD
> clause of RFC 1122.
> 
> Signed-off-by: Johannes Berg 

Patches 1 and 3 look correct to me,

Reviewed-by: Julian Anastasov 

If the patches are lost in the merge window you
can also consider one minor optimization, see below...

> ---
>  Documentation/networking/ip-sysctl.txt |  7 +++
>  include/uapi/linux/ip.h|  1 +
>  net/ipv4/devinet.c |  2 ++
>  net/ipv4/ip_input.c| 26 +-
>  4 files changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.txt 
> b/Documentation/networking/ip-sysctl.txt
> index 05915be86235..35c4c43dd8de 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -1208,6 +1208,13 @@ promote_secondaries - BOOLEAN
>   promote a corresponding secondary IP address instead of
>   removing all the corresponding secondary IP addresses.
>  
> +drop_unicast_in_l2_multicast - BOOLEAN
> + Drop any unicast IP packets that are received in link-layer
> + multicast (or broadcast) frames.
> + This behavior (for multicast) is actually a SHOULD in RFC
> + 1122, but is disabled by default for compatibility reasons.
> + Default: off (0)
> +
>  
>  tag - INTEGER
>   Allows you to write a number, which can be used as required.
> diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
> index 08f894d2ddbd..584834f7e95c 100644
> --- a/include/uapi/linux/ip.h
> +++ b/include/uapi/linux/ip.h
> @@ -165,6 +165,7 @@ enum
>   IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL,
>   IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
>   IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
> + IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
>   __IPV4_DEVCONF_MAX
>  };
>  
> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
> index cebd9d31e65a..dbbab28a52a4 100644
> --- a/net/ipv4/devinet.c
> +++ b/net/ipv4/devinet.c
> @@ -2192,6 +2192,8 @@ static struct devinet_sysctl_table {
> "promote_secondaries"),
>   DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
> "route_localnet"),
> + DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
> +   "drop_unicast_in_l2_multicast"),
>   },
>  };
>  
> diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
> index b1209b63381f..a442b6bd9441 100644
> --- a/net/ipv4/ip_input.c
> +++ b/net/ipv4/ip_input.c
> @@ -359,8 +359,32 @@ static int ip_rcv_finish(struct net *net, struct sock 
> *sk, struct sk_buff *skb)
>   rt = skb_rtable(skb);
>   if (rt->rt_type == RTN_MULTICAST) {
>   IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
> - } else if (rt->rt_type == RTN_BROADCAST)
> + } else if (rt->rt_type == RTN_BROADCAST) {
>   IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
> + } else {

} else if (unlikely(skb->pkt_type != PACKET_HOST)) {

May be such check can save some cycles because
it is more common to see PACKET_HOST packets...

> + struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
> +
> + /* RFC 1122 3.3.6:
> +  *
> +  *   When a host sends a datagram to a link-layer broadcast
> +  *   address, the IP destination address MUST be a legal IP
> +  *   broadcast or IP multicast address.
> +  *
> +  *   A host SHOULD silently discard a datagram that is received
> +  *   via a link-layer broadcast (see Section 2.4) but does not
> +  *   specify an IP multicast or broadcast destination address.
> +  *
> +  * This doesn't explicitly say L2 *broadcast*, but broadcast is
> +  * in a way a form of multicast and the most common use case for
> +  * this is 802.11 protecting against cross-station spoofing (the
> +  * so-called "hole-196" attack) so do it for both.
> +  */
> + if (in_dev &&
> + IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST) &&
> + (skb->pkt_type == PACKET_BROADCAST ||
> +  skb->pkt_type == 

Re: [PATCH] sh_eth: merge sh_eth_free_dma_buffer() into sh_eth_ring_free()

2015-11-04 Thread David Miller
From: Sergei Shtylyov 
Date: Wed, 04 Nov 2015 00:55:13 +0300

> While the ring allocation is done by a single function, sh_eth_ring_init(),
> the ring deallocation was split into two functions (almost always called
> one after the other) for no good reason. Merge  sh_eth_free_dma_buffer()
> into sh_eth_ring_free() which allows us  to save space not only on the
> direct calls  of the former function but also on the sh_eth_ring_init()'s
> simplified error path...
> 
> Signed-off-by: Sergei Shtylyov 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] sh_eth: kill 'ret' variable in sh_eth_ring_init()

2015-11-04 Thread David Miller
From: Sergei Shtylyov 
Date: Wed, 04 Nov 2015 00:17:08 +0300

> The 'ret' local variable in sh_eth_ring_init() serves no useful purpose as
> the only  values it gets assigned are 0 and -ENOMEM both of which could be
> returned directly...
> 
> Signed-off-by: Sergei Shtylyov 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] commit c6825c0976fa7893692e0e43b09740b419b23c09 upstream.

2015-11-04 Thread Ani Sinha
(removed a bunch of people from CC list)

On Mon, Oct 26, 2015 at 1:06 PM, Pablo Neira Ayuso  wrote:

> Then we can review and, if no major concerns, I can submit this to
> -stable.

Now that Neal has sufficiently tested the patches, is it OK to apply
to -stable or do you guys want me to do anything more?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] drivers: net: cpsw: Add support for fixed-link PHY

2015-11-04 Thread David Miller
From: Markus Brunner 
Date: Tue, 03 Nov 2015 22:09:51 +0100

> Add support for a fixed-link devicetree sub-node in case the the 
> cpsw MAC is directly connected to a non-mdio PHY/device. 
> 
> Signed-off-by: Markus Brunner 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Andy Shevchenko
On Thu, Nov 5, 2015 at 12:53 AM, Nelson, Shannon
 wrote:
>> From: Andy Shevchenko [mailto:andy.shevche...@gmail.com]
>> Sent: Wednesday, November 04, 2015 11:59 AM
>>
>> On Wed, Nov 4, 2015 at 9:39 PM, Sowmini Varadhan
>>  wrote:
>> >
>> > This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
>> > address in Open Firmware or IDPROM").
>
> [...]
>
>> > +   }
>> > +
>> > +   memset(, 0, sizeof(element));
>> > +   ether_addr_copy(element.mac_addr, macaddr);
>> > +   element.flags = cpu_to_le16(I40E_AQC_MACVLAN_ADD_PERFECT_MATCH);
>> > +   ret = i40e_aq_add_macvlan(>back->hw, vsi->seid, ,
>> 1, NULL);
>> > +   aq_err = vsi->back->hw.aq.asq_last_status;
>>
>> Do you really need a separate variable (aq_err)?
>
> These are two separate error values that we're tracking - one from the 
> communication between the driver and the firmware (aq_err) and one from the 
> driver activity.  Sometimes there may be an AQ error that we want to report, 
> but it might not actually be a driver error.  Alternatively, there are times 
> when the AQ error needs to get interpreted different ways depending on which 
> task the driver is performing.  Lastly, the AQ error gives us more detail on 
> whatever the transaction error may have been which gives us more useful debug 
> info.

Understandable, though in this certain function I don't see why we
can't drop it. The usage of it like this:

var x;

x = y;
if (x) {
...
}

Which is just
if (y) {
...
}


>
> sln



-- 
With Best Regards,
Andy Shevchenko
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch net v2] ipv4: fix a potential deadlock in mcast getsockopt() path

2015-11-04 Thread David Miller
From: Cong Wang 
Date: Tue,  3 Nov 2015 15:41:16 -0800

> Sasha reported the following lockdep warning:
> 
>   Possible unsafe locking scenario:
> 
> CPU0CPU1
> 
>lock(sk_lock-AF_INET);
> lock(rtnl_mutex);
> lock(sk_lock-AF_INET);
>lock(rtnl_mutex);
> 
> This is due to that for IP_MSFILTER and MCAST_MSFILTER, we take
> rtnl lock before the socket lock in setsockopt() path, but take
> the socket lock before rtnl lock in getsockopt() path. All the
> rest optnames are setsockopt()-only.
> 
> Fix this by aligning the getsockopt() path with the setsockopt()
> path, so that all mcast socket path would be locked in the same
> order.
> 
> Note, IPv6 part is different where rtnl lock is not held.
> 
> Fixes: 54ff9ef36bdf ("ipv4, ipv6: kill ip_mc_{join, leave}_group and 
> ipv6_sock_mc_{join, drop}")
> Reported-by: Sasha Levin 
> Cc: Marcelo Ricardo Leitner 
> Signed-off-by: Cong Wang 

I don't like conditional locking, but I can't think of a better
way to suggest fixing this, so applied.

And queued up for -stable as well.

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v6] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Sowmini Varadhan

This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
address in Open Firmware or IDPROM").

As with that fix, attempt to look up the MAC address in Open Firmware
on systems that support it, and use IDPROM on SPARC if no OF address
is found.

In the case of the i40e there is an assumption that the default mac
address has already been set up as the primary mac filter on probe,
so if this filter is obtained from the Open Firmware or IDPROM, an
explicit write is needed via i40e_aq_mac_address_write() and
i40e_aq_add_macvlan() invocation.

Reviewed-by: Martin K. Petersen 
Signed-off-by: Sowmini Varadhan 
---
v2, v3: Andy Shevchenko comments
v4: Shannon Nelson review: explicitly set up mac filters before register_netdev
v5: Shannon Nelson code style comments
v6: Shannon Nelson code style comments

 drivers/net/ethernet/intel/i40e/i40e_main.c |   83 ++-
 1 files changed, 82 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b825f97..e355873 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -24,6 +24,15 @@
  *
  
**/
 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_SPARC
+#include 
+#include 
+#endif
+
 /* Local includes */
 #include "i40e.h"
 #include "i40e_diag.h"
@@ -9213,6 +9222,44 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct 
i40e_vsi *vsi)
 }
 
 /**
+ * i40e_macaddr_init - explicitly write the mac address filters.
+ *
+ * @vsi: pointer to the vsi.
+ * @macaddr: the MAC address
+ *
+ * This is needed when the macaddr has been obtained by other
+ * means than the default, e.g., from Open Firmware or IDPROM.
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_macaddr_init(struct i40e_vsi *vsi, u8 *macaddr)
+{
+   int ret, aq_err;
+   struct i40e_aqc_add_macvlan_element_data element;
+
+   ret = i40e_aq_mac_address_write(>back->hw,
+   I40E_AQC_WRITE_TYPE_LAA_WOL,
+   macaddr, NULL);
+   if (ret) {
+   dev_info(>back->pdev->dev,
+"Addr change for VSI failed: %d\n", ret);
+   return -EADDRNOTAVAIL;
+   }
+
+   memset(, 0, sizeof(element));
+   ether_addr_copy(element.mac_addr, macaddr);
+   element.flags = cpu_to_le16(I40E_AQC_MACVLAN_ADD_PERFECT_MATCH);
+   ret = i40e_aq_add_macvlan(>back->hw, vsi->seid, , 1, NULL);
+   aq_err = vsi->back->hw.aq.asq_last_status;
+   if (aq_err != I40E_AQ_RC_OK) {
+   dev_info(>back->pdev->dev,
+"add filter failed err %s aq_err %s\n",
+i40e_stat_str(>back->hw, ret),
+i40e_aq_str(>back->hw, aq_err));
+   }
+   return ret;
+}
+
+/**
  * i40e_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
  * @type: VSI type
@@ -9341,6 +9388,9 @@ struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 
type,
ret = i40e_config_netdev(vsi);
if (ret)
goto err_netdev;
+   ret = i40e_macaddr_init(vsi, pf->hw.mac.addr);
+   if (ret)
+   goto err_netdev;
ret = register_netdev(vsi->netdev);
if (ret)
goto err_netdev;
@@ -10163,6 +10213,35 @@ static void i40e_print_features(struct i40e_pf *pf)
 }
 
 /**
+ * i40e_get_platform_mac_addr - get platform-specific MAC address
+ *
+ * @pdev: PCI device information struct
+ * @mac_addr: the MAC address to be returned
+ *
+ * Look up the MAC address in Open Firmware  on systems that support it,
+ * and use IDPROM on SPARC if no OF address is found.
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_get_platform_mac_addr(struct pci_dev *pdev, u8 *mac_addr)
+{
+   struct device_node *dp = pci_device_to_OF_node(pdev);
+   const unsigned char *addr;
+
+   addr = of_get_mac_address(dp);
+   if (addr) {
+   ether_addr_copy(mac_addr, addr);
+   return 0;
+   }
+#ifdef CONFIG_SPARC
+   ether_addr_copy(mac_addr, idprom->id_ethaddr);
+   return 0;
+#else
+   return -EINVAL;
+#endif /* CONFIG_SPARC */
+}
+
+/**
  * i40e_probe - Device initialization routine
  * @pdev: PCI device information struct
  * @ent: entry in i40e_pci_tbl
@@ -10360,7 +10439,9 @@ static int i40e_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
i40e_aq_stop_lldp(hw, true, NULL);
}
 
-   i40e_get_mac_addr(hw, hw->mac.addr);
+   err = i40e_get_platform_mac_addr(pdev, hw->mac.addr);
+   if (err)
+   i40e_get_mac_addr(hw, hw->mac.addr);
if (!is_valid_ether_addr(hw->mac.addr)) {
 

Re: [PATCH v2 2/4] e1000e: Do not read icr in Other interrupt

2015-11-04 Thread Benjamin Poirier
On 2015/10/30 12:19, Alexander Duyck wrote:
> On 10/30/2015 10:31 AM, Benjamin Poirier wrote:
> >Using eiac instead of reading icr allows us to avoid interference with
> >rx and tx interrupts in the Other interrupt handler.
> >
> >According to the 82574 datasheet section 10.2.4.1, interrupt causes that
> >trigger the Other interrupt are
> >1) Link Status Change.
> >2) Receiver Overrun.
> >3) MDIO Access Complete.
> >4) Small Receive Packet Detected.
> >5) Receive ACK Frame Detected.
> >6) Manageability Event Detected.
> >
> >Causes 3, 4, 5 are related to features which are not enabled by the
> >driver. Always assume that cause 1 is what triggered the Other interrupt
> >and set get_link_status. Cause 2 and 6 should be rare enough that the
> >extra cost of needlessly re-reading the link status is negligible.
> >
> >Signed-off-by: Benjamin Poirier 
> 
> You might want to instead use a write of LSC to the ICR instead of just
> using auto-clear and not enabling LSC.  My concern is that you might no
> longer be getting link status change events at all.  An easy test is to just
> unplug/plug the cable a few times, or run "ethtool -r" on the link partner
> if connected back to back.  You should see messages appear in the dmesg log
> indicating that the link state changed.
> 
> In addition you should probably clear the IAME bit in the CTRL_EXT register
> so that you don't risk masking the interrupts on the ICR read or write.

Thanks, your concern about not getting LSC events was right. After more
experimentation I noticed that in order for the Other interrupt to be
raised for each of these six conditions, the IMS bit for that condition
must also be set. I've restored setting LSC in IMS. OTOH, I don't see a
need to clear LSC from ICR. Even without an ICR read or write-to-clear
to clear the LSC bit, Other interrupts are raised to signal LSC events.

I'll wait for net-next to reopen and send v3.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 2/4] e1000e: Do not read icr in Other interrupt

2015-11-04 Thread Alexander Duyck

On 11/04/2015 03:19 PM, Benjamin Poirier wrote:

On 2015/10/30 12:19, Alexander Duyck wrote:

On 10/30/2015 10:31 AM, Benjamin Poirier wrote:

Using eiac instead of reading icr allows us to avoid interference with
rx and tx interrupts in the Other interrupt handler.

According to the 82574 datasheet section 10.2.4.1, interrupt causes that
trigger the Other interrupt are
1) Link Status Change.
2) Receiver Overrun.
3) MDIO Access Complete.
4) Small Receive Packet Detected.
5) Receive ACK Frame Detected.
6) Manageability Event Detected.

Causes 3, 4, 5 are related to features which are not enabled by the
driver. Always assume that cause 1 is what triggered the Other interrupt
and set get_link_status. Cause 2 and 6 should be rare enough that the
extra cost of needlessly re-reading the link status is negligible.

Signed-off-by: Benjamin Poirier 

You might want to instead use a write of LSC to the ICR instead of just
using auto-clear and not enabling LSC.  My concern is that you might no
longer be getting link status change events at all.  An easy test is to just
unplug/plug the cable a few times, or run "ethtool -r" on the link partner
if connected back to back.  You should see messages appear in the dmesg log
indicating that the link state changed.

In addition you should probably clear the IAME bit in the CTRL_EXT register
so that you don't risk masking the interrupts on the ICR read or write.

Thanks, your concern about not getting LSC events was right. After more
experimentation I noticed that in order for the Other interrupt to be
raised for each of these six conditions, the IMS bit for that condition
must also be set. I've restored setting LSC in IMS. OTOH, I don't see a
need to clear LSC from ICR. Even without an ICR read or write-to-clear
to clear the LSC bit, Other interrupts are raised to signal LSC events.

I'll wait for net-next to reopen and send v3.


You probably don't need to wait.  The Intel-wired tree operates outside 
of Dave's merge window, and it will take some time for the patches to be 
validated before the Jeff can submit them to Dave.


- Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC net-next 2/2] tcp: Add Redundant Data Bundling (RDB)

2015-11-04 Thread Bendik Rønning Opstad
On Monday, November 02, 2015 09:37:54 AM David Laight wrote:
> From: Bendik Rønning Opstad
> > Sent: 23 October 2015 21:50
> > RDB is a mechanism that enables a TCP sender to bundle redundant
> > (already sent) data with TCP packets containing new data. By bundling
> > (retransmitting) already sent data with each TCP packet containing new
> > data, the connection will be more resistant to sporadic packet loss
> > which reduces the application layer latency significantly in congested
> > scenarios.
> 
> What sort of traffic flows do you expect this to help?

As mentioned in the cover letter, RDB is aimed at reducing the
latencies for "thin-stream" traffic often produced by
latency-sensitive applications. This blog post describes RDB and the
underlying motivation:
http://mlab.no/blog/2015/10/redundant-data-bundling-in-tcp

Further information is available in the links referred to in the blog
post.

> An ssh (or similar) connection will get additional data to send,
> but that sort of data flow needs Nagle in order to reduce the
> number of packets sent.

Whether an application needs to reduce the number of packets sent
depends on the perspective of who you ask. If low latency is of high
priority for the application it may need to increase the number of
packets sent by disabling Nagle to reduce the segments sojourn times
on the sender side.

As for SSH clients, it seems OpenSSH disables Nagle for interactive
sessions.

> OTOH it might benefit from including unacked data if the Nagle
> timer expires.
> Being able to set the Nagle timer on a per-connection basis
> (or maybe using something based on the RTT instead of 2 secs)
> might make packet loss less problematic.

There is no timer for Nagle? The current (Minshall variant)
implementation restricts sending a small segment as long as the
previously transmitted packet was small and is not yet ACKed.

> Data flows that already have Nagle disabled (probably anything that
> isn't command-response and isn't unidirectional bulk data) are
> likely to generate a lot of packets within the RTT.

How many packets such applications need to transmit for optimal
latency varies to a great extent. Packets per RTT is not a very useful
metric in this regard, considering the strict dependency on the RTT.

This is why we propose a dynamic packets in flight limit (DPIFL) that
indirectly relies on the application write frequency, i.e. how often
the application performs write systems calls. This limit is used to
ensure that only applications that write data less frequently than a
certain limit may utilize RDB.

> Resending unacked data will just eat into available network bandwidth
> and could easily make any congestion worse.
>
> I think that means you shouldn't resend data more than once, and/or
> should make sure that the resent data isn't a significant overhead
> on the packet being sent.

It is important to remember what type of traffic flows we are
discussing. The applications RDB is aimed at helping produce
application-limited flows that transmit small amounts of data, both in
terms of payload per packet and packets per second.

Analysis of traces from latency-sensitive applications producing
traffic with thin-stream characteristics show inter-transmission times
ranging from a few ms (typically 20-30 ms on average) to many hundred
ms.
(http://mlab.no/blog/2015/10/redundant-data-bundling-in-tcp/#thin_streams)

Increasing the amount of transmitted data will certainly contribute to
congestion to some degree, but it is not (necessarily) an unreasonable
trade-off considering the relatively small amounts of data such
applications transmit compared to greedy flows.

RDB does not cause more packets to be sent through the network, as it
uses available "free" space in packets already scheduled for
transmission. With a bundling limitation of only one previous segment,
the bandwidth requirement is doubled - accounting for headers it would
be less.

By increasing the BW requirement for an application that produces
relatively little data, we still end up with a low BW requirement.
The suggested minimum lower bound inter-transmission time is 10 ms,
meaning that when an application writes data more frequently than
every 10 ms (on average) it will not be allowed to utilize RDB.

To what degree RDB affects competing traffic will of course depend on
the link capacity and the number of simultaneous flows utilizing RDB.
We have performed tests to asses how RDB affects competing traffic. In
one of the test scenarios, 10 RDB-enabled thin streams and 10 regular
TCP thin streams compete against 5 greedy TCP flows over a shared
bottleneck limited to 5Mbit/s. The results from this test show that by
only bundling one previous segment with each packet (segment size: 120
bytes), the effect on the the competing thin-stream traffic is modest.
(http://mlab.no/blog/2015/10/redundant-data-bundling-in-tcp/#latency_test_with_cross_traffic).

Also relevant to the discussion is the paper "Reducing web latency:

Re: [Patch net] ipv4: disable BH when changing ip local port range

2015-11-04 Thread David Miller
From: Cong Wang 
Date: Tue,  3 Nov 2015 14:32:57 -0800

> This fixes the following lockdep warning:
 ...
> Fixes: b8f1a55639e6 ("udp: Add function to make source port for UDP tunnels")
> Cc: Tom Herbert 
> Signed-off-by: Cong Wang 

Applied and queued up for -stable, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v6] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Nelson, Shannon
> From: Sowmini Varadhan [mailto:sowmini.varad...@oracle.com]
> Sent: Wednesday, November 04, 2015 3:21 PM
> 
> This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
> address in Open Firmware or IDPROM").
> 
> As with that fix, attempt to look up the MAC address in Open Firmware
> on systems that support it, and use IDPROM on SPARC if no OF address
> is found.
> 
> In the case of the i40e there is an assumption that the default mac
> address has already been set up as the primary mac filter on probe,
> so if this filter is obtained from the Open Firmware or IDPROM, an
> explicit write is needed via i40e_aq_mac_address_write() and
> i40e_aq_add_macvlan() invocation.
> 
> Reviewed-by: Martin K. Petersen 
> Signed-off-by: Sowmini Varadhan 
> ---
> v2, v3: Andy Shevchenko comments
> v4: Shannon Nelson review: explicitly set up mac filters before
> register_netdev
> v5: Shannon Nelson code style comments
> v6: Shannon Nelson code style comments
> 
>  drivers/net/ethernet/intel/i40e/i40e_main.c |   83
> ++-
>  1 files changed, 82 insertions(+), 1 deletions(-)
> 

Acked-by: Shannon Nelson 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bpf: fix trivial comment typo

2015-11-04 Thread Matthew Fernandez

On 03/11/15 20:48, Daniel Borkmann wrote:

On 11/02/2015 10:48 PM, Matthew Fernandez wrote:

On 03/11/15 08:31, David Miller wrote:

From: Matthew Fernandez 
Date: Mon, 2 Nov 2015 11:59:03 +1100


bpf: fix trivial comment typo

Signed-off-by: Matthew Fernandez 


This doesn't apply to any tree.


I'm sorry, I think I must be missing something. This seems to apply cleanly to 
the current tip of
mainline (e86328c489d7) to me. Was this not in the expected format? It wasn't 
my intention to
waste your time, so I apologise for any newbie errors.


You might want to check Documentation/networking/netdev-FAQ.txt ;),
and rebase your spelling fix f.e. to the latest net-next tree.


Ah right, sorry about that. Thanks for the info, Daniel. From netdev-FAQ it sounds like this patch 
would be more appropriate for the net tree, but as the target file is identical in both right now 
I'll leave it to someone else's discretion to decide which is more suitable. Will rebase and resubmit.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net v2] bpf: fix trivial comment typo

2015-11-04 Thread Matthew Fernandez

bpf: fix trivial comment typo

Signed-off-by: Matthew Fernandez 

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 334b1bd..0bd41f5 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -462,7 +462,7 @@ select_insn:

/* ARG1 at this point is guaranteed to point to CTX from
 * the verifier side due to the fact that the tail call is
-* handeled like a helper, that is, bpf_tail_call_proto,
+* handled like a helper, that is, bpf_tail_call_proto,
 * where arg1_type is ARG_PTR_TO_CTX.
 */
insn = prog->insnsi;
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/3] Netfilter fixes for net

2015-11-04 Thread David Miller
From: Pablo Neira Ayuso 
Date: Wed,  4 Nov 2015 15:00:35 +0100

> The following patchset contains Netfilter fixes for your net tree,
> they are:
> 
> 1) Fix crash when TEE target is used with no --oif, from Eric Dumazet.
> 
> 2) Oneliner to fix a crash on the redirect traffic to localhost
>infrastructure when interface has not yet an address, from
>Munehisa Kamata.
> 
> 3) Oneliner not to request module all the time from nfnetlink due to
>wrong type value, from Florian Westphal.
> 
> I'll make sure these patches 1 and 2 hit -stable.

Pulled.

Since net-next got merged, and I fast-forwarded net to Linus's
tree, there were some minor conflicts.

Please double check my work.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] net: dsa: mv88e6xxx: isolate unbridged ports

2015-11-04 Thread Andrew Lunn
> > +int mv88e6xxx_port_bridge_join(struct dsa_switch *ds, int port, u32 
> > members)
> > +{
> > +   struct mv88e6xxx_priv_state *ps = ds_to_priv(ds);
> > +   const u16 pvid = 4000 + ds->index * DSA_MAX_PORTS + port;
> > +   int err;
> > +
> > +   /* The port joined a bridge, so leave its reserved VLAN */
> > +   mutex_lock(>smi_mutex);
> > +   err = _mv88e6xxx_port_vlan_del(ds, port, pvid);
> > +   if (!err)
> > +   err = _mv88e6xxx_port_pvid_set(ds, port, 0);
> 
> Does that mean that the following happens:
> 
> - bridge is created and port joins it
> - port is configured to be in pvid 0 while joining
> - port is then configured again by the bridge layer to be in whatever
> pvid the user has decided
> 
> The other question is, does that break isolation between multiple
> bridges on the same switch? Should we use the bridge ifindex here
> somehow as a pvid indication?

Hi Florian

The old code which got changed when VLAN support was added used some
property from the bridge to handle multiple bridges.

But that is probably a different bug to the one being fixed here.
This is all about using ports individually.

 Andrew
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next] net/core: ensure features get disabled on new lower devs

2015-11-04 Thread David Miller
From: Jarod Wilson 
Date: Tue,  3 Nov 2015 23:09:32 -0500

> With moving netdev_sync_lower_features() after the .ndo_set_features
> calls, I neglected to verify that devices added *after* a flag had been
> disabled on an upper device were properly added with that flag disabled as
> well. This currently happens, because we exit __netdev_update_features()
> when we see dev->features == features for the upper dev. We can retain the
> optimization of leaving without calling .ndo_set_features with a bit of
> tweaking and a goto here.
> 
> Fixes: fd867d51f889 ("net/core: generic support for disabling netdev features 
> down stack")
 ...
> Reported-by: Nikolay Aleksandrov 
> Signed-off-by: Jarod Wilson 
> ---
> v2: Based on suggestions from Alex, and with not changing err to ret, this
> patch actually becomes quite minimal and doesn't ugly up the code much.

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


AF_PACKET mmap() v4...

2015-11-04 Thread David Miller

As part of fixing y2038 problems, Arnd is going to have to make a new
version fo the AF_PACKET mmap() tpacker descriptors in order to extend
the time values to 64-bit.

So I want everyone to think about whether there are any other changes
we might want to make given that we have to make a v4 anyways.

Particularly, I am rather certain that the buffer management could be
improved.  Some have complained that v3 is kinda awkward to use and/or
suboptimal is various ways.

Thanks.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v2] bpf: fix trivial comment typo

2015-11-04 Thread David Miller
From: Matthew Fernandez 
Date: Thu, 5 Nov 2015 11:09:52 +1100

> bpf: fix trivial comment typo
> 
> Signed-off-by: Matthew Fernandez 

This does not apply.  It looks like your email client has
corrupted the patch.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: AF_PACKET mmap() v4...

2015-11-04 Thread Richard Cochran
On Thu, Nov 05, 2015 at 12:04:14AM -0500, David Miller wrote:
> So I want everyone to think about whether there are any other changes
> we might want to make given that we have to make a v4 anyways.

One thing I would like to see is a field for a desired transmit time.
Time based scheduling is a new topic, never discussed on this list
before, afaict.  HW already supports this, for example, the Intel i210
card has a high priority queue where you can tell it a Tx time in
terms of the PTP clock.

This functionality is useful in industrial Ethernet protocols.  There
must be a dozen of these out there, and a new IEEE standard is in the
works by Time Sensitive Networking (TSN) group.

I haven't thought too much about how to implement this, but the
eventual goal would be a generic time based scheduler that either uses
special HW features or does best effort in SW.  User space would have
a socket option for desired Tx time, and this should also be available
over the mmap interface.

Thanks,
Richard


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bnxt_en: add VXLAN dependency

2015-11-04 Thread David Miller
From: Arnd Bergmann 
Date: Wed, 04 Nov 2015 16:00:32 +0100

> VXLAN may be a loadable module, and this driver cannot be built-in
> in that case, or we get a link error:
> 
> drivers/built-in.o: In function `__bnxt_open_nic':
> drivers/net/ethernet/broadcom/bnxt/bnxt.c:4581: undefined reference to 
> `vxlan_get_rx_port'
> 
> This adds a Kconfig dependency that ensures that either VXLAN is
> disabled (which the driver handles correctly), or we depend on
> VXLAN itself and disallow built-in compilation when VXLAN is
> a module.
> 
> Signed-off-by: Arnd Bergmann 
> Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.")

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch net-next] net: add forgotten IFF_L3MDEV_SLAVE define

2015-11-04 Thread David Miller
From: Jiri Pirko 
Date: Wed,  4 Nov 2015 14:59:06 +0100

> From: Jiri Pirko 
> 
> Fixes: fee6d4c77 ("net: Add netif_is_l3_slave")
> Signed-off-by: Jiri Pirko 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v3] ipv6: clean up dev_snmp6 proc entry when we fail to initialize inet6_dev

2015-11-04 Thread David Miller
From: Sabrina Dubroca 
Date: Wed,  4 Nov 2015 18:00:13 +0100

> In ipv6_add_dev, when addrconf_sysctl_register fails, we do not clean up
> the dev_snmp6 entry that we have already registered for this device.
> Call snmp6_unregister_dev in this case.
> 
> Fixes: a317a2f19da7d ("ipv6: fail early when creating netdev named all or 
> default")
> Reported-by: Dmitry Vyukov 
> Signed-off-by: Sabrina Dubroca 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] tun_dst: Fix potential NULL dereference

2015-11-04 Thread David Miller
From: Tobias Klauser 
Date: Wed,  4 Nov 2015 13:49:49 +0100

> In tun_dst_unclone() the return value of skb_metadata_dst() is checked
> for being NULL after it is dereferenced. Fix this by moving the
> dereference after the NULL check.
> 
> Found by the Coverity scanner (CID 1338068).
> 
> Fixes: fc4099f17240 ("openvswitch: Fix egress tunnel info.")
> Cc: Pravin B Shelar 
> Signed-off-by: Tobias Klauser 

Applied and queued up for -stable, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch -next] qlogic: qed: fix a test for MODE_MF_SI

2015-11-04 Thread David Miller
From: Dan Carpenter 
Date: Wed, 4 Nov 2015 16:29:11 +0300

> MODE_MF_SI is 9.  We should be testing bit 9 instead of AND 0x9.
> 
> Fixes: fe56b9e6a8d9 ('qed: Add module with basic common support')
> Signed-off-by: Dan Carpenter 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch -next] qlogic/qed: remove bogus NULL check

2015-11-04 Thread David Miller
From: Dan Carpenter 
Date: Wed, 4 Nov 2015 16:27:16 +0300

> We check if "p_hwfn" is NULL and then dereference it in the error
> handling code.  I read the code and it isn't NULL so let's remove the
> check.
> 
> Signed-off-by: Dan Carpenter 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [MM PATCH V4 6/6] slub: optimize bulk slowpath free by detached freelist

2015-11-04 Thread Joonsoo Kim
On Wed, Oct 21, 2015 at 09:57:09AM +0200, Jesper Dangaard Brouer wrote:
> On Wed, 14 Oct 2015 14:15:25 +0900
> Joonsoo Kim  wrote:
> 
> > On Tue, Sep 29, 2015 at 05:48:26PM +0200, Jesper Dangaard Brouer wrote:
> > > This change focus on improving the speed of object freeing in the
> > > "slowpath" of kmem_cache_free_bulk.
> > > 
> > > The calls slab_free (fastpath) and __slab_free (slowpath) have been
> > > extended with support for bulk free, which amortize the overhead of
> > > the (locked) cmpxchg_double.
> > > 
> > > To use the new bulking feature, we build what I call a detached
> > > freelist.  The detached freelist takes advantage of three properties:
> > > 
> > >  1) the free function call owns the object that is about to be freed,
> > > thus writing into this memory is synchronization-free.
> > > 
> > >  2) many freelist's can co-exist side-by-side in the same slab-page
> > > each with a separate head pointer.
> > > 
> > >  3) it is the visibility of the head pointer that needs synchronization.
> > > 
> > > Given these properties, the brilliant part is that the detached
> > > freelist can be constructed without any need for synchronization.  The
> > > freelist is constructed directly in the page objects, without any
> > > synchronization needed.  The detached freelist is allocated on the
> > > stack of the function call kmem_cache_free_bulk.  Thus, the freelist
> > > head pointer is not visible to other CPUs.
> > > 
> > > All objects in a SLUB freelist must belong to the same slab-page.
> > > Thus, constructing the detached freelist is about matching objects
> > > that belong to the same slab-page.  The bulk free array is scanned is
> > > a progressive manor with a limited look-ahead facility.
> [...]
> 
> 
> > Hello, Jesper.
> > 
> > AFAIK, it is uncommon to clear pointer to object in argument array.
> > At least, it is better to comment it on somewhere.
> 
> In this case, I think clearing the array is a good thing, as
> using/referencing objects after they have been free'ed is a bug (which
> can be hard to detect).

Okay.

> 
> > Or, how about removing  lookahead facility? Does it have real benefit?
> 
> In my earlier patch series I had a version with and without lookahead
> facility.  Just so I could benchmark the difference.  With Alex'es help
> we/I tuned the code with the lookahead feature to be just as fast.
> Thus, I merged the two patches. (Also did testing for worstcase [1])
> 
> I do wonder if the lookahead have any real benefit.  In micro
> benchmarking it might be "just-as-fast", but I do suspect (just the code
> size increase) it can affect real use-cases... Should we remove it?
> 
> [1] 
> https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/slab_bulk_test03.c

If it's not implemented yet, I would say that starting with simple
one first. But, now, we already have well implemented one so we don't
need to remove it. :)

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] net: Fix prefsrc lookups

2015-11-04 Thread David Miller
From: David Ahern 
Date: Tue,  3 Nov 2015 15:59:28 -0800

> A bug report (https://bugzilla.kernel.org/show_bug.cgi?id=107071) noted
> that the follwoing ip command is failing with v4.3:
> 
> $ ip route add 10.248.5.0/24 dev bond0.250 table vlan_250 src 10.248.5.154
> RTNETLINK answers: Invalid argument
> 
> 021dd3b8a142d changed the lookup of the given preferred source address to
> use the table id passed in, but this assumes the local entries are in the
> given table which is not necessarily true for non-VRF use cases. When
> validating the preferred source fallback to the local table on failure.
> 
> Fixes: 021dd3b8a142d ("net: Add routes to the table associated with the 
> device")
> Signed-off-by: David Ahern 

Applied and queued up for -stable, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] VSOCK: call sk->sk_data_ready() on accept()

2015-11-04 Thread David Miller
From: Stefan Hajnoczi 
Date: Wed,  4 Nov 2015 12:58:42 +

> When a listen socket enqueues a connection for userspace to accept(),
> the sk->sk_data_ready() callback should be invoked.  In-kernel socket
> users rely on this callback to detect when incoming connections are
> available.
> 
> Currently the sk->sk_state_change() callback is invoked by
> vmci_transport.c.  This happens to work for userspace applications since
> sk->sk_state_change = sock_def_wakeup() and sk->sk_data_ready =
> sock_def_readable() both wake up the accept() waiter.  In-kernel socket
> users, on the other hand, fail to detect incoming connections.
> 
> Signed-off-by: Stefan Hajnoczi 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bpf: add mod default A and X test cases

2015-11-04 Thread David Miller
From: Yang Shi 
Date: Wed,  4 Nov 2015 11:36:37 -0800

> When running "mod X" operation, if X is 0 the filter has to be halt.
> Add new test cases to cover A = A mod X if X is 0, and A = A mod 1.
> 
> CC: Xi Wang 
> CC: Zi Shen Lim 
> Signed-off-by: Yang Shi 

Applied, thank you.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] net: stmmac: remove unneeded phy_iface variable

2015-11-04 Thread David Miller
From: LABBE Corentin 
Date: Wed,  4 Nov 2015 21:08:12 +0100

> The variable phy_iface is double-initialized and finally is not necessary
> at all.
> 
> Reported-by: coverity (CID 1271141)
> Signed-off-by: LABBE Corentin 

The value is used in two locations, so having it computed once in a
variable is useful.

Please just eliminate the double assignment.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


pull request: bluetooth 2015-11-05

2015-11-04 Thread Johan Hedberg
Hi Dave,

The following set of Bluetooth patches would be good to get into 4.4-rc1
if possible:

 - Fix for missing LE CoC parameter validity checks
 - Fix for potential deadlock in btusb
 - Fix for issuing unsupported commands during HCI init

Please let me know if there are any issues pulling. Thanks.

Johan

---
The following changes since commit e1b8d903c6c3862160d2d5036806a94786c8fc4e:

  net: Fix prefsrc lookups (2015-11-04 21:34:37 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth.git 
for-upstream

for you to fetch changes up to 40624183c202278e7e0edd01d1273efc87ddd1f2:

  Bluetooth: L2CAP: Add missing checks for invalid LE DCID (2015-11-05 04:04:15 
+0100)


Johan Hedberg (3):
  Bluetooth: L2CAP: Fix returning correct LE CoC response codes
  Bluetooth: L2CAP: Fix checked range when allocating new CID
  Bluetooth: L2CAP: Add missing checks for invalid LE DCID

Kuba Pawlak (1):
  Bluetooth: Fix possible deadlock in btusb

Marcel Holtmann (1):
  Bluetooth: Check for supported white list before issuing commands

 drivers/bluetooth/btusb.c |  6 --
 include/net/bluetooth/l2cap.h |  2 ++
 net/bluetooth/hci_core.c  | 17 +++--
 net/bluetooth/l2cap_core.c| 20 +---
 4 files changed, 34 insertions(+), 11 deletions(-)


signature.asc
Description: PGP signature


RE: kernel 3.14.53 + bnx2x loss of connectivity / parity errors / MCP SCPAD

2015-11-04 Thread Yuval Mintz
> on a production server (HP DL380 Gen9 with HP 10GE dual port card - bnx2x
> driver), I just encountered a full loss of connectivity through the 10 GE 
> ports.
> Kernel in use is vanilla 3.14.53.
> 
> On the console I could see this (timestamps omitted, have to type by hand,
> damn ILO console does not let me copy+paste text...)
> 
> MCP SCPAD
> MCP SCPAD
> bnx2x :04:00.1 eth1: Parity errors detected in blocks:
> MCP SCPAD
> MCP SCPAD
> bnx2x :04:00.0 eth0: Parity errors detected in blocks:
> bnx2x: [bnx2x_attn_int_deasserted3:4080(eth0)]LATCHED attention
> 0x8000
> (masked)
> MCP SCPAD
> ...
> systemd-journald[491]: /dev/kmsg buffer overrun, some messages lost.
> 
> Some googling around finds:
> 
> https://github.com/torvalds/linux/commit/ad6afbe9578d1fa26680faf78c846bd
> 8c00d1d6e
> 
> which might be related. If I read that correctly (and I have no real idea 
> what I'm
> talking about, sorry...) that patch removes superflous printks which might, 
> e.g. in
> our case, hide the real cause. i.e. even with that patch we would have had a
> problem / loss of connectivity, but we might know better why.

> 
> Maybe that changeset would be suitable for backporting to long term stable
> kernels?
> 
> Incidentally, how should these parity events be judged generally? Hope it's a 
> one
> time cosmic ray incident? Cry "faulty hardware, please repair" to the 
> supplier?
> Anything else?

A couple of things to note - 
1. On older kernels, MCP SCPAD parity on its own would have resulted in
Entering the parity recovery flows, and assuming those would have failed
resulting in an adapter in an unsteady state.
But 3.14.53 should be passed that point, and only log MCP SCPAD errors
instead of initiating recovery.

2. Since the SCPAD is not on the datapath, even assuming a real parity
would occur, if that's the only problem then it shouldn't have stopped traffic.

3. In most cases SCPAD is due to utilities, e.g., `ethtool -d' or `ethtool -t'
that are ran on the adapter's network interface; Theoretically, if there's some
unexpected incompatibility between driver and management FW it might
also happen.

4. The patch you've listed merely removes the MCP SCPAD prints, as they're
unavoidable in certain scenarios; It doesn't actually solve anything.

Having said that, do you know if anything happened to the setup that
triggered this? I.e., so configuration change, new utility, etc.?
Alternatively, did the log show anything else in addition to the MCP SCPAD?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/8] mm: memcontrol: account socket memory on unified hierarchy

2015-11-04 Thread Johannes Weiner
On Wed, Nov 04, 2015 at 11:42:40AM +0100, Michal Hocko wrote:
> On Thu 29-10-15 09:10:09, Johannes Weiner wrote:
> > On Thu, Oct 29, 2015 at 04:25:46PM +0100, Michal Hocko wrote:
> > > On Tue 27-10-15 09:42:27, Johannes Weiner wrote:
> [...]
> > > > You carefully skipped over this part. We can ignore it for socket
> > > > memory but it's something we need to figure out when it comes to slab
> > > > accounting and tracking.
> > > 
> > > I am sorry, I didn't mean to skip this part, I though it would be clear
> > > from the previous text. I think kmem accounting falls into the same
> > > category. Have a sane default and a global boottime knob to override it
> > > for those that think differently - for whatever reason they might have.
> > 
> > Yes, that makes sense to me.
> > 
> > Like cgroup.memory=nosocket, would you think it makes sense to include
> > slab in the default for functional/semantical completeness and provide
> > a cgroup.memory=noslab for powerusers?
> 
> I am still not sure whether the kmem accounting is stable enough to be
> enabled by default. If for nothing else the allocation failures, which
> are not allowed for the global case and easily triggered by the hard
> limit, might be a big problem. My last attempts to allow GFP_NOFS to
> fail made me quite skeptical. I still believe this is something which
> will be solved in the long term but the current state might be still too
> fragile. So I would rather be conservative and have the kmem accounting
> disabled by default with a config option and boot parameter to override.
> If somebody is confident that the desired load is stable then the config
> can be enabled easily.

I agree with your assessment of the current kmem code state, but I
think your conclusion is completely backwards here.

The interface will be set in stone forever, whereas any stability
issues will be transient and will have to be addressed in a finite
amount of time anyway. It doesn't make sense to design an interface
based on temporary quality of implementation. Only one of those two
can ever be changed.

Because it goes without saying that once the cgroupv2 interface is
released, and people use it in production, there is no way we can then
*add* dentry cache, inode cache, and others to memory.current. That
would be an unacceptable change in interface behavior. On the other
hand, people will be prepared for hiccups in the early stages of
cgroupv2 release, and we're providing cgroup.memory=noslab to let them
workaround severe problems in production until we fix it without
forcing them to fully revert to cgroupv1.

So if we agree that there are no fundamental architectural concerns
with slab accounting, i.e. nothing that can't be addressed in the
implementation, we have to make the call now.

And I maintain that not accounting dentry cache and inode cache is a
gaping hole in memory isolation, so it should be included by default.
(The rest of the slabs is arguable, but IMO the risk of missing
something important is higher than the cost of including them.)


As far as your allocation failure concerns go, I think the kmem code
is currently not behaving as Glauber originally intended, which is to
force charge if reclaim and OOM killing weren't able to make enough
space. See this recently rewritten section of the kmem charge path:

-   /*
-* try_charge() chose to bypass to root due to OOM kill or
-* fatal signal.  Since our only options are to either fail
-* the allocation or charge it to this cgroup, do it as a
-* temporary condition. But we can't fail. From a kmem/slab
-* perspective, the cache has already been selected, by
-* mem_cgroup_kmem_get_cache(), so it is too late to change
-* our minds.
-*
-* This condition will only trigger if the task entered
-* memcg_charge_kmem in a sane state, but was OOM-killed
-* during try_charge() above. Tasks that were already dying
-* when the allocation triggers should have been already
-* directed to the root cgroup in memcontrol.h
-*/
-   page_counter_charge(>memory, nr_pages);
-   if (do_swap_account)
-   page_counter_charge(>memsw, nr_pages);

It could be that this never properly worked as it was tied to the
-EINTR bypass trick, but the idea was these charges never fail.

And this makes sense. If the allocator semantics are such that we
never fail these page allocations for slab, and the callsites rely on
that, surely we should not fail them in the memory controller, either.

And it makes a lot more sense to account them in excess of the limit
than pretend they don't exist. We might not be able to completely
fullfill the containment part of the memory controller (although these
slab charges will still create significant pressure before that), but
at least we 

Re: 4.1.12 kernel crash in rtnetlink_put_metrics

2015-11-04 Thread Daniel Borkmann

Hi Andrew,

thanks for the report!

On 11/04/2015 05:00 PM, Andrew wrote:

Hi all.

Today I've got a crash on one of servers (PPPoE BRAS with BGP/OSPF). This 
server becomes unstable after updating from 3.2.x kernel to 4.1.x (other 
servers with slightly different CPUs/MBs also have troubles - but they hang 
less frequently).

Place in kernel code:
(gdb) list *rtnetlink_put_metrics+0x50
0xc131c7d0 is in rtnetlink_put_metrics 
(/var/testpoint/LEAF/source/i486-unknown-linux-uclibc/linux/linux-4.1/net/core/rtnetlink.c:672).
667mx = nla_nest_start(skb, RTA_METRICS);
668if (mx == NULL)
669return -ENOBUFS;
670
671for (i = 0; i < RTAX_MAX; i++) {
672if (metrics[i]) {


( Making the trace a bit more readable ... )

[41358.475254]BUG:unable to handle kernel NULL pointer dereference at (null)
[41358.475333]IP:[]rtnetlink_put_metrics+0x50/0x180
[...]
CallTrace:
[41358.476522][]?__nla_reserve+0x23/0xe0
[41358.476557][]?__nla_put+0x9/0xb0
[41358.476595][]?fib_dump_info+0x15e/0x3e0
[41358.476636][]?irq_entries_start+0x639/0x678
[41358.476671][]?fib_table_dump+0xf3/0x180
[41358.476708][]?inet_dump_fib+0x7d/0x100
[41358.476746][]?netlink_dump+0x121/0x270
[41358.476781][]?skb_free_datagram+0x12/0x40
[41358.476818][]?netlink_recvmsg+0x244/0x360
[41358.476855][]?sock_recvmsg+0x1d/0x30
[41358.476890][]?sock_recvmsg_nosec+0x30/0x30
[41358.476924][]?___sys_recvmsg+0x9c/0x120
[41358.476958][]?sock_recvmsg_nosec+0x30/0x30
[41358.476994][]?update_cfs_rq_blocked_load+0xc4/0x130
[41358.477030][]?hrtimer_forward+0xa4/0x1c0
[41358.477065][]?sockfd_lookup_light+0x1d/0x80
[41358.477099][]?__sys_recvmsg+0x3e/0x80
[41358.477134][]?SyS_socketcall+0xb1/0x2a0
[41358.477168][]?handle_irq_event+0x3c/0x60
[41358.477203][]?handle_edge_irq+0x7d/0x100
[41358.477238][]?rps_trigger_softirq+0x26/0x30
[41358.477273][]?flush_smp_call_function_queue+0x83/0x120
[41358.477307][]?syscall_call+0x7/0x7
[...]

Strange that rtnetlink_put_metrics() itself is not part of the above
call trace (it's an exported symbol).

So, your analysis suggests that metrics itself is NULL in this case?
(Can you confirm that?)

How frequently does this trigger? Are the seen call traces all the same kind?

Is there an easy way to reproduce this?

I presume you don't use any per route congestion control settings, right?

Thanks,
Daniel


673if (i == RTAX_CC_ALGO - 1) {
674char tmp[TCP_CA_NAME_MAX], *name;
675
676name = tcp_ca_get_name_by_key(metrics[i], tmp);


Here's trace:

[41358.475254]BUG:unable to handle kernel NULL pointer dereference at 
(null)[41358.475333]IP:[]rtnetlink_put_metrics+0x50/0x180[41358.475376]*pdpt
 =26d58001*pde =[41358.475413]Oops:[#1] SMP 
[41358.475453]Moduleslinked in:act_mirred pppoe pppox ppp_generic slhc iptable_filter 
xt_length xt_TCPMSS xt_tcpudp xt_mark xt_dscp iptable_mangle ip_tables x_tables ipv6 
sch_sfq sch_htb cls_u32 sch_ingress sch_prio sch_tbf cls_flow cls_fw act_police ifb 
8021qmrp garp stp llc softdog parport_pc parport acpi_cpufreq processor thermal_sys 
igb(O)k10temp hwmon dca ohci_pci ohci_hcd ptp pps_core i2c_piix4 i2c_core sp5100_tco 
sd_mod pata_acpi pata_atiixp pcspkr ata_generic ahci libahci libata ehci_pci ehci_hcd 
scsi_mod usbcore usb_common ext4 mbcache jbd2 crc16 vfat fat isofs 
[41358.475807]CPU:2PID:10877Comm:bird Tainted:G   O 4.1.12-i686 
#1[41358.475880]Hardwarename:MICRO-STAR INTERNATIONAL CO.,LTD 
MS-7596/760GM-E51(MS-7596),BIOS
V3.301/12/2012[41358.475955]task:f5302da0 ti:e1364000 task.ti:e1364000 
[41358.475993]EIP:0060:[]EFLAGS:00010282CPU:2[41358.476030]EIP isat 
rtnetlink_put_metrics+0x50/0x180[41358.476066]EAX:EBX:0001ECX:0004EDX:[41358.476106]ESI:EDI:e0b38000
 EBP:e1365ca8 ESP:e1365c78 [41358.476143] 
DS:007bES:007bFS:00d8GS:0033SS:0068[41358.476179]CR0:8005003bCR2:CR3:34966ac0CR4:06f0[41358.476216]Stack:[41358.476249]c1213873
 d4316f64 e0b38000 e1365d00 c1213989 0fe4[41358.476330] e0b38000 
d4316f30 e0b38000 e1365d00 c138362e e1365cd8 
000c[41358.476405]00020002c13bba01 e0b38000

[PATCH] bpf: add mod default A and X test cases

2015-11-04 Thread Yang Shi
When running "mod X" operation, if X is 0 the filter has to be halt.
Add new test cases to cover A = A mod X if X is 0, and A = A mod 1.

CC: Xi Wang 
CC: Zi Shen Lim 
Signed-off-by: Yang Shi 
---
 lib/test_bpf.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index d137739..10cd186 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -5056,6 +5056,36 @@ static struct bpf_test tests[] = {
{ {0x1, 0x0 } },
},
{
+   "MOD default X",
+   .u.insns = {
+   /*
+* A = 0x42
+* A = A mod X ; this halt the filter execution if X is 0
+* ret 0x42
+*/
+   BPF_STMT(BPF_LD | BPF_IMM, 0x42),
+   BPF_STMT(BPF_ALU | BPF_MOD | BPF_X, 0),
+   BPF_STMT(BPF_RET | BPF_K, 0x42),
+   },
+   CLASSIC | FLAG_NO_DATA,
+   {},
+   { {0x1, 0x0 } },
+   },
+   {
+   "MOD default A",
+   .u.insns = {
+   /*
+* A = A mod 1
+* ret A
+*/
+   BPF_STMT(BPF_ALU | BPF_MOD | BPF_K, 0x1),
+   BPF_STMT(BPF_RET | BPF_A, 0x0),
+   },
+   CLASSIC | FLAG_NO_DATA,
+   {},
+   { {0x1, 0x0 } },
+   },
+   {
"JMP EQ default A",
.u.insns = {
/*
-- 
2.0.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Andy Shevchenko
On Wed, Nov 4, 2015 at 9:39 PM, Sowmini Varadhan
 wrote:
>
> This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
> address in Open Firmware or IDPROM").
>
> As with that fix, attempt to look up the MAC address in Open Firmware
> on systems that support it, and use IDPROM on SPARC if no OF address
> is found.
>
> In the case of the i40e there is an assumption that the default mac
> address has already been set up as the primary mac filter on probe,
> so if this filter is obtained from the Open Firmware or IDPROM, an
> explicit write is needed via i40e_aq_mac_address_write() and
> i40e_aq_add_macvlan() invocation.
>

Few comments (mostly stylish)
And take my

Reviewed-by: Andy Shevchenko 

> Reviewed-by: Martin K. Petersen 
> Signed-off-by: Sowmini Varadhan 
> ---
> v2, v3: Andy Shevchenko comments
> v4: Shannon Nelson review: explicitly set up mac filters before 
> register_netdev
> v5: Shannon Nelson code style comments
>
>  drivers/net/ethernet/intel/i40e/i40e_main.c |   84 
> ++-
>  1 files changed, 83 insertions(+), 1 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
> b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index b825f97..a3883cf 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -24,6 +24,15 @@
>   *
>   
> **/
>
> +#include 
> +#include 
> +#include 
> +
> +#ifdef CONFIG_SPARC
> +#include 
> +#include 
> +#endif
> +
>  /* Local includes */
>  #include "i40e.h"
>  #include "i40e_diag.h"
> @@ -9213,6 +9222,44 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct 
> i40e_vsi *vsi)
>  }
>
>  /**
> + * i40e_macaddr_init - explicitly write the mac address filters. This
> + * is needed when the macaddr has been obtained by other means than
> + * the default, e.g., from Open Firmware or IDPROM.
> + *
> + * @vsi: pointer to the vsi.
> + * @macaddr: the MAC address
> + *
> + * Returns 0 on success, negative on failure

Usually the structure of kernel doc is something like following

/**
 * func - summary
 * @paramx: desc
 *
 * Description:
 * Long description in many lines and / or paragraphs
 *
 * Returns:
 * 0 on success or errno otherwise.
 */


> + **/

No need two stars.

> +static int i40e_macaddr_init(struct i40e_vsi *vsi, u8 *macaddr)
> +{
> +   int ret, aq_err;
> +   struct i40e_aqc_add_macvlan_element_data element;

Usually

struct something whatever;
int ret;

looks better.

> +
> +   ret = i40e_aq_mac_address_write(>back->hw,
> +   I40E_AQC_WRITE_TYPE_LAA_WOL,
> +   macaddr, NULL);
> +   if (ret) {
> +   dev_info(>back->pdev->dev,
> +"Addr change for VSI failed: %d\n", ret);

dev_err() or dev_warn() I would say.

> +   return -EADDRNOTAVAIL;
> +   }
> +
> +   memset(, 0, sizeof(element));
> +   ether_addr_copy(element.mac_addr, macaddr);
> +   element.flags = cpu_to_le16(I40E_AQC_MACVLAN_ADD_PERFECT_MATCH);
> +   ret = i40e_aq_add_macvlan(>back->hw, vsi->seid, , 1, 
> NULL);
> +   aq_err = vsi->back->hw.aq.asq_last_status;

Do you really need a separate variable (aq_err)?

> +   if (aq_err != I40E_AQ_RC_OK) {
> +   dev_info(>back->pdev->dev,
> +"add filter failed err %s aq_err %s\n",
> +i40e_stat_str(>back->hw, ret),
> +i40e_aq_str(>back->hw, aq_err));
> +   }
> +   return ret;
> +}
> +
> +/**
>   * i40e_vsi_setup - Set up a VSI by a given type
>   * @pf: board private structure
>   * @type: VSI type
> @@ -9341,6 +9388,9 @@ struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 
> type,
> ret = i40e_config_netdev(vsi);
> if (ret)
> goto err_netdev;
> +   ret = i40e_macaddr_init(vsi, pf->hw.mac.addr);
> +   if (ret)
> +   goto err_netdev;
> ret = register_netdev(vsi->netdev);
> if (ret)
> goto err_netdev;
> @@ -10163,6 +10213,36 @@ static void i40e_print_features(struct i40e_pf *pf)
>  }
>
>  /**
> + * i40e_get_platform_mac_addr - get mac address from Open Firmware
> + * or IDPROM if supported by the platform
> + *
> + * @pdev: PCI device information struct
> + * @mac_addr: the MAC address to be returned
> + *
> + * Look up the MAC address in Open Firmware  on systems that support it,
> + * and use IDPROM on SPARC if no OF address is found.
> + *
> + * Returns 0 on success, negative on failure
> + **/

Same about kernel doc.

> +static int i40e_get_platform_mac_addr(struct pci_dev *pdev, u8 *mac_addr)
> +{
> +   struct device_node *dp = 

Re: [PATCH v5] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Sowmini Varadhan
On (11/04/15 21:59), Andy Shevchenko wrote:
> 
> Usually the structure of kernel doc is something like following
> 
> /**
>  * func - summary
>  * @paramx: desc
>  *
>  * Description:
>  * Long description in many lines and / or paragraphs
>  *
>  * Returns:
>  * 0 on success or errno otherwise.
>  */
> 
> 
> > + **/
> 
> No need two stars.

I was actually following the exact comment style of 
the function just before i40e_macaddr_init, namely:;

/**
 * i40e_vsi_setup - Set up a VSI by a given type
 * @pf: board private structure
 * @type: VSI type
 * @uplink_seid: the switch element to link to
 * @param1: usage depends upon VSI type. For VF types, indicates VF id
 *
 * This allocates the sw VSI structure and its queue resources, then add a VSI
 * to the identified VEB.
 *
 * Returns pointer to the successfully allocated and configure VSI sw struct on
 * success, otherwise returns NULL on failure.
 **/
struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type,
u16 uplink_seid, u32 param1)

So I'm not sure we need to really bike-shed this one?
> > +   macaddr, NULL);
> > +   if (ret) {
> > +   dev_info(>back->pdev->dev,
> > +"Addr change for VSI failed: %d\n", ret);
> 
> dev_err() or dev_warn() I would say.

again, this was a cut/paste of code from i40e_set_mac()
which does netdev_info.

> > +   ret = i40e_aq_add_macvlan(>back->hw, vsi->seid, , 1, 
> > NULL);
> > +   aq_err = vsi->back->hw.aq.asq_last_status;
> 
> Do you really need a separate variable (aq_err)?

That seems to be the convention used elsewhere, where ret is
distinguished from aq_err, see i40e_sync_vsi_filters()

> > +   if (aq_err != I40E_AQ_RC_OK) {
> > +   dev_info(>back->pdev->dev,
> > +"add filter failed err %s aq_err %s\n",
> > +i40e_stat_str(>back->hw, ret),
> > +i40e_aq_str(>back->hw, aq_err));
> > +   }
> > +   return ret;

> Same about kernel doc.
See earlier response.

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] net: stmmac: remove unneeded phy_iface variable

2015-11-04 Thread LABBE Corentin
The variable phy_iface is double-initialized and finally is not necessary
at all.

Reported-by: coverity (CID 1271141)
Signed-off-by: LABBE Corentin 
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
index 11baa4b..26a11b7 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
@@ -354,11 +354,9 @@ static int gmac_clk_init(struct rk_priv_data *bsp_priv)
 
 static int gmac_clk_enable(struct rk_priv_data *bsp_priv, bool enable)
 {
-   int phy_iface = phy_iface = bsp_priv->phy_iface;
-
if (enable) {
if (!bsp_priv->clk_enabled) {
-   if (phy_iface == PHY_INTERFACE_MODE_RMII) {
+   if (bsp_priv->phy_iface == PHY_INTERFACE_MODE_RMII) {
if (!IS_ERR(bsp_priv->mac_clk_rx))
clk_prepare_enable(
bsp_priv->mac_clk_rx);
@@ -390,7 +388,7 @@ static int gmac_clk_enable(struct rk_priv_data *bsp_priv, 
bool enable)
}
} else {
if (bsp_priv->clk_enabled) {
-   if (phy_iface == PHY_INTERFACE_MODE_RMII) {
+   if (bsp_priv->phy_iface == PHY_INTERFACE_MODE_RMII) {
if (!IS_ERR(bsp_priv->mac_clk_rx))
clk_disable_unprepare(
bsp_priv->mac_clk_rx);
-- 
2.4.10

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bpf: add mod default A and X test cases

2015-11-04 Thread Daniel Borkmann

On 11/04/2015 08:36 PM, Yang Shi wrote:

When running "mod X" operation, if X is 0 the filter has to be halt.
Add new test cases to cover A = A mod X if X is 0, and A = A mod 1.

CC: Xi Wang 
CC: Zi Shen Lim 
Signed-off-by: Yang Shi 


LGTM!

Acked-by: Daniel Borkmann 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Sowmini Varadhan

This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
address in Open Firmware or IDPROM").

As with that fix, attempt to look up the MAC address in Open Firmware
on systems that support it, and use IDPROM on SPARC if no OF address
is found.

In the case of the i40e there is an assumption that the default mac
address has already been set up as the primary mac filter on probe,
so if this filter is obtained from the Open Firmware or IDPROM, an
explicit write is needed via i40e_aq_mac_address_write() and
i40e_aq_add_macvlan() invocation.

Reviewed-by: Martin K. Petersen 
Signed-off-by: Sowmini Varadhan 
---
v2, v3: Andy Shevchenko comments
v4: Shannon Nelson review: explicitly set up mac filters before register_netdev
v5: Shannon Nelson code style comments

 drivers/net/ethernet/intel/i40e/i40e_main.c |   84 ++-
 1 files changed, 83 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b825f97..a3883cf 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -24,6 +24,15 @@
  *
  
**/
 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_SPARC
+#include 
+#include 
+#endif
+
 /* Local includes */
 #include "i40e.h"
 #include "i40e_diag.h"
@@ -9213,6 +9222,44 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct 
i40e_vsi *vsi)
 }
 
 /**
+ * i40e_macaddr_init - explicitly write the mac address filters. This
+ * is needed when the macaddr has been obtained by other means than
+ * the default, e.g., from Open Firmware or IDPROM.
+ *
+ * @vsi: pointer to the vsi.
+ * @macaddr: the MAC address
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_macaddr_init(struct i40e_vsi *vsi, u8 *macaddr)
+{
+   int ret, aq_err;
+   struct i40e_aqc_add_macvlan_element_data element;
+
+   ret = i40e_aq_mac_address_write(>back->hw,
+   I40E_AQC_WRITE_TYPE_LAA_WOL,
+   macaddr, NULL);
+   if (ret) {
+   dev_info(>back->pdev->dev,
+"Addr change for VSI failed: %d\n", ret);
+   return -EADDRNOTAVAIL;
+   }
+
+   memset(, 0, sizeof(element));
+   ether_addr_copy(element.mac_addr, macaddr);
+   element.flags = cpu_to_le16(I40E_AQC_MACVLAN_ADD_PERFECT_MATCH);
+   ret = i40e_aq_add_macvlan(>back->hw, vsi->seid, , 1, NULL);
+   aq_err = vsi->back->hw.aq.asq_last_status;
+   if (aq_err != I40E_AQ_RC_OK) {
+   dev_info(>back->pdev->dev,
+"add filter failed err %s aq_err %s\n",
+i40e_stat_str(>back->hw, ret),
+i40e_aq_str(>back->hw, aq_err));
+   }
+   return ret;
+}
+
+/**
  * i40e_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
  * @type: VSI type
@@ -9341,6 +9388,9 @@ struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 
type,
ret = i40e_config_netdev(vsi);
if (ret)
goto err_netdev;
+   ret = i40e_macaddr_init(vsi, pf->hw.mac.addr);
+   if (ret)
+   goto err_netdev;
ret = register_netdev(vsi->netdev);
if (ret)
goto err_netdev;
@@ -10163,6 +10213,36 @@ static void i40e_print_features(struct i40e_pf *pf)
 }
 
 /**
+ * i40e_get_platform_mac_addr - get mac address from Open Firmware
+ * or IDPROM if supported by the platform
+ *
+ * @pdev: PCI device information struct
+ * @mac_addr: the MAC address to be returned
+ *
+ * Look up the MAC address in Open Firmware  on systems that support it,
+ * and use IDPROM on SPARC if no OF address is found.
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_get_platform_mac_addr(struct pci_dev *pdev, u8 *mac_addr)
+{
+   struct device_node *dp = pci_device_to_OF_node(pdev);
+   const unsigned char *addr;
+
+   addr = of_get_mac_address(dp);
+   if (addr) {
+   ether_addr_copy(mac_addr, addr);
+   return 0;
+   }
+#ifdef CONFIG_SPARC
+   ether_addr_copy(mac_addr, idprom->id_ethaddr);
+   return 0;
+#else
+   return -EINVAL;
+#endif /* CONFIG_SPARC */
+}
+
+/**
  * i40e_probe - Device initialization routine
  * @pdev: PCI device information struct
  * @ent: entry in i40e_pci_tbl
@@ -10360,7 +10440,9 @@ static int i40e_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
i40e_aq_stop_lldp(hw, true, NULL);
}
 
-   i40e_get_mac_addr(hw, hw->mac.addr);
+   err = i40e_get_platform_mac_addr(pdev, hw->mac.addr);
+   if (err)
+   i40e_get_mac_addr(hw, hw->mac.addr);
if (!is_valid_ether_addr(hw->mac.addr)) {

Re: [Patch net v2] ipv4: fix a potential deadlock in mcast getsockopt() path

2015-11-04 Thread Marcelo Ricardo Leitner

Em 03-11-2015 21:41, Cong Wang escreveu:

Sasha reported the following lockdep warning:

   Possible unsafe locking scenario:

 CPU0CPU1
 
lock(sk_lock-AF_INET);
 lock(rtnl_mutex);
 lock(sk_lock-AF_INET);
lock(rtnl_mutex);

This is due to that for IP_MSFILTER and MCAST_MSFILTER, we take
rtnl lock before the socket lock in setsockopt() path, but take
the socket lock before rtnl lock in getsockopt() path. All the
rest optnames are setsockopt()-only.

Fix this by aligning the getsockopt() path with the setsockopt()
path, so that all mcast socket path would be locked in the same
order.

Note, IPv6 part is different where rtnl lock is not held.

Fixes: 54ff9ef36bdf ("ipv4, ipv6: kill ip_mc_{join, leave}_group and 
ipv6_sock_mc_{join, drop}")
Reported-by: Sasha Levin 
Cc: Marcelo Ricardo Leitner 
Signed-off-by: Cong Wang 


Reviewed-by: Marcelo Ricardo Leitner 

Thanks


---
  net/ipv4/igmp.c| 12 
  net/ipv4/ip_sockglue.c | 45 ++---
  2 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d38b8b6..a2429b7 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2392,11 +2392,11 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter 
*msf,
struct ip_sf_socklist *psl;
struct net *net = sock_net(sk);

+   ASSERT_RTNL();
+
if (!ipv4_is_multicast(addr))
return -EINVAL;

-   rtnl_lock();
-
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = 0;
@@ -2417,7 +2417,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
goto done;
msf->imsf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
-   rtnl_unlock();
if (!psl) {
len = 0;
count = 0;
@@ -2436,7 +2435,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
return -EFAULT;
return 0;
  done:
-   rtnl_unlock();
return err;
  }

@@ -2450,6 +2448,8 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter 
*gsf,
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;

+   ASSERT_RTNL();
+
psin = (struct sockaddr_in *)>gf_group;
if (psin->sin_family != AF_INET)
return -EINVAL;
@@ -2457,8 +2457,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter 
*gsf,
if (!ipv4_is_multicast(addr))
return -EINVAL;

-   rtnl_lock();
-
err = -EADDRNOTAVAIL;

for_each_pmc_rtnl(inet, pmc) {
@@ -2470,7 +2468,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter 
*gsf,
goto done;
gsf->gf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
-   rtnl_unlock();
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
@@ -2490,7 +2487,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter 
*gsf,
}
return 0;
  done:
-   rtnl_unlock();
return err;
  }

diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c3c359a..5f73a7c 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1251,11 +1251,22 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
   *the _received_ ones. The set sets the _sent_ ones.
   */

+static bool getsockopt_needs_rtnl(int optname)
+{
+   switch (optname) {
+   case IP_MSFILTER:
+   case MCAST_MSFILTER:
+   return true;
+   }
+   return false;
+}
+
  static int do_ip_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen, unsigned 
int flags)
  {
struct inet_sock *inet = inet_sk(sk);
-   int val;
+   bool needs_rtnl = getsockopt_needs_rtnl(optname);
+   int val, err = 0;
int len;

if (level != SOL_IP)
@@ -1269,6 +1280,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, 
int optname,
if (len < 0)
return -EINVAL;

+   if (needs_rtnl)
+   rtnl_lock();
lock_sock(sk);

switch (optname) {
@@ -1386,39 +1399,35 @@ static int do_ip_getsockopt(struct sock *sk, int level, 
int optname,
case IP_MSFILTER:
{
struct ip_msfilter msf;
-   int err;

if (len < IP_MSFILTER_SIZE(0)) {
-   release_sock(sk);
-   return -EINVAL;
+   err = -EINVAL;
+   goto out;
}
if (copy_from_user(, optval, IP_MSFILTER_SIZE(0))) {
-   

Re: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Sowmini Varadhan
On (11/02/15 14:57), Sowmini Varadhan wrote:
> On (11/02/15 17:26), Nelson, Shannon wrote:
> > > I assume you mean .1q 
> > 
> > Yes, this is what I had in mind.
> 
> I dont think we're quite there yet, even without vlans.
> 

Ok finally got all the .1q stuff verified (took a bit longer
than it should, because of some lab/administrative hassles in
getting the vlans adjusted at intermediate switches). But
the fix as I'd sent has been tested, and it is good for .1q.

The issues I was seeing with promisc were orthogonal to my fix,
or even to my platform - I understand those are being addressed
separately.

I'm probably too late for the merge window, and I dont know if
this fix qualifies as "critical" for net, so I'll re-target v5 
for net-next, to be conservative.

--Sowmini


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


NETIF_F_GSO_SOFTWARE vs NETIF_F_GSO

2015-11-04 Thread Jason A. Donenfeld
Hello,

I am making a network device driver that receives packets in
ndo_start_xmit, "does something to them", and then sends the resultant
packet out of a kernelspace UDP socket.

The routine looks something along the lines of:

size_t outgoing_len = calculate_outgoing_length(skb);
struct sk_buff *outgoing = alloc_skb(outgoing_len);
u8 *output_buffer = skb_put(outgoing, outgoing_len);

struct scatterlist sglist[MAX_SKB_FRAGS + 1] = { 0 };
sg_init_table(sglist, skb_shinfo(skb)->nr_frags + 1);
skb_to_sgvec(skb, sglist, 0, skb->len);

magic_transformer state;
begin_magic(, outgoing_buffer);
for (struct scatterlist *sg =  sg; sg = sg_next(sg)) {
u8 *vaddr = kmap_atomic(sg_page(sg));
update_magic(, vaddr + sg->offset, sg->length);
kunmap_atomic(vaddr);
}
finish_magic();

send_udp(outgoing);

Hopefully that's straight-forward enough. I make the skb into an
scatterlist, and then iterate over the scatterlist, to apply a
particular transformation to each part, and then finally I send it
out.

For this, I'm using these netdev features:

#define MY_FEATURES (NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_SG \
 | NETIF_F_GSO | NETIF_F_HIGHDMA)
dev->features |= MY_FEATURES;
dev->hw_features |= MY_FEATURES;
dev->hw_enc_features |= MY_FEATURES;

Using this set of features, everything works well. But the performance
isn't great. I suspect this has something to do with having to
traverse the network stack. So I've looked into offloading features.

Strangely, the performance does not change at all regardless of
whether or not NETIF_F_GSO is specified.

However, the performance becomes incredible when I use
NETIF_F_GSO_SOFTWARE instead of NETIF_F_GSO. But, when using
NETIF_F_GSO_SOFTWARE, skb->len is bigger than the MTU! This poses some
problems for me. Perhaps this is intended behavior? I'm not really
sure. My question is: how can I gain the performance benefits of
NETIF_F_GSO_SOFTWARE while still having skbs that fit inside the MTU?
And what's the difference between specifying NETIF_F_GSO and
NETIF_F_GSO_SOFTWARE?

Thank you,
Jason
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] ipv6: gro: support sit protocol

2015-11-04 Thread Eric Dumazet
On Wed, 2015-11-04 at 13:19 +0100, Wolfgang Walter wrote:

> Today I found a problem: on a router forwarding GRE-packets (ipv4) (it is not 
> the endpount) the interface (intel igb) stops sending packets after some 
> time. 
> I think this happens when an ISATAP packet is inside the GRE-packet.
> 
>   gre packets arrives on eth0
>   eth1 stops sending (receiving still works)
>   ethtool -r eth1
>   eth1 works again for some time
> 
> Switching GRO off on eth0 "fixes" the problem.
> 
> I didn't test vanilla 4.1.12 yet, though. Until today 4.1.11 has been running 
> on the router. What I tested was your patch
>   "gre_gso_segment() chokes if SIT frames were aggregated by GRO engine."
> but did not solve the problem.
> 
> So I would not recommend to backport it to longterm 4.1.
> 
> My plans are:
> 
> * test vanilla 4.1.12
> * test 4.3
> 
> I want to test 4.3 on another router first, though.

If the NIC stops sending packets after some time, it might be an igb
issue.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch net-next] net: add forgotten IFF_L3MDEV_SLAVE define

2015-11-04 Thread David Ahern

On 11/4/15 6:59 AM, Jiri Pirko wrote:

From: Jiri Pirko 

Fixes: fee6d4c77 ("net: Add netif_is_l3_slave")
Signed-off-by: Jiri Pirko 
---
  include/linux/netdevice.h | 1 +
  1 file changed, 1 insertion(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4ac653b..2c00772 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1322,6 +1322,7 @@ enum netdev_priv_flags {
  #define IFF_L3MDEV_MASTER IFF_L3MDEV_MASTER
  #define IFF_NO_QUEUE  IFF_NO_QUEUE
  #define IFF_OPENVSWITCH   IFF_OPENVSWITCH
+#define IFF_L3MDEV_SLAVE   IFF_L3MDEV_SLAVE

  /**
   *struct net_device - The DEVICE structure.



Acked-by: David Ahern 

Thanks, Jiri.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kbuild-all] [PATCH] mpls: fix semicolon.cocci warnings

2015-11-04 Thread David Miller
From: Julia Lawall 
Date: Wed, 4 Nov 2015 07:50:05 +0100 (CET)

> The whole idea of a macro that declares variables some of which are then 
> used in the code under the macro also seems quite unpleasant.  Other 
> iterators don't do this, and so they don't need the end marker.

I'm open minded about cleaning these iterators up.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] ipv6: gro: support sit protocol

2015-11-04 Thread Eric Dumazet
On Wed, 2015-11-04 at 15:09 +0100, Wolfgang Walter wrote:

> 
> Yes, maybe igb has a problem sending a gro-packet if it is an isatap in gre.

We might detect this condition properly from igb ndo_features_check
method.

It currently uses plain passthru_features_check()



> 
> igb has no problem sending gro-packets which are pure isatap or which are 
> ipv4 
> (tcp/udp) in gre with 4.1.12 + these patches.
> 
> And it had no problem with 4.1.11 with isatap in gre.
> 
> Disabling gso for the interface does help.

My patch was aimed for 4.4, not sure about backports to old kernels...


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-04 Thread Marek Vasut
On Wednesday, November 04, 2015 at 04:03:16 PM, Vostrikov Andrey wrote:
> Hi, Marek.

Hi,

> > About the parity -- can we add some flag into the datagram to indicate we
> > want hardware to calculate the parity for that particular datagram for
> > us? And we'd also need to indicate what type of parity. I dunno if this
> > is worth the hassle.
> 
> This  is HW configuration property, it does not belong to  datagram. Also
> for TX channels,  parity could  be  two  kinds:  odd and even, for RX it
> is only on/off.

There are datagrams which do contain parity and ones which do not contain it,
correct ? Thus, it's a property of that particular datagram.

> Parity  is  not  the  only property that needs to be configured in HW,
> following could be needed as well,
> - label bit flipping (on or off)

This is hardware property.

> - rate change (low / high)

This is again hardware configuration -- you have to configure the link speed
before you do RX/TX.

> -  label  filters  and  label  priority matching (this could be HoltIC
> specific)
> 
> I  suppose  all  these  properties  are  configured  only  once, at
> interface initialization.

These three above, yes. Parity, I don't think so.

Best regards,
Marek Vasut
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] ipv6: gro: support sit protocol

2015-11-04 Thread Wolfgang Walter
Am Mittwoch, 4. November 2015, 04:40:51 schrieb Eric Dumazet:
> On Wed, 2015-11-04 at 13:19 +0100, Wolfgang Walter wrote:
> > Today I found a problem: on a router forwarding GRE-packets (ipv4) (it is
> > not the endpount) the interface (intel igb) stops sending packets after
> > some time. I think this happens when an ISATAP packet is inside the
> > GRE-packet.> 
> > gre packets arrives on eth0
> > eth1 stops sending (receiving still works)
> > ethtool -r eth1
> > eth1 works again for some time
> > 
> > Switching GRO off on eth0 "fixes" the problem.
> > 
> > I didn't test vanilla 4.1.12 yet, though. Until today 4.1.11 has been
> > running on the router. What I tested was your patch
> > 
> > "gre_gso_segment() chokes if SIT frames were aggregated by GRO 
engine."
> > 
> > but did not solve the problem.
> > 
> > So I would not recommend to backport it to longterm 4.1.
> > 
> > My plans are:
> > 
> > * test vanilla 4.1.12
> > * test 4.3
> > 
> > I want to test 4.3 on another router first, though.
> 
> If the NIC stops sending packets after some time, it might be an igb
> issue.

Yes, maybe igb has a problem sending a gro-packet if it is an isatap in gre.

igb has no problem sending gro-packets which are pure isatap or which are ipv4 
(tcp/udp) in gre with 4.1.12 + these patches.

And it had no problem with 4.1.11 with isatap in gre.

Disabling gso for the interface does help.

I'll test pure 4.1.12 soon.

Regards,
-- 
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-04 Thread Vostrikov Andrey
Hi, Marek.

> About the parity -- can we add some flag into the datagram to indicate we
> want hardware to calculate the parity for that particular datagram for us?
> And we'd also need to indicate what type of parity. I dunno if this is worth
> the hassle.
This  is HW configuration property, it does not belong to  datagram. Also for TX
channels,  parity could  be  two  kinds:  odd and even, for RX it is only
on/off.

Parity  is  not  the  only property that needs to be configured in HW,
following could be needed as well,
- label bit flipping (on or off)
- rate change (low / high)
-  label  filters  and  label  priority matching (this could be HoltIC
specific)

I  suppose  all  these  properties  are  configured  only  once, at interface
initialization.

-- 
Best regards,
Andrey Vostrikov

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-04 Thread Vostrikov Andrey
Hi, Marek.

>> > About the parity -- can we add some flag into the datagram to indicate we
>> > want hardware to calculate the parity for that particular datagram for
>> > us? And we'd also need to indicate what type of parity. I dunno if this
>> > is worth the hassle.
>> 
>> This  is HW configuration property, it does not belong to  datagram. Also
>> for TX channels,  parity could  be  two  kinds:  odd and even, for RX it
>> is only on/off.

> There are datagrams which do contain parity and ones which do not contain it,
> correct ? Thus, it's a property of that particular datagram.
For   RX  side  it  is  both: datagram and HW (is it checked by receiver or not)
But  for TX side it is HW property of transmitter, either OFF or ON (odd or 
even).

-- 
Best regards,
Andrey Vostrikov

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] bnxt_en: add VXLAN dependency

2015-11-04 Thread Arnd Bergmann
VXLAN may be a loadable module, and this driver cannot be built-in
in that case, or we get a link error:

drivers/built-in.o: In function `__bnxt_open_nic':
drivers/net/ethernet/broadcom/bnxt/bnxt.c:4581: undefined reference to 
`vxlan_get_rx_port'

This adds a Kconfig dependency that ensures that either VXLAN is
disabled (which the driver handles correctly), or we depend on
VXLAN itself and disallow built-in compilation when VXLAN is
a module.

Signed-off-by: Arnd Bergmann 
Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.")

diff --git a/drivers/net/ethernet/broadcom/Kconfig 
b/drivers/net/ethernet/broadcom/Kconfig
index 67a7d520d9f5..8550df189ceb 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -173,6 +173,7 @@ config SYSTEMPORT
 config BNXT
tristate "Broadcom NetXtreme-C/E support"
depends on PCI
+   depends on VXLAN || VXLAN=n
select FW_LOADER
select LIBCRC32C
---help---

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-04 Thread Aleksander Morgado
On Wed, Nov 4, 2015 at 4:18 PM, Vostrikov Andrey
 wrote:
>>> > About the parity -- can we add some flag into the datagram to indicate we
>>> > want hardware to calculate the parity for that particular datagram for
>>> > us? And we'd also need to indicate what type of parity. I dunno if this
>>> > is worth the hassle.
>>>
>>> This  is HW configuration property, it does not belong to  datagram. Also
>>> for TX channels,  parity could  be  two  kinds:  odd and even, for RX it
>>> is only on/off.
>
>> There are datagrams which do contain parity and ones which do not contain it,
>> correct ? Thus, it's a property of that particular datagram.

All ARINC words have bit #31 as parity bit; whether it's used or not
depends on the setup as Andrey says below.

> For   RX  side  it  is  both: datagram and HW (is it checked by receiver or 
> not)
> But  for TX side it is HW property of transmitter, either OFF or ON (odd or 
> even).




-- 
Aleksander
https://aleksander.es
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v2] ipv6: clean up dev_snmp6 proc entry when we fail to initialize inet6_dev

2015-11-04 Thread Eric Dumazet
On Wed, 2015-11-04 at 14:47 +0100, Sabrina Dubroca wrote:
> In ipv6_add_dev, when addrconf_sysctl_register fails, we do not clean up
> the dev_snmp6 entry that we have already registered for this device.
> Call snmp6_unregister_dev in this case.
> 
> Reported-by: Dmitry Vyukov 
> Signed-off-by: Sabrina Dubroca 
> ---
> 
> v2: we cannot call snmp6_unregister_dev from addrconf_core.c, this
> breaks CONFIG_IPV6=m, instead do the clean up directly from
> ipv6_add_dev
> thanks Cong.

Any idea when the bug was added ?

Can we please add a proper Fixes: tag for patches that need to be
backported to stable versions ?

It seems to be

Fixes: a317a2f19da7d ("ipv6: fail early when creating netdev named all or 
default")

So this goes back to linux-3.17 ?

Thanks a lot Sabrina !


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch -next] qlogic: qed: fix a test for MODE_MF_SI

2015-11-04 Thread Dan Carpenter
MODE_MF_SI is 9.  We should be testing bit 9 instead of AND 0x9.

Fixes: fe56b9e6a8d9 ('qed: Add module with basic common support')
Signed-off-by: Dan Carpenter 

diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c 
b/drivers/net/ethernet/qlogic/qed/qed_dev.c
index b9b7b7e..774b223 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_dev.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c
@@ -562,7 +562,7 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
}
 
/* Enable classification by MAC if needed */
-   if (hw_mode & MODE_MF_SI) {
+   if (hw_mode & (1 << MODE_MF_SI)) {
DP_VERBOSE(p_hwfn, NETIF_MSG_HW,
   "Configuring TAGMAC_CLS_TYPE\n");
STORE_RT_REG(p_hwfn,
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net v2] ipv6: clean up dev_snmp6 proc entry when we fail to initialize inet6_dev

2015-11-04 Thread Sabrina Dubroca
In ipv6_add_dev, when addrconf_sysctl_register fails, we do not clean up
the dev_snmp6 entry that we have already registered for this device.
Call snmp6_unregister_dev in this case.

Reported-by: Dmitry Vyukov 
Signed-off-by: Sabrina Dubroca 
---

v2: we cannot call snmp6_unregister_dev from addrconf_core.c, this
breaks CONFIG_IPV6=m, instead do the clean up directly from
ipv6_add_dev
thanks Cong.

 net/ipv6/addrconf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 36b85bd05ac8..dd00828863a0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -417,6 +417,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device 
*dev)
if (err) {
ipv6_mc_destroy_dev(ndev);
del_timer(>regen_timer);
+   snmp6_unregister_dev(ndev);
goto err_release;
}
/* protected by rtnl_lock */
-- 
2.6.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] netfilter: xt_TEE: fix NULL dereference

2015-11-04 Thread Pablo Neira Ayuso
From: Eric Dumazet 

iptables -I INPUT ... -j TEE --gateway 10.1.2.3

 because --oif was not specified

tee_tg_check() sets ->priv pointer to NULL in this case.

Fixes: bbde9fc1824a ("netfilter: factor out packet duplication for IPv4/IPv6")
Signed-off-by: Eric Dumazet 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/xt_TEE.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index fd980aa..c5fdea1 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -31,8 +31,9 @@ static unsigned int
 tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
 {
const struct xt_tee_tginfo *info = par->targinfo;
+   int oif = info->priv ? info->priv->oif : 0;
 
-   nf_dup_ipv4(skb, par->hooknum, >gw.in, info->priv->oif);
+   nf_dup_ipv4(skb, par->hooknum, >gw.in, oif);
 
return XT_CONTINUE;
 }
@@ -42,8 +43,9 @@ static unsigned int
 tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
 {
const struct xt_tee_tginfo *info = par->targinfo;
+   int oif = info->priv ? info->priv->oif : 0;
 
-   nf_dup_ipv6(skb, par->hooknum, >gw.in6, info->priv->oif);
+   nf_dup_ipv6(skb, par->hooknum, >gw.in6, oif);
 
return XT_CONTINUE;
 }
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] netfilter: nf_nat_redirect: add missing NULL pointer check

2015-11-04 Thread Pablo Neira Ayuso
From: Munehisa Kamata 

Commit 8b13eddfdf04cbfa561725cfc42d6868fe896f56 ("netfilter: refactor NAT
redirect IPv4 to use it from nf_tables") has introduced a trivial logic
change which can result in the following crash.

BUG: unable to handle kernel NULL pointer dereference at 0030
IP: [] nf_nat_redirect_ipv4+0x2d/0xa0 [nf_nat_redirect]
PGD 3ba662067 PUD 3ba661067 PMD 0
Oops:  [#1] SMP
Modules linked in: ipv6(E) xt_REDIRECT(E) nf_nat_redirect(E) xt_tcpudp(E) 
iptable_nat(E) nf_conntrack_ipv4(E) nf_defrag_ipv4(E) nf_nat_ipv4(E) nf_nat(E) 
nf_conntrack(E) ip_tables(E) x_tables(E) binfmt_misc(E) xfs(E) libcrc32c(E) 
evbug(E) evdev(E) psmouse(E) i2c_piix4(E) i2c_core(E) acpi_cpufreq(E) button(E) 
ext4(E) crc16(E) jbd2(E) mbcache(E) dm_mirror(E) dm_region_hash(E) dm_log(E) 
dm_mod(E)
CPU: 0 PID: 2536 Comm: ip Tainted: GE   4.1.7-15.23.amzn1.x86_64 #1
Hardware name: Xen HVM domU, BIOS 4.2.amazon 05/06/2015
task: 8800eb438000 ti: 8803ba664000 task.ti: 8803ba664000
[...]
Call Trace:
 
 [] redirect_tg4+0x15/0x20 [xt_REDIRECT]
 [] ipt_do_table+0x2b9/0x5e1 [ip_tables]
 [] iptable_nat_do_chain+0x25/0x30 [iptable_nat]
 [] nf_nat_ipv4_fn+0x13d/0x1f0 [nf_nat_ipv4]
 [] ? iptable_nat_ipv4_fn+0x20/0x20 [iptable_nat]
 [] nf_nat_ipv4_in+0x2e/0x90 [nf_nat_ipv4]
 [] iptable_nat_ipv4_in+0x15/0x20 [iptable_nat]
 [] nf_iterate+0x57/0x80
 [] nf_hook_slow+0x97/0x100
 [] ip_rcv+0x314/0x400

unsigned int
nf_nat_redirect_ipv4(struct sk_buff *skb,
...
{
...
rcu_read_lock();
indev = __in_dev_get_rcu(skb->dev);
if (indev != NULL) {
ifa = indev->ifa_list;
newdst = ifa->ifa_local; <---
}
rcu_read_unlock();
...
}

Before the commit, 'ifa' had been always checked before access. After the
commit, however, it could be accessed even if it's NULL. Interestingly,
this was once fixed in 2003.

http://marc.info/?l=netfilter-devel=106668497403047=2

In addition to the original one, we have seen the crash when packets that
need to be redirected somehow arrive on an interface which hasn't been
yet fully configured.

This change just reverts the logic to the old behavior to avoid the crash.

Fixes: 8b13eddfdf04 ("netfilter: refactor NAT redirect IPv4 to use it from 
nf_tables")
Signed-off-by: Munehisa Kamata 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_nat_redirect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 97b75f9..d438698 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -55,7 +55,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 
rcu_read_lock();
indev = __in_dev_get_rcu(skb->dev);
-   if (indev != NULL) {
+   if (indev && indev->ifa_list) {
ifa = indev->ifa_list;
newdst = ifa->ifa_local;
}
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] netfilter: nfnetlink: don't probe module if it exists

2015-11-04 Thread Pablo Neira Ayuso
From: Florian Westphal 

nfnetlink_bind request_module()s all the time as nfnetlink_get_subsys()
shifts the argument by 8 to obtain the subsys id.

So using type instead of type << 8 always returns NULL.

Fixes: 03292745b02d11 ("netlink: add nlk->netlink_bind hook for module 
auto-loading")
Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nfnetlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 70277b1..27b93da 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -492,7 +492,7 @@ static int nfnetlink_bind(struct net *net, int group)
type = nfnl_group2type[group];
 
rcu_read_lock();
-   ss = nfnetlink_get_subsys(type);
+   ss = nfnetlink_get_subsys(type << 8);
rcu_read_unlock();
if (!ss)
request_module("nfnetlink-subsys-%d", type);
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] Netfilter fixes for net

2015-11-04 Thread Pablo Neira Ayuso
Hi David,

The following patchset contains Netfilter fixes for your net tree,
they are:

1) Fix crash when TEE target is used with no --oif, from Eric Dumazet.

2) Oneliner to fix a crash on the redirect traffic to localhost
   infrastructure when interface has not yet an address, from
   Munehisa Kamata.

3) Oneliner not to request module all the time from nfnetlink due to
   wrong type value, from Florian Westphal.

I'll make sure these patches 1 and 2 hit -stable.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!



The following changes since commit 104eb270e665f4fcd8cb8c8ab4c4d4538c604e92:

  net: sun4i-emac: Properly free resources on probe failure and remove 
(2015-10-21 19:47:45 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to dbc3617f4c1f9fcbe63612048cb9583fea1e11ab:

  netfilter: nfnetlink: don't probe module if it exists (2015-10-28 03:40:50 
+0100)


Eric Dumazet (1):
  netfilter: xt_TEE: fix NULL dereference

Florian Westphal (1):
  netfilter: nfnetlink: don't probe module if it exists

Munehisa Kamata (1):
  netfilter: nf_nat_redirect: add missing NULL pointer check

 net/netfilter/nf_nat_redirect.c | 2 +-
 net/netfilter/nfnetlink.c   | 2 +-
 net/netfilter/xt_TEE.c  | 6 --
 3 files changed, 6 insertions(+), 4 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net] tun_dst: Fix potential NULL dereference

2015-11-04 Thread Tobias Klauser
In tun_dst_unclone() the return value of skb_metadata_dst() is checked
for being NULL after it is dereferenced. Fix this by moving the
dereference after the NULL check.

Found by the Coverity scanner (CID 1338068).

Fixes: fc4099f17240 ("openvswitch: Fix egress tunnel info.")
Cc: Pravin B Shelar 
Signed-off-by: Tobias Klauser 
---
 include/net/dst_metadata.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index ce00971..6816f0f 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -63,12 +63,13 @@ static inline struct metadata_dst *tun_rx_dst(int md_size)
 static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
 {
struct metadata_dst *md_dst = skb_metadata_dst(skb);
-   int md_size = md_dst->u.tun_info.options_len;
+   int md_size;
struct metadata_dst *new_md;
 
if (!md_dst)
return ERR_PTR(-EINVAL);
 
+   md_size = md_dst->u.tun_info.options_len;
new_md = metadata_dst_alloc(md_size, GFP_ATOMIC);
if (!new_md)
return ERR_PTR(-ENOMEM);
-- 
2.6.1.148.g7927db1


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] VSOCK: call sk->sk_data_ready() on accept()

2015-11-04 Thread Stefan Hajnoczi
When a listen socket enqueues a connection for userspace to accept(),
the sk->sk_data_ready() callback should be invoked.  In-kernel socket
users rely on this callback to detect when incoming connections are
available.

Currently the sk->sk_state_change() callback is invoked by
vmci_transport.c.  This happens to work for userspace applications since
sk->sk_state_change = sock_def_wakeup() and sk->sk_data_ready =
sock_def_readable() both wake up the accept() waiter.  In-kernel socket
users, on the other hand, fail to detect incoming connections.

Signed-off-by: Stefan Hajnoczi 
---
 net/vmw_vsock/vmci_transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 7555cad..9363abb 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1236,7 +1236,7 @@ vmci_transport_recv_connecting_server(struct sock 
*listener,
/* Callers of accept() will be be waiting on the listening socket, not
 * the pending socket.
 */
-   listener->sk_state_change(listener);
+   listener->sk_data_ready(listener);
 
return 0;
 
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch -next] qlogic/qed: remove bogus NULL check

2015-11-04 Thread Dan Carpenter
We check if "p_hwfn" is NULL and then dereference it in the error
handling code.  I read the code and it isn't NULL so let's remove the
check.

Signed-off-by: Dan Carpenter 

diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c 
b/drivers/net/ethernet/qlogic/qed/qed_int.c
index 2e399b6..de50e84 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.c
@@ -251,11 +251,6 @@ void qed_int_sp_dpc(unsigned long hwfn_cookie)
int arr_size;
u16 rc = 0;
 
-   if (!p_hwfn) {
-   DP_ERR(p_hwfn->cdev, "DPC called - no hwfn!\n");
-   return;
-   }
-
if (!p_hwfn->p_sp_sb) {
DP_ERR(p_hwfn->cdev, "DPC called - no p_sp_sb\n");
return;
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net] ipv6: allow routes to be configured with expire values

2015-11-04 Thread Xin Long
Add the support for adding expire value to routes,  requested by
Tom Gundersen  for systemd-networkd, and NetworkManager
wants it too.

add it by using the field rta_expires of rta_cacheinfo

Signed-off-by: Xin Long 
Signed-off-by: Hannes Frederic Sowa 
---
 net/ipv6/route.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 946880a..49780bc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2700,6 +2700,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] 
= {
[RTA_PREF]  = { .type = NLA_U8 },
[RTA_ENCAP_TYPE]= { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
+   [RTA_CACHEINFO] = { .len = sizeof(struct rta_cacheinfo) },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2800,6 +2801,16 @@ static int rtm_to_fib6_config(struct sk_buff *skb, 
struct nlmsghdr *nlh,
if (tb[RTA_ENCAP_TYPE])
cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
 
+   if (tb[RTA_CACHEINFO]) {
+   struct rta_cacheinfo *ci = nla_data(tb[RTA_CACHEINFO]);
+   unsigned long timeout = addrconf_timeout_fixup(ci->rta_expires, 
HZ);
+
+   if (addrconf_finite_timeout(timeout)) {
+   cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
+   cfg->fc_flags |= RTF_EXPIRES;
+   }
+   }
+
err = 0;
 errout:
return err;
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [patch -next] qlogic: qed: fix a test for MODE_MF_SI

2015-11-04 Thread Yuval Mintz
> MODE_MF_SI is 9.  We should be testing bit 9 instead of AND 0x9.
> 
> Fixes: fe56b9e6a8d9 ('qed: Add module with basic common support')
> Signed-off-by: Dan Carpenter 

True indeed. Thanks.
Acked-by: Yuval Mintz 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch net-next] net: add forgotten IFF_L3MDEV_SLAVE define

2015-11-04 Thread Jiri Pirko
From: Jiri Pirko 

Fixes: fee6d4c77 ("net: Add netif_is_l3_slave")
Signed-off-by: Jiri Pirko 
---
 include/linux/netdevice.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4ac653b..2c00772 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1322,6 +1322,7 @@ enum netdev_priv_flags {
 #define IFF_L3MDEV_MASTER  IFF_L3MDEV_MASTER
 #define IFF_NO_QUEUE   IFF_NO_QUEUE
 #define IFF_OPENVSWITCHIFF_OPENVSWITCH
+#define IFF_L3MDEV_SLAVE   IFF_L3MDEV_SLAVE
 
 /**
  * struct net_device - The DEVICE structure.
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-04 Thread Marek Vasut
On Wednesday, November 04, 2015 at 10:34:50 AM, Aleksander Morgado wrote:
> On Tue, Nov 3, 2015 at 10:43 PM, Marek Vasut  wrote:
> > On Tuesday, November 03, 2015 at 08:28:43 PM, Oliver Hartkopp wrote:
> >> On 11/03/2015 08:19 PM, Marek Vasut wrote:
> >> > On Tuesday, November 03, 2015 at 07:03:26 PM, Oliver Hartkopp wrote:
> >> >> On 11/03/2015 06:41 PM, Marek Vasut wrote:
> >> >>> On Tuesday, November 03, 2015 at 06:32:12 PM, Oliver Hartkopp wrote:
> >> >>> 
> >> >>> [...]
> >> >>> 
> >>  It looks like you need to shift the stuff in user space every time.
> >>  
> >>  So you might better think of something like this:
> >>  struct a429_frame {
> >>  
> >>  __u32   label;   /* ARINC 429 label */
> >>  __u8length;  /* always set to 8 */
> >>  __u8__pad;   /* padding */
> >>  __u8__res0;  /* reserved / padding */
> >>  __u8__res1;  /* reserved / padding */
> >>  __u32   data __attribute__((aligned(8)));
> >>  __u8p;   /* p */
> >>  __u8ssm; /* ssm */
> >>  __u8sdi; /* sdi */
> >>  __u8__end;   /* padding */
> >>  
> >>  };
> >> >>> 
> >> >>> You don't want to interpret those P(arity)/SSM/SDI bits, since they
> >> >>> differ depending on whatever the remote party sends. That's why I
> >> >>> decided to just make those into 3-bytes of data and let the userland
> >> >>> application deal with it as seen fit. Besides, the ARINC "FTP"
> >> >>> really uses those 3 bytes as plain data.
> >> >> 
> >> >> Ok. I did not know what P was for :-)
> >> > 
> >> > Oh yeah. P is parity and it's optional as well and can be odd/even
> >> > depending on the remote endpoint (sigh).
> >> > 
> >> >> Btw. it can make sense to introduce an union struct where different
> >> >> options to access the content are possible.
> >> > 
> >> > This would be pretty nasty I think. By reading the ARINC
> >> > specification, the SSM can be either 2 or 3 bits, the SDI is
> >> > who-knows-what depending on the remote endpoint and the P is also not
> >> > always present. I'm not convinced that the kernel should interpret
> >> > the 3 byte ARINC payload in any way. (but I wonder if my argument
> >> > presented above is convincing at all either ...).
> >> 
> >> Right.
> >> 
> >> When we define a user visible data structure, this is written into
> >> stone.
> >> 
> >> When ARINC isn't even sure about the detailed interpretation we should
> >> definitely keep our fingers away from doing it ourselves.
> > 
> > Right. Besides, such extension to the ABI can be done later if the need
> > arises (which I seriously doubt), can't it ? Handling the payload as a
> > CAN payload makes sense.
> 
> Agree on this, the three non-label bytes in an ARINC word should be
> taken as opaque payload. The only exception would be the parity most
> significant bit, but I don't think it'd be an issue to have that in
> the opaque payload.

About the parity -- can we add some flag into the datagram to indicate we
want hardware to calculate the parity for that particular datagram for us?
And we'd also need to indicate what type of parity. I dunno if this is worth
the hassle.

Best regards,
Marek Vasut
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


kernel 3.14.53 + bnx2x loss of connectivity / parity errors / MCP SCPAD

2015-11-04 Thread Patrick Schaaf
Dear netdevs,

on a production server (HP DL380 Gen9 with HP 10GE dual port card - bnx2x 
driver), I just encountered a full loss of connectivity through the 10 GE 
ports. Kernel in use is vanilla 3.14.53.

On the console I could see this (timestamps omitted, have to type by hand, 
damn ILO console does not let me copy+paste text...)

MCP SCPAD
MCP SCPAD
bnx2x :04:00.1 eth1: Parity errors detected in blocks:
MCP SCPAD
MCP SCPAD
bnx2x :04:00.0 eth0: Parity errors detected in blocks:
bnx2x: [bnx2x_attn_int_deasserted3:4080(eth0)]LATCHED attention 0x8000 
(masked)
MCP SCPAD
...
systemd-journald[491]: /dev/kmsg buffer overrun, some messages lost.

Some googling around finds:

https://github.com/torvalds/linux/commit/ad6afbe9578d1fa26680faf78c846bd8c00d1d6e
 

which might be related. If I read that correctly (and I have no real idea what 
I'm talking about, sorry...) that patch removes superflous printks which 
might, e.g. in our case, hide the real cause. i.e. even with that patch we 
would have had a problem / loss of connectivity, but we might know better why.

Maybe that changeset would be suitable for backporting to long term stable 
kernels?

Incidentally, how should these parity events be judged generally? Hope it's a 
one time cosmic ray incident? Cry "faulty hardware, please repair" to the 
supplier? Anything else?

best regards
  Patrick
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] ipv6: gro: support sit protocol

2015-11-04 Thread Wolfgang Walter
Am Dienstag, 3. November 2015, 05:07:33 schrieb Eric Dumazet:
> On Tue, 2015-11-03 at 13:57 +0100, Wolfgang Walter wrote:
> > Am Montag, 19. Oktober 2015, 20:40:17 schrieb Eric Dumazet:
> > > From: Eric Dumazet 
> > > 
> > > Tom Herbert added SIT support to GRO with commit
> > > 19424e052fb4 ("sit: Add gro callbacks to sit_offload"),
> > > later reverted by Herbert Xu.
> > > 
> > > The problem came because Tom patch was building GRO
> > > packets without proper meta data : If packets were locally
> > > delivered, we would not care.
> > > 
> > > But if packets needed to be forwarded, GSO engine was not
> > > able to segment individual segments.
> > > 
> > > With the following patch, we correctly set skb->encapsulation
> > > and inner network header. We also update gso_type.
> > 
> > I'm running 4.1.11 / 4.1.12 with this patch on top now since over a week.
> > ISATAP works fine.
> 
> Perfect ! thanks a lot for testing !

Today I found a problem: on a router forwarding GRE-packets (ipv4) (it is not 
the endpount) the interface (intel igb) stops sending packets after some time. 
I think this happens when an ISATAP packet is inside the GRE-packet.

gre packets arrives on eth0
eth1 stops sending (receiving still works)
ethtool -r eth1
eth1 works again for some time

Switching GRO off on eth0 "fixes" the problem.

I didn't test vanilla 4.1.12 yet, though. Until today 4.1.11 has been running 
on the router. What I tested was your patch
"gre_gso_segment() chokes if SIT frames were aggregated by GRO engine."
but did not solve the problem.

So I would not recommend to backport it to longterm 4.1.

My plans are:

* test vanilla 4.1.12
* test 4.3

I want to test 4.3 on another router first, though.

Regards,
-- 
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [patch -next] qlogic/qed: remove bogus NULL check

2015-11-04 Thread Yuval Mintz
> We check if "p_hwfn" is NULL and then dereference it in the error handling
> code.  I read the code and it isn't NULL so let's remove the check.
> 
> Signed-off-by: Dan Carpenter 

Our current interrupt handling logic is being uber-defensive. Thanks.

Acked-by: Yuval Mintz 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH] net: arinc429: Add ARINC-429 stack

2015-11-04 Thread Marek Vasut
On Wednesday, November 04, 2015 at 04:19:45 PM, Aleksander Morgado wrote:
> On Wed, Nov 4, 2015 at 4:18 PM, Vostrikov Andrey
> 
>  wrote:
> >>> > About the parity -- can we add some flag into the datagram to
> >>> > indicate we want hardware to calculate the parity for that
> >>> > particular datagram for us? And we'd also need to indicate what type
> >>> > of parity. I dunno if this is worth the hassle.
> >>> 
> >>> This  is HW configuration property, it does not belong to  datagram.
> >>> Also for TX channels,  parity could  be  two  kinds:  odd and even,
> >>> for RX it is only on/off.
> >> 
> >> There are datagrams which do contain parity and ones which do not
> >> contain it, correct ? Thus, it's a property of that particular
> >> datagram.
> 
> All ARINC words have bit #31 as parity bit; whether it's used or not
> depends on the setup as Andrey says below.

Can bit 31 be ever used for DATA instead of parity ? Or is this just me
not understanding the parlance of the specification, where "DATA" actually
means "DATA with parity" ?

Best regards,
Marek Vasut
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [iproute PATCH] iproute: fix filter_nlmsg

2015-11-04 Thread Phil Sutter
On Tue, Nov 03, 2015 at 04:33:59PM -0800, Stephen Hemminger wrote:
> On Thu, 29 Oct 2015 12:15:47 +0100
> Phil Sutter  wrote:
> 
> > This patch is based upon an old Fedora bug[1] regarding the routing
> > setup of PPP links. I'm not quite sure if it still applies today or how
> > to trigger it, but looking at the change introducing this, it's
> > obviously a bug.
> > 
> > [1] https://bugzilla.redhat.com/show_bug.cgi?id=622782
> > 
> > Fixes: 4479282 ("iproute2: filter routing entries based on clone flag")
> > Signed-off-by: Phil Sutter 
> > ---
> >  ip/iproute.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/ip/iproute.c b/ip/iproute.c
> > index eab512d..ea7e9aa 100644
> > --- a/ip/iproute.c
> > +++ b/ip/iproute.c
> > @@ -151,7 +151,7 @@ static int filter_nlmsg(struct nlmsghdr *n, struct 
> > rtattr **tb, int host_len)
> > if (r->rtm_family == AF_INET6 && table != RT_TABLE_MAIN)
> > ip6_multiple_tables = 1;
> >  
> > -   if (filter.cloned == !(r->rtm_flags_F_CLONED))
> > +   if (filter.cloned && !(r->rtm_flags_F_CLONED))
> > return 0;
> >  
> > if (r->rtm_family == AF_INET6 && !ip6_multiple_tables) {
> 
> Holding off on this, until there is an obvious reproduction.
> The patch looks right but this code has been around a long time and don't want
> any surprised users.

Looking more into this, I found commit c73f3e0 ("iproute2: dont filter
cached routes on iproute_get") which bases explicitly on the behaviour
as we have now (comparison instead of boolean AND).

The above change at least affects showing routes for IPv6. With it
applied, 'ip r s' prints the routing cache along with normal routing
table entries, without it one has to explicitly ask for cached entries
in order for them to show up ('ip r s cached'). What do you think which
is the correct behaviour?

Thanks, Phil
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


4.1.12 kernel crash in rtnetlink_put_metrics

2015-11-04 Thread Andrew

Hi all.

Today I've got a crash on one of servers (PPPoE BRAS with BGP/OSPF). 
This server becomes unstable after updating from 3.2.x kernel to 4.1.x 
(other servers with slightly different CPUs/MBs also have troubles - but 
they hang less frequently).


Place in kernel code:
(gdb) list *rtnetlink_put_metrics+0x50
0xc131c7d0 is in rtnetlink_put_metrics 
(/var/testpoint/LEAF/source/i486-unknown-linux-uclibc/linux/linux-4.1/net/core/rtnetlink.c:672).

667mx = nla_nest_start(skb, RTA_METRICS);
668if (mx == NULL)
669return -ENOBUFS;
670
671for (i = 0; i < RTAX_MAX; i++) {
672if (metrics[i]) {
673if (i == RTAX_CC_ALGO - 1) {
674char tmp[TCP_CA_NAME_MAX], *name;
675
676name = tcp_ca_get_name_by_key(metrics[i], tmp);


Here's trace:

[41358.475254]BUG:unable to handle kernel NULL pointer dereference at 
(null)[41358.475333]IP:[]rtnetlink_put_metrics+0x50/0x180[41358.475376]*pdpt 
=26d58001*pde =[41358.475413]Oops:[#1] SMP 
[41358.475453]Moduleslinked in:act_mirred pppoe pppox ppp_generic slhc 
iptable_filter xt_length xt_TCPMSS xt_tcpudp xt_mark xt_dscp 
iptable_mangle ip_tables x_tables ipv6 sch_sfq sch_htb cls_u32 
sch_ingress sch_prio sch_tbf cls_flow cls_fw act_police ifb 8021qmrp 
garp stp llc softdog parport_pc parport acpi_cpufreq processor 
thermal_sys igb(O)k10temp hwmon dca ohci_pci ohci_hcd ptp pps_core 
i2c_piix4 i2c_core sp5100_tco sd_mod pata_acpi pata_atiixp pcspkr 
ata_generic ahci libahci libata ehci_pci ehci_hcd scsi_mod usbcore 
usb_common ext4 mbcache jbd2 crc16 vfat fat isofs 
[41358.475807]CPU:2PID:10877Comm:bird Tainted:G   O 4.1.12-i686 
#1[41358.475880]Hardwarename:MICRO-STAR INTERNATIONAL CO.,LTD 
MS-7596/760GM-E51(MS-7596),BIOS 
V3.301/12/2012[41358.475955]task:f5302da0 ti:e1364000 task.ti:e1364000 
[41358.475993]EIP:0060:[]EFLAGS:00010282CPU:2[41358.476030]EIP 
isat 
rtnetlink_put_metrics+0x50/0x180[41358.476066]EAX:EBX:0001ECX:0004EDX:[41358.476106]ESI:EDI:e0b38000 
EBP:e1365ca8 ESP:e1365c78 
[41358.476143] DS:007bES:007bFS:00d8GS:0033SS:0068[41358.476179]CR0:8005003bCR2:CR3:34966ac0CR4:06f0[41358.476216]Stack:[41358.476249]c1213873 
d4316f64 e0b38000 e1365d00 c1213989 
0fe4[41358.476330] e0b38000 d4316f30 e0b38000 e1365d00 
c138362e e1365cd8 
000c[41358.476405]00020002c13bba01 e0b38000 
00fe007d8196[41358.476482]CallTrace:[41358.476522][]?__nla_reserve+0x23/0xe0[41358.476557][]?__nla_put+0x9/0xb0[41358.476595][]?fib_dump_info+0x15e/0x3e0[41358.476636][]?irq_entries_start+0x639/0x678[41358.476671][]?fib_table_dump+0xf3/0x180[41358.476708][]?inet_dump_fib+0x7d/0x100[41358.476746][]?netlink_dump+0x121/0x270[41358.476781][]?skb_free_datagram+0x12/0x40[41358.476818][]?netlink_recvmsg+0x244/0x360[41358.476855][]?sock_recvmsg+0x1d/0x30[41358.476890][]?sock_recvmsg_nosec+0x30/0x30[41358.476924][]?___sys_recvmsg+0x9c/0x120[41358.476958][]?sock_recvmsg_nosec+0x30/0x30[41358.476994][]?update_cfs_rq_blocked_load+0xc4/0x130[41358.477030][]?hrtimer_forward+0xa4/0x1c0[41358.477065][]?sockfd_lookup_light+0x1d/0x80[41358.477099][]?__sys_recvmsg+0x3e/0x80[41358.477134][]?SyS_socketcall+0xb1/0x2a0[41358.477168][]?handle_irq_event+0x3c/0x60[41358.477203][]?handle_edge_irq+0x7d/0x100[41358.477238][]?rps_trigger_softirq+0x26/0x30[41358.477273][]?flush_smp_call_function_queue+0x83/0x120[41358.477307][]?syscall_call+0x7/0x7[41358.477341]Code:008945d8 
89c3 89f8 e8 7e72ef ff 85c0 0f889e0085db 0f849600bb 0100c7 
45dc 6690<8b>449efc 85c0 742b83fb 100f8484008945e0 
8d[41358.477509]EIP:[]rtnetlink_put_metrics+0x50/0x180SS:ESP 
0068:e1365c78 
[41358.477576]CR2:[41358.477880]---[endtrace 
6e3e7e6b81407c0a]---[41358.499813][cut here 
][41358.499879]WARNING:CPU:2PID:0at 
/var/testpoint/LEAF/source/i486-unknown-linux-uclibc/linux/linux-4.1/net/netlink/af_netlink.c:944netlink_sock_destruct+0xa8/0xc0()[41358.53]Moduleslinked 
in:act_mirred pppoe pppox ppp_generic slhc iptable_filter xt_length 
xt_TCPMSS xt_tcpudp xt_mark xt_dscp iptable_mangle ip_tables x_tables 
ipv6 sch_sfq sch_htb cls_u32 sch_ingress sch_prio sch_tbf cls_flow 
cls_fw act_police ifb 8021qmrp garp stp llc softdog parport_pc parport 
acpi_cpufreq processor thermal_sys igb(O)k10temp hwmon dca ohci_pci 
ohci_hcd ptp pps_core i2c_piix4 i2c_core sp5100_tco sd_mod pata_acpi 
pata_atiixp pcspkr ata_generic ahci libahci libata ehci_pci ehci_hcd 
scsi_mod usbcore usb_common ext4 mbcache jbd2 crc16 vfat fat isofs 
[41358.502110]CPU:2PID:0Comm:swapper/2Tainted:G  DO 4.1.12-i686 
#1[41358.502213]Hardwarename:MICRO-STAR INTERNATIONAL CO.,LTD 
MS-7596/760GM-E51(MS-7596),BIOS V3.301/12/2012[41358.502305] c14b0540 
f5259f40 c13b6ee2 c104b5a3 c1475fd4 
0002[41358.502610] c14b0540 03b0c13373e8 
0009c13373e8 f2204c00 

Re: [Bug 106241] New: shutdown(3)/close(3) behaviour is incorrect for sockets in accept(3)

2015-11-04 Thread Al Viro
On Wed, Nov 04, 2015 at 03:54:09PM +, David Laight wrote:
> > Sigh...  The kernel has no idea when other threads are done with "all
> > io activities using that fd" - it can wait for them to leave the
> > kernel mode, but there's fuck-all it can do about e.g. a userland
> > loop doing write() until there's more data to send.  And no, you can't
> > rely upon them catching EBADF on the next iteration - by the time they
> > get there, close() might very well have returned and open() from yet
> > another thread might've grabbed the same descriptor.  Welcome to your
> > data being written to hell knows what...
> 
> That just means that the application must use dup2() rather than close().
> It must do that anyway since the thread it is trying to stop might be
> sleeping in the system call stub in libc at the time a close() and open()
> happen.

Oh, _lovely_.  So instead of continuation of that write(2) going down
the throat of something opened by unrelated thread, it (starting from a
pretty arbitrary point) will go into the descriptor the closing thread
passed to dup2().  Until it, in turn, gets closed, at which point we
are back to square one.  That, of course, makes it so much better -
whatever had I been thinking about that made me miss that?

> The listening (in this case) thread would need to look at its global
> data to determine that it is supposed to exit, and then close the fd itself.

Right until it crosses into the kernel mode and does descriptor-to-file
lookup, presumably?  Because prior to that point this kernel-side
"protection" oesn't come into play.  In other words, this is inherently
racy, and AFAICS you are the first poster in that thread who disagrees
with that.

_Any_ userland code that would be racy without that kludge of semantics
in close()/dup2() is *still* racy with it.  If that crap gets triggered
at all, the userland code responsible for that is broken.  Said crap
makes the race windows more narrow, but it doesn't really close them.
And IMO it's rather misduided, especially since it's a) quiet and b)
costly as hell.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] bnxt_en: add VXLAN dependency

2015-11-04 Thread Michael Chan
On Wed, 2015-11-04 at 16:00 +0100, Arnd Bergmann wrote: 
> VXLAN may be a loadable module, and this driver cannot be built-in
> in that case, or we get a link error:
> 
> drivers/built-in.o: In function `__bnxt_open_nic':
> drivers/net/ethernet/broadcom/bnxt/bnxt.c:4581: undefined reference to 
> `vxlan_get_rx_port'
> 
> This adds a Kconfig dependency that ensures that either VXLAN is
> disabled (which the driver handles correctly), or we depend on
> VXLAN itself and disallow built-in compilation when VXLAN is
> a module.
> 
> Signed-off-by: Arnd Bergmann 

Thanks.
Acked-by: Michael Chan 

> Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.")
> 
> diff --git a/drivers/net/ethernet/broadcom/Kconfig 
> b/drivers/net/ethernet/broadcom/Kconfig
> index 67a7d520d9f5..8550df189ceb 100644
> --- a/drivers/net/ethernet/broadcom/Kconfig
> +++ b/drivers/net/ethernet/broadcom/Kconfig
> @@ -173,6 +173,7 @@ config SYSTEMPORT
>  config BNXT
>   tristate "Broadcom NetXtreme-C/E support"
>   depends on PCI
> + depends on VXLAN || VXLAN=n
>   select FW_LOADER
>   select LIBCRC32C
>   ---help---
> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v2] ipv6: clean up dev_snmp6 proc entry when we fail to initialize inet6_dev

2015-11-04 Thread Sabrina Dubroca
2015-11-04, 07:23:14 -0800, Eric Dumazet wrote:
> On Wed, 2015-11-04 at 14:47 +0100, Sabrina Dubroca wrote:
> > In ipv6_add_dev, when addrconf_sysctl_register fails, we do not clean up
> > the dev_snmp6 entry that we have already registered for this device.
> > Call snmp6_unregister_dev in this case.
> > 
> > Reported-by: Dmitry Vyukov 
> > Signed-off-by: Sabrina Dubroca 
> > ---
> > 
> > v2: we cannot call snmp6_unregister_dev from addrconf_core.c, this
> > breaks CONFIG_IPV6=m, instead do the clean up directly from
> > ipv6_add_dev
> > thanks Cong.
> 
> Any idea when the bug was added ?
> 
> Can we please add a proper Fixes: tag for patches that need to be
> backported to stable versions ?
> 
> It seems to be
> 
> Fixes: a317a2f19da7d ("ipv6: fail early when creating netdev named all or 
> default")
> 
> So this goes back to linux-3.17 ?
> 
> Thanks a lot Sabrina !

Sorry, I didn't do the archeology (well, run git blame).  That looks
correct, thanks Eric.

-- 
Sabrina
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Expected behaviour of `tc qdisc replace`?

2015-11-04 Thread Toke Høiland-Jørgensen
Hi

I recently noticed that the behaviour of `tc qdisc replace` differs
depending on whether the qdisc being replaced is of the same kind as the
replacement. I.e.:

# tc qdisc del dev eno1 root
# tc qdisc replace dev eno1 root fq_codel target 100ms interval 200ms
# tc qdisc replace dev eno1 root fq_codel target 5ms 
# tc qdisc
qdisc fq_codel 8007: dev eno1 root refcnt 2 limit 10240p flows 1024 quantum 
1514 target 5.0ms interval 200.0ms ecn 
# tc qdisc del dev eno1 root 
# tc qdisc replace dev eno1 root fq_codel target 5ms
# tc qdisc  
qdisc fq_codel 8008: dev eno1 root refcnt 2 limit 10240p flows 1024 quantum 
1514 target 5.0ms interval 100.0ms ecn 

Notice the difference in interval between the two output lines from tc.

Now, according to the tc man page, `tc qdisc replace` "Performs a nearly
atomic remove/add on an existing node id." In which case I would expect
it to *not* retain settings from the previous configuration.

So my question is: is this a bug, or is it expected behaviour? And if
the latter, what is the difference between `tc qdisc replace` and `tc
qdisc change` then supposed to be?

-Toke
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] drivers: net: cpsw: Add support for fixed-link PHY

2015-11-04 Thread Mugunthan V N
On Wednesday 04 November 2015 02:39 AM, Markus Brunner wrote:
> Add support for a fixed-link devicetree sub-node in case the the 
> cpsw MAC is directly connected to a non-mdio PHY/device. 
> 
> Signed-off-by: Markus Brunner 

Looks good to me.

Acked-by: Mugunthan V N 

Regards
Mugunthan V N
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/4] ipv6: add option to drop unicast encapsulated in L2 multicast

2015-11-04 Thread Johannes Berg
From: Johannes Berg 

In order to solve a problem with 802.11, the so-called hole-196 attack,
add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if
enabled, causes the stack to drop IPv6 unicast packets encapsulated in
link-layer multi- or broadcast frames. Such frames can (as an attack)
be created by any member of the same wireless network and transmitted
as valid encrypted frames since the symmetric key for broadcast frames
is shared between all stations.

Signed-off-by: Johannes Berg 
---
 Documentation/networking/ip-sysctl.txt |  6 ++
 include/linux/ipv6.h   |  1 +
 include/uapi/linux/ipv6.h  |  1 +
 net/ipv6/addrconf.c|  8 
 net/ipv6/ip6_input.c   | 10 ++
 5 files changed, 26 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 35c4c43dd8de..a2169651254e 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1660,6 +1660,12 @@ stable_secret - IPv6 address
 
By default the stable secret is unset.
 
+drop_unicast_in_l2_multicast - BOOLEAN
+   Drop any unicast IPv6 packets that are received in link-layer
+   multicast (or broadcast) frames.
+
+   By default this is turned off.
+
 icmp/*:
 ratelimit - INTEGER
Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 0ef2a97ccdb5..34317cb6a6fc 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -50,6 +50,7 @@ struct ipv6_devconf {
__s32   mc_forwarding;
 #endif
__s32   disable_ipv6;
+   __s32   drop_unicast_in_l2_multicast;
__s32   accept_dad;
__s32   force_tllao;
__s32   ndisc_notify;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 38b4fef20219..4c413570efe8 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -174,6 +174,7 @@ enum {
DEVCONF_USE_OIF_ADDRS_ONLY,
DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+   DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d72fa90d6feb..35f880bcf626 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4672,6 +4672,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf 
*cnf,
array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = 
cnf->ignore_routes_with_linkdown;
/* we omit DEVCONF_STABLE_SECRET for now */
array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
+   array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = 
cnf->drop_unicast_in_l2_multicast;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5734,6 +5735,13 @@ static struct addrconf_sysctl_table
.proc_handler   = 
addrconf_sysctl_ignore_routes_with_linkdown,
},
{
+   .procname   = "drop_unicast_in_l2_multicast",
+   .data   = 
_devconf.drop_unicast_in_l2_multicast,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec,
+   },
+   {
/* sentinel */
}
},
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9075acf081dd..31ac3c56da4b 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -134,6 +134,16 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, 
struct packet_type *pt
IPV6_ADDR_MC_SCOPE(>daddr) == 1)
goto err;
 
+   /* If enabled, drop unicast packets that were encapsulated in link-layer
+* multicast or broadcast to protected against the so-called "hole-196"
+* attack in 802.11 wireless.
+*/
+   if (!ipv6_addr_is_multicast(>daddr) &&
+   (skb->pkt_type == PACKET_BROADCAST ||
+skb->pkt_type == PACKET_MULTICAST) &&
+   idev->cnf.drop_unicast_in_l2_multicast)
+   goto err;
+
/* RFC4291 2.7
 * Nodes must not originate a packet to a multicast address whose scope
 * field contains the reserved value 0; if such a packet is received, it
-- 
2.6.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >