[PATCHv3 net-next 2/2] cxgb4/cxgb4vf: Add set VF mac address support

2016-08-10 Thread Hariprasad Shenai
Add ndo_set_vf_mac support which allows to set the MAC address
for cxgb4vf interfaces from the host

Signed-off-by: Hariprasad Shenai 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |  3 ++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c| 24 -
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c | 41 ++
 .../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c| 24 +
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h |  3 ++
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c | 63 +++---
 6 files changed, 151 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 2e2aa9fec9bb..bcfa51226b46 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -1521,4 +1521,7 @@ void t4_idma_monitor_init(struct adapter *adapter,
 void t4_idma_monitor(struct adapter *adapter,
 struct sge_idma_monitor_state *idma,
 int hz, int ticks);
+int t4_set_vf_mac_acl(struct adapter *adapter, unsigned int vf,
+ unsigned int naddr, u8 *addr);
+
 #endif /* __CXGB4_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 490388239b7f..a13da5ae98ab 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3078,6 +3078,26 @@ static int cxgb_change_mtu(struct net_device *dev, int 
new_mtu)
return ret;
 }
 
+#ifdef CONFIG_PCI_IOV
+static int cxgb_set_vf_mac(struct net_device *dev, int vf, u8 *mac)
+{
+   struct port_info *pi = netdev_priv(dev);
+   struct adapter *adap = pi->adapter;
+
+   /* verify MAC addr is valid */
+   if (!is_valid_ether_addr(mac)) {
+   dev_err(pi->adapter->pdev_dev,
+   "Invalid Ethernet address %pM for VF %d\n",
+   mac, vf);
+   return -EINVAL;
+   }
+
+   dev_info(pi->adapter->pdev_dev,
+"Setting MAC %pM on VF %d\n", mac, vf);
+   return t4_set_vf_mac_acl(adap, vf + 1, 1, mac);
+}
+#endif
+
 static int cxgb_set_mac_addr(struct net_device *dev, void *p)
 {
int ret;
@@ -3136,10 +3156,12 @@ static const struct net_device_ops cxgb4_netdev_ops = {
 #ifdef CONFIG_NET_RX_BUSY_POLL
.ndo_busy_poll= cxgb_busy_poll,
 #endif
-
 };
 
 static const struct net_device_ops cxgb4_mgmt_netdev_ops = {
+#ifdef CONFIG_PCI_IOV
+   .ndo_set_vf_mac   = cxgb_set_vf_mac,
+#endif
 };
 
 static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c 
b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index dc92c80a75f4..2a476cc4e073 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -8264,3 +8264,44 @@ void t4_idma_monitor(struct adapter *adapter,
t4_sge_decode_idma_state(adapter, idma->idma_state[i]);
}
 }
+
+/**
+ * t4_set_vf_mac - Set MAC address for the specified VF
+ * @adapter: The adapter
+ * @vf: one of the VFs instantiated by the specified PF
+ * @naddr: the number of MAC addresses
+ * @addr: the MAC address(es) to be set to the specified VF
+ */
+int t4_set_vf_mac_acl(struct adapter *adapter, unsigned int vf,
+ unsigned int naddr, u8 *addr)
+{
+   struct fw_acl_mac_cmd cmd;
+
+   memset(, 0, sizeof(cmd));
+   cmd.op_to_vfn = cpu_to_be32(FW_CMD_OP_V(FW_ACL_MAC_CMD) |
+   FW_CMD_REQUEST_F |
+   FW_CMD_WRITE_F |
+   FW_ACL_MAC_CMD_PFN_V(adapter->pf) |
+   FW_ACL_MAC_CMD_VFN_V(vf));
+
+   /* Note: Do not enable the ACL */
+   cmd.en_to_len16 = cpu_to_be32((unsigned int)FW_LEN16(cmd));
+   cmd.nmac = naddr;
+
+   switch (adapter->pf) {
+   case 3:
+   memcpy(cmd.macaddr3, addr, sizeof(cmd.macaddr3));
+   break;
+   case 2:
+   memcpy(cmd.macaddr2, addr, sizeof(cmd.macaddr2));
+   break;
+   case 1:
+   memcpy(cmd.macaddr1, addr, sizeof(cmd.macaddr1));
+   break;
+   case 0:
+   memcpy(cmd.macaddr0, addr, sizeof(cmd.macaddr0));
+   break;
+   }
+
+   return t4_wr_mbox(adapter, adapter->mbox, , sizeof(cmd), );
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c 
b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index e116bb8d1729..f2951bf68992 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -2777,6 +2777,7 @@ static int cxgb4vf_pci_probe(struct pci_dev *pdev,
struct adapter *adapter;
struct port_info *pi;
struct net_device 

[PATCHv3 net-next 1/2] cxgb4: Add control net_device for configuring PCIe VF

2016-08-10 Thread Hariprasad Shenai
Issue:
For instance, the current APIs assume a 1-to-1 mapping of Network Ports,
Physical Functions and the SR-IOV Virtual Functions of those Physical
Functions. This is not the case with our cards where any Virtual
Function can be hooked up to any Port -- or any number of Ports the
current Linux APIs also assume only 1 Network Interface/Port can be
accessed per Virtual Function.

Another issue is that these APIs assume that the Administrative Driver
is attached to the Physical Function Associated with a Virtual Function.
This is not the case with our card where all administration is performed
by a Driver which is not attached to any of the Physical Functions which
have SR-IOV PCI Capabilities.

Another consequence of these assumptions is the inability to utilize all
of the cards SR-IOV resources. For instance, our cards have SR-IOV
Capabilities on Physical Functions 0..3 and the administrative Driver
attaches to Physical Function 4. Each of the Physical Functions 0..3 can
support up to 16 Virtual Functions. With the current Linux APIs, a
2-Port card would only be able to use the Virtual Functions on Physical
Function 0..1 and not allow the Virtual Functions on Physical Functions
2..3 to be used since there are no Ports 2..3 on a 2-Port card.

Fix:
Since the control node is always the netdevice for all VF ACL commands.
Created a dummy netdevice for each Physical Function from 0 to 3 through
which one could control their VFs. The device won't be associated with
any port, since it doesn't need to transmit/receive. Its purely used
for VF management purpose only. The device will be registered only when
VF for a particular PF is configured using PCI sysfs interface and
unregistered while pci_disable_sriov() for the PF is called.

The interface will be named "mgmtpf", for example
for PF1 of adapter 0 will be named 'mgmtpf01'.

Signed-off-by: Hariprasad Shenai 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 115 
 1 file changed, 97 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index c45de49dc963..490388239b7f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3139,6 +3139,24 @@ static const struct net_device_ops cxgb4_netdev_ops = {
 
 };
 
+static const struct net_device_ops cxgb4_mgmt_netdev_ops = {
+};
+
+static void get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+   struct adapter *adapter = netdev2adap(dev);
+
+   strlcpy(info->driver, cxgb4_driver_name, sizeof(info->driver));
+   strlcpy(info->version, cxgb4_driver_version,
+   sizeof(info->version));
+   strlcpy(info->bus_info, pci_name(adapter->pdev),
+   sizeof(info->bus_info));
+}
+
+static const struct ethtool_ops cxgb4_mgmt_ethtool_ops = {
+   .get_drvinfo   = get_drvinfo,
+};
+
 void t4_fatal_err(struct adapter *adap)
 {
t4_set_reg_field(adap, SGE_CONTROL_A, GLOBALENABLE_F, 0);
@@ -4836,19 +4854,12 @@ static int get_chip_type(struct pci_dev *pdev, u32 
pl_rev)
 #ifdef CONFIG_PCI_IOV
 static int cxgb4_iov_configure(struct pci_dev *pdev, int num_vfs)
 {
+   struct adapter *adap = pci_get_drvdata(pdev);
int err = 0;
int current_vfs = pci_num_vf(pdev);
u32 pcie_fw;
-   void __iomem *regs;
 
-   regs = pci_ioremap_bar(pdev, 0);
-   if (!regs) {
-   dev_err(>dev, "cannot map device registers\n");
-   return -ENOMEM;
-   }
-
-   pcie_fw = readl(regs + PCIE_FW_A);
-   iounmap(regs);
+   pcie_fw = readl(adap->regs + PCIE_FW_A);
/* Check if cxgb4 is the MASTER and fw is initialized */
if (!(pcie_fw & PCIE_FW_INIT_F) ||
!(pcie_fw & PCIE_FW_MASTER_VLD_F) ||
@@ -4875,6 +4886,8 @@ static int cxgb4_iov_configure(struct pci_dev *pdev, int 
num_vfs)
 */
if (!num_vfs) {
pci_disable_sriov(pdev);
+   if (adap->port[0]->reg_state == NETREG_REGISTERED)
+   unregister_netdev(adap->port[0]);
return num_vfs;
}
 
@@ -4882,6 +4895,12 @@ static int cxgb4_iov_configure(struct pci_dev *pdev, int 
num_vfs)
err = pci_enable_sriov(pdev, num_vfs);
if (err)
return err;
+
+   if (adap->port[0]->reg_state == NETREG_UNINITIALIZED) {
+   err = register_netdev(adap->port[0]);
+   if (err < 0)
+   pr_info("Unable to register VF mgmt netdev\n");
+   }
}
return num_vfs;
 }
@@ -4893,9 +4912,14 @@ static int init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
struct port_info *pi;
bool highdma = false;
struct adapter *adapter = NULL;
+   struct net_device *netdev;
+#ifdef CONFIG_PCI_IOV
+   char 

[PATCHv3 net-next 0/2] Add support for IFLA_VF_MAC

2016-08-10 Thread Hariprasad Shenai
Hi,

We're struggling to implement the PCI SR-IOV management features for
administering Virtual Functions which represent networking devices using
the current Linux APIs. The problem is that these APIs incorporate all
sorts of assumptions which don't match chelsio networking cards.

For instance, the current APIs assume a 1-to-1 mapping of Network Ports,
Physical Functions and the SR-IOV Virtual Functions of those Physical
Functions. This is not the case with our cards where any Virtual Function
can be hooked up to any Port -- or any number of Ports the current Linux
APIs also assume only 1 Network Interface/Port can be accessed per Virtuali
Function.

Another issue is that these APIs assume that the Administrative Driver is
attached to the Physical Function Associated with a Virtual Function. This
is not the case with our card where all administration is performed by a
Driver which is not attached to any of the Physical Functions which have
SR-IOV PCI Capabilities.

Another consequence of these assumptions is the inability to utilize all
of the cards SR-IOV resources. For instance, our cards have SR-IOV
Capabilities on Physical Functions 0..3 and the administrative Driver
attaches to Physical Function 4. Each of the Physical Functions 0..3 can
support up to 16 Virtual Functions. With the current Linux APIs, a 2-Port
card would only be able to use the Virtual Functions on Physical
Function 0..1 and not allow the Virtual Functions on Physical
Functions 2..3 to be used since there are no Ports 2..3 on a 2-Port card.

Patch 1/2 adds support to create management interface for each PF to control
thier corresponding VF's when SRIOV VF's are configure via sysyfs.
Patch 2/2 adds support for ndo_set_vf_mac.

This patch series has been created against net-next tree.

We have included all the maintainers of respective drivers. Kindly review
the change and let us know in case of any review comments.

V3: Based on review comment by Yuval Mintz, removed extra parameter pf
added to IFLA_VF API's and created a net_device corresponding to
each PF for controling their VF. Based on review comment by
Yuval Mintz 

V2: Fixed check for MAC address in Patch 2/2, based on review comments by
Yuval Mintz  

Hariprasad Shenai (2):
  cxgb4: Add control net_device for configuring PCIe VF
  cxgb4/cxgb4vf: Add set VF mac address support

 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |   3 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c| 137 ++---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c |  41 ++
 .../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c|  24 
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_common.h |   3 +
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c |  63 +-
 6 files changed, 247 insertions(+), 24 deletions(-)

-- 
2.3.4



Re: [PATCH 11/21] net: thunderx: Add support for 16 LMACs of 83xx

2016-08-10 Thread kbuild test robot
Hi Sunil,

[auto build test ERROR on net-next/master]
[also build test ERROR on v4.8-rc1 next-20160809]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/sunil-kovvuri-gmail-com/net-thunderx-Support-for-newer-chips-and-miscellaneous-patches/20160811-051837
config: x86_64-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

Note: the 
linux-review/sunil-kovvuri-gmail-com/net-thunderx-Support-for-newer-chips-and-miscellaneous-patches/20160811-051837
 HEAD 5dcb590bc5f427045688ce0ace2e3d39d22e979f builds fine.
  It only hurts bisectibility.

All errors (new ones prefixed by >>):

   drivers/net/ethernet/cavium/thunder/nic_main.c: In function 
'nic_mbx_send_ready':
>> drivers/net/ethernet/cavium/thunder/nic_main.c:177:11: error: 'MAX_LMAC' 
>> undeclared (first use in this function)
 if (vf < MAX_LMAC) {
  ^~~~
   drivers/net/ethernet/cavium/thunder/nic_main.c:177:11: note: each undeclared 
identifier is reported only once for each function it appears in

vim +/MAX_LMAC +177 drivers/net/ethernet/cavium/thunder/nic_main.c

4863dea3 Sunil Goutham 2015-05-26  171  
4863dea3 Sunil Goutham 2015-05-26  172  mbx.nic_cfg.msg = 
NIC_MBOX_MSG_READY;
4863dea3 Sunil Goutham 2015-05-26  173  mbx.nic_cfg.vf_id = vf;
4863dea3 Sunil Goutham 2015-05-26  174  
4863dea3 Sunil Goutham 2015-05-26  175  mbx.nic_cfg.tns_mode = 
NIC_TNS_BYPASS_MODE;
4863dea3 Sunil Goutham 2015-05-26  176  
92dc8769 Sunil Goutham 2015-08-30 @177  if (vf < MAX_LMAC) {
4863dea3 Sunil Goutham 2015-05-26  178  bgx_idx = 
NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
4863dea3 Sunil Goutham 2015-05-26  179  lmac = 
NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]);
4863dea3 Sunil Goutham 2015-05-26  180  

:: The code at line 177 was first introduced by commit
:: 92dc87697e6a71675a9e9eec04ebecd8cf4837a3 net: thunderx: Support for upto 
96 queues for a VF

:: TO: Sunil Goutham 
:: CC: David S. Miller 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data


Re: [PATCH] bonding: Allow tun-interfaces as slaves

2016-08-10 Thread Ding Tianhong
On 2016/8/11 8:58, Jay Vosburgh wrote:
> Jörn Engel  wrote:
> 
>> On Wed, Aug 10, 2016 at 02:26:49PM -0700, Jörn Engel wrote:
>>>
>>> Having to set one more parameter is a bit annoying.  It would have to be
>>> documented in a prominent place and people would still often miss it.
>>> So I wonder if we can make the interface a little nicer.
>>>
>>> Options:
>>> - If there are no slaves yet and the first slave added is tun, we trust
>>>   the users to know what they are doing.  Automatically set
>>>   bond->params.fail_over_mac = BOND_FOM_KEEPMAC
>>>   Maybe do a printk to inform the user in case of a mistake.
> 
>   I don't think this is feasible, as I don't see a reliable way to
> test for a slave being a tun device (ARPHRD_NONE is not just tun, and we
> cannot check the ops as they are not statically built into the kernel).
> I'm also not sure that heuristics are the proper way to enable this
> functionality in general.
> 
>>> - If we get an error and the slave device is tun, do a printk giving the
>>>   user enough information to find this parameter.
> 
>   This could probably be done as a change the existing logic, e.g.,
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 1f276fa30ba6..019c1a689aae 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -1443,6 +1443,9 @@ int bond_enslave(struct net_device *bond_dev, struct 
> net_device *slave_dev)
>   res = -EOPNOTSUPP;
>   goto err_undo_flags;
>   }
> + } else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP &&
> +bond->params.fail_over_mac != BOND_FOM_KEEPMAC) {
> + netdev_err(bond_dev, "The slave device 
> specified does not support setting the MAC address, but fail_over_mac is not 
> set to keepmac\n");
>   }
>   }
>  
>   I haven't tested this, and I'm not sure it will get all corner
> cases correct, but this should basically cover it.
> 

Looks fine to cover the case, but if we still let it pass, I am not sure it is 
suitable.

>   -J
> 
>>> I'm leaning towards the former, but you probably know a reason why I am
>>> wrong again.
>>
>> Patch below is an implementation of the former.  Not sure if something
>> like this is worth considering.
>>
>> Jörn
>>
>> --
>> To announce that there must be no criticism of the President, or that we
>> are to stand by the President, right or wrong, is not only unpatriotic
>> and servile, but is morally treasonable to the American public.
>> -- Theodore Roosevelt, Kansas City Star, 1918
>>
>>
>> diff --git a/drivers/net/bonding/bond_main.c 
>> b/drivers/net/bonding/bond_main.c
>> index 1f276fa30ba6..306909a44fab 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -1482,8 +1482,9 @@ int bond_enslave(struct net_device *bond_dev, struct 
>> net_device *slave_dev)
>>   */
>>  ether_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr);
>>
>> -if (!bond->params.fail_over_mac ||
>> -BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
>> +if (bond_dev->type != ARPHRD_NONE &&
>> +(!bond->params.fail_over_mac ||
>> + BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) {
>>  /* Set slave to master's mac address.  The application already
>>   * set the master's mac address to that of the first slave
>>   */
>> -- 
>> 2.1.4
>>
> 
> ---
>   -Jay Vosburgh, jay.vosbu...@canonical.com
> 
> .
> 




Re: [PATCH] bonding: Allow tun-interfaces as slaves

2016-08-10 Thread Ding Tianhong
On 2016/8/11 1:41, Jay Vosburgh wrote:
> Ding Tianhong  wrote:
> 
>> On 2016/8/10 7:51, Jay Vosburgh wrote:
>>> Jörn Engel  wrote:
>>>
 On Tue, Aug 09, 2016 at 12:06:36PM -0700, David Miller wrote:
>> On Tue, Aug 09, 2016 at 09:28:45PM +0800, Ding Tianhong wrote:
>>
>> Simply not checking errors when setting the mac address solves the
>> problem for me.  No new features needed.
>
> But it only works in certain modes.
>
> So the best we can do is enforce the MAC address setting in the
> modes that absolutely require it.  We cannot ignore the MAC
> address setting unilaterally.

 Something like this?

 [PATCH] bonding: Allow tun-interfaces as slaves in balance-rr mode

 Up until 00503b6f702e (part of 3.14-rc1), the bonding driver could be
 used to enslave tun-interfaces.  00503b6f702e broke that behaviour,
 afaics as an unintended side-effect.

 For the purpose of bond-over-tun in balance-rr mode, simply ignoring the
 error from dev_set_mac_address() is good enough.

 Signed-off-by: Joern Engel 
 ---
 drivers/net/bonding/bond_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

 diff --git a/drivers/net/bonding/bond_main.c 
 b/drivers/net/bonding/bond_main.c
 index 1f276fa30ba6..2f686bfe4304 100644
 --- a/drivers/net/bonding/bond_main.c
 +++ b/drivers/net/bonding/bond_main.c
 @@ -1490,7 +1490,8 @@ int bond_enslave(struct net_device *bond_dev, struct 
 net_device *slave_dev)
memcpy(addr.sa_data, bond_dev->dev_addr, bond_dev->addr_len);
addr.sa_family = slave_dev->type;
res = dev_set_mac_address(slave_dev, );
 -  if (res) {
 +  /* round-robin mode works fine without a mac address */
 +  if (res && BOND_MODE(bond) != BOND_MODE_ROUNDROBIN) {
>>>
>>> This will cause balance-rr to add the slave to the bond if any
>>> device's dev_set_mac_address call fails.
>>>
>>> If a bond of regular Ethernet devices is connected to a static
>>> link aggregation (Etherchannel channel group), a set_mac failure would
>>> result in that slave having a different MAC address than the bond, which
>>> in turn would cause traffic inbound from the switch to that slave to be
>>> dropped (as the destination MAC would not pass the device MAC filters).
>>>
>>> The failure check for the set_mac call serves a legitimate
>>> purpose, and I don't believe we should bypass it without making the
>>> bypass an option that is explicitly enabled for those special cases that
>>> need it.
>>>
>>> E.g., something like the following (which I have not tested);
>>> this would also need documentation and iproute2 updates to go with it.
>>> This would be enabled with "fail_over_mac=keepmac".
>>>
>>> diff --git a/drivers/net/bonding/bond_main.c 
>>> b/drivers/net/bonding/bond_main.c
>>> index 1f276fa30ba6..d2283fc23b16 100644
>>> --- a/drivers/net/bonding/bond_main.c
>>> +++ b/drivers/net/bonding/bond_main.c
>>> @@ -1483,7 +1483,8 @@ int bond_enslave(struct net_device *bond_dev, struct 
>>> net_device *slave_dev)
>>> ether_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr);
>>>  
>>> if (!bond->params.fail_over_mac ||
>>> -   BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
>>> +   (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP &&
>>> +bond->params.fail_over_mac != BOND_FOM_KEEPMAC)) {
>>> /* Set slave to master's mac address.  The application already
>>>  * set the master's mac address to that of the first slave
>>>  */
>>> diff --git a/drivers/net/bonding/bond_options.c 
>>> b/drivers/net/bonding/bond_options.c
>>> index 577e57cad1dc..f9653fe4d622 100644
>>> --- a/drivers/net/bonding/bond_options.c
>>> +++ b/drivers/net/bonding/bond_options.c
>>> @@ -125,6 +125,7 @@ static const struct bond_opt_value 
>>> bond_fail_over_mac_tbl[] = {
>>> { "none",   BOND_FOM_NONE,   BOND_VALFLAG_DEFAULT},
>>> { "active", BOND_FOM_ACTIVE, 0},
>>> { "follow", BOND_FOM_FOLLOW, 0},
>>> +   { "keepmac", BOND_FOM_KEEPMAC, 0},
>>> { NULL, -1,  0},
>>>  };
>>>  
>>> diff --git a/include/net/bonding.h b/include/net/bonding.h
>>> index 6360c259da6d..ec3442b3aa83 100644
>>> --- a/include/net/bonding.h
>>> +++ b/include/net/bonding.h
>>> @@ -420,6 +420,7 @@ static inline bool bond_slave_can_tx(struct slave 
>>> *slave)
>>>  #define BOND_FOM_NONE  0
>>>  #define BOND_FOM_ACTIVE1
>>>  #define BOND_FOM_FOLLOW2
>>> +#define BOND_FOM_KEEPMAC   3
>>>  
>>>  #define BOND_ARP_TARGETS_ANY   0
>>>  #define BOND_ARP_TARGETS_ALL   1
>>>
>>>
>>> -J
>>>
>> Hi jorn:
>>
>> Could you please test this patch? I build this patch base on Jay's 
>> suggestion and I think it could fix your problem.
>>
>> ---
>> 

Re: [PATCH v2] drivers: net: cpsw: fix kmemleak false-positive reports for sk buffers

2016-08-10 Thread David Miller
From: Grygorii Strashko 
Date: Wed, 10 Aug 2016 20:02:53 +0300

> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index 0805855..5caef77 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -732,6 +732,11 @@ static void cpsw_rx_handler(void *token, int len, int 
> status)
>   netif_receive_skb(skb);
>   ndev->stats.rx_bytes += len;
>   ndev->stats.rx_packets++;
> + /* SKB pointer will be stored in CPPI RAM (SRAM) which belongs
> +  * to MMIO space, as result false positive memory leak report
> +  * will be generated.
> +  */
> + kmemleak_not_leak(new_skb);
>   } else {
>   ndev->stats.rx_dropped++;
>   new_skb = skb;

There is already a kmemleak_not_leak() statement here in the current
driver.

Please always develop and generate patches against current sources.


Re: [PATCH] bonding: Allow tun-interfaces as slaves

2016-08-10 Thread Jay Vosburgh
Jörn Engel  wrote:

>On Wed, Aug 10, 2016 at 02:26:49PM -0700, Jörn Engel wrote:
>> 
>> Having to set one more parameter is a bit annoying.  It would have to be
>> documented in a prominent place and people would still often miss it.
>> So I wonder if we can make the interface a little nicer.
>> 
>> Options:
>> - If there are no slaves yet and the first slave added is tun, we trust
>>   the users to know what they are doing.  Automatically set
>>   bond->params.fail_over_mac = BOND_FOM_KEEPMAC
>>   Maybe do a printk to inform the user in case of a mistake.

I don't think this is feasible, as I don't see a reliable way to
test for a slave being a tun device (ARPHRD_NONE is not just tun, and we
cannot check the ops as they are not statically built into the kernel).
I'm also not sure that heuristics are the proper way to enable this
functionality in general.

>> - If we get an error and the slave device is tun, do a printk giving the
>>   user enough information to find this parameter.

This could probably be done as a change the existing logic, e.g.,

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 1f276fa30ba6..019c1a689aae 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1443,6 +1443,9 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev)
res = -EOPNOTSUPP;
goto err_undo_flags;
}
+   } else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP &&
+  bond->params.fail_over_mac != BOND_FOM_KEEPMAC) {
+   netdev_err(bond_dev, "The slave device 
specified does not support setting the MAC address, but fail_over_mac is not 
set to keepmac\n");
}
}
 
I haven't tested this, and I'm not sure it will get all corner
cases correct, but this should basically cover it.

-J

>> I'm leaning towards the former, but you probably know a reason why I am
>> wrong again.
>
>Patch below is an implementation of the former.  Not sure if something
>like this is worth considering.
>
>Jörn
>
>--
>To announce that there must be no criticism of the President, or that we
>are to stand by the President, right or wrong, is not only unpatriotic
>and servile, but is morally treasonable to the American public.
>-- Theodore Roosevelt, Kansas City Star, 1918
>
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 1f276fa30ba6..306909a44fab 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -1482,8 +1482,9 @@ int bond_enslave(struct net_device *bond_dev, struct 
>net_device *slave_dev)
>*/
>   ether_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr);
> 
>-  if (!bond->params.fail_over_mac ||
>-  BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
>+  if (bond_dev->type != ARPHRD_NONE &&
>+  (!bond->params.fail_over_mac ||
>+   BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) {
>   /* Set slave to master's mac address.  The application already
>* set the master's mac address to that of the first slave
>*/
>-- 
>2.1.4
>

---
-Jay Vosburgh, jay.vosbu...@canonical.com


Re: [PATCH 1/1 linux-next] net: hns: fix typo in g_gmac_stats_string[]

2016-08-10 Thread David Miller
From: Fabian Frederick 
Date: Wed, 10 Aug 2016 17:48:36 +0200

> s/gamc/gmac/
> 
> Signed-off-by: Fabian Frederick 

Applied, thank you.


Re: [PATCH net v1 1/1] tipc: fix variable dereference before NULL check

2016-08-10 Thread David Miller
From: Parthasarathy Bhuvaragan 
Date: Wed, 10 Aug 2016 14:07:34 +0200

> In commit cf6f7e1d5109 ("tipc: dump monitor attributes"),
> I dereferenced a pointer before checking if its valid.
> This is reported by static check Smatch as:
> net/tipc/monitor.c:733 tipc_nl_add_monitor_peer()
>  warn: variable dereferenced before check 'mon' (see line 731)
> 
> In this commit, we check for a valid monitor before proceeding
> with any other operation.
> 
> Fixes: cf6f7e1d5109 ("tipc: dump monitor attributes")
> Reported-by: Dan Carpenter 
> Signed-off-by: Parthasarathy Bhuvaragan 
> 

Applied, thank you.


[PATCH 4/5] net: add dscp ranges to net cgroup

2016-08-10 Thread Anoop Naravaram
dscp ranges
--
This property controls which dscp values the processes in a cgroup are
allowed to use. A process in a cgroup will receive an EACCES error if it
tries to do any of these things:
* set a socket's IP_TOS option to a value whose dscp field (bits 7:2) is
  outside the range
* use a socket to send a message in which the IP_TOS ancillary data is
  set to a value whose dscp field is outside the range

This property is exposed to userspace through the 'net.dscp_ranges' file,
similar to the bind and listen port ranges.

Tested: wrote python to attempt to setsockopt the IP_TOS option to a
value with an out-of-range dscp field, and expect a failure

Signed-off-by: Anoop Naravaram 
---
 Documentation/cgroup-v1/net.txt | 14 ++
 include/net/net_cgroup.h|  6 ++
 net/core/net_cgroup.c   | 34 --
 net/ipv4/ip_sockglue.c  | 13 +
 net/ipv6/datagram.c |  9 +
 net/ipv6/ipv6_sockglue.c|  8 
 6 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/Documentation/cgroup-v1/net.txt b/Documentation/cgroup-v1/net.txt
index a14fd1c..ea2f1db 100644
--- a/Documentation/cgroup-v1/net.txt
+++ b/Documentation/cgroup-v1/net.txt
@@ -30,6 +30,20 @@ This property is exposed to userspace through the 
'net.listen_port_ranges' file,
 as ranges of ports that the processes can listen on (as described in the HOW TO
 INTERACT WITH RANGES FILES section).
 
+dscp ranges
+---
+This property controls which dscp values the processes in a cgroup are
+allowed to use. A process in a cgroup will receive an EACCES error if it
+tries to do any of these things:
+* set a socket's IP_TOS option to a value whose dscp field (bits 7:2) is
+  outside the range
+* use a socket to send a message in which the IP_TOS ancillary data is
+  set to a value whose dscp field is outside the range
+
+This property is exposed to userspace through the 'net.dscp_ranges' file, as
+ranges of dscp values that the process can use (as described in the HOW TO
+INTERACT WITH RANGES FILES section).
+
 udp port usage and limit
 
 This property controls the limit of udp ports that can be used by the
diff --git a/include/net/net_cgroup.h b/include/net/net_cgroup.h
index 25a9def..d89e98d 100644
--- a/include/net/net_cgroup.h
+++ b/include/net/net_cgroup.h
@@ -23,6 +23,7 @@
 enum {
NETCG_LISTEN_RANGES,
NETCG_BIND_RANGES,
+   NETCG_DSCP_RANGES,
NETCG_NUM_RANGE_TYPES
 };
 
@@ -73,6 +74,7 @@ struct net_cgroup {
 
 bool net_cgroup_bind_allowed(u16 port);
 bool net_cgroup_listen_allowed(u16 port);
+bool net_cgroup_dscp_allowed(u8 dscp);
 bool net_cgroup_acquire_udp_port(void);
 void net_cgroup_release_udp_port(void);
 
@@ -85,6 +87,10 @@ static inline bool net_cgroup_listen_allowed(u16 port)
 {
return true;
 }
+static inline bool net_cgroup_dscp_allowed(u8 dscp)
+{
+   return true;
+}
 static inline bool net_cgroup_acquire_udp_port(void)
 {
return true;
diff --git a/net/core/net_cgroup.c b/net/core/net_cgroup.c
index 2f58e13..73dc5e7 100644
--- a/net/core/net_cgroup.c
+++ b/net/core/net_cgroup.c
@@ -21,6 +21,9 @@
 #define MIN_PORT_VALUE 0
 #define MAX_PORT_VALUE 65535
 
+#define MIN_DSCP_VALUE 0
+#define MAX_DSCP_VALUE 63
+
 /* Deriving MAX_ENTRIES from MAX_WRITE_SIZE as a rough estimate */
 #define MAX_ENTRIES ((MAX_WRITE_SIZE - offsetof(struct net_ranges, range)) /   
\
 BYTES_PER_ENTRY)
@@ -161,7 +164,10 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
MIN_PORT_VALUE, MAX_PORT_VALUE) ||
alloc_init_net_ranges(
>whitelists[NETCG_LISTEN_RANGES],
-   MIN_PORT_VALUE, MAX_PORT_VALUE)) {
+   MIN_PORT_VALUE, MAX_PORT_VALUE) ||
+   alloc_init_net_ranges(
+   >whitelists[NETCG_DSCP_RANGES],
+   MIN_DSCP_VALUE, MAX_DSCP_VALUE)) {
free_net_cgroup(netcg);
/* if any of these cause an error, return ENOMEM */
return ERR_PTR(-ENOMEM);
@@ -178,7 +184,11 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
alloc_copy_net_ranges(
>whitelists[NETCG_LISTEN_RANGES],
MIN_PORT_VALUE, MAX_PORT_VALUE,
-   
_netcg->whitelists[NETCG_LISTEN_RANGES])) {
+   _netcg->whitelists[NETCG_LISTEN_RANGES]) 
||
+   alloc_copy_net_ranges(
+   >whitelists[NETCG_DSCP_RANGES],
+   MIN_DSCP_VALUE, MAX_DSCP_VALUE,
+   _netcg->whitelists[NETCG_DSCP_RANGES])) {
free_net_cgroup(netcg);
   

[PATCH 1/5] net: create the networking cgroup controller

2016-08-10 Thread Anoop Naravaram
This is a skeleton implementation of a cgroup controller for networking
properties. It will be used for:
* limiting the specific ports that a process in a cgroup is allowed to bind
  to or listen on
* restricting which dscp values processes can use with their sockets
* limiting the total number of udp ports that can be used by a process

Also there is new documentation of this controller in
Documentation/cgroup-v1/net.txt.

Signed-off-by: Anoop Naravaram 
---
 Documentation/cgroup-v1/net.txt |  9 ++
 include/linux/cgroup_subsys.h   |  4 +++
 include/net/net_cgroup.h| 27 ++
 net/Kconfig | 10 +++
 net/core/Makefile   |  1 +
 net/core/net_cgroup.c   | 62 +
 6 files changed, 113 insertions(+)
 create mode 100644 Documentation/cgroup-v1/net.txt
 create mode 100644 include/net/net_cgroup.h
 create mode 100644 net/core/net_cgroup.c

diff --git a/Documentation/cgroup-v1/net.txt b/Documentation/cgroup-v1/net.txt
new file mode 100644
index 000..580c214
--- /dev/null
+++ b/Documentation/cgroup-v1/net.txt
@@ -0,0 +1,9 @@
+Networking cgroup
+=
+
+The net cgroup controller keeps track of the following networking related
+properties for each process group:
+* bind port ranges
+* listen port ranges
+* dscp ranges
+* udp port usage and limit
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336a..81ff75b 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -40,6 +40,10 @@ SUBSYS(freezer)
 SUBSYS(net_cls)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_NET)
+SUBSYS(net)
+#endif
+
 #if IS_ENABLED(CONFIG_CGROUP_PERF)
 SUBSYS(perf_event)
 #endif
diff --git a/include/net/net_cgroup.h b/include/net/net_cgroup.h
new file mode 100644
index 000..8e98803
--- /dev/null
+++ b/include/net/net_cgroup.h
@@ -0,0 +1,27 @@
+/*
+ * net_cgroup.hNetworking Control Group
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Authors:Anoop Naravaram 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#ifndef _NET_CGROUP_H
+#define _NET_CGROUP_H
+
+#include 
+
+#ifdef CONFIG_CGROUP_NET
+
+struct net_cgroup {
+   struct cgroup_subsys_state  css;
+};
+
+#endif /* CONFIG_CGROUP_NET */
+#endif  /* _NET_CGROUP_H */
diff --git a/net/Kconfig b/net/Kconfig
index c2cdbce..47f68bd 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -278,6 +278,16 @@ config CGROUP_NET_CLASSID
  Cgroup subsystem for use as general purpose socket classid marker 
that is
  being used in cls_cgroup and for netfilter matching.
 
+config CGROUP_NET
+   bool "Networking cgroup"
+   depends on CGROUPS
+   ---help---
+ Cgroup subsystem for use in managing several networking properties,
+ such as restricting which ports are available for processes to bind
+ and listen on, restricting which dscp values processes can use with
+ their sockets, and limiting the number of udp ports that can be
+ acquired by processes from the cgroup.
+
 config NET_RX_BUSY_POLL
bool
default y
diff --git a/net/core/Makefile b/net/core/Makefile
index d6508c2..9dbc8b6 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -23,6 +23,7 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
 obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
 obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
+obj-$(CONFIG_CGROUP_NET) += net_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
diff --git a/net/core/net_cgroup.c b/net/core/net_cgroup.c
new file mode 100644
index 000..3a46960
--- /dev/null
+++ b/net/core/net_cgroup.c
@@ -0,0 +1,62 @@
+/*
+ * net/core/net_cgroup.c   Networking Control Group
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * Authors:Anoop Naravaram 
+ */
+
+#include 
+#include 
+
+static struct net_cgroup *css_to_net_cgroup(struct cgroup_subsys_state *css)
+{
+   return css ? container_of(css, struct net_cgroup, css) : NULL;
+}
+
+static struct net_cgroup *task_to_net_cgroup(struct task_struct *p)
+{
+   return css_to_net_cgroup(task_css(p, net_cgrp_id));
+}
+
+static struct net_cgroup *net_cgroup_to_parent(struct net_cgroup *netcg)
+{
+   return css_to_net_cgroup(netcg->css.parent);
+}
+
+static void 

[PATCH 2/5] net: add bind/listen ranges to net cgroup

2016-08-10 Thread Anoop Naravaram
bind port ranges

This property controls which ports the processes in a cgroup are allowed
to bind to. If a process in a cgroup tries to bind a socket to a port
that is not within the range(s) permitted by the cgroup, it will receive an
EACCES error.

>From userspace, you can get or set the bind port ranges by accessing the
'net.bind_port_ranges' file. To set the ranges of a cgroup, write the
comma-separated ranges to the file, where each range could be either a
pair of ports separated by a hyphen (-), or just an individual port. For
example, say you want to allow all the processes in a cgroup to be allowed
to bind to ports 100 through 200 (inclusive), 300 through 320 (inclusive)
and 350. Then you can write the string "100-200,300-320,350" to the
'net.bind_port_ranges' file. When reading the file, any individual ports
will be read as a "start-end" range where the start and end are equal.
The example above would be read as "100-200,300-320,350-350".

The controller imposes the invariant that the ranges of any cgroup must
be a subset (or equal set) of the ranges of its parents (i.e., processes
in a cgroup cannot be allowed to bind to any port that is not allowed
by the parent cgroup). This constraint allows you to ensure that not
only do the processes in a cgroup follow the bind range, but so do the
processes in any of the cgroup's descendants. The way this is enforced
is because of two things: 1) when a cgroup is initialized, its ranges
are inherited from its parent, and 2) when attempting to set the ranges of
a cgroup, the kernel ensures that the condition is true for the current
cgroup and all its children, or otherwise fails to change the ranges
with error EINVAL.

listen port ranges
--
This property controls which ports the processes in a cgroup are allowed
to listen on. If a process in a cgroup tries to listen using a socket
bound to a port that is not within the range(s) permitted by the cgroup,
it will receive an EACCES error.

Configuring this property works the same way as with bind port ranges,
except using the file 'net.listen_port_ranges' instead of
'net.bind_port_ranges'. The range subset invariant is imposed
independently for bind and listen port ranges. For now the kernel does
not enforce that the listen range must be a subset of the bind range.

Tested: Used a python unittest to set the range and try
binding/listening to ports inside and outside the range, and ensure
that an error occurred only when it should. Also, ensures that an error
occurs when trying to violate the subset condition.

Signed-off-by: Anoop Naravaram 
---
 Documentation/cgroup-v1/net.txt |  46 ++
 include/net/net_cgroup.h|  41 +
 net/core/net_cgroup.c   | 341 
 net/ipv4/af_inet.c  |   8 +
 net/ipv4/inet_connection_sock.c |   7 +
 net/ipv6/af_inet6.c |   7 +
 6 files changed, 450 insertions(+)

diff --git a/Documentation/cgroup-v1/net.txt b/Documentation/cgroup-v1/net.txt
index 580c214..8c50c61 100644
--- a/Documentation/cgroup-v1/net.txt
+++ b/Documentation/cgroup-v1/net.txt
@@ -7,3 +7,49 @@ properties for each process group:
 * listen port ranges
 * dscp ranges
 * udp port usage and limit
+
+bind port ranges
+
+This property controls which ports the processes in a cgroup are allowed
+to bind to. If a process in a cgroup tries to bind a socket to a port
+that is not within the range(s) permitted by the cgroup, it will receive an
+EACCES error.
+
+This property is exposed to userspace through the 'net.bind_port_ranges' file,
+as ranges of ports that the processes can bind to (as described in the HOW TO
+INTERACT WITH RANGES FILES section).
+
+listen port ranges
+--
+This property controls which ports the processes in a cgroup are allowed
+to listen on. If a process in a cgroup tries to listen using a socket
+bound to a port that is not within the range(s) permitted by the cgroup,
+it will receive an EACCES error.
+
+This property is exposed to userspace through the 'net.listen_port_ranges' 
file,
+as ranges of ports that the processes can listen on (as described in the HOW TO
+INTERACT WITH RANGES FILES section).
+
+HOW TO INTERACT WITH RANGES FILES
+-
+Some cgroup properties can be expressed as ranges of allowed integers. From
+userspace, you can get or set them by accessing the cgroup file corresponding 
to
+the property you want to interact with. To set the ranges, write a list of
+comma-separated ranges to the file, where each range could be either a pair of
+integers separated by a hyphen (-), or just an individual integer. For example,
+say you want a cgroup to allow the integers 100 through 200 (inclusive), 300
+through 320 (inclusive) and 350. Then you can write the string
+"100-200,300-320,350" to the file. When reading the file, any individual
+integers will be read as a "start-end" range where the start and 

[PATCH 5/5] net: add test for net cgroup

2016-08-10 Thread Anoop Naravaram
Created a file scripts/cgroup/net_cgroup_test.py that tests the
functionality of the net cgroup as described in previous commit logs.

Signed-off-by: Anoop Naravaram 
---
 scripts/cgroup/net_cgroup_test.py | 359 ++
 1 file changed, 359 insertions(+)
 create mode 100755 scripts/cgroup/net_cgroup_test.py

diff --git a/scripts/cgroup/net_cgroup_test.py 
b/scripts/cgroup/net_cgroup_test.py
new file mode 100755
index 000..604f662
--- /dev/null
+++ b/scripts/cgroup/net_cgroup_test.py
@@ -0,0 +1,359 @@
+#!/usr/grte/v4/bin/python2.7
+import unittest
+import os
+import socket
+import shutil
+import multiprocessing
+
+cgroup_net_root = '/dev/cgroup/net'
+
+def create_cgroup(name):
+'''
+Creates a cgroup with the given name. The name should also include the 
names
+of all ancestors separated by slashes, such as 'a/b/c'. Returns a path to
+the directory of the newly created cgroup.
+'''
+cgroup_dir = os.path.join(cgroup_net_root, name)
+while True:
+try:
+os.mkdir(cgroup_dir)
+break
+except OSError as e:
+# remove it if it already exists, then try to create again
+# there will be errors when rmtree tries to remove the cgroup 
files,
+# but these errors should be ignored because we only care about
+# rmdir'ing the directories, which will automatically get rid of 
the
+# files inside them
+shutil.rmtree(cgroup_dir, ignore_errors=True)
+
+return cgroup_dir
+
+
+def parse_ranges(ranges_str):
+'''
+Converts a range string like "100-200,300-400" into a set of 2-tuples like
+{(100,200),(300,400)}.
+'''
+return set(tuple(int(l) for l in r.strip().split('-'))
+   for r in ranges_str.split(','))
+
+def acquire_udp_ports(cgroup_dir, e2, n, addr, numfailq):
+'''
+Waits for the event e1, attempts to acquire n udp ports connected to addr,
+and then puts the number of failures on the queue and waits for e2. Then,
+all sockets are closed. (Intended to be called as a subprocess.)
+
+While waiting for e1, the parent process can set this process's cgroup.
+While waiting for e2, the parent process can read the udp statistics while
+this process is still alive.
+'''
+
+with open(os.path.join(cgroup_dir, 'tasks'), 'w') as f:
+# add proc1 to cgroup a/b
+f.write(str(os.getpid()))
+
+socketset = set()
+numfail = 0
+for _ in xrange(n):
+s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+try:
+s.connect(addr)
+except socket.error as e:
+numfail += 1
+socketset.add(s)
+
+if numfailq is not None:
+numfailq.put(numfail)
+
+if e2 is not None:
+e2.wait()
+
+for s in socketset:
+s.close()
+
+
+class NetCgroupTest(unittest.TestCase):
+
+def test_port_range_check(self):
+'''
+Test that the kernel is correctly checking that a port is in the
+relevant range when a process in a net cgroup is trying to bind a 
socket
+to it, or trying to listen on a socket bound to it.
+'''
+
+# create a new cgroup a
+cgroup_a_dir = create_cgroup('a')
+
+# add current process to cgroup a
+with open(os.path.join(cgroup_a_dir, 'tasks'), 'w') as f:
+f.write(str(os.getpid()))
+
+# set bind and listen range of cgroup a
+with open(os.path.join(cgroup_a_dir, 'net.bind_port_ranges'), 'w') as 
f:
+f.write('300-400,500')
+with open(os.path.join(cgroup_a_dir, 'net.listen_port_ranges'), 'w') 
as f:
+f.write('350-400')
+
+# try binding and listening on various ports, and check if they succeed
+# or fail appropriately
+s = socket.socket()
+try:
+s.bind(('0.0.0.0', 350)) # should bind and listen successfully
+try:
+s.listen(5)
+except socket.error:
+self.fail('unexpectedly failed to listen')
+except socket.error:
+self.fail('unexpectedly failed to bind')
+s.close()
+
+s = socket.socket()
+try:
+s.bind(('0.0.0.0', 370)) # should bind and listen successfully
+try:
+s.listen(5)
+except socket.error:
+self.fail('unexpectedly failed to listen')
+except socket.error:
+self.fail('unexpectedly failed to bind')
+s.close()
+
+s = socket.socket()
+try:
+s.bind(('0.0.0.0', 320)) # should bind successfully but fail to 
listen
+with self.assertRaises(socket.error):
+s.listen(5)
+except socket.error:
+self.fail('unexpectedly failed to bind')
+s.close()
+
+s = socket.socket()
+try:
+s.bind(('0.0.0.0', 500)) # should bind successfully 

[PATCH 3/5] net: add udp limit to net cgroup

2016-08-10 Thread Anoop Naravaram
udp port limit
--
This property controls the limit of udp ports that can be used by the
processes in a cgroup. The controller manages udp statistics (usage,
limit, etc) for each cgroup. Every cgroup also keeps track of the udp
ports acquired by its descendants. If a process tries to acquire a port
when its cgroup has already reached its limit, it will fail with error
EACCES. It will also fail if one of the cgroup's ancestors has reached its
limit. There are 5 files exposed to userspace to configure this property:

* 'net.udp_usage': Reading this file gives the number of udp ports used by
processes in this cgroup and all its descendants.
* 'net.udp_limit': Writing this file sets the total number of udp ports
that can be used by processes in this cgroup and all
its descendants. This file can also be read.
* 'net.udp_maxusage': Reading this file gives the highest value of
net.udp_usage that has been seen for this cgroup.
* 'net.udp_failcnt': Reading this file gives the number of times a
process in this cgroup or one of its descendants has attempted to acquire
a udp port but failed because the limit of this cgroup was reached.
* 'net.udp_underflowcnt': Reading this file gives the number of times a
process in this cgroup or one of its descendants released a udp port when
the usage value of this cgroup was 0.

When a new cgroup is created, its udp limit is copied from its parent.

Tested: Set the udp limit, then used python to use several udp ports,
ensuring that it is successful up until the limit, after which there
should be an error. Also tried different limits at different levels of the
hierarchy.

Signed-off-by: Anoop Naravaram 
---
 Documentation/cgroup-v1/net.txt |  26 
 include/net/net_cgroup.h|  29 +
 net/core/net_cgroup.c   | 273 
 net/ipv4/udp.c  |   8 ++
 4 files changed, 336 insertions(+)

diff --git a/Documentation/cgroup-v1/net.txt b/Documentation/cgroup-v1/net.txt
index 8c50c61..a14fd1c 100644
--- a/Documentation/cgroup-v1/net.txt
+++ b/Documentation/cgroup-v1/net.txt
@@ -30,6 +30,32 @@ This property is exposed to userspace through the 
'net.listen_port_ranges' file,
 as ranges of ports that the processes can listen on (as described in the HOW TO
 INTERACT WITH RANGES FILES section).
 
+udp port usage and limit
+
+This property controls the limit of udp ports that can be used by the
+processes in a cgroup. The controller manages udp statistics (usage, limit, 
etc)
+for each cgroup. Every cgroup also keeps track of the udp ports acquired by its
+descendants. If a process tries to acquire a port when its cgroup has
+already reached its limit, it will fail with error EACCES. It will also fail if
+one of the cgroup's ancestors has reached its limit. There are 5 files
+exposed to userspace to configure this property:
+
+* 'net.udp_usage': Reading this file gives the number of udp ports used by
+processes in this cgroup and all its descendants.
+* 'net.udp_limit': Writing this file sets the total number of udp ports
+that can be used by processes in this cgroup and all
+its descendants. This file can also be read.
+* 'net.udp_maxusage': Reading this file gives the highest value of
+net.udp_usage that has been seen for this cgroup.
+* 'net.udp_failcnt': Reading this file gives the number of times a
+process in this cgroup or one of its descendants has attempted to acquire a
+udp port but failed because the limit of this cgroup was reached.
+* 'net.udp_underflowcnt': Reading this file gives the number of times a
+process in this cgroup or one of its descendants released a udp port when the
+usage value of this cgroup was 0.
+
+When a new cgroup is created, its udp limit is copied from its parent.
+
 HOW TO INTERACT WITH RANGES FILES
 -
 Some cgroup properties can be expressed as ranges of allowed integers. From
diff --git a/include/net/net_cgroup.h b/include/net/net_cgroup.h
index 6ee79d5..25a9def 100644
--- a/include/net/net_cgroup.h
+++ b/include/net/net_cgroup.h
@@ -26,6 +26,16 @@ enum {
NETCG_NUM_RANGE_TYPES
 };
 
+/* udp statistic type */
+enum {
+   NETCG_LIMIT_UDP,
+   NETCG_USAGE_UDP,
+   NETCG_MAXUSAGE_UDP,
+   NETCG_FAILCNT_UDP,
+   NETCG_UNDERFLOWCNT_UDP,
+   NETCG_NUM_UDP_STATS
+};
+
 struct net_range {
u16 min_value;
u16 max_value;
@@ -43,9 +53,19 @@ struct net_range_types {
u16 upper_limit;
 };
 
+struct cgroup_udp_stats {
+   /* Use atomics to protect against multiple writers */
+   atomic64_t  udp_limitandusage; /* 32MSB => limit, 32LSB => usage */
+   atomic_tudp_maxusage;
+   atomic_tudp_failcnt;
+   atomic_tudp_underflowcnt;
+};
+
 struct net_cgroup {
struct cgroup_subsys_state  css;
 
+   struct cgroup_udp_stats udp_stats;
+
/* these fields are required for 

[PATCH 0/5] Networking cgroup controller

2016-08-10 Thread Anoop Naravaram
This patchset introduces a cgroup controller for the networking subsystem as a
whole. As of now, this controller will be used for:

* Limiting the specific ports that a process in a cgroup is allowed to bind
  to or listen on. For example, you can say that all the processes in a
  cgroup can only bind to ports 1000-2000, and listen on ports 1000-1100, which
  guarantees that the remaining ports will be available for other processes.

* Restricting which DSCP values processes can use with their sockets. For
  example, you can say that all the processes in a cgroup can only send
  packets with a DSCP tag between 48 and 63 (corresponding to TOS values of
  192 to 255).

* Limiting the total number of udp ports that can be used by a process in a
  cgroup. For example, you can say that all the processes in one cgroup are
  allowed to use a total of up to 100 udp ports. Since the total number of udp
  ports that can be used by all processes is limited, this is useful for
  rationing out the ports to different process groups.

In the future, more networking-related properties may be added to this
controller.

Anoop Naravaram (5):
  net: create the networking cgroup controller
  net: add bind/listen ranges to net cgroup
  net: add udp limit to net cgroup
  net: add dscp ranges to net cgroup
  net: add test for net cgroup

 Documentation/cgroup-v1/net.txt   |  95 +
 include/linux/cgroup_subsys.h |   4 +
 include/net/net_cgroup.h  | 103 ++
 net/Kconfig   |  10 +
 net/core/Makefile |   1 +
 net/core/net_cgroup.c | 706 ++
 net/ipv4/af_inet.c|   8 +
 net/ipv4/inet_connection_sock.c   |   7 +
 net/ipv4/ip_sockglue.c|  13 +
 net/ipv4/udp.c|   8 +
 net/ipv6/af_inet6.c   |   7 +
 net/ipv6/datagram.c   |   9 +
 net/ipv6/ipv6_sockglue.c  |   8 +
 scripts/cgroup/net_cgroup_test.py | 359 +++
 14 files changed, 1338 insertions(+)
 create mode 100644 Documentation/cgroup-v1/net.txt
 create mode 100644 include/net/net_cgroup.h
 create mode 100644 net/core/net_cgroup.c
 create mode 100755 scripts/cgroup/net_cgroup_test.py

-- 
2.8.0.rc3.226.g39d4020



Re: [PATCH 1/1 linux-next] net: hns: fix typo in g_gmac_stats_string[]

2016-08-10 Thread Yisen Zhuang
Hi Fabian,

This patch is fine to me, many thanks.

Yisen

在 2016/8/10 23:48, Fabian Frederick 写道:
> s/gamc/gmac/
> 
> Signed-off-by: Fabian Frederick 
> ---
>  drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c 
> b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c
> index 1235c7f..1e1eb92 100644
> --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c
> +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c
> @@ -17,7 +17,7 @@ static const struct mac_stats_string g_gmac_stats_string[] 
> = {
>   {"gmac_rx_octets_total_ok", MAC_STATS_FIELD_OFF(rx_good_bytes)},
>   {"gmac_rx_octets_bad", MAC_STATS_FIELD_OFF(rx_bad_bytes)},
>   {"gmac_rx_uc_pkts", MAC_STATS_FIELD_OFF(rx_uc_pkts)},
> - {"gamc_rx_mc_pkts", MAC_STATS_FIELD_OFF(rx_mc_pkts)},
> + {"gmac_rx_mc_pkts", MAC_STATS_FIELD_OFF(rx_mc_pkts)},
>   {"gmac_rx_bc_pkts", MAC_STATS_FIELD_OFF(rx_bc_pkts)},
>   {"gmac_rx_pkts_64octets", MAC_STATS_FIELD_OFF(rx_64bytes)},
>   {"gmac_rx_pkts_65to127", MAC_STATS_FIELD_OFF(rx_65to127)},
> 



Re: [PATCH] net: macb: Add 64 bit addressing support for GEM

2016-08-10 Thread David Miller
From: Harini Katakam 
Date: Tue, 9 Aug 2016 13:15:53 +0530

> This patch adds support for 64 bit addressing and BDs.
> -> Enable 64 bit addressing in DMACFG register.
> -> Set DMA mask when design config register shows support for 64 bit addr.
> -> Add new BD words for higher address when 64 bit DMA support is present.
> -> Add and update TBQPH and RBQPH for MSB of BD pointers.
> -> Change extraction and updation of buffer addresses to use
> 64 bit address.
> -> In gem_rx extract address in one place insted of two and use a
> separate flag for RXUSED.
> 
> Signed-off-by: Harini Katakam 

It's not the cleanest looking thing in the world with all of these
ifdefs, but I can't suggest anything better, so applied to net-next
thanks.


Re: [PATCH net-next] qed*: Add support for ethtool link_ksettings callbacks.

2016-08-10 Thread David Miller
From: Sudarsana Reddy Kalluru 
Date: Tue, 9 Aug 2016 03:51:23 -0400

> This patch adds the driver implementation for ethtool link_ksettings
> callbacks. qed driver now defines/uses the qed specific masks for
> representing link capability values. qede driver maps these values to
> to new link modes defined by the kernel implementation of link_ksettings.
> 
> Please consider applying this to 'net-next' branch.
> 
> Signed-off-by: Sudarsana Reddy Kalluru 
> Signed-off-by: Yuval Mintz 

Applied, thanks.


Re: [PATCH] net: ip_finish_output_gso: If skb_gso_network_seglen exceeds MTU, allow segmentation for gre tunneled skbs

2016-08-10 Thread David Miller
From: we...@ucloud.cn
Date: Tue,  9 Aug 2016 15:04:21 +0800

> From: wenxu 
> 
> commit b8247f095edd ("net: ip_finish_output_gso: If skb_gso_network_seglen
> exceeds MTU, allow segmentation for local udp tunneled skbs")
> 
> Given:
>  - tap0 and ovs-gre
>  - ovs-gre stacked on eth0, eth0 having the small mtu
> 
> After encapsulation these skbs have skb_gso_network_seglen that exceed
> eth0's ip_skb_dst_mtu. So the finnal each segment would be larger than
> eth0 mtu. These packets maybe dropped.
> 
> It has the same problem if tap0 bridge with ipgre or gretap device. So
> the IPSKB_FRAG_SEGS flags should also be set in gre tunneled skbs.
> 
> Signed-off-by: wenxu 

I am rather certain that this test is intentionally restricted to
UDP tunnel endpoints, because GRE and other tunnel types are PMTU safe.

Hannes and Shmulik?


Re: [PATCH v3 00/13] net: ethernet: ti: cpsw: split driver data and per ndev data

2016-08-10 Thread David Miller
From: Ivan Khoronzhuk 
Date: Wed, 10 Aug 2016 02:22:31 +0300

> In dual_emac mode the driver can handle 2 network devices. Each of them can 
> use
> its own private data and common data/resources. This patchset splits common 
> driver
> data/resources and private per net device data.
> It leads to:
> - reduce memory usage
> - increase code readability
> - allows add a bunch of simplification
> - create prerequisites to add multi-channel support,
>   when channels are shared between net devices
 ...

Series applied to net-next, thank you.


Re: [PATCH v6 1/1] rps: Inspect PPTP encapsulated by GRE to get flow hash

2016-08-10 Thread David Miller
From: f...@ikuai8.com
Date: Tue,  9 Aug 2016 12:38:24 +0800

> From: Gao Feng 
> 
> The PPTP is encapsulated by GRE header with that GRE_VERSION bits
> must contain one. But current GRE RPS needs the GRE_VERSION must be
> zero. So RPS does not work for PPTP traffic.
> 
> In my test environment, there are four MIPS cores, and all traffic
> are passed through by PPTP. As a result, only one core is 100% busy
> while other three cores are very idle. After this patch, the usage
> of four cores are balanced well.
> 
> Signed-off-by: Gao Feng 

Applied to net-next, thanks.


Re: [PATCH 0/2] Convert qdisc linked list into a hashtable

2016-08-10 Thread David Miller
From: Jiri Kosina 
Date: Wed, 10 Aug 2016 11:00:42 +0200 (CEST)

> This is a respin of the v6 of the original patch [1], split into two-patch 
> series as requested by davem; first patch fixes all symbol conflicts 
> that'd happen once netdevice.h starts to include hashtable.h, the second 
> one performs the actual switch to hashtable.
> 
> I've preserved Cong's Reviewed-by:, as code-wise this series is identical 
> to the original v6 of the patch.
> 
> [1] lkml.kernel.org/r/alpine.lnx.2.00.1608011220580.22...@cbobk.fhfr.pm

Series applied, thank you.


Re: [PATCHv2 3/4] pci: Determine actual VPD size on first access

2016-08-10 Thread Benjamin Herrenschmidt
On Wed, 2016-08-10 at 08:47 -0700, Alexander Duyck wrote:
> 
> The problem is if we don't do this it becomes possible for a guest to
> essentially cripple a device on the host by just accessing VPD
> regions that aren't actually viable on many devices. 

And ? We already can cripple the device in so many different ways
simpy because we have pretty much full BAR access to it...

>  We are much better off
> in terms of security and stability if we restrict access to what
> should be accessible. 

Bollox. I've heard that argument over and over again, it never stood
and still doesn't.

We have full BAR access for god sake. We can already destroy the device
in many cases (think: reflashing microcode, internal debug bus access
with a route to the config space, voltage/freq control ).

We aren't protecting anything more here, we are just adding layers of
bloat, complication and bugs.

>  In this case what has happened is that the
> vendor threw in an extra out-of-spec block and just expected it to
> work.

Like vendors do all the time in all sort of places

I still completely fail to see the point in acting as a filtering
middle man.

> In order to work around it we just need to add a small function
> to drivers/pci/quirks.c that would update the VPD size reported so
> that it matches what the hardware is actually providing instead of
> what we can determine based on the VPD layout.
> 
> Really working around something like this is not much different than
> what we would have to do if the vendor had stuffed the data in some
> reserved section of their PCI configuration space.

It is, in both cases we shouldn't have VFIO or the host involved. We
should just let the guest config space accesses go through.

>   We end up needing
> to add special quirks any time a vendor goes out-of-spec for some
> one-off configuration interface that only they are ever going to use.

Cheers,
Ben.



[PATCH 1/1 v3] net: i40e: use matching format identifiers

2016-08-10 Thread Heinrich Schuchardt
i is defined as int but output as %u several times.
Adjust the format identifiers.

Signed-off-by: Heinrich Schuchardt 

---

v3:
fix typos in title, old title:
net: i10e: use matching format indentifiers

v2:
Keep definition of i as int.

 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index c912e04..94ac712 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1560,13 +1560,13 @@ static void i40e_get_strings(struct net_device *netdev, 
u32 stringset,
}
 #endif
for (i = 0; i < vsi->num_queue_pairs; i++) {
-   snprintf(p, ETH_GSTRING_LEN, "tx-%u.tx_packets", i);
+   snprintf(p, ETH_GSTRING_LEN, "tx-%d.tx_packets", i);
p += ETH_GSTRING_LEN;
-   snprintf(p, ETH_GSTRING_LEN, "tx-%u.tx_bytes", i);
+   snprintf(p, ETH_GSTRING_LEN, "tx-%d.tx_bytes", i);
p += ETH_GSTRING_LEN;
-   snprintf(p, ETH_GSTRING_LEN, "rx-%u.rx_packets", i);
+   snprintf(p, ETH_GSTRING_LEN, "rx-%d.rx_packets", i);
p += ETH_GSTRING_LEN;
-   snprintf(p, ETH_GSTRING_LEN, "rx-%u.rx_bytes", i);
+   snprintf(p, ETH_GSTRING_LEN, "rx-%d.rx_bytes", i);
p += ETH_GSTRING_LEN;
}
if (vsi != pf->vsi[pf->lan_vsi] || pf->hw.partition_id != 1)
@@ -1581,16 +1581,16 @@ static void i40e_get_strings(struct net_device *netdev, 
u32 stringset,
}
for (i = 0; i < I40E_MAX_TRAFFIC_CLASS; i++) {
snprintf(p, ETH_GSTRING_LEN,
-"veb.tc_%u_tx_packets", i);
+"veb.tc_%d_tx_packets", i);
p += ETH_GSTRING_LEN;
snprintf(p, ETH_GSTRING_LEN,
-"veb.tc_%u_tx_bytes", i);
+"veb.tc_%d_tx_bytes", i);
p += ETH_GSTRING_LEN;
snprintf(p, ETH_GSTRING_LEN,
-"veb.tc_%u_rx_packets", i);
+"veb.tc_%d_rx_packets", i);
p += ETH_GSTRING_LEN;
snprintf(p, ETH_GSTRING_LEN,
-"veb.tc_%u_rx_bytes", i);
+"veb.tc_%d_rx_bytes", i);
p += ETH_GSTRING_LEN;
}
}
@@ -1601,23 +1601,23 @@ static void i40e_get_strings(struct net_device *netdev, 
u32 stringset,
}
for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
snprintf(p, ETH_GSTRING_LEN,
-"port.tx_priority_%u_xon", i);
+"port.tx_priority_%d_xon", i);
p += ETH_GSTRING_LEN;
snprintf(p, ETH_GSTRING_LEN,
-"port.tx_priority_%u_xoff", i);
+"port.tx_priority_%d_xoff", i);
p += ETH_GSTRING_LEN;
}
for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
snprintf(p, ETH_GSTRING_LEN,
-"port.rx_priority_%u_xon", i);
+"port.rx_priority_%d_xon", i);
p += ETH_GSTRING_LEN;
snprintf(p, ETH_GSTRING_LEN,
-"port.rx_priority_%u_xoff", i);
+"port.rx_priority_%d_xoff", i);
p += ETH_GSTRING_LEN;
}
for (i = 0; i < I40E_MAX_USER_PRIORITY; i++) {
snprintf(p, ETH_GSTRING_LEN,
-"port.rx_priority_%u_xon_2_xoff", i);
+"port.rx_priority_%d_xon_2_xoff", i);
p += ETH_GSTRING_LEN;
}
/* BUG_ON(p - data != I40E_STATS_LEN * ETH_GSTRING_LEN); */
-- 
2.1.4



Re: [PATCH v6 2/3] Documentation: DT: net: Add Xilinx gmiitorgmii converter device tree binding documentation

2016-08-10 Thread Rob Herring
On Wed, Aug 10, 2016 at 11:20:07AM +0530, Kedareswara rao Appana wrote:
> Device-tree binding documentation for xilinx gmiitorgmii converter.
> 
> Signed-off-by: Kedareswara rao Appana 
> ---
> Changes for v6:
> ---> Removed mdio description as suggested by Florian.
> Changes for v5:
> ---> Fixed Indentation in the example as suggested by Michal.
> Changes for v4:
> --> Modified compatible as suggested by Rob.
> --> Removed underscores from the converter node name as suggested by Rob.
> Changes for v3:
> --> None.
> Changes for v2:
> --> New patch
> 
>  .../devicetree/bindings/net/xilinx_gmii2rgmii.txt  | 35 
> ++
>  1 file changed, 35 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/net/xilinx_gmii2rgmii.txt

Acked-by: Rob Herring 


Re: [PATCH v2 1/1] Fix unbound rx buffer

2016-08-10 Thread David Miller

All of these patches need to be resubmitted with a proper subsystem
prefix in the Subject line.

Thank you.


Re: [PATCH] bonding: Allow tun-interfaces as slaves

2016-08-10 Thread Jörn Engel
On Wed, Aug 10, 2016 at 02:26:49PM -0700, Jörn Engel wrote:
> 
> Having to set one more parameter is a bit annoying.  It would have to be
> documented in a prominent place and people would still often miss it.
> So I wonder if we can make the interface a little nicer.
> 
> Options:
> - If there are no slaves yet and the first slave added is tun, we trust
>   the users to know what they are doing.  Automatically set
>   bond->params.fail_over_mac = BOND_FOM_KEEPMAC
>   Maybe do a printk to inform the user in case of a mistake.
> - If we get an error and the slave device is tun, do a printk giving the
>   user enough information to find this parameter.
> 
> I'm leaning towards the former, but you probably know a reason why I am
> wrong again.

Patch below is an implementation of the former.  Not sure if something
like this is worth considering.

Jörn

--
To announce that there must be no criticism of the President, or that we
are to stand by the President, right or wrong, is not only unpatriotic
and servile, but is morally treasonable to the American public.
-- Theodore Roosevelt, Kansas City Star, 1918


diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 1f276fa30ba6..306909a44fab 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1482,8 +1482,9 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev)
 */
ether_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr);
 
-   if (!bond->params.fail_over_mac ||
-   BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
+   if (bond_dev->type != ARPHRD_NONE &&
+   (!bond->params.fail_over_mac ||
+BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) {
/* Set slave to master's mac address.  The application already
 * set the master's mac address to that of the first slave
 */
-- 
2.1.4



Re: [PATCH net-next 2/4] flow_dissector: Get vlan priority in addition to vlan id

2016-08-10 Thread kbuild test robot
Hi Hadar,

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Hadar-Hen-Zion/flow_dissector-Get-vlan-info-from-skb-vlan_tci-instead-of-skb-data/20160811-042500
config: cris-etrax-100lx_v2_defconfig (attached as .config)
compiler: cris-linux-gcc (GCC) 4.6.3
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=cris 

All errors (new ones prefixed by >>):

   In function 'flow_keys_hash_length.isra.6',
   inlined from '__flow_hash_from_keys' at net/core/flow_dissector.c:599:26,
   inlined from 'flow_hash_from_keys' at net/core/flow_dissector.c:610:2:
>> net/core/flow_dissector.c:512:2: error: call to '__compiletime_assert_512' 
>> declared with attribute error: BUILD_BUG_ON failed: (sizeof(*flow) - 
>> FLOW_KEYS_HASH_OFFSET) % sizeof(u32)
   In function 'flow_keys_hash_length.isra.6',
   inlined from '__flow_hash_from_keys' at net/core/flow_dissector.c:599:26,
   inlined from '__skb_get_hash_symmetric' at 
net/core/flow_dissector.c:663:2:
>> net/core/flow_dissector.c:512:2: error: call to '__compiletime_assert_512' 
>> declared with attribute error: BUILD_BUG_ON failed: (sizeof(*flow) - 
>> FLOW_KEYS_HASH_OFFSET) % sizeof(u32)
   In function 'flow_keys_hash_length.isra.6',
   inlined from '__flow_hash_from_keys' at net/core/flow_dissector.c:599:26,
   inlined from '__skb_get_hash' at net/core/flow_dissector.c:620:2:
>> net/core/flow_dissector.c:512:2: error: call to '__compiletime_assert_512' 
>> declared with attribute error: BUILD_BUG_ON failed: (sizeof(*flow) - 
>> FLOW_KEYS_HASH_OFFSET) % sizeof(u32)
   In function 'flow_keys_hash_length.isra.6',
   inlined from '__flow_hash_from_keys' at net/core/flow_dissector.c:599:26,
   inlined from 'skb_get_hash_perturb' at net/core/flow_dissector.c:620:2:
>> net/core/flow_dissector.c:512:2: error: call to '__compiletime_assert_512' 
>> declared with attribute error: BUILD_BUG_ON failed: (sizeof(*flow) - 
>> FLOW_KEYS_HASH_OFFSET) % sizeof(u32)

vim +/__compiletime_assert_512 +512 net/core/flow_dissector.c

20a17bf6 David S. Miller  2015-09-01  506   return (const u32 *)(p + 
FLOW_KEYS_HASH_OFFSET);
42aecaa9 Tom Herbert  2015-06-04  507  }
42aecaa9 Tom Herbert  2015-06-04  508  
20a17bf6 David S. Miller  2015-09-01  509  static inline size_t 
flow_keys_hash_length(const struct flow_keys *flow)
42aecaa9 Tom Herbert  2015-06-04  510  {
c3f83241 Tom Herbert  2015-06-04  511   size_t diff = 
FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
42aecaa9 Tom Herbert  2015-06-04 @512   BUILD_BUG_ON((sizeof(*flow) - 
FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
c3f83241 Tom Herbert  2015-06-04  513   
BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
c3f83241 Tom Herbert  2015-06-04  514sizeof(*flow) - 
sizeof(flow->addrs));
c3f83241 Tom Herbert  2015-06-04  515  
c3f83241 Tom Herbert  2015-06-04  516   switch 
(flow->control.addr_type) {
c3f83241 Tom Herbert  2015-06-04  517   case 
FLOW_DISSECTOR_KEY_IPV4_ADDRS:
c3f83241 Tom Herbert  2015-06-04  518   diff -= 
sizeof(flow->addrs.v4addrs);
c3f83241 Tom Herbert  2015-06-04  519   break;
c3f83241 Tom Herbert  2015-06-04  520   case 
FLOW_DISSECTOR_KEY_IPV6_ADDRS:
c3f83241 Tom Herbert  2015-06-04  521   diff -= 
sizeof(flow->addrs.v6addrs);
c3f83241 Tom Herbert  2015-06-04  522   break;
9f249089 Tom Herbert  2015-06-04  523   case 
FLOW_DISSECTOR_KEY_TIPC_ADDRS:
9f249089 Tom Herbert  2015-06-04  524   diff -= 
sizeof(flow->addrs.tipcaddrs);
9f249089 Tom Herbert  2015-06-04  525   break;
c3f83241 Tom Herbert  2015-06-04  526   }
c3f83241 Tom Herbert  2015-06-04  527   return (sizeof(*flow) - diff) / 
sizeof(u32);
66415cf8 Hannes Frederic Sowa 2013-10-23  528  }
66415cf8 Hannes Frederic Sowa 2013-10-23  529  
c3f83241 Tom Herbert  2015-06-04  530  __be32 flow_get_u32_src(const 
struct flow_keys *flow)
5ed20a68 Tom Herbert  2014-07-01  531  {
c3f83241 Tom Herbert  2015-06-04  532   switch 
(flow->control.addr_type) {
c3f83241 Tom Herbert  2015-06-04  533   case 
FLOW_DISSECTOR_KEY_IPV4_ADDRS:
c3f83241 Tom Herbert  2015-06-04  534   return 
flow->addrs.v4addrs.src;
c3f83241 Tom Herbert  2015-06-04  535   case 
FLOW_DISSECTOR_KEY_IPV6_ADDRS:
c3f83241 Tom Herbert  2015-06-04  536   return (__force 
__be32)ipv6_addr_hash(
c3f83241 Tom Herbert  2015-06-04  537   
>addrs.v6addrs.src);
9f249089 Tom Herbert  2015-06-04  538   case 
FLOW_DISSECTOR_KEY_TIPC_ADDRS:
9f249089 Tom Herbert  2015-06-04  539   return 
flow->addrs.tipcaddrs.srcnode;

[PATCH 2/9] dsa: mv88e6xxx: hide unused functions

2016-08-10 Thread Arnd Bergmann
When CONFIG_NET_DSA_HWMON is disabled, we get warnings about two unused
functions whose only callers are all inside of an #ifdef:

drivers/net/dsa/mv88e6xxx.c:3257:12: 'mv88e6xxx_mdio_page_write' defined but 
not used [-Werror=unused-function]
drivers/net/dsa/mv88e6xxx.c:3244:12: 'mv88e6xxx_mdio_page_read' defined but not 
used [-Werror=unused-function]

This adds another ifdef around the function definitions. The warnings
appeared after the functions were marked 'static', but the problem
was already there before that.

Signed-off-by: Arnd Bergmann 
Fixes: 57d3231057e9 ("net: dsa: mv88e6xxx: fix style issues")
Reviewed-by: Vivien Didelot 
---
When I first submitted it on June 23, there was a clash with some
other patches, this version is based on top of current mainline,
which contains those patches already, so it should apply cleanly.
---
 drivers/net/dsa/mv88e6xxx/chip.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index d36aedde8cb9..d1d9d3cf9139 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -3187,6 +3187,7 @@ static int mv88e6xxx_set_addr(struct dsa_switch *ds, u8 
*addr)
return err;
 }
 
+#ifdef CONFIG_NET_DSA_HWMON
 static int mv88e6xxx_mdio_page_read(struct dsa_switch *ds, int port, int page,
int reg)
 {
@@ -3212,6 +3213,7 @@ static int mv88e6xxx_mdio_page_write(struct dsa_switch 
*ds, int port, int page,
 
return ret;
 }
+#endif
 
 static int mv88e6xxx_port_to_mdio_addr(struct mv88e6xxx_chip *chip, int port)
 {
-- 
2.9.0



[PATCH 0/9] v4.8 build regressions

2016-08-10 Thread Arnd Bergmann
This is a set of patches to address build warnings and errors that have
come up in linux-4.8 but that worked fine in v4.7. I've added the
tinyconfig warning patch in there as well, which is not a regression
but is something that shows up in the kernelci.org build bots.
The other patches address issues that either came up in kernelci.org
or in my randconfig test setup.

I have three more fixes queued up in arm-soc/fixes that I plan
to send as a pull request for -rc2 along with the other bugfixes.

All patches have been posted before, but for some reason or another
failed to make it into the merge window. I have updated the ones
that were waiting for a new version from me now, the others are
sent without modifications.

Hopefully we can get them all merged into v4.8. Please pick up
patches from the middle of the series if appropriate.

Arnd

Arnd Bergmann (7):
  kconfig: tinyconfig: provide whole choice blocks to avoid warnings
  dsa: mv88e6xxx: hide unused functions
  drm/mediatek: add COMMON_CLK dependency
  drm/mediatek: add CONFIG_OF dependency
  drm/mediatek: add ARM_SMCCC dependency
  clocksource: kona: fix get_counter error handling
  8250/fintek: rename IRQ_MODE macro

Geert Uytterhoeven (1):
  test/hash: Fix warning in two-dimensional array init

George Spelvin (1):
  test/hash: Fix warning in preprocessor symbol evaluation

 arch/x86/configs/tiny.config  |  2 ++
 drivers/clocksource/bcm_kona_timer.c  | 16 ++--
 drivers/gpu/drm/mediatek/Kconfig  |  3 +++
 drivers/net/dsa/mv88e6xxx/chip.c  |  2 ++
 drivers/tty/serial/8250/8250_fintek.c |  4 ++--
 kernel/configs/tiny.config|  8 
 lib/test_hash.c   |  8 
 7 files changed, 31 insertions(+), 12 deletions(-)

-- 
2.9.0

Cc: Andrew Lunn 
Cc: Andrew Morton 
Cc: Daniel Lezcano 
Cc: David Airlie 
Cc: dri-de...@lists.freedesktop.org
Cc: Geert Uytterhoeven 
Cc: George Spelvin 
Cc: Greg Kroah-Hartman 
Cc: Jiri Slaby 
Cc: Ji-Ze Hong (Peter Hong) 
Cc: Josh Triplett 
Cc: kernel-build-repo...@lists.linaro.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-ker...@vger.kernel.org
Cc: Masahiro Yamada 
Cc: Matthias Brugger 
Cc: netdev@vger.kernel.org
Cc: Philipp Zabel 
Cc: Ricardo Ribalda Delgado 
Cc: Thomas Gleixner 
Cc: Vivien Didelot 
Cc: x...@kernel.org


Re: [PATCH] net: ipconfig: fix use after free

2016-08-10 Thread David Miller
From: Geert Uytterhoeven 
Date: Wed, 10 Aug 2016 12:00:35 +0200

> Hi Uwe,
> 
> On Wed, Aug 10, 2016 at 11:44 AM, Uwe Kleine-König
>  wrote:
>> ic_close_devs() calls kfree() for all devices's ic_device. Since commit
>> 2647cffb2bc6 ("net: ipconfig: Support using "delayed" DHCP replies")
>> the active device's ic_device is still used however to print the
>> ipconfig summary which results in an oops if the memory is already
>> changed. So delay freeing until after the autoconfig results are
>> reported.
> 
> Thanks!
> 
>> Fixes: 2647cffb2bc6 ("net: ipconfig: Support using "delayed" DHCP replies")
>> Reported-by: Geert Uytterhoeven 
>> Signed-off-by: Uwe Kleine-König 
> 
> Tested-by: Geert Uytterhoeven 

Applied, thanks everyone.


Re: [PATCH net-next 2/4] flow_dissector: Get vlan priority in addition to vlan id

2016-08-10 Thread kbuild test robot
Hi Hadar,

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Hadar-Hen-Zion/flow_dissector-Get-vlan-info-from-skb-vlan_tci-instead-of-skb-data/20160811-042500
config: m68k-sun3_defconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 4.9.0
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=m68k 

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/linkage.h:4:0,
from include/linux/kernel.h:6,
from net/core/flow_dissector.c:1:
   In function 'flow_keys_hash_length.isra.3',
   inlined from 'flow_hash_from_keys' at net/core/flow_dissector.c:599:9:
>> include/linux/compiler.h:491:38: error: call to '__compiletime_assert_512' 
>> declared with attribute error: BUILD_BUG_ON failed: (sizeof(*flow) - 
>> FLOW_KEYS_HASH_OFFSET) % sizeof(u32)
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:474:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## suffix();\
   ^
   include/linux/compiler.h:491:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/bug.h:51:37: note: in expansion of macro 'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^
   include/linux/bug.h:75:2: note: in expansion of macro 'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
 ^
>> net/core/flow_dissector.c:512:2: note: in expansion of macro 'BUILD_BUG_ON'
 BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
 ^
   In function 'flow_keys_hash_length.isra.3',
   inlined from '__skb_get_hash' at net/core/flow_dissector.c:599:9:
>> include/linux/compiler.h:491:38: error: call to '__compiletime_assert_512' 
>> declared with attribute error: BUILD_BUG_ON failed: (sizeof(*flow) - 
>> FLOW_KEYS_HASH_OFFSET) % sizeof(u32)
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:474:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## suffix();\
   ^
   include/linux/compiler.h:491:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/bug.h:51:37: note: in expansion of macro 'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^
   include/linux/bug.h:75:2: note: in expansion of macro 'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
 ^
>> net/core/flow_dissector.c:512:2: note: in expansion of macro 'BUILD_BUG_ON'
 BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
 ^
   In function 'flow_keys_hash_length.isra.3',
   inlined from 'skb_get_hash_perturb' at net/core/flow_dissector.c:599:9:
>> include/linux/compiler.h:491:38: error: call to '__compiletime_assert_512' 
>> declared with attribute error: BUILD_BUG_ON failed: (sizeof(*flow) - 
>> FLOW_KEYS_HASH_OFFSET) % sizeof(u32)
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:474:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## suffix();\
   ^
   include/linux/compiler.h:491:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/bug.h:51:37: note: in expansion of macro 'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^
   include/linux/bug.h:75:2: note: in expansion of macro 'BUILD_BUG_ON_MSG'
 BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition)
 ^
>> net/core/flow_dissector.c:512:2: note: in expansion of macro 'BUILD_BUG_ON'
 BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
 ^
   In function 'flow_keys_hash_length.isra.3',
   inlined from '__skb_get_hash_symmetric' at 
net/core/flow_dissector.c:599:9:
>> include/linux/compiler.h:491:38: error: call to '__compiletime_assert_512' 
>> declared with attribute error: BUILD_BUG_ON failed: (sizeof(*flow) - 
>> FLOW_KEYS_HASH_OFFSET) % sizeof(u32)
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:474:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## 

Re: [PATCH] ravb: use proper names for suspend/resume functions

2016-08-10 Thread David Miller
From: Niklas Söderlund 
Date: Wed, 10 Aug 2016 13:09:49 +0200

> The patch 'ravb: add sleep PM suspend/resume support' used incorrect
> function names containing 'runtime' for the suspend and resume
> functions.
> 
> Reported-by: Sergei Shtylyov 
> Signed-off-by: Niklas Söderlund 

Applied, thanks.


[PATCH v2 2/3] proc: make proc entries inherit ownership from parent

2016-08-10 Thread Dmitry Torokhov
There are certain parameters that belong to net namespace and that are
exported in /proc. They should be controllable by the container's owner,
but are currently owned by global root and thus not available.

Let's change proc code to inherit ownership of parent entry, and when
create per-ns "net" proc entry set it up as owned by container's owner.

Signed-off-by: Dmitry Torokhov 
---
 fs/proc/generic.c  |  2 ++
 fs/proc/proc_net.c | 13 +
 2 files changed, 15 insertions(+)

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index c633476..bca66d8 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -390,6 +390,8 @@ static struct proc_dir_entry *__proc_create(struct 
proc_dir_entry **parent,
atomic_set(>count, 1);
spin_lock_init(>pde_unload_lock);
INIT_LIST_HEAD(>pde_openers);
+   proc_set_user(ent, (*parent)->uid, (*parent)->gid);
+
 out:
return ent;
 }
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index c8bbc68..7ae6b1d 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -185,6 +186,8 @@ const struct file_operations proc_net_operations = {
 static __net_init int proc_net_ns_init(struct net *net)
 {
struct proc_dir_entry *netd, *net_statd;
+   kuid_t uid;
+   kgid_t gid;
int err;
 
err = -ENOMEM;
@@ -199,6 +202,16 @@ static __net_init int proc_net_ns_init(struct net *net)
netd->parent = _root;
memcpy(netd->name, "net", 4);
 
+   uid = make_kuid(net->user_ns, 0);
+   if (!uid_valid(uid))
+   uid = netd->uid;
+
+   gid = make_kgid(net->user_ns, 0);
+   if (!gid_valid(gid))
+   gid = netd->gid;
+
+   proc_set_user(netd, uid, gid);
+
err = -EEXIST;
net_statd = proc_net_mkdir(net, "stat", netd);
if (!net_statd)
-- 
2.8.0.rc3.226.g39d4020



[PATCH v2 0/3] Make /proc per net namespace objects belong to container

2016-08-10 Thread Dmitry Torokhov
Currently [almost] all /proc objects belong to the global root, even if
data belongs to a given namespace within a container and (at least for
sysctls) we work around permssions checks to allow container's root to
access the data.

This series changes ownership of net namespace /proc objects
(/proc/net/self/* and /proc/sys/net/*) to be container's root and not
global root when there exists mapping for container's root in user
namespace.

This helps when running Android CTS in a container, but I think it makes
sense regardless.

Changes from V1:

- added fix for crash when !CONFIG_NET_NS (new patch #1)
- addressed Eric'c comments for error handling style in patch #3 and
  added his Ack
- adjusted patch #2 to use the same style of erro handling
- sent out as series instead of separate patches

Dmitry Torokhov (3):
  netns: do not call pernet ops for not yet set up init_net namespace
  proc: make proc entries inherit ownership from parent
  net: make net namespace sysctls belong to container's owner

 fs/proc/generic.c|  2 ++
 fs/proc/proc_net.c   | 13 +
 fs/proc/proc_sysctl.c|  5 +
 include/linux/sysctl.h   |  4 
 net/core/net_namespace.c | 21 +
 net/sysctl_net.c | 29 -
 6 files changed, 61 insertions(+), 13 deletions(-)

-- 
2.8.0.rc3.226.g39d4020



[PATCH v2 1/3] netns: do not call pernet ops for not yet set up init_net namespace

2016-08-10 Thread Dmitry Torokhov
When CONFIG_NET_NS is disabled, registering pernet operations causes
init() to be called immediately with init_net as an argument. Unfortunately
this leads to some pernet ops, such as proc_net_ns_init() to be called too
early, when init_net namespace has not been fully initialized. This causes
issues when we want to change pernet ops to use more data from the net
namespace in question, for example reference user namespace that owns our
network namespace.

To fix this we could either play game of musical chairs and rearrange init
order, or we could do the same as when CONFIG_NET_NS is enabled, and
postpone calling pernet ops->init() until namespace is set up properly.

Note that we can not simply undo commit ed160e839d2e ("[NET]: Cleanup
pernet operation without CONFIG_NET_NS") and use the same implementations
for __register_pernet_operations() and __unregister_pernet_operations(),
because many pernet ops are marked as __net_initdata and will be discarded,
which wreaks havoc on our ops lists. Here we rely on the fact that we only
use lists until init_net is fully initialized, which happens much earlier
than discarding __net_initdata sections.

Signed-off-by: Dmitry Torokhov 
---
 net/core/net_namespace.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2c2eb1b..1fe5816 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -37,6 +37,8 @@ struct net init_net = {
 };
 EXPORT_SYMBOL(init_net);
 
+static bool init_net_initialized;
+
 #define INITIAL_NET_GEN_PTRS   13 /* +1 for len +2 for rcu_head */
 
 static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
@@ -750,6 +752,8 @@ static int __init net_ns_init(void)
if (setup_net(_net, _user_ns))
panic("Could not setup the initial network namespace");
 
+   init_net_initialized = true;
+
rtnl_lock();
list_add_tail_rcu(_net.list, _namespace_list);
rtnl_unlock();
@@ -811,15 +815,24 @@ static void __unregister_pernet_operations(struct 
pernet_operations *ops)
 static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
 {
+   if (!init_net_initialized) {
+   list_add_tail(>list, list);
+   return 0;
+   }
+
return ops_init(ops, _net);
 }
 
 static void __unregister_pernet_operations(struct pernet_operations *ops)
 {
-   LIST_HEAD(net_exit_list);
-   list_add(_net.exit_list, _exit_list);
-   ops_exit_list(ops, _exit_list);
-   ops_free_list(ops, _exit_list);
+   if (!init_net_initialized) {
+   list_del(>list);
+   } else {
+   LIST_HEAD(net_exit_list);
+   list_add(_net.exit_list, _exit_list);
+   ops_exit_list(ops, _exit_list);
+   ops_free_list(ops, _exit_list);
+   }
 }
 
 #endif /* CONFIG_NET_NS */
-- 
2.8.0.rc3.226.g39d4020



[PATCH v2 3/3] net: make net namespace sysctls belong to container's owner

2016-08-10 Thread Dmitry Torokhov
If net namespace is attached to a user namespace let's make container's
root owner of sysctls affecting said network namespace instead of global
root.

This also allows us to clean up net_ctl_permissions() because we do not
need to fudge permissions anymore for the container's owner since it now
owns the objects in question.

Acked-by: "Eric W. Biederman" 
Signed-off-by: Dmitry Torokhov 
---
 fs/proc/proc_sysctl.c  |  5 +
 include/linux/sysctl.h |  4 
 net/sysctl_net.c   | 29 -
 3 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5e57c3e..28f9085 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -430,6 +430,7 @@ static int sysctl_perm(struct ctl_table_header *head, 
struct ctl_table *table, i
 static struct inode *proc_sys_make_inode(struct super_block *sb,
struct ctl_table_header *head, struct ctl_table *table)
 {
+   struct ctl_table_root *root = head->root;
struct inode *inode;
struct proc_inode *ei;
 
@@ -457,6 +458,10 @@ static struct inode *proc_sys_make_inode(struct 
super_block *sb,
if (is_empty_dir(head))
make_empty_dir_inode(inode);
}
+
+   if (root->set_ownership)
+   root->set_ownership(head, table, >i_uid, >i_gid);
+
 out:
return inode;
 }
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index fa7bc29..55bec2f 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /* For the /proc/sys support */
@@ -156,6 +157,9 @@ struct ctl_table_root {
struct ctl_table_set default_set;
struct ctl_table_set *(*lookup)(struct ctl_table_root *root,
   struct nsproxy *namespaces);
+   void (*set_ownership)(struct ctl_table_header *head,
+ struct ctl_table *table,
+ kuid_t *uid, kgid_t *gid);
int (*permissions)(struct ctl_table_header *head, struct ctl_table 
*table);
 };
 
diff --git a/net/sysctl_net.c b/net/sysctl_net.c
index ed98c1f..5bc1a3d 100644
--- a/net/sysctl_net.c
+++ b/net/sysctl_net.c
@@ -42,26 +42,37 @@ static int net_ctl_permissions(struct ctl_table_header 
*head,
   struct ctl_table *table)
 {
struct net *net = container_of(head->set, struct net, sysctls);
-   kuid_t root_uid = make_kuid(net->user_ns, 0);
-   kgid_t root_gid = make_kgid(net->user_ns, 0);
 
/* Allow network administrator to have same access as root. */
-   if (ns_capable(net->user_ns, CAP_NET_ADMIN) ||
-   uid_eq(root_uid, current_euid())) {
+   if (ns_capable(net->user_ns, CAP_NET_ADMIN)) {
int mode = (table->mode >> 6) & 7;
return (mode << 6) | (mode << 3) | mode;
}
-   /* Allow netns root group to have the same access as the root group */
-   if (in_egroup_p(root_gid)) {
-   int mode = (table->mode >> 3) & 7;
-   return (mode << 3) | mode;
-   }
+
return table->mode;
 }
 
+static void net_ctl_set_ownership(struct ctl_table_header *head,
+ struct ctl_table *table,
+ kuid_t *uid, kgid_t *gid)
+{
+   struct net *net = container_of(head->set, struct net, sysctls);
+   kuid_t ns_root_uid;
+   kgid_t ns_root_gid;
+
+   ns_root_uid = make_kuid(net->user_ns, 0);
+   if (uid_valid(ns_root_uid))
+   *uid = ns_root_uid;
+
+   ns_root_gid = make_kgid(net->user_ns, 0);
+   if (gid_valid(ns_root_gid))
+   *gid = ns_root_gid;
+}
+
 static struct ctl_table_root net_sysctl_root = {
.lookup = net_ctl_header_lookup,
.permissions = net_ctl_permissions,
+   .set_ownership = net_ctl_set_ownership,
 };
 
 static int __net_init sysctl_net_init(struct net *net)
-- 
2.8.0.rc3.226.g39d4020



Re: [PATCH RESEND net-next 13/15] smc: receive data from RMBE

2016-08-10 Thread David Miller
From: Ursula Braun 
Date: Wed, 10 Aug 2016 15:44:00 +0200

> But there are still usages (and conn->rx_curs_confirmed is one of
> them), where I need an 8-byte cursor field to be read and written
> atomicaly, even though I do not care whether the write operation has
> been beaten or not. But I do care that reading the cursor does not
> return a partially updated cursor. Isn't xchg() a possible solution
> in this case?

Either the cpu supports 64-bit stores or it does not.

xchg() and atomicity have absolutely nothing to do with this.


Re: [PATCH v3 13/13] net: ethernet: ti: cpsw: move ale, cpts and drivers params under cpsw_common

2016-08-10 Thread Mugunthan V N
On Wednesday 10 August 2016 04:52 AM, Ivan Khoronzhuk wrote:
> The ale, cpts, version, rx_packet_max, bus_freq, interrupt pacing
> parameters are common per net device that uses the same h/w. So,
> move them to common driver structure.
> 
> Signed-off-by: Ivan Khoronzhuk 

Reviewed-by: Mugunthan V N 

Regards
Mugunthan V N


Re: [PATCH v3 10/13] net; ethernet: ti: cpsw: move irq stuff under cpsw_common

2016-08-10 Thread Mugunthan V N
On Wednesday 10 August 2016 04:52 AM, Ivan Khoronzhuk wrote:
> The irq data are common for net devs in dual_emac mode. So no need to
> hold these data in every priv struct, move them under cpsw_common.
> Also delete irq_num var, as after optimization it's not needed.
> Correct number of irqs to 2, as anyway, driver is using only 2,
> at least for now.
> 
> Signed-off-by: Ivan Khoronzhuk 

Reviewed-by: Mugunthan V N 

Regards
Mugunthan V N


Re: [PATCH] bonding: Allow tun-interfaces as slaves

2016-08-10 Thread Jörn Engel
On Tue, Aug 09, 2016 at 04:51:04PM -0700, Jay Vosburgh wrote:
> 
>   This will cause balance-rr to add the slave to the bond if any
> device's dev_set_mac_address call fails.
> 
>   If a bond of regular Ethernet devices is connected to a static
> link aggregation (Etherchannel channel group), a set_mac failure would
> result in that slave having a different MAC address than the bond, which
> in turn would cause traffic inbound from the switch to that slave to be
> dropped (as the destination MAC would not pass the device MAC filters).
> 
>   The failure check for the set_mac call serves a legitimate
> purpose, and I don't believe we should bypass it without making the
> bypass an option that is explicitly enabled for those special cases that
> need it.
> 
>   E.g., something like the following (which I have not tested);
> this would also need documentation and iproute2 updates to go with it.
> This would be enabled with "fail_over_mac=keepmac".

Thank you!

Tested-by: Jörn Engel 

Having to set one more parameter is a bit annoying.  It would have to be
documented in a prominent place and people would still often miss it.
So I wonder if we can make the interface a little nicer.

Options:
- If there are no slaves yet and the first slave added is tun, we trust
  the users to know what they are doing.  Automatically set
  bond->params.fail_over_mac = BOND_FOM_KEEPMAC
  Maybe do a printk to inform the user in case of a mistake.
- If we get an error and the slave device is tun, do a printk giving the
  user enough information to find this parameter.

I'm leaning towards the former, but you probably know a reason why I am
wrong again.

Jörn

--
For a successful technology, reality must take precedence over public
relations, for nature cannot be fooled.
-- Richard Feynman


Re: [RFC PATCH 2/3] net: macb: Add support for 1588 for Zynq Ultrascale+ MPSoC

2016-08-10 Thread Andrei Pistirica

Hi Punnaiah,

cpts_match(...) has a way to parse frames, while ptp_classify_raw 
identifies the underlying protocol (in case the frames are parsed on 
data path), or tx/rxtstamp callbacks can be used with PTP events. But, 
there is comment in ptp_classify.h which worries me.


Unfortunately,  I cannot access https://gitenterprise.xilinx.com.

Best regards,
Andrei

On 09.08.2016 18:56, Punnaiah Choudary Kalluri wrote:

Hi Nicolas,

 1588 implementation in cadence GEM IP we have in Zynq Ultascale+ MPSoC is
Different to the one in Zynq SOC.

In earlier version, all timestamp values will be stored in registers and there 
is no specific
Mechanism to distinguish the received ethernet frame that contains time stamp 
information
Other than parsing the frame for PTP packet type.

We have basic implementation for earlier version in our out of tree driver, 
which is going to be deprecated
Soon. You could also check the below driver for 1588 support.
https://gitenterprise.xilinx.com/Linux/linux-xlnx/blob/master/drivers/net/ethernet/xilinx/xilinx_emacps.c


Regards,
Punnaiah


-Original Message-
From: Nicolas Ferre [mailto:nicolas.fe...@atmel.com]
Sent: Tuesday, August 09, 2016 10:10 PM
To: Harini Katakam ; Harini Katakam
; Andrei Pistirica 
Cc: da...@davemloft.net; Boris Brezillon ; alexandre.bell...@free-electrons.com;
netdev@vger.kernel.org; linux-ker...@vger.kernel.org;
devicet...@vger.kernel.org; Punnaiah Choudary Kalluri
; Michal Simek ; Anirudha
Sarangi 
Subject: Re: [RFC PATCH 2/3] net: macb: Add support for 1588 for Zynq
Ultrascale+ MPSoC

Le 21/09/2015 à 19:49, Harini Katakam a écrit :

On Fri, Sep 11, 2015 at 1:27 PM, Harini Katakam
 wrote:

Cadence GEM in Zynq Ultrascale+ MPSoC supports 1588 and provides a
102 bit time counter with 48 bits for seconds, 30 bits for nsecs and
24 bits for sub-nsecs. The timestamp is made available to the SW through
registers as well as (more precisely) through upper two words in
an extended BD.

This patch does the following:
- Adds MACB_CAPS_TSU in zynqmp_config.
- Registers to ptp clock framework (after checking for timestamp support

in

  IP and capability in config).
- TX BD and RX BD control registers are written to populate timestamp in
  extended BD words.
- Timer initialization is done by writing time of day to the timer counter.
- ns increment register is programmed as NS_PER_SEC/TSU_CLK.
  For a 24 bit subns precision, the subns increment equals
  remainder of (NS_PER_SEC/TSU_CLK) * (2^24).
  TSU (Time stamp unit) clock is obtained by the  driver from devicetree.
- HW time stamp capabilities are advertised via ethtool and macb ioctl is
  updated accordingly.
- For all PTP event frames, nanoseconds and the lower 5 bits of seconds

are

  obtained from the BD. This offers a precise timestamp. The upper bits
  (which dont vary between consecutive packets) are obtained from the
  TX/RX PTP event/PEER registers. The timestamp obtained thus is

updated

  in skb for upper layers to access.
- The drivers register functions with ptp to perform time and frequency
  adjustment.
- Time adjustment is done by writing to the 1558_ADJUST register.
  The controller will read the delta in this register and update the timer
  counter register. Alternatively, for large time offset adjustments,
  the driver reads the secs and nsecs counter values, adds/subtracts the
  delta and updates the timer counter. In order to be as precise as

possible,

  nsecs counter is read again if secs has incremented during the counter

read.

- Frequency adjustment is not directly supported by this IP.
  addend is the initial value ns increment and similarly addendesub.
  The ppb (parts per billion) provided is used as
  ns_incr = addend +/- (ppb/rate).
  Similarly the remainder of the above is used to populate subns

increment.

  In case the ppb requested is negative AND subns adjustment greater

than

  the addendsub, ns_incr is reduced by 1 and subns_incr is adjusted in
  positive accordingly.

Signed-off-by: Harini Katakam :
---
 drivers/net/ethernet/cadence/macb.c |  372

++-

 drivers/net/ethernet/cadence/macb.h |   64 ++
 2 files changed, 428 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c

b/drivers/net/ethernet/cadence/macb.c

index bb2932c..b531008 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -30,6 +30,8 @@
 #include 
 #include 


[..]


+   unsigned intns_incr;
+   unsigned intsubns_incr;
 };

 static inline bool macb_is_gem(struct macb *bp)
--
1.7.9.5


Ping

Thanks.


Harini,

I come back to this patch of last year and I'm sorry about being so late
answering you.

Andrei who is added to the discussion will have 

RE: [RFC PATCH v5 2/3] Documentation: DT: net: Add Xilinx gmiitorgmii converter device tree binding documentation

2016-08-10 Thread Appana Durga Kedareswara Rao
Hi Florian,

Thanks for the review...

> 
> On 08/09/2016 02:34 AM, Kedareswara rao Appana wrote:
> > Device-tree binding documentation for xilinx gmiitorgmii converter.
> >
> > Signed-off-by: Kedareswara rao Appana 
> > ---
> > Changes for v5:
> > ---> Fixed Indentation in the example as suggested by Michal.
> > Changes for v4:
> > --> Modified compatible as suggested by Rob.
> > --> Removed underscores from the converter node name as suggested by Rob.
> > Changes for v3:
> > --> None.
> > Changes for v2:
> > --> New patch.
> >
> >  .../devicetree/bindings/net/xilinx_gmii2rgmii.txt  | 38
> > ++
> >  1 file changed, 38 insertions(+)
> >  create mode 100644
> > Documentation/devicetree/bindings/net/xilinx_gmii2rgmii.txt
> >
> > diff --git
> > a/Documentation/devicetree/bindings/net/xilinx_gmii2rgmii.txt
> > b/Documentation/devicetree/bindings/net/xilinx_gmii2rgmii.txt
> > new file mode 100644
> > index 000..5f48793
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/net/xilinx_gmii2rgmii.txt
> > @@ -0,0 +1,38 @@
> > +XILINX GMIITORGMII Converter Driver Device Tree Bindings
> > +
> > +
> > +The Gigabit Media Independent Interface (GMII) to Reduced Gigabit
> > +Media Independent Interface (RGMII) core provides the RGMII between
> > +RGMII-compliant Ethernet physical media devices (PHY) and the Gigabit
> Ethernet controller.
> > +This core can be used in all three modes of operation(10/100/1000 Mb/s).
> > +The Management Data Input/Output (MDIO) interface is used to
> > +configure the Speed of operation. This core can switch dynamically
> > +between the three Different speed modes by configuring the conveter
> register through mdio write.
> > +
> > +The MDIO is a bus to which the PHY devices are connected.  For each
> > +device that exists on this bus, a child node should be created.  See
> > +the definition of the PHY node in booting-without-of.txt for an
> > +example of how to define a PHY.
> 
> I would skip this paragraph which does not really help with understanding, and
> just refer to Documentation/devicetree/bindings/net/phy.txt for examples.

Sure will fix in the next version...

> 
> 
> > +
> > +This converter sits between the ethernet MAC and the external phy.
> > +MAC <==> GMII2RGMII <==> RGMII_PHY
> > +
> > +Required properties:
> > +- compatible   : Should be "xlnx,gmii-to-rgmii-1.0"
> > +- reg  : The ID number for the phy, usually a small integer
> 
> You would want specify that "reg" property needs to match the one of the PHY
> (specified via phy-handle) you are converting to/from for this "proxy" piece 
> of
> hardware to work.
> 
> If these two have the same "reg" value, is not that going to lead to duplicate
> MDIO devices created on the bus, this may work, based on probing ordering, but
> seems unusual, you don't really need the "reg"
> property here it seems?

The converter Phy address is different from the external phy address.

Regards,
Kedar.


> 
> > +- phy-handle   : Should point to the external phy device.
> > + See ethernet.txt file in the same directory.
> > +
> > +Example:
> > +   mdio {
> > +   #address-cells = <1>;
> > +   #size-cells = <0>;
> > +   phy: ethernet-phy@0 {
> > +   ..
> > +   };
> > +   gmiitorgmii: gmiitorgmii@8 {
> > +   compatible = "xlnx,gmii-to-rgmii-1.0";
> > +   reg = <8>;
> > +   phy-handle = <>;
> > +   };
> > +   };
> >
> 
> 
> --
> Florian


Re: [PATCH v3 12/13] net: ethernet: ti: cpsw: move napi struct to cpsw_common

2016-08-10 Thread Mugunthan V N
On Wednesday 10 August 2016 04:52 AM, Ivan Khoronzhuk wrote:
> The napi structs are common for both net devices in dual_emac
> mode, In order to not hold duplicate links to them, move to
> cpsw_common.
> 
> Signed-off-by: Ivan Khoronzhuk 
> ---

Reviewed-by: Mugunthan V N 

Regards
Mugunthan V N


Re: [PATCH] drivers: net: cpsw: fix kmemleak false-positive reports for sk buffers

2016-08-10 Thread Mugunthan V N
On Tuesday 09 August 2016 05:39 PM, Grygorii Strashko wrote:
> Kmemleak reports following false positive memory leaks for each sk
> buffers allocated by CPSW (__netdev_alloc_skb_ip_align()) in
> cpsw_ndo_open() and cpsw_rx_handler():
> 
> unreferenced object 0xea915000 (size 2048):
>   comm "systemd-network", pid 713, jiffies 4294938323 (age 102.180s)
>   hex dump (first 32 bytes):
> 00 58 91 ea ff ff ff ff ff ff ff ff ff ff ff ff  .X..
> ff ff ff ff ff ff fd 0f 00 00 00 00 00 00 00 00  
>   backtrace:
> [] __kmalloc_track_caller+0x1a4/0x230
> [] __alloc_skb+0x68/0x16c
> [] __netdev_alloc_skb+0x40/0x104
> [] cpsw_ndo_open+0x374/0x670 [ti_cpsw]
> [] __dev_open+0xb0/0x114
> [] __dev_change_flags+0x9c/0x14c
> [] dev_change_flags+0x20/0x50
> [] do_setlink+0x2cc/0x78c
> [] rtnl_setlink+0xcc/0x100
> [] rtnetlink_rcv_msg+0x184/0x224
> [] netlink_rcv_skb+0xa8/0xc4
> [] rtnetlink_rcv+0x2c/0x34
> [] netlink_unicast+0x16c/0x1f8
> [] netlink_sendmsg+0x334/0x348
> [] sock_sendmsg+0x1c/0x2c
> [] SyS_sendto+0xc0/0xe8
> 
> unreferenced object 0xec861780 (size 192):
>   comm "softirq", pid 0, jiffies 4294938759 (age 109.540s)
>   hex dump (first 32 bytes):
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
> 00 00 00 00 00 b0 5a ed 00 00 00 00 00 00 00 00  ..Z.
>   backtrace:
> [] kmem_cache_alloc+0x190/0x208
> [] __build_skb+0x30/0x98
> [] __netdev_alloc_skb+0xb8/0x104
> [] cpsw_rx_handler+0x68/0x1e4 [ti_cpsw]
> [] __cpdma_chan_free+0xa8/0xc4 [davinci_cpdma]
> [] __cpdma_chan_process+0x14c/0x16c [davinci_cpdma]
> [] cpdma_chan_process+0x44/0x5c [davinci_cpdma]
> [] cpsw_rx_poll+0x1c/0x9c [ti_cpsw]
> [] net_rx_action+0x1f0/0x2ec
> [] __do_softirq+0x134/0x258
> [] do_softirq+0x68/0x70
> [] __local_bh_enable_ip+0xd4/0xe8
> [] _raw_spin_unlock_bh+0x30/0x34
> [] igmp6_group_added+0x4c/0x1bc
> [] ipv6_dev_mc_inc+0x398/0x434
> [] addrconf_dad_work+0x224/0x39c
> 
> This happens because CPSW allocates SK buffers and then passes
> pointers on them in CPDMA where they stored in internal CPPI RAM
> (SRAM) which belongs to DEV MMIO space. Kmemleak does not scan IO
> memory and so reports memory leaks.
> 
> Hence, mark allocated sk buffers as false positive explicitly.
> 
> Cc: Catalin Marinas 
> Signed-off-by: Grygorii Strashko 

Reviewed-by: Mugunthan V N 

Regards
Mugunthan V N


Re: [PATCH v3 00/13] net: ethernet: ti: cpsw: split driver data and per ndev data

2016-08-10 Thread Grygorii Strashko

On 08/10/2016 02:22 AM, Ivan Khoronzhuk wrote:

In dual_emac mode the driver can handle 2 network devices. Each of them can use
its own private data and common data/resources. This patchset splits common 
driver
data/resources and private per net device data.
It leads to:
- reduce memory usage
- increase code readability
- allows add a bunch of simplification
- create prerequisites to add multi-channel support,
  when channels are shared between net devices



Reviewed-by: Grygorii Strashko 

--
regards,
-grygorii


[PATCH v2] drivers: net: cpsw: fix kmemleak false-positive reports for sk buffers

2016-08-10 Thread Grygorii Strashko
Kmemleak reports following false positive memory leaks for each sk
buffers allocated by CPSW (__netdev_alloc_skb_ip_align()) in
cpsw_ndo_open() and cpsw_rx_handler():

unreferenced object 0xea915000 (size 2048):
  comm "systemd-network", pid 713, jiffies 4294938323 (age 102.180s)
  hex dump (first 32 bytes):
00 58 91 ea ff ff ff ff ff ff ff ff ff ff ff ff  .X..
ff ff ff ff ff ff fd 0f 00 00 00 00 00 00 00 00  
  backtrace:
[] __kmalloc_track_caller+0x1a4/0x230
[] __alloc_skb+0x68/0x16c
[] __netdev_alloc_skb+0x40/0x104
[] cpsw_ndo_open+0x374/0x670 [ti_cpsw]
[] __dev_open+0xb0/0x114
[] __dev_change_flags+0x9c/0x14c
[] dev_change_flags+0x20/0x50
[] do_setlink+0x2cc/0x78c
[] rtnl_setlink+0xcc/0x100
[] rtnetlink_rcv_msg+0x184/0x224
[] netlink_rcv_skb+0xa8/0xc4
[] rtnetlink_rcv+0x2c/0x34
[] netlink_unicast+0x16c/0x1f8
[] netlink_sendmsg+0x334/0x348
[] sock_sendmsg+0x1c/0x2c
[] SyS_sendto+0xc0/0xe8

unreferenced object 0xec861780 (size 192):
  comm "softirq", pid 0, jiffies 4294938759 (age 109.540s)
  hex dump (first 32 bytes):
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
00 00 00 00 00 b0 5a ed 00 00 00 00 00 00 00 00  ..Z.
  backtrace:
[] kmem_cache_alloc+0x190/0x208
[] __build_skb+0x30/0x98
[] __netdev_alloc_skb+0xb8/0x104
[] cpsw_rx_handler+0x68/0x1e4 [ti_cpsw]
[] __cpdma_chan_free+0xa8/0xc4 [davinci_cpdma]
[] __cpdma_chan_process+0x14c/0x16c [davinci_cpdma]
[] cpdma_chan_process+0x44/0x5c [davinci_cpdma]
[] cpsw_rx_poll+0x1c/0x9c [ti_cpsw]
[] net_rx_action+0x1f0/0x2ec
[] __do_softirq+0x134/0x258
[] do_softirq+0x68/0x70
[] __local_bh_enable_ip+0xd4/0xe8
[] _raw_spin_unlock_bh+0x30/0x34
[] igmp6_group_added+0x4c/0x1bc
[] ipv6_dev_mc_inc+0x398/0x434
[] addrconf_dad_work+0x224/0x39c

This happens because CPSW allocates SK buffers and then passes
pointers on them in CPDMA where they stored in internal CPPI RAM
(SRAM) which belongs to DEV MMIO space. Kmemleak does not scan IO
memory and so reports memory leaks.

Hence, mark allocated sk buffers as false positive explicitly.

Acked-by: Catalin Marinas 
Signed-off-by: Grygorii Strashko 
---
changes in v2:
 - comments added 

 drivers/net/ethernet/ti/cpsw.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 0805855..5caef77 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -732,6 +732,11 @@ static void cpsw_rx_handler(void *token, int len, int 
status)
netif_receive_skb(skb);
ndev->stats.rx_bytes += len;
ndev->stats.rx_packets++;
+   /* SKB pointer will be stored in CPPI RAM (SRAM) which belongs
+* to MMIO space, as result false positive memory leak report
+* will be generated.
+*/
+   kmemleak_not_leak(new_skb);
} else {
ndev->stats.rx_dropped++;
new_skb = skb;
@@ -1323,6 +1328,11 @@ static int cpsw_ndo_open(struct net_device *ndev)
kfree_skb(skb);
goto err_cleanup;
}
+   /* SKB pointer will be stored in CPPI RAM (SRAM) which
+* belongs to MMIO space, as result false positive
+* memory leak report will be generated.
+*/
+   kmemleak_not_leak(skb);
}
/* continue even if we didn't manage to submit all
 * receive descs
-- 
2.9.2



RE: [RFC PATCH v5 3/3] net: phy: Add gmiitorgmii converter support

2016-08-10 Thread Appana Durga Kedareswara Rao
Hi Florian,

Thanks for the review...

> >
> > This converter sits between the MAC and the external phy MAC <==>
> > GMII2RGMII <==> RGMII_PHY
> 
> This looks good, just a few things, see below:

Thanks...

> > +config XILINX_GMII2RGMII
> > +   tristate "Xilinx GMII2RGMII converter driver"
> > +   default y
> 
> Don't force that, or at least make the default based on the potential
> users/drivers here.

Ok sure will fix in the next version...

> 
> > +   ---help---
> > + This driver support xilinx GMII to RGMII IP core it provides
> > + the Reduced Gigabit Media Independent Interface(RGMII) between
> > + Ethernet physical media devices and the Gigabit Ethernet 
> > controller.
> > +
> >  endif # PHYLIB



> > +#define XILINX_GMII2RGMII_REG  0x10
> > +#define XILINX_GMII2RGMII_SPEED_MASK   0x2040
> 
> BMCR_SPEED1000 | BMCR_SPEED100 would be clearer here.

Sure will fix...

> 
> > +
> > +struct gmii2rgmii {
> > +   struct phy_device *phy_dev;
> > +   struct phy_driver *phy_drv;
> > +   struct phy_driver conv_phy_drv;
> > +   int addr;
> > +};
> > +
> > +static int xgmiitorgmii_read_status(struct phy_device *phydev) {
> > +   struct gmii2rgmii *priv = (struct gmii2rgmii *)phydev->priv;
> 
> Casting is not required here, priv is void *.

Ok will remove...

> 
> > +   u16 val = 0;
> > +
> > +   priv->phy_drv->read_status(phydev);
> > +
> > +   val = mdiobus_read(phydev->mdio.bus, priv->addr,
> XILINX_GMII2RGMII_REG);
> > +   val &= XILINX_GMII2RGMII_SPEED_MASK;
> > +
> > +   switch (phydev->speed) {
> > +   case SPEED_1000:
> > +   val |= BMCR_SPEED1000;
> 
> Is the fall through really intentional here? See genphy_setup_forced() for
> instance.

Ok will fix...

> 
> > +   case SPEED_100:
> > +   val |= BMCR_SPEED100;
> > +   case SPEED_10:
> > +   val |= BMCR_SPEED10;
> > +   }
> > +
> > +   mdiobus_write(phydev->mdio.bus, priv->addr,
> XILINX_GMII2RGMII_REG,
> > +val);
> > +
> > +   return 0;
> > +}
> [snip]
> 
> > +static int __init xgmiitorgmii_init(void) {
> > +   return mdio_driver_register(_driver);
> > +}
> > +module_init(xgmiitorgmii_init);
> > +
> > +static void __exit xgmiitorgmii_cleanup(void) {
> > +   mdio_driver_unregister(_driver);
> > +}
> > +module_exit(xgmiitorgmii_cleanup);
> 
> mdio_module_driver() does eliminate a bit of this boilerplate code.

Sure will fix in the next version...

Regards,
Kedar.
> --
> Florian


Re: [PATCH 2/2] ravb: add sleep PM suspend/resume support

2016-08-10 Thread David Miller
From: Sergei Shtylyov 
Date: Wed, 10 Aug 2016 13:47:26 +0300

>Ugh, I should have reviewed the patch earlier -- I was postponing this
>until I have the time to test it... Since the patch did have some
>visible issues, it should've been recast before applying. DaveM,
>please in the future could you ping me before merging?

Review patches in a timely manner.

If I can go through a 5 day email outage and still see this patch
rotting in patchwork, you could have reviewed it in time.


[PATCH 0/2] Convert qdisc linked list into a hashtable

2016-08-10 Thread Jiri Kosina
This is a respin of the v6 of the original patch [1], split into two-patch 
series as requested by davem; first patch fixes all symbol conflicts 
that'd happen once netdevice.h starts to include hashtable.h, the second 
one performs the actual switch to hashtable.

I've preserved Cong's Reviewed-by:, as code-wise this series is identical 
to the original v6 of the patch.

[1] lkml.kernel.org/r/alpine.lnx.2.00.1608011220580.22...@cbobk.fhfr.pm

-- 
Jiri Kosina
SUSE Labs



Re: qdisc hash table changes...

2016-08-10 Thread David Miller
From: Jiri Kosina 
Date: Tue, 9 Aug 2016 11:02:43 +0200 (CEST)

> Does that strictly have to be a show-stopper for the qdisc hash 
> conversion, given the fact that the whole tree is building properly?

I guess not.  Please submit that change as a series, first patches
that correct the build required hash symbol conflict fixes, then
the qdisc change itself.

THanks.



Re: [PATCH] [v7] net: emac: emac gigabit ethernet controller driver

2016-08-10 Thread Florian Fainelli
On 08/10/2016 09:38 AM, Timur Tabi wrote:
> Florian Fainelli wrote:
>>> >Is there an easy way for me to stop the RX path before I set
>>> rxbuf_size?
>>> >  Some netif_xxx function I can call?
>> napi_disable() should take care of that.
> 
> It appears that if I call netif_stop_queue() *afer* calling
> napi_disable(), I get a hang and/or TX timeout.  Since emac_mac_down()
> does this:
> 
> netif_stop_queue(netdev);
> napi_disable(>rx_q.napi);
> 
> I cannot call just napi_disable() in emac_change_mtu(), because when I
> then call emac_mac_down(), the first thing it does is call
> netif_stop_queue(), and that's when I timeout/hang.

Whatever emac_mac_down() does you can unroll it in the change_mtu
callback anyway, so if this a problematic sequence you can work around it.

> 
> Unfortunately, I cannot even do this:
> 
> netif_stop_queue(netdev);
> napi_disable(>rx_q.napi);
> netif_stop_queue(netdev);
> napi_disable(>rx_q.napi);
> 
> Even though I've already called netif_stop_queue(), calling it again
> causes the timeout/hang.

Buf if this is really what you copy/pasted here, why do this twice anyway?

> 
> Is this expected?  I never understood why I needed to call
> netif_stop_queue() before napi_disable().  I do see some drivers do not
> call netif_stop_queue().  I even saw a driver that calls them in reverse
> order, so I don't understand why that sequence breaks for me but not him.
> 

Not clear how the two relate with each other here, must be specific to
your driver implementation somehow.
-- 
Florian


Re: [PATCHv2 3/4] pci: Determine actual VPD size on first access

2016-08-10 Thread Hannes Reinecke
On 08/09/2016 08:12 PM, Alexander Duyck wrote:
> On Tue, Aug 9, 2016 at 5:54 AM, Alexey Kardashevskiy  wrote:
>> On 10/02/16 08:04, Bjorn Helgaas wrote:
>>> On Wed, Jan 13, 2016 at 12:25:34PM +0100, Hannes Reinecke wrote:
 PCI-2.2 VPD entries have a maximum size of 32k, but might actually
 be smaller than that. To figure out the actual size one has to read
 the VPD area until the 'end marker' is reached.
 Trying to read VPD data beyond that marker results in 'interesting'
 effects, from simple read errors to crashing the card. And to make
 matters worse not every PCI card implements this properly, leaving
 us with no 'end' marker or even completely invalid data.
 This path tries to determine the size of the VPD data.
 If no valid data can be read an I/O error will be returned when
 reading the sysfs attribute.
>>
>>
>> I have a problem with this particular feature as today VFIO uses this
>> pci_vpd_ API to virtualize access to VPD and the existing code assumes
>> there is just one VPD block with 0x2 start and 0xf end. However I have at
>> least one device where this is not true - "10 Gigabit Ethernet-SR PCI
>> Express Adapter" - it has 2 blocks (made a script to read/parse it as
>> /sys/bus/pci/devices/0001\:03\:00.0/vpd shows it wrong):
> 
> The PCI spec is what essentially assumes that there is only one block.
> If I am not mistaken in the case of this device the second block here
> actually contains device configuration data, not actual VPD data.  The
> issue here is that the second block is being accessed as VPD when it
> isn't.
> 
>> # Large item 42 bytes; name 0x2 Identifier String
>> #002d Large item 74 bytes; name 0x10
>> #007a Small item 1 bytes; name 0xf End Tag
>> ---
>> #0c00 Large item 16 bytes; name 0x2 Identifier String
>> #0c13 Large item 234 bytes; name 0x10
>> #0d00 Large item 252 bytes; name 0x11
>> #0dff Small item 0 bytes; name 0xf End Tag
> 
> The second block here is driver proprietary setup bits.
> 
>> The cxgb3 driver is reading the second bit starting from 0xc00 but since
>> the size is wrongly detected as 0x7c, VFIO blocks access beyond it and the
>> guest driver fails to probe.
>>
>> I also cannot find a clause in the PCI 3.0 spec saying that there must be
>> just a single block, is it there?
> 
> The problem is we need to be able to parse it.  The spec defines a
> series of tags that can be used starting at offset 0.  That is how we
> are supposed to get around through the VPD data.  The problem is we
> can't have more than one end tag and what appears to be happening here
> is that we are defining a second block of data which uses the same
> formatting as VPD but is not VPD.
> 
>> What would the correct fix be? Scanning all 32k of VPD is not an option I
>> suppose as this is what this patch is trying to avoid. Thanks.
> 
> I adding the current cxgb3 maintainer and netdev list to the Cc.  This
> is something that can probably be addressed via a PCI quirk as what
> needs to happen is that we need to extend the VPD in the case of this
> part in order to include this second block.  As long as we can read
> the VPD data all the way out to 0xdff odds are we could probably just
> have the size arbitrarily increased to 0xe00 via the quirk and then
> you would be able to access all of the VPD for the device.  We already
> have code making other modifications to drivers/pci/quirks.c for
> several Broadcom devices and probably just need something similar to
> allow extended access in the case of these devices.
> 
Yes, that's what I think, too.
The Broadcom quirk should work here, too.
(Didn't we do that already?)

Cheers,

Hannes
-- 
Dr. Hannes ReineckeTeamlead Storage & Networking
h...@suse.de   +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)


Re: [Patch net 0/5] net_sched: tc action fixes and updates

2016-08-10 Thread Jamal Hadi Salim

On 16-08-08 04:46 PM, Cong Wang wrote:

This patchset fixes several regressions caused by the previous
code refactor. Thanks to Jamal for catching them!

Note, patch 3/5 and 4/5 are not strictly necessary, I just
want to carry them together.



Cong - there's good news and bad news.
The good news is that the oopses are fixed.
The bad news is you have now slowed down the system. It is noticeable
at high speed.
I narrowed it down to your use of flex arrays. In particular
tcf_exts_exec() call:
This is the fast path - was flexarray really necessary?
The conversion to list is slowing things down.

As hard as this is for me to say:
I am actually beginning to question this whole patch series.
Either you have a plan to fix this regression or lets just
pull this out for now to regain stability until we get our
act together. I think it would make a lot of sense to just pass
an array instead of a list.

cheers,
jamal




Re: [PATCH v3 09/13] net: ethernet: ti: cpsw: move cpdma resources to cpsw_common

2016-08-10 Thread Mugunthan V N
On Wednesday 10 August 2016 04:52 AM, Ivan Khoronzhuk wrote:
> Every net device private struct holds links to shared cpdma resources.
> No need to save and every time synchronize these resources per net dev.
> So, move it to common driver struct.
> 
> Signed-off-by: Ivan Khoronzhuk 

Reviewed-by: Mugunthan V N 

Regards
Mugunthan V N


Re: [PATCH] bonding: Allow tun-interfaces as slaves

2016-08-10 Thread Jay Vosburgh
Ding Tianhong  wrote:

>On 2016/8/10 7:51, Jay Vosburgh wrote:
>> Jörn Engel  wrote:
>> 
>>> On Tue, Aug 09, 2016 at 12:06:36PM -0700, David Miller wrote:
> On Tue, Aug 09, 2016 at 09:28:45PM +0800, Ding Tianhong wrote:
>
> Simply not checking errors when setting the mac address solves the
> problem for me.  No new features needed.

 But it only works in certain modes.

 So the best we can do is enforce the MAC address setting in the
 modes that absolutely require it.  We cannot ignore the MAC
 address setting unilaterally.
>>>
>>> Something like this?
>>>
>>> [PATCH] bonding: Allow tun-interfaces as slaves in balance-rr mode
>>>
>>> Up until 00503b6f702e (part of 3.14-rc1), the bonding driver could be
>>> used to enslave tun-interfaces.  00503b6f702e broke that behaviour,
>>> afaics as an unintended side-effect.
>>>
>>> For the purpose of bond-over-tun in balance-rr mode, simply ignoring the
>>> error from dev_set_mac_address() is good enough.
>>>
>>> Signed-off-by: Joern Engel 
>>> ---
>>> drivers/net/bonding/bond_main.c | 3 ++-
>>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/bonding/bond_main.c 
>>> b/drivers/net/bonding/bond_main.c
>>> index 1f276fa30ba6..2f686bfe4304 100644
>>> --- a/drivers/net/bonding/bond_main.c
>>> +++ b/drivers/net/bonding/bond_main.c
>>> @@ -1490,7 +1490,8 @@ int bond_enslave(struct net_device *bond_dev, struct 
>>> net_device *slave_dev)
>>> memcpy(addr.sa_data, bond_dev->dev_addr, bond_dev->addr_len);
>>> addr.sa_family = slave_dev->type;
>>> res = dev_set_mac_address(slave_dev, );
>>> -   if (res) {
>>> +   /* round-robin mode works fine without a mac address */
>>> +   if (res && BOND_MODE(bond) != BOND_MODE_ROUNDROBIN) {
>> 
>>  This will cause balance-rr to add the slave to the bond if any
>> device's dev_set_mac_address call fails.
>> 
>>  If a bond of regular Ethernet devices is connected to a static
>> link aggregation (Etherchannel channel group), a set_mac failure would
>> result in that slave having a different MAC address than the bond, which
>> in turn would cause traffic inbound from the switch to that slave to be
>> dropped (as the destination MAC would not pass the device MAC filters).
>> 
>>  The failure check for the set_mac call serves a legitimate
>> purpose, and I don't believe we should bypass it without making the
>> bypass an option that is explicitly enabled for those special cases that
>> need it.
>> 
>>  E.g., something like the following (which I have not tested);
>> this would also need documentation and iproute2 updates to go with it.
>> This would be enabled with "fail_over_mac=keepmac".
>> 
>> diff --git a/drivers/net/bonding/bond_main.c 
>> b/drivers/net/bonding/bond_main.c
>> index 1f276fa30ba6..d2283fc23b16 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -1483,7 +1483,8 @@ int bond_enslave(struct net_device *bond_dev, struct 
>> net_device *slave_dev)
>>  ether_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr);
>>  
>>  if (!bond->params.fail_over_mac ||
>> -BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
>> +(BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP &&
>> + bond->params.fail_over_mac != BOND_FOM_KEEPMAC)) {
>>  /* Set slave to master's mac address.  The application already
>>   * set the master's mac address to that of the first slave
>>   */
>> diff --git a/drivers/net/bonding/bond_options.c 
>> b/drivers/net/bonding/bond_options.c
>> index 577e57cad1dc..f9653fe4d622 100644
>> --- a/drivers/net/bonding/bond_options.c
>> +++ b/drivers/net/bonding/bond_options.c
>> @@ -125,6 +125,7 @@ static const struct bond_opt_value 
>> bond_fail_over_mac_tbl[] = {
>>  { "none",   BOND_FOM_NONE,   BOND_VALFLAG_DEFAULT},
>>  { "active", BOND_FOM_ACTIVE, 0},
>>  { "follow", BOND_FOM_FOLLOW, 0},
>> +{ "keepmac", BOND_FOM_KEEPMAC, 0},
>>  { NULL, -1,  0},
>>  };
>>  
>> diff --git a/include/net/bonding.h b/include/net/bonding.h
>> index 6360c259da6d..ec3442b3aa83 100644
>> --- a/include/net/bonding.h
>> +++ b/include/net/bonding.h
>> @@ -420,6 +420,7 @@ static inline bool bond_slave_can_tx(struct slave *slave)
>>  #define BOND_FOM_NONE   0
>>  #define BOND_FOM_ACTIVE 1
>>  #define BOND_FOM_FOLLOW 2
>> +#define BOND_FOM_KEEPMAC3
>>  
>>  #define BOND_ARP_TARGETS_ANY0
>>  #define BOND_ARP_TARGETS_ALL1
>> 
>> 
>>  -J
>> 
>Hi jorn:
>
>Could you please test this patch? I build this patch base on Jay's suggestion 
>and I think it could fix your problem.
>
>---
> drivers/net/bonding/bond_main.c| 24 +---
> drivers/net/bonding/bond_options.c |  3 ++-
> 

Re: [PATCH v3 11/13] net: ethernet: ti: cpsw: move platform data and slaves info to cpsw_common

2016-08-10 Thread Mugunthan V N
On Wednesday 10 August 2016 04:52 AM, Ivan Khoronzhuk wrote:
> These data are common for net devs in dual_emac mode. No need to hold
> it for every priv instance, so move them under cpsw_common.
> 
> Signed-off-by: Ivan Khoronzhuk 

Reviewed-by: Mugunthan V N 

Regards
Mugunthan V N


Re: [PATCH v3 06/13] net: ethernet: ti: cpsw: create common struct to hold shared driver data

2016-08-10 Thread Mugunthan V N
On Wednesday 10 August 2016 04:52 AM, Ivan Khoronzhuk wrote:
> This patch simply create holder for common data and as a start moves
> pdev var to it.
> 
> Signed-off-by: Ivan Khoronzhuk 

Reviewed-by: Mugunthan V N 

Regards
Mugunthan V N


[Patch net v2 1/5] net_sched: remove the leftover cleanup_a()

2016-08-10 Thread Cong Wang
After refactoring tc_action into tcf_common, we no
longer need to cleanup temporary "actions" in list,
they are permanently stored in the hashtable.

Fixes: a85a970af265 ("net_sched: move tc_action into tcf_common")
Reported-by: Jamal Hadi Salim 
Cc: Jamal Hadi Salim 
Signed-off-by: Cong Wang 
---
 net/sched/act_api.c | 22 +++---
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index e4a5f26..cce6986 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -754,16 +754,6 @@ static struct tc_action *tcf_action_get_1(struct net *net, 
struct nlattr *nla,
return ERR_PTR(err);
 }
 
-static void cleanup_a(struct list_head *actions)
-{
-   struct tc_action *a, *tmp;
-
-   list_for_each_entry_safe(a, tmp, actions, list) {
-   list_del(>list);
-   kfree(a);
-   }
-}
-
 static int tca_action_flush(struct net *net, struct nlattr *nla,
struct nlmsghdr *n, u32 portid)
 {
@@ -905,7 +895,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct 
nlmsghdr *n,
return ret;
}
 err:
-   cleanup_a();
+   tcf_action_destroy(, 0);
return ret;
 }
 
@@ -942,15 +932,9 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct 
nlmsghdr *n,
 
ret = tcf_action_init(net, nla, NULL, NULL, ovr, 0, );
if (ret)
-   goto done;
+   return ret;
 
-   /* dump then free all the actions after update; inserted policy
-* stays intact
-*/
-   ret = tcf_add_notify(net, n, , portid);
-   cleanup_a();
-done:
-   return ret;
+   return tcf_add_notify(net, n, , portid);
 }
 
 static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n)
-- 
2.1.0



Re: [PATCH v4 1/1] Add timer to handle OOM situations

2016-08-10 Thread Stefan Hajnoczi
On Thu, Aug 04, 2016 at 04:09:11PM +0200, ggar...@abra.uab.cat wrote:
> From: Gerard Garcia 
> 
> Set up a rx timer to avoid packets being discarded when there is not 
> available memory in the host.
> 
> Signed-off-by: Gerard Garcia 
> 
> ---
> v4:
>  * Fix style.
> 
> v3:
>  * Avoid race condition when freeing timer.
> 
> v2:
> * Use of ERR_PTR/PTR_ERR/IS_ERR
> * Timer cleaned on device release.
> * Do not process more packets on error.
> 
>  drivers/vhost/vsock.c | 49 -
>  1 file changed, 40 insertions(+), 9 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature


[PATCH 2/2] net: sched: convert qdisc linked list to hashtable

2016-08-10 Thread Jiri Kosina
From: Jiri Kosina 

Convert the per-device linked list into a hashtable. The primary 
motivation for this change is that currently, we're not tracking all the 
qdiscs in hierarchy (e.g. excluding default qdiscs), as the lookup 
performed over the linked list by qdisc_match_from_root() is rather 
expensive.

The ultimate goal is to get rid of hidden qdiscs completely, which will 
bring much more determinism in user experience.

Reviewed-by: Cong Wang 
Signed-off-by: Jiri Kosina 
---
 include/linux/netdevice.h |  4 
 include/net/pkt_sched.h   |  4 ++--
 include/net/sch_generic.h |  2 +-
 net/core/dev.c|  3 +++
 net/sched/sch_api.c   | 23 +--
 net/sched/sch_generic.c   |  8 +---
 net/sched/sch_mq.c|  2 +-
 net/sched/sch_mqprio.c|  2 +-
 8 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f45929c..17c6499 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -52,6 +52,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct netpoll_info;
 struct device;
@@ -1778,6 +1779,9 @@ struct net_device {
unsigned intnum_tx_queues;
unsigned intreal_num_tx_queues;
struct Qdisc*qdisc;
+#ifdef CONFIG_NET_SCHED
+   DECLARE_HASHTABLE   (qdisc_hash, 4);
+#endif
unsigned long   tx_queue_len;
spinlock_t  tx_global_lock;
int watchdog_timeo;
diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h
index fea53f4..8ba11b4 100644
--- a/include/net/pkt_sched.h
+++ b/include/net/pkt_sched.h
@@ -90,8 +90,8 @@ int unregister_qdisc(struct Qdisc_ops *qops);
 void qdisc_get_default(char *id, size_t len);
 int qdisc_set_default(const char *id);
 
-void qdisc_list_add(struct Qdisc *q);
-void qdisc_list_del(struct Qdisc *q);
+void qdisc_hash_add(struct Qdisc *q);
+void qdisc_hash_del(struct Qdisc *q);
 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle);
 struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle);
 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 62d5531..26f5cb3 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -67,7 +67,7 @@ struct Qdisc {
u32 limit;
const struct Qdisc_ops  *ops;
struct qdisc_size_table __rcu *stab;
-   struct list_headlist;
+   struct hlist_node   hash;
u32 handle;
u32 parent;
int (*reshape_fail)(struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index 904ff43..d3736d5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7511,6 +7511,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, 
const char *name,
INIT_LIST_HEAD(>all_adj_list.lower);
INIT_LIST_HEAD(>ptype_all);
INIT_LIST_HEAD(>ptype_specific);
+#ifdef CONFIG_NET_SCHED
+   hash_init(dev->qdisc_hash);
+#endif
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
 
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index ddf047d..c093d32 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -265,33 +266,33 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc 
*root, u32 handle)
root->handle == handle)
return root;
 
-   list_for_each_entry_rcu(q, >list, list) {
+   hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, 
handle) {
if (q->handle == handle)
return q;
}
return NULL;
 }
 
-void qdisc_list_add(struct Qdisc *q)
+void qdisc_hash_add(struct Qdisc *q)
 {
if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
struct Qdisc *root = qdisc_dev(q)->qdisc;
 
WARN_ON_ONCE(root == _qdisc);
ASSERT_RTNL();
-   list_add_tail_rcu(>list, >list);
+   hash_add_rcu(qdisc_dev(q)->qdisc_hash, >hash, q->handle);
}
 }
-EXPORT_SYMBOL(qdisc_list_add);
+EXPORT_SYMBOL(qdisc_hash_add);
 
-void qdisc_list_del(struct Qdisc *q)
+void qdisc_hash_del(struct Qdisc *q)
 {
if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
ASSERT_RTNL();
-   list_del_rcu(>list);
+   hash_del_rcu(>hash);
}
 }
-EXPORT_SYMBOL(qdisc_list_del);
+EXPORT_SYMBOL(qdisc_hash_del);
 
 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
 {
@@ -1004,7 +1005,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue 
*dev_queue,
goto err_out4;
}
 
-   

Re: [PATCH RESEND] net: can: Introduce MEN 16Z192-00 CAN controller driver

2016-08-10 Thread Oliver Hartkopp

Hi Andreas,

On 08/09/2016 08:10 AM, Andreas Werner wrote:

On Mon, Aug 08, 2016 at 04:35:34PM +0200, Wolfgang Grandegger wrote:



You specify here one echo_skb but it's not used anywhere. Local loopback
seems not to be implemented.



Agree with you, will set it to "0".


No, the local loopback is mandetory!



Hm ok, but if i check alloc_candev() in drivers/net/can/dev.c
it is not mandatory.


It is.

Even those drivers that show up to use zero echo skbs in alloc_candev() 
implement the echo functionality correct.


Just check 'git grep IFF_ECHO'. Even grcan.c and janz-ican3.c have 
IFF_ECHO set - but implement it in a different way without using the 
provided machanism from dev.c .



In the Documentation/networking/can.txt
there is also a "should" and a fallback mechnism if the driver
does not support the local loopback.


But this fallback mechanism is bad - really bad!

E.g. the slcan.c driver sends a stream of CAN frames without knowing 
whether the frames ever hit the wire. The slcan driver is more less for 
hobby users. The CAN frame echo with IFF_ECHO gives a correct 
representation of the traffic on the wire - including the correct 
timestamps.


You really want to know whether a CAN frame was sent correctly on the 
bus instead of getting some shortcut info from inside the network layer.

.


Well, s/driver/hardware/ ! Local loopback is the preferred mechanism.



Sure...


I'm currently ok with this fallback mechanism.


/me not.


Anyway I am not sure that the driver can handle the echo skb correctly.
If i understand it correctly, the can_get_echo_skb() is normally called
on a "TX done IRQ" to get the skb and loop it back.


ack.


I do not have such a "TX done IRQ" and have not implemented implemented
and added the local looback.


I'm not really sure how grcan.c and janz-ican3.c implemented the echo 
functionality but they must have faced a similar situation.


A local loopback inside the CAN controller which is generated after 
successful transmit is an excellent implementation with excellent 
timestamps. The only problem for you is to detect the looped CAN frames 
and match them to the skb pointer of the outgoing frame to 'receive' the 
correct echo skb.


When you send CAN frames to an unconnected CAN bus it can't be sent out 
due to the missing acknowledge from other nodes. So when you send frames 
and you get echo frames due to the fallback mode your cool CAN 
controller degrades to slcan level.


Regards,
Oliver

ps. Do you have any URL where one can get the MEN 16Z192 spec?


Re: [PATCH v2 1/1] Fix unbound rx buffer

2016-08-10 Thread Stefan Hajnoczi
On Thu, Aug 04, 2016 at 04:09:57PM +0200, ggar...@abra.uab.cat wrote:
> From: Gerard Garcia 
> 
> Reset connection and close rx socket when the sender is ignoring our 
> announced available buffer.
> 
> Signed-off-by: Gerard Garcia 
> 
> ---
> v2:
>  * Get vvs->rx_lock lock before checking if next packet is going to
> overflow the rx buffer.
> 
>  net/vmw_vsock/virtio_transport_common.c | 20 +++-
>  1 file changed, 15 insertions(+), 5 deletions(-)

Acked-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature


[Patch net v2 3/5] net_sched: fix a typo in tc_for_each_action()

2016-08-10 Thread Cong Wang
It is harmless because all users pass 'a' to this macro.

Fixes: 00175aec941e ("net/sched: Macro instead of CONFIG_NET_CLS_ACT ifdef")
Cc: Amir Vadai 
Signed-off-by: Cong Wang 
---
 include/net/act_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 41e6a24..f53ee9d 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -193,7 +193,7 @@ int tcf_action_copy_stats(struct sk_buff *, struct 
tc_action *, int);
(list_empty(&(_exts)->actions))
 
 #define tc_for_each_action(_a, _exts) \
-   list_for_each_entry(a, &(_exts)->actions, list)
+   list_for_each_entry(_a, &(_exts)->actions, list)
 
 #define tc_single_action(_exts) \
(list_is_singular(&(_exts)->actions))
-- 
2.1.0



[Patch net v2 5/5] net_sched: convert tcf_exts from list to pointer array

2016-08-10 Thread Cong Wang
As pointed out by Jamal, an action could be shared by
multiple filters, so we can't use list to chain them
any more after we get rid of the original tc_action.
Instead, we could just save pointers to these actions
in tcf_exts, since they are refcount'ed, so convert
the list to an array of pointers.

The "ugly" part is the action API still accepts list
as a parameter, I just introduce a helper function to
convert the array of pointers to a list, instead of
relying on the C99 feature to iterate the array.

Fixes: a85a970af265 ("net_sched: move tc_action into tcf_common")
Reported-by: Jamal Hadi Salim 
Cc: Jamal Hadi Salim 
Signed-off-by: Cong Wang 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 12 --
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c  |  4 +-
 include/net/pkt_cls.h   | 42 +---
 net/sched/cls_api.c | 51 +
 5 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 5418c69a..6ac1254 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8390,12 +8390,14 @@ static int parse_tc_actions(struct ixgbe_adapter 
*adapter,
struct tcf_exts *exts, u64 *action, u8 *queue)
 {
const struct tc_action *a;
+   LIST_HEAD(actions);
int err;
 
if (tc_no_actions(exts))
return -EINVAL;
 
-   tc_for_each_action(a, exts) {
+   tcf_exts_to_list(exts, );
+   list_for_each_entry(a, , list) {
 
/* Drop action */
if (is_tcf_gact_shot(a)) {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 0f19b01..dc8b1cb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -318,6 +318,7 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, 
struct tcf_exts *exts,
u32 *action, u32 *flow_tag)
 {
const struct tc_action *a;
+   LIST_HEAD(actions);
 
if (tc_no_actions(exts))
return -EINVAL;
@@ -325,7 +326,8 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, 
struct tcf_exts *exts,
*flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
*action = 0;
 
-   tc_for_each_action(a, exts) {
+   tcf_exts_to_list(exts, );
+   list_for_each_entry(a, , list) {
/* Only support a single action per rule */
if (*action)
return -EINVAL;
@@ -362,13 +364,15 @@ static int parse_tc_fdb_actions(struct mlx5e_priv *priv, 
struct tcf_exts *exts,
u32 *action, u32 *dest_vport)
 {
const struct tc_action *a;
+   LIST_HEAD(actions);
 
if (tc_no_actions(exts))
return -EINVAL;
 
*action = 0;
 
-   tc_for_each_action(a, exts) {
+   tcf_exts_to_list(exts, );
+   list_for_each_entry(a, , list) {
/* Only support a single action per rule */
if (*action)
return -EINVAL;
@@ -503,6 +507,7 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
struct mlx5e_tc_flow *flow;
struct tc_action *a;
struct mlx5_fc *counter;
+   LIST_HEAD(actions);
u64 bytes;
u64 packets;
u64 lastuse;
@@ -518,7 +523,8 @@ int mlx5e_stats_flower(struct mlx5e_priv *priv,
 
mlx5_fc_query_cached(counter, , , );
 
-   tc_for_each_action(a, f->exts)
+   tcf_exts_to_list(f->exts, );
+   list_for_each_entry(a, , list)
tcf_action_stats_update(a, bytes, packets, lastuse);
 
return 0;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index e1b8f62..9130cb2 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -1149,6 +1149,7 @@ static int mlxsw_sp_port_add_cls_matchall(struct 
mlxsw_sp_port *mlxsw_sp_port,
  bool ingress)
 {
const struct tc_action *a;
+   LIST_HEAD(actions);
int err;
 
if (!tc_single_action(cls->exts)) {
@@ -1156,7 +1157,8 @@ static int mlxsw_sp_port_add_cls_matchall(struct 
mlxsw_sp_port *mlxsw_sp_port,
return -ENOTSUPP;
}
 
-   tc_for_each_action(a, cls->exts) {
+   tcf_exts_to_list(cls->exts, );
+   list_for_each_entry(a, , list) {
if (!is_tcf_mirred_mirror(a) || protocol != htons(ETH_P_ALL))
return -ENOTSUPP;
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 00dd5c4..d882b36 100644
--- a/include/net/pkt_cls.h
+++ 

[Patch net v2 4/5] net_sched: move tc offload macros to pkt_cls.h

2016-08-10 Thread Cong Wang
struct tcf_exts belongs to filters, should not be visible
to plain tc actions.

Cc: Ido Schimmel 
Signed-off-by: Cong Wang 
---
 include/net/act_api.h | 19 +++
 include/net/pkt_cls.h | 19 +++
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index f53ee9d..870332f 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -189,30 +189,17 @@ int tcf_action_dump_old(struct sk_buff *skb, struct 
tc_action *a, int, int);
 int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
 int tcf_action_copy_stats(struct sk_buff *, struct tc_action *, int);
 
-#define tc_no_actions(_exts) \
-   (list_empty(&(_exts)->actions))
-
-#define tc_for_each_action(_a, _exts) \
-   list_for_each_entry(_a, &(_exts)->actions, list)
-
-#define tc_single_action(_exts) \
-   (list_is_singular(&(_exts)->actions))
+#endif /* CONFIG_NET_CLS_ACT */
 
 static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes,
   u64 packets, u64 lastuse)
 {
+#ifdef CONFIG_NET_CLS_ACT
if (!a->ops->stats_update)
return;
 
a->ops->stats_update(a, bytes, packets, lastuse);
+#endif
 }
 
-#else /* CONFIG_NET_CLS_ACT */
-
-#define tc_no_actions(_exts) true
-#define tc_for_each_action(_a, _exts) while ((void)(_a), 0)
-#define tc_single_action(_exts) false
-#define tcf_action_stats_update(a, bytes, packets, lastuse)
-
-#endif /* CONFIG_NET_CLS_ACT */
 #endif
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 6f8d653..00dd5c4 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -130,6 +130,25 @@ tcf_exts_exec(struct sk_buff *skb, struct tcf_exts *exts,
return 0;
 }
 
+#ifdef CONFIG_NET_CLS_ACT
+
+#define tc_no_actions(_exts) \
+   (list_empty(&(_exts)->actions))
+
+#define tc_for_each_action(_a, _exts) \
+   list_for_each_entry(_a, &(_exts)->actions, list)
+
+#define tc_single_action(_exts) \
+   (list_is_singular(&(_exts)->actions))
+
+#else /* CONFIG_NET_CLS_ACT */
+
+#define tc_no_actions(_exts) true
+#define tc_for_each_action(_a, _exts) while ((void)(_a), 0)
+#define tc_single_action(_exts) false
+
+#endif /* CONFIG_NET_CLS_ACT */
+
 int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
  struct nlattr **tb, struct nlattr *rate_tlv,
  struct tcf_exts *exts, bool ovr);
-- 
2.1.0



[Patch net v2 0/5] net_sched: tc action fixes and updates

2016-08-10 Thread Cong Wang
This patchset fixes a few regressions caused by the previous
code refactor. Thanks to Jamal for catching them!

Note, patch 3/5 and 4/5 are not strictly necessary for this patchset,
I just want to carry them together.

---
v2: replace flex_array with regular dynamic array
keep tcf_action_stats_update() in act_api.h
fix macro typos found by Amir

Cong Wang (5):
  net_sched: remove the leftover cleanup_a()
  net_sched: remove an unnecessary list_del()
  net_sched: fix a typo in tc_for_each_action()
  net_sched: move tc offload macros to pkt_cls.h
  net_sched: convert tcf_exts from list to pointer array

 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c   |  4 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 12 --
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c  |  4 +-
 include/net/act_api.h   | 19 ++---
 include/net/pkt_cls.h   | 43 ++---
 net/sched/act_api.c | 23 ++-
 net/sched/cls_api.c | 51 +
 7 files changed, 95 insertions(+), 61 deletions(-)

-- 
2.1.0



[Patch net v2 2/5] net_sched: remove an unnecessary list_del()

2016-08-10 Thread Cong Wang
This list_del() for tc action is not needed actually,
because we only use this list to chain bulk operations,
therefore should not be carried for latter operations.

Fixes: ec0595cc4495 ("net_sched: get rid of struct tcf_common")
Cc: Jamal Hadi Salim 
Signed-off-by: Cong Wang 
---
 net/sched/act_api.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index cce6986..b4c7be3 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -64,7 +64,6 @@ int __tcf_hash_release(struct tc_action *p, bool bind, bool 
strict)
if (p->tcfa_bindcnt <= 0 && p->tcfa_refcnt <= 0) {
if (p->ops->cleanup)
p->ops->cleanup(p, bind);
-   list_del(>list);
tcf_hash_destroy(p->hinfo, p);
ret = ACT_P_DELETED;
}
-- 
2.1.0



Re: [PATCH RESEND net-next 13/15] smc: receive data from RMBE

2016-08-10 Thread Ursula Braun



On 08/09/2016 11:32 PM, David Miller wrote:

From: Ursula Braun 
Date: Tue,  9 Aug 2016 12:12:58 +0200


+   xchg(>rx_curs_confirmed.acurs,
+smc_curs_read(conn->local_tx_ctrl.cons.acurs));


Why in the world do you need to use xchg() in all of these places?

It makes no sense whatsoever, especially since you don't even check
the return value.
98e906b2
If you need the operation to be atomic, then you have to check the
return value and do something to recover if something else beat
you to the xchg() and put something else into the location.

Otherwise, you therefore don't need it be atomic and can avoid
this expensive operation and just store the value normally.

Reviewing my xchg() usages, I really detected some paranoid usages, that 
I am going to remove. But there are still usages (and 
conn->rx_curs_confirmed is one of them), where I need an 8-byte cursor 
field to be read and written atomicaly, even though I do not care 
whether the write operation has been beaten or not. But I do care that 
reading the cursor does not return a partially updated cursor. Isn't 
xchg() a possible solution in this case?




Re: [Patch net 0/5] net_sched: tc action fixes and updates

2016-08-10 Thread Cong Wang
On Wed, Aug 10, 2016 at 7:34 AM, Jamal Hadi Salim  wrote:
> On 16-08-08 04:46 PM, Cong Wang wrote:
>>
>> This patchset fixes several regressions caused by the previous
>> code refactor. Thanks to Jamal for catching them!
>>
>
> Cong,
>
> Good news: oops gone. I havent done more testing than I did
> before; but looks good so far.
>
> Bad news: You have introduced a performance regression which is
> noticeable at high speed.
>
> tcf_exts_exec() is the culprit - and conversion to from flexarray
> to linked list in the fast problem to be specific.

Ah, this reminds me that I don't have to use flex_array, initially
I thought the tcf_exts could hold as many actions as it wants,
but actually there is a upper bound, TCA_ACT_MAX_PRIO.
IOW, a regular dynamic array is just enough here.

I just replaced the flex_array with a regular one, it works fine
for me too, at least no crash with all of my test cases.

Please try v2, since you have more test cases that I do.
Or it would be great if you can share your test cases with
me or us.

Be patient, every big change could have regression. :)

Thanks.


Re: [PATCH 2/2] ravb: add sleep PM suspend/resume support

2016-08-10 Thread Niklas Söderlund
On 2016-08-10 13:40:51 +0300, Sergei Shtylyov wrote:
> Hello.
> 
> On 8/3/2016 4:56 PM, Niklas Söderlund wrote:
> 
> > The interface would not function after the system had been woken up
> > after have been suspended (echo mem > /sys/power/state) cycle. The
> > reason for this is that all device registers have been reset to its
> > default values. This patch adds sleep suspend and resume functions that
> > detached the interface at suspend and restore the registers and reattach
> > the interface at resume.
> > 
> > Only the registers that are only configured at probe time needs to be
> > explicitly restored by the resume handler. All other registers are
> > reconfigured by either reopening the device in the resume handler (if
> > the device was running when the system was suspended) or when the
> > interface is opened by a user at a later time.
> > 
> > Signed-off-by: Niklas Söderlund 
> > ---
> >  drivers/net/ethernet/renesas/ravb_main.c | 72 
> > 
> >  1 file changed, 64 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/renesas/ravb_main.c 
> > b/drivers/net/ethernet/renesas/ravb_main.c
> > index da8da86..1de55c9 100644
> > --- a/drivers/net/ethernet/renesas/ravb_main.c
> > +++ b/drivers/net/ethernet/renesas/ravb_main.c
> [...]
> > @@ -2111,6 +2166,7 @@ static int ravb_runtime_nop(struct device *dev)
> >  }
> > 
> >  static const struct dev_pm_ops ravb_dev_pm_ops = {
> > +   SET_SYSTEM_SLEEP_PM_OPS(ravb_runtime_suspend, ravb_runtime_resume)
> 
>Why in the world you used runtime_ in the usual suspend/resume method
> names?! Since DaveM have already taken the patch, please send a follow-up
> patch to remove that infix.

Sorry about that, stupid error on my part. Will send a follow-up patch.  
Thanks for finding it.

> 
> > SET_RUNTIME_PM_OPS(ravb_runtime_nop, ravb_runtime_nop, NULL)
> >  };
> > 
> 
> MBR, Sergei
> 

-- 
Regards,
Niklas Söderlund


Re: [PATCHv2 3/4] pci: Determine actual VPD size on first access

2016-08-10 Thread Alexander Duyck
On Tue, Aug 9, 2016 at 5:03 PM, Benjamin Herrenschmidt
 wrote:
> On Tue, 2016-08-09 at 11:12 -0700, Alexander Duyck wrote:
>>
>> The PCI spec is what essentially assumes that there is only one block.
>> If I am not mistaken in the case of this device the second block here
>> actually contains device configuration data, not actual VPD data.  The
>> issue here is that the second block is being accessed as VPD when it
>> isn't.
>
> Devices do funny things with config space, film at 11. VFIO trying to
> be the middle man and intercept/interpret things is broken, cannot work,
> will never work, will just results in lots and lots of useless code, but
> I've been singing that song for too long and nobody seems to care...
>
>> > > # Large item 42 bytes; name 0x2 Identifier String
>> > #002d Large item 74 bytes; name 0x10
>> > #007a Small item 1 bytes; name 0xf End Tag
>> > ---
>> > #0c00 Large item 16 bytes; name 0x2 Identifier String
>> > #0c13 Large item 234 bytes; name 0x10
>> > #0d00 Large item 252 bytes; name 0x11
>> > #0dff Small item 0 bytes; name 0xf End Tag
>>
>> The second block here is driver proprietary setup bits.
>
> Right. They happen to be in VPD on this device. They an be elsewhere on
> other devices. In between capabilities on some, in vendor caps on others...
>
>> > > The cxgb3 driver is reading the second bit starting from 0xc00 but since
>> > the size is wrongly detected as 0x7c, VFIO blocks access beyond it and the
>> > guest driver fails to probe.
>> >
>> > I also cannot find a clause in the PCI 3.0 spec saying that there must be
>> > just a single block, is it there?
>>
>> > The problem is we need to be able to parse it.
>
> We can parse the standard part for generic stuff like inventory tools
> or lsvpd, but we shouldn't get in the way of the driver poking at its
> device.

If we add the quirk to the kernel that reports the VPD for this device
is the actual size of both blocks then we wouldn't be blocking the VPD
access like we currently are.

The problem is if we don't do this it becomes possible for a guest to
essentially cripple a device on the host by just accessing VPD regions
that aren't actually viable on many devices.  We are much better off
in terms of security and stability if we restrict access to what
should be accessible.  In this case what has happened is that the
vendor threw in an extra out-of-spec block and just expected it to
work.  In order to work around it we just need to add a small function
to drivers/pci/quirks.c that would update the VPD size reported so
that it matches what the hardware is actually providing instead of
what we can determine based on the VPD layout.

Really working around something like this is not much different than
what we would have to do if the vendor had stuffed the data in some
reserved section of their PCI configuration space.  We end up needing
to add special quirks any time a vendor goes out-of-spec for some
one-off configuration interface that only they are ever going to use.

- Alex


[PATCH 15/21] net: thunderx: Improvement for MBX interface debug messages

2016-08-10 Thread sunil . kovvuri
From: Radoslaw Biernacki 

Adding debug messages in case of NACK for a mailbox message, also
did small cleanups.

Signed-off-by: Radoslaw Biernacki 
Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/cavium/thunder/nic_main.c   | 16 ++--
 drivers/net/ethernet/cavium/thunder/nicvf_main.c |  8 ++--
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c 
b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 8ed8ecd..668cfd9 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -831,7 +831,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
mbx_addr += sizeof(u64);
}
 
-   dev_dbg(>pdev->dev, "%s: Mailbox msg %d from VF%d\n",
+   dev_dbg(>pdev->dev, "%s: Mailbox msg 0x%02x from VF%d\n",
__func__, mbx.msg.msg, vf);
switch (mbx.msg.msg) {
case NIC_MBOX_MSG_READY:
@@ -841,8 +841,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
nic->duplex[vf] = 0;
nic->speed[vf] = 0;
}
-   ret = 1;
-   break;
+   goto unlock;
case NIC_MBOX_MSG_QS_CFG:
reg_addr = NIC_PF_QSET_0_127_CFG |
   (mbx.qs.num << NIC_QS_ID_SHIFT);
@@ -891,8 +890,10 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
nic_tx_channel_cfg(nic, mbx.qs.num, );
break;
case NIC_MBOX_MSG_SET_MAC:
-   if (vf >= nic->num_vf_en)
+   if (vf >= nic->num_vf_en) {
+   ret = -1; /* NACK */
break;
+   }
lmac = mbx.mac.vf_id;
bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lmac]);
lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lmac]);
@@ -947,10 +948,13 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
break;
}
 
-   if (!ret)
+   if (!ret) {
nic_mbx_send_ack(nic, vf);
-   else if (mbx.msg.msg != NIC_MBOX_MSG_READY)
+   } else if (mbx.msg.msg != NIC_MBOX_MSG_READY) {
+   dev_err(>pdev->dev, "NACK for MBOX 0x%02x from VF %d\n",
+   mbx.msg.msg, vf);
nic_mbx_send_nack(nic, vf);
+   }
 unlock:
nic->mbx_lock[vf] = false;
 }
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index dbb61a8..300416a 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -144,15 +144,19 @@ int nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx 
*mbx)
 
/* Wait for previous message to be acked, timeout 2sec */
while (!nic->pf_acked) {
-   if (nic->pf_nacked)
+   if (nic->pf_nacked) {
+   netdev_err(nic->netdev,
+  "PF NACK to mbox msg 0x%02x from VF%d\n",
+  (mbx->msg.msg & 0xFF), nic->vf_id);
return -EINVAL;
+   }
msleep(sleep);
if (nic->pf_acked)
break;
timeout -= sleep;
if (!timeout) {
netdev_err(nic->netdev,
-  "PF didn't ack to mbox msg %d from VF%d\n",
+  "PF didn't ACK to mbox msg 0x%02x from 
VF%d\n",
   (mbx->msg.msg & 0xFF), nic->vf_id);
return -EBUSY;
}
-- 
2.7.4



[PATCH v6 3/3] net: phy: Add gmiitorgmii converter support

2016-08-10 Thread Kedareswara rao Appana
This patch adds support for gmiitorgmii converter.

The GMII to RGMII IP core provides the Reduced Gigabit Media
Independent Interface (RGMII) between Ethernet physical media
Devices and the Gigabit Ethernet controller. This core can
Switch dynamically between the three different speed modes of
Operation by configuring the converter register through mdio write.

MDIO interface is used to set operating speed of Ethernet MAC.

This converter sits between the MAC and the external phy
MAC <==> GMII2RGMII <==> RGMII_PHY

Signed-off-by: Kedareswara rao Appana 
---
Thanks a lot Andrew for your inputs.
Changes for v6:
--> Don't force phy to default enabled as suggested by Florian.
--> Fix Mask value as suggested by Florian.
--> Used mdio_module_driver as suggested by Florian.
--> Remove switch case and used if-else conditions for phy speed
checking as suggested by Florian.
Changes for v5:
--> Fixed return values in the probe as suggested by punnaiah.
--> Added a mask for the converter speed as suggested by punnaiah.
Changes for v4:
--> Updated phydev speed for all 3 speeds as suggested by zhuyj.
Changes for v3:
--> Updated the driver as suggested by Andrew.
Changes for v2:
--> Passed struct xphy pointer directly to the fix_mac_speed
API as suggested by the Florian.
--> Added checks for the phy-node fail case as suggested
by the Florian

 drivers/net/phy/Kconfig |   7 +++
 drivers/net/phy/Makefile|   1 +
 drivers/net/phy/xilinx_gmii2rgmii.c | 109 
 3 files changed, 117 insertions(+)
 create mode 100644 drivers/net/phy/xilinx_gmii2rgmii.c

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 1b534ea..d66133bf 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -312,6 +312,13 @@ config MICROSEMI_PHY
 ---help---
   Currently supports the VSC8531 and VSC8541 PHYs
 
+config XILINX_GMII2RGMII
+   tristate "Xilinx GMII2RGMII converter driver"
+   ---help---
+ This driver support xilinx GMII to RGMII IP core it provides
+ the Reduced Gigabit Media Independent Interface(RGMII) between
+ Ethernet physical media devices and the Gigabit Ethernet controller.
+
 endif # PHYLIB
 
 config MICREL_KS8995MA
diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
index a713bd4..73d65ce 100644
--- a/drivers/net/phy/Makefile
+++ b/drivers/net/phy/Makefile
@@ -50,3 +50,4 @@ obj-$(CONFIG_MDIO_BCM_IPROC)  += mdio-bcm-iproc.o
 obj-$(CONFIG_INTEL_XWAY_PHY)   += intel-xway.o
 obj-$(CONFIG_MDIO_HISI_FEMAC)  += mdio-hisi-femac.o
 obj-$(CONFIG_MDIO_XGENE)   += mdio-xgene.o
+obj-$(CONFIG_XILINX_GMII2RGMII) += xilinx_gmii2rgmii.o
diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c 
b/drivers/net/phy/xilinx_gmii2rgmii.c
new file mode 100644
index 000..8e980ad
--- /dev/null
+++ b/drivers/net/phy/xilinx_gmii2rgmii.c
@@ -0,0 +1,109 @@
+/* Xilinx GMII2RGMII Converter driver
+ *
+ * Copyright (C) 2016 Xilinx, Inc.
+ *
+ * Author: Kedareswara rao Appana 
+ *
+ * Description:
+ * This driver is developed for Xilinx GMII2RGMII Converter
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define XILINX_GMII2RGMII_REG  0x10
+#define XILINX_GMII2RGMII_SPEED_MASK   (BMCR_SPEED1000 | BMCR_SPEED100)
+
+struct gmii2rgmii {
+   struct phy_device *phy_dev;
+   struct phy_driver *phy_drv;
+   struct phy_driver conv_phy_drv;
+   int addr;
+};
+
+static int xgmiitorgmii_read_status(struct phy_device *phydev)
+{
+   struct gmii2rgmii *priv = phydev->priv;
+   u16 val = 0;
+
+   priv->phy_drv->read_status(phydev);
+
+   val = mdiobus_read(phydev->mdio.bus, priv->addr, XILINX_GMII2RGMII_REG);
+   val &= XILINX_GMII2RGMII_SPEED_MASK;
+
+   if (phydev->speed == SPEED_1000)
+   val |= BMCR_SPEED1000;
+   else if (phydev->speed == SPEED_100)
+   val |= BMCR_SPEED100;
+   else
+   val |= BMCR_SPEED10;
+
+   mdiobus_write(phydev->mdio.bus, priv->addr, XILINX_GMII2RGMII_REG, val);
+
+   return 0;
+}
+
+int xgmiitorgmii_probe(struct mdio_device *mdiodev)
+{
+   struct device *dev = >dev;
+   struct device_node *np = dev->of_node, *phy_node;
+   struct gmii2rgmii *priv;
+
+   priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+   if (!priv)
+   return -ENOMEM;
+
+   phy_node = of_parse_phandle(np, "phy-handle", 0);
+   if 

[PATCH v6 2/3] Documentation: DT: net: Add Xilinx gmiitorgmii converter device tree binding documentation

2016-08-10 Thread Kedareswara rao Appana
Device-tree binding documentation for xilinx gmiitorgmii converter.

Signed-off-by: Kedareswara rao Appana 
---
Changes for v6:
---> Removed mdio description as suggested by Florian.
Changes for v5:
---> Fixed Indentation in the example as suggested by Michal.
Changes for v4:
--> Modified compatible as suggested by Rob.
--> Removed underscores from the converter node name as suggested by Rob.
Changes for v3:
--> None.
Changes for v2:
--> New patch

 .../devicetree/bindings/net/xilinx_gmii2rgmii.txt  | 35 ++
 1 file changed, 35 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/xilinx_gmii2rgmii.txt

diff --git a/Documentation/devicetree/bindings/net/xilinx_gmii2rgmii.txt 
b/Documentation/devicetree/bindings/net/xilinx_gmii2rgmii.txt
new file mode 100644
index 000..038dda4
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/xilinx_gmii2rgmii.txt
@@ -0,0 +1,35 @@
+XILINX GMIITORGMII Converter Driver Device Tree Bindings
+
+
+The Gigabit Media Independent Interface (GMII) to Reduced Gigabit Media
+Independent Interface (RGMII) core provides the RGMII between RGMII-compliant
+Ethernet physical media devices (PHY) and the Gigabit Ethernet controller.
+This core can be used in all three modes of operation(10/100/1000 Mb/s).
+The Management Data Input/Output (MDIO) interface is used to configure the
+Speed of operation. This core can switch dynamically between the three
+Different speed modes by configuring the conveter register through mdio write.
+
+This converter sits between the ethernet MAC and the external phy.
+MAC <==> GMII2RGMII <==> RGMII_PHY
+
+For more details about mdio please refer phy.txt file in the same directory.
+
+Required properties:
+- compatible   : Should be "xlnx,gmii-to-rgmii-1.0"
+- reg  : The ID number for the phy, usually a small integer
+- phy-handle   : Should point to the external phy device.
+ See ethernet.txt file in the same directory.
+
+Example:
+   mdio {
+   #address-cells = <1>;
+   #size-cells = <0>;
+   phy: ethernet-phy@0 {
+   ..
+   };
+   gmiitorgmii: gmiitorgmii@8 {
+   compatible = "xlnx,gmii-to-rgmii-1.0";
+   reg = <8>;
+   phy-handle = <>;
+   };
+   };
-- 
2.1.2



[PATCH] net/xfrm_input: fix possible NULL deref of tunnel.ip6->parms.i_key

2016-08-10 Thread Alexey Kodanev
Running LTP 'icmp-uni-basic.sh -6 -p ipcomp -m tunnel' test over
openvswitch + veth can trigger kernel panic:

  BUG: unable to handle kernel NULL pointer dereference
  at 00e0 IP: [] xfrm_input+0x82/0x750
  ...
  [] xfrm6_rcv_spi+0x1e/0x20
  [] xfrm6_tunnel_rcv+0x42/0x50 [xfrm6_tunnel]
  [] tunnel6_rcv+0x3e/0x8c [tunnel6]
  [] ip6_input_finish+0xd5/0x430
  [] ip6_input+0x33/0x90
  [] ip6_rcv_finish+0xa5/0xb0
  ...

It seems that tunnel.ip6 can have garbage values and also dereferenced
without a proper check, only tunnel.ip4 is being verified. Fix it by
adding one more if block for AF_INET6 and initialize tunnel.ip6 with NULL
inside xfrm6_rcv_spi() (which is similar to xfrm4_rcv_spi()).

Fixes: 049f8e2 ("xfrm: Override skb->mark with tunnel->parm.i_key in 
xfrm_input")

Signed-off-by: Alexey Kodanev 
---
 net/ipv6/xfrm6_input.c |1 +
 net/xfrm/xfrm_input.c  |   14 +++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 0eaab1f..00a2d40 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -23,6 +23,7 @@ int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff 
*skb)
 
 int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi)
 {
+   XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
return xfrm_input(skb, nexthdr, spi, 0);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 1c4ad47..6e3f025 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -207,15 +207,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 
spi, int encap_type)
family = XFRM_SPI_SKB_CB(skb)->family;
 
/* if tunnel is present override skb->mark value with tunnel i_key */
-   if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) {
-   switch (family) {
-   case AF_INET:
+   switch (family) {
+   case AF_INET:
+   if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4)
mark = 
be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key);
-   break;
-   case AF_INET6:
+   break;
+   case AF_INET6:
+   if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6)
mark = 
be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key);
-   break;
-   }
+   break;
}
 
/* Allocate new secpath or COW existing one. */
-- 
1.7.1



[PATCH 1/1 linux-next] net: hns: fix typo in g_gmac_stats_string[]

2016-08-10 Thread Fabian Frederick
s/gamc/gmac/

Signed-off-by: Fabian Frederick 
---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c 
b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c
index 1235c7f..1e1eb92 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_gmac.c
@@ -17,7 +17,7 @@ static const struct mac_stats_string g_gmac_stats_string[] = {
{"gmac_rx_octets_total_ok", MAC_STATS_FIELD_OFF(rx_good_bytes)},
{"gmac_rx_octets_bad", MAC_STATS_FIELD_OFF(rx_bad_bytes)},
{"gmac_rx_uc_pkts", MAC_STATS_FIELD_OFF(rx_uc_pkts)},
-   {"gamc_rx_mc_pkts", MAC_STATS_FIELD_OFF(rx_mc_pkts)},
+   {"gmac_rx_mc_pkts", MAC_STATS_FIELD_OFF(rx_mc_pkts)},
{"gmac_rx_bc_pkts", MAC_STATS_FIELD_OFF(rx_bc_pkts)},
{"gmac_rx_pkts_64octets", MAC_STATS_FIELD_OFF(rx_64bytes)},
{"gmac_rx_pkts_65to127", MAC_STATS_FIELD_OFF(rx_65to127)},
-- 
2.8.1



[iproute PATCH] ip-route: Pretty-print expired routes

2016-08-10 Thread Phil Sutter
Instead of printing 'expires -23sec' for expired (but not yet garbage
collected) routes, print 'expired 23sec' instead.

Signed-off-by: Phil Sutter 
---
 ip/iproute.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index c52294d298210..a89a26d68be0f 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -305,6 +305,14 @@ static void print_rtax_features(FILE *fp, unsigned int 
features)
fprintf(fp, " 0x%x", of);
 }
 
+static void print_expires(FILE *fp, __s32 expires, int hz)
+{
+   if (expires > 0)
+   fprintf(fp, " expires %dsec", expires/hz);
+   else
+   fprintf(fp, " expired %dsec", -expires/hz);
+}
+
 int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 {
FILE *fp = (FILE *)arg;
@@ -502,7 +510,7 @@ int print_route(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
if (!hz)
hz = get_user_hz();
if (ci->rta_expires != 0)
-   fprintf(fp, " expires %dsec", 
ci->rta_expires/hz);
+   print_expires(fp, ci->rta_expires, hz);
if (ci->rta_error != 0)
fprintf(fp, " error %d", ci->rta_error);
if (show_stats) {
@@ -530,7 +538,7 @@ int print_route(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
if (r->rtm_flags & RTM_F_CLONED)
fprintf(fp, "%scache ", _SL_);
if (ci->rta_expires)
-   fprintf(fp, " expires %dsec", 
ci->rta_expires/hz);
+   print_expires(fp, ci->rta_expires, hz);
if (ci->rta_error != 0)
fprintf(fp, " error %d", ci->rta_error);
if (show_stats) {
-- 
2.8.2



Re: [PATCH v3 08/13] net: ethernet: ti: cpsw: move links on h/w registers to cpsw_common

2016-08-10 Thread Mugunthan V N
On Wednesday 10 August 2016 04:52 AM, Ivan Khoronzhuk wrote:
> The pointers on h/w registers are common for every cpsw_private
> instance, so no need to hold them for every ndev.
> 
> Signed-off-by: Ivan Khoronzhuk 


Reviewed-by: Mugunthan V N 

Regards
Mugunthan V N


Re: [PATCH v3 07/13] net: ethernet: ti: cpsw: replace pdev on dev

2016-08-10 Thread Mugunthan V N
On Wednesday 10 August 2016 04:52 AM, Ivan Khoronzhuk wrote:
> No need to hold pdev link when only dev is needed.
> This allows to simplify a bunch of cpsw->pdev->dev now and farther.
> 
> Signed-off-by: Ivan Khoronzhuk 

Reviewed-by: Mugunthan V N 

Regards
Mugunthan V N


Re: [net-next v2 v2 1/2] bpf: Add bpf_current_task_in_cgroup helper

2016-08-10 Thread Daniel Borkmann

On 08/10/2016 06:40 AM, Sargun Dhillon wrote:

On Tue, Aug 09, 2016 at 08:52:01PM -0700, Alexei Starovoitov wrote:

On Tue, Aug 09, 2016 at 08:40:05PM -0700, Sargun Dhillon wrote:

On Tue, Aug 09, 2016 at 08:27:32PM -0700, Alexei Starovoitov wrote:

On Tue, Aug 09, 2016 at 06:26:37PM -0700, Sargun Dhillon wrote:

On Tue, Aug 09, 2016 at 06:02:34PM -0700, Alexei Starovoitov wrote:

On Tue, Aug 09, 2016 at 05:55:26PM -0700, Sargun Dhillon wrote:

On Tue, Aug 09, 2016 at 05:23:50PM -0700, Alexei Starovoitov wrote:

On Tue, Aug 09, 2016 at 05:00:12PM -0700, Sargun Dhillon wrote:

This adds a bpf helper that's similar to the skb_in_cgroup helper to check
whether the probe is currently executing in the context of a specific
subset of the cgroupsv2 hierarchy. It does this based on membership test
for a cgroup arraymap. It is invalid to call this in an interrupt, and
it'll return an error. The helper is primarily to be used in debugging
activities for containers, where you may have multiple programs running in
a given top-level "container".

This patch also genericizes some of the arraymap fetching logic between the
skb_in_cgroup helper and this new helper.

Signed-off-by: Sargun Dhillon 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
---
  include/linux/bpf.h  | 24 
  include/uapi/linux/bpf.h | 11 +++
  kernel/bpf/arraymap.c|  2 +-
  kernel/bpf/verifier.c|  4 +++-
  kernel/trace/bpf_trace.c | 34 ++
  net/core/filter.c| 11 ---
  6 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1113423..9adf712 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -319,4 +319,28 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
  void bpf_user_rnd_init_once(void);
  u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

+#ifdef CONFIG_CGROUPS
+/* Helper to fetch a cgroup pointer based on index.
+ * @map: a cgroup arraymap
+ * @idx: index of the item you want to fetch
+ *
+ * Returns pointer on success,
+ * Error code if item not found, or out-of-bounds access
+ */
+static inline struct cgroup *fetch_arraymap_ptr(struct bpf_map *map, int idx)
+{
+   struct cgroup *cgrp;
+   struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+   if (unlikely(idx >= array->map.max_entries))
+   return ERR_PTR(-E2BIG);
+
+   cgrp = READ_ONCE(array->ptrs[idx]);
+   if (unlikely(!cgrp))
+   return ERR_PTR(-EAGAIN);
+
+   return cgrp;
+}
+#endif /* CONFIG_CGROUPS */
+
  #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da218fe..64b1a07 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,17 @@ enum bpf_func_id {
 */
BPF_FUNC_probe_write_user,

+   /**
+* bpf_current_task_in_cgroup(map, index) - Check cgroup2 membership of 
current task
+* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+* @index: index of the cgroup in the bpf_map
+* Return:
+*   == 0 current failed the cgroup2 descendant test
+*   == 1 current succeeded the cgroup2 descendant test
+*< 0 error
+*/
+   BPF_FUNC_current_task_in_cgroup,
+
__BPF_FUNC_MAX_ID,
  };

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633a650..a2ac051 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void)
  }
  late_initcall(register_perf_event_array_map);

-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUPS
  static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
 struct file *map_file /* not used */,
 int fd)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7094c69..80efab8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1053,7 +1053,8 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
goto error;
break;
case BPF_MAP_TYPE_CGROUP_ARRAY:
-   if (func_id != BPF_FUNC_skb_in_cgroup)
+   if (func_id != BPF_FUNC_skb_in_cgroup &&
+   func_id != BPF_FUNC_current_task_in_cgroup)
goto error;
break;
default:
@@ -1075,6 +1076,7 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+   case BPF_FUNC_current_task_in_cgroup:
case BPF_FUNC_skb_in_cgroup:
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 

[PATCH 07/21] net: thunderx: Support for different LMAC types within BGX

2016-08-10 Thread sunil . kovvuri
From: Sunil Goutham 

On 88xx all LMACs in a BGX will be in same mode but on 81xx
BGX can be split as two and there can be LMACs configured in
different modes.

These changes move lmac_type, lane2serdes fields into per lmac
struct from BGX struct. Got rid of qlm_mode field which has become
redundant with these changes. And now no of valid LMACs is read
from CSRs configured by low level firmware and figuring out the
same based on QLM mode is discarded

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 224 ++
 drivers/net/ethernet/cavium/thunder/thunder_bgx.h |  10 -
 2 files changed, 98 insertions(+), 136 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c 
b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
index 63a39ac..4497427 100644
--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
+++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
@@ -28,6 +28,9 @@ struct lmac {
struct bgx  *bgx;
int dmac;
u8  mac[ETH_ALEN];
+   u8  lmac_type;
+   u8  lane_to_sds;
+   booluse_training;
boollink_up;
int lmacid; /* ID within BGX */
int lmacid_bd; /* ID on board */
@@ -43,12 +46,8 @@ struct lmac {
 
 struct bgx {
u8  bgx_id;
-   u8  qlm_mode;
struct  lmaclmac[MAX_LMAC_PER_BGX];
int lmac_count;
-   int lmac_type;
-   int lane_to_sds;
-   int use_training;
void __iomem*reg_base;
struct pci_dev  *pdev;
 };
@@ -418,9 +417,10 @@ static int bgx_lmac_sgmii_init(struct bgx *bgx, int lmacid)
return 0;
 }
 
-static int bgx_lmac_xaui_init(struct bgx *bgx, int lmacid, int lmac_type)
+static int bgx_lmac_xaui_init(struct bgx *bgx, struct lmac *lmac)
 {
u64 cfg;
+   int lmacid = lmac->lmacid;
 
/* Reset SPU */
bgx_reg_modify(bgx, lmacid, BGX_SPUX_CONTROL1, SPU_CTL_RESET);
@@ -436,7 +436,7 @@ static int bgx_lmac_xaui_init(struct bgx *bgx, int lmacid, 
int lmac_type)
 
bgx_reg_modify(bgx, lmacid, BGX_SPUX_CONTROL1, SPU_CTL_LOW_POWER);
/* Set interleaved running disparity for RXAUI */
-   if (bgx->lmac_type != BGX_MODE_RXAUI)
+   if (lmac->lmac_type != BGX_MODE_RXAUI)
bgx_reg_modify(bgx, lmacid,
   BGX_SPUX_MISC_CONTROL, SPU_MISC_CTL_RX_DIS);
else
@@ -451,7 +451,7 @@ static int bgx_lmac_xaui_init(struct bgx *bgx, int lmacid, 
int lmac_type)
cfg = bgx_reg_read(bgx, lmacid, BGX_SPUX_INT);
bgx_reg_write(bgx, lmacid, BGX_SPUX_INT, cfg);
 
-   if (bgx->use_training) {
+   if (lmac->use_training) {
bgx_reg_write(bgx, lmacid, BGX_SPUX_BR_PMD_LP_CUP, 0x00);
bgx_reg_write(bgx, lmacid, BGX_SPUX_BR_PMD_LD_CUP, 0x00);
bgx_reg_write(bgx, lmacid, BGX_SPUX_BR_PMD_LD_REP, 0x00);
@@ -474,9 +474,9 @@ static int bgx_lmac_xaui_init(struct bgx *bgx, int lmacid, 
int lmac_type)
bgx_reg_write(bgx, lmacid, BGX_SPUX_AN_CONTROL, cfg);
 
cfg = bgx_reg_read(bgx, lmacid, BGX_SPUX_AN_ADV);
-   if (bgx->lmac_type == BGX_MODE_10G_KR)
+   if (lmac->lmac_type == BGX_MODE_10G_KR)
cfg |= (1 << 23);
-   else if (bgx->lmac_type == BGX_MODE_40G_KR)
+   else if (lmac->lmac_type == BGX_MODE_40G_KR)
cfg |= (1 << 24);
else
cfg &= ~((1 << 23) | (1 << 24));
@@ -511,11 +511,11 @@ static int bgx_xaui_check_link(struct lmac *lmac)
 {
struct bgx *bgx = lmac->bgx;
int lmacid = lmac->lmacid;
-   int lmac_type = bgx->lmac_type;
+   int lmac_type = lmac->lmac_type;
u64 cfg;
 
bgx_reg_modify(bgx, lmacid, BGX_SPUX_MISC_CONTROL, SPU_MISC_CTL_RX_DIS);
-   if (bgx->use_training) {
+   if (lmac->use_training) {
cfg = bgx_reg_read(bgx, lmacid, BGX_SPUX_INT);
if (!(cfg & (1ull << 13))) {
cfg = (1ull << 13) | (1ull << 14);
@@ -556,7 +556,7 @@ static int bgx_xaui_check_link(struct lmac *lmac)
   BGX_SPUX_STATUS2, SPU_STATUS2_RCVFLT);
if (bgx_reg_read(bgx, lmacid, BGX_SPUX_STATUS2) & SPU_STATUS2_RCVFLT) {
dev_err(>pdev->dev, "Receive fault, retry training\n");
-   if (bgx->use_training) {
+   if (lmac->use_training) {
cfg = bgx_reg_read(bgx, lmacid, BGX_SPUX_INT);
if (!(cfg & (1ull << 13))) {
cfg = (1ull << 13) | (1ull << 14);
@@ -599,7 +599,7 @@ static int bgx_xaui_check_link(struct lmac *lmac)
/* Rx 

Re: [PATCH] bonding: Allow tun-interfaces as slaves

2016-08-10 Thread Ding Tianhong
On 2016/8/10 7:51, Jay Vosburgh wrote:
> Jörn Engel  wrote:
> 
>> On Tue, Aug 09, 2016 at 12:06:36PM -0700, David Miller wrote:
 On Tue, Aug 09, 2016 at 09:28:45PM +0800, Ding Tianhong wrote:

 Simply not checking errors when setting the mac address solves the
 problem for me.  No new features needed.
>>>
>>> But it only works in certain modes.
>>>
>>> So the best we can do is enforce the MAC address setting in the
>>> modes that absolutely require it.  We cannot ignore the MAC
>>> address setting unilaterally.
>>
>> Something like this?
>>
>> [PATCH] bonding: Allow tun-interfaces as slaves in balance-rr mode
>>
>> Up until 00503b6f702e (part of 3.14-rc1), the bonding driver could be
>> used to enslave tun-interfaces.  00503b6f702e broke that behaviour,
>> afaics as an unintended side-effect.
>>
>> For the purpose of bond-over-tun in balance-rr mode, simply ignoring the
>> error from dev_set_mac_address() is good enough.
>>
>> Signed-off-by: Joern Engel 
>> ---
>> drivers/net/bonding/bond_main.c | 3 ++-
>> 1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/bonding/bond_main.c 
>> b/drivers/net/bonding/bond_main.c
>> index 1f276fa30ba6..2f686bfe4304 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -1490,7 +1490,8 @@ int bond_enslave(struct net_device *bond_dev, struct 
>> net_device *slave_dev)
>>  memcpy(addr.sa_data, bond_dev->dev_addr, bond_dev->addr_len);
>>  addr.sa_family = slave_dev->type;
>>  res = dev_set_mac_address(slave_dev, );
>> -if (res) {
>> +/* round-robin mode works fine without a mac address */
>> +if (res && BOND_MODE(bond) != BOND_MODE_ROUNDROBIN) {
> 
>   This will cause balance-rr to add the slave to the bond if any
> device's dev_set_mac_address call fails.
> 
>   If a bond of regular Ethernet devices is connected to a static
> link aggregation (Etherchannel channel group), a set_mac failure would
> result in that slave having a different MAC address than the bond, which
> in turn would cause traffic inbound from the switch to that slave to be
> dropped (as the destination MAC would not pass the device MAC filters).
> 
>   The failure check for the set_mac call serves a legitimate
> purpose, and I don't believe we should bypass it without making the
> bypass an option that is explicitly enabled for those special cases that
> need it.
> 
>   E.g., something like the following (which I have not tested);
> this would also need documentation and iproute2 updates to go with it.
> This would be enabled with "fail_over_mac=keepmac".
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 1f276fa30ba6..d2283fc23b16 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -1483,7 +1483,8 @@ int bond_enslave(struct net_device *bond_dev, struct 
> net_device *slave_dev)
>   ether_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr);
>  
>   if (!bond->params.fail_over_mac ||
> - BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
> + (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP &&
> +  bond->params.fail_over_mac != BOND_FOM_KEEPMAC)) {
>   /* Set slave to master's mac address.  The application already
>* set the master's mac address to that of the first slave
>*/
> diff --git a/drivers/net/bonding/bond_options.c 
> b/drivers/net/bonding/bond_options.c
> index 577e57cad1dc..f9653fe4d622 100644
> --- a/drivers/net/bonding/bond_options.c
> +++ b/drivers/net/bonding/bond_options.c
> @@ -125,6 +125,7 @@ static const struct bond_opt_value 
> bond_fail_over_mac_tbl[] = {
>   { "none",   BOND_FOM_NONE,   BOND_VALFLAG_DEFAULT},
>   { "active", BOND_FOM_ACTIVE, 0},
>   { "follow", BOND_FOM_FOLLOW, 0},
> + { "keepmac", BOND_FOM_KEEPMAC, 0},
>   { NULL, -1,  0},
>  };
>  
> diff --git a/include/net/bonding.h b/include/net/bonding.h
> index 6360c259da6d..ec3442b3aa83 100644
> --- a/include/net/bonding.h
> +++ b/include/net/bonding.h
> @@ -420,6 +420,7 @@ static inline bool bond_slave_can_tx(struct slave *slave)
>  #define BOND_FOM_NONE0
>  #define BOND_FOM_ACTIVE  1
>  #define BOND_FOM_FOLLOW  2
> +#define BOND_FOM_KEEPMAC 3
>  
>  #define BOND_ARP_TARGETS_ANY 0
>  #define BOND_ARP_TARGETS_ALL 1
> 
> 
>   -J
> 
Hi jorn:

Could you please test this patch? I build this patch base on Jay's suggestion 
and I think it could fix your problem.

---
 drivers/net/bonding/bond_main.c| 24 +---
 drivers/net/bonding/bond_options.c |  3 ++-
 include/net/bonding.h  |  1 +
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c 

Re: [Patch net 0/5] net_sched: tc action fixes and updates

2016-08-10 Thread Jamal Hadi Salim

On 16-08-08 04:46 PM, Cong Wang wrote:

This patchset fixes several regressions caused by the previous
code refactor. Thanks to Jamal for catching them!



Cong,

Good news: oops gone. I havent done more testing than I did
before; but looks good so far.

Bad news: You have introduced a performance regression which is
noticeable at high speed.

tcf_exts_exec() is the culprit - and conversion to from flexarray
to linked list in the fast problem to be specific.
The regression is problematic (and unacceptable). Two options:
a) You fix the regressions - which i think may require changing
what gets passed around an executed on as an array instead of a
list.
b) I am worried #a will take some work. So the second option is
to back out the patch since there are known stability options;
get regression issues resolved and then go back and submit.

cheers,
jamla


[PATCH net v1 1/1] tipc: fix variable dereference before NULL check

2016-08-10 Thread Parthasarathy Bhuvaragan
In commit cf6f7e1d5109 ("tipc: dump monitor attributes"),
I dereferenced a pointer before checking if its valid.
This is reported by static check Smatch as:
net/tipc/monitor.c:733 tipc_nl_add_monitor_peer()
 warn: variable dereferenced before check 'mon' (see line 731)

In this commit, we check for a valid monitor before proceeding
with any other operation.

Fixes: cf6f7e1d5109 ("tipc: dump monitor attributes")
Reported-by: Dan Carpenter 
Signed-off-by: Parthasarathy Bhuvaragan 
---
 net/tipc/monitor.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index b62caa1c770c..ed97a5876ebe 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -728,12 +728,13 @@ int tipc_nl_add_monitor_peer(struct net *net, struct 
tipc_nl_msg *msg,
 u32 bearer_id, u32 *prev_node)
 {
struct tipc_monitor *mon = tipc_monitor(net, bearer_id);
-   struct tipc_peer *peer = mon->self;
+   struct tipc_peer *peer;
 
if (!mon)
return -EINVAL;
 
read_lock_bh(>lock);
+   peer = mon->self;
do {
if (*prev_node) {
if (peer->addr == *prev_node)
-- 
2.1.4



Re: [RFC PATCH 2/3] net: macb: Add support for 1588 for Zynq Ultrascale+ MPSoC

2016-08-10 Thread Michal Simek
Hi Nicolas,

just a note: Here is the link to public Linux repo
https://github.com/Xilinx/linux-xlnx

Thanks,
Michal


On 9.8.2016 18:56, Punnaiah Choudary Kalluri wrote:
> Hi Nicolas,
> 
>  1588 implementation in cadence GEM IP we have in Zynq Ultascale+ MPSoC is
> Different to the one in Zynq SOC.
> 
> In earlier version, all timestamp values will be stored in registers and 
> there is no specific
> Mechanism to distinguish the received ethernet frame that contains time stamp 
> information
> Other than parsing the frame for PTP packet type.
> 
> We have basic implementation for earlier version in our out of tree driver, 
> which is going to be deprecated
> Soon. You could also check the below driver for 1588 support.
> https://gitenterprise.xilinx.com/Linux/linux-xlnx/blob/master/drivers/net/ethernet/xilinx/xilinx_emacps.c
> 
> 
> Regards,
> Punnaiah
> 
>> -Original Message-
>> From: Nicolas Ferre [mailto:nicolas.fe...@atmel.com]
>> Sent: Tuesday, August 09, 2016 10:10 PM
>> To: Harini Katakam ; Harini Katakam
>> ; Andrei Pistirica 
>> Cc: da...@davemloft.net; Boris Brezillon > electrons.com>; alexandre.bell...@free-electrons.com;
>> netdev@vger.kernel.org; linux-ker...@vger.kernel.org;
>> devicet...@vger.kernel.org; Punnaiah Choudary Kalluri
>> ; Michal Simek ; Anirudha
>> Sarangi 
>> Subject: Re: [RFC PATCH 2/3] net: macb: Add support for 1588 for Zynq
>> Ultrascale+ MPSoC
>>
>> Le 21/09/2015 à 19:49, Harini Katakam a écrit :
>>> On Fri, Sep 11, 2015 at 1:27 PM, Harini Katakam
>>>  wrote:
 Cadence GEM in Zynq Ultrascale+ MPSoC supports 1588 and provides a
 102 bit time counter with 48 bits for seconds, 30 bits for nsecs and
 24 bits for sub-nsecs. The timestamp is made available to the SW through
 registers as well as (more precisely) through upper two words in
 an extended BD.

 This patch does the following:
 - Adds MACB_CAPS_TSU in zynqmp_config.
 - Registers to ptp clock framework (after checking for timestamp support
>> in
   IP and capability in config).
 - TX BD and RX BD control registers are written to populate timestamp in
   extended BD words.
 - Timer initialization is done by writing time of day to the timer counter.
 - ns increment register is programmed as NS_PER_SEC/TSU_CLK.
   For a 24 bit subns precision, the subns increment equals
   remainder of (NS_PER_SEC/TSU_CLK) * (2^24).
   TSU (Time stamp unit) clock is obtained by the  driver from devicetree.
 - HW time stamp capabilities are advertised via ethtool and macb ioctl is
   updated accordingly.
 - For all PTP event frames, nanoseconds and the lower 5 bits of seconds
>> are
   obtained from the BD. This offers a precise timestamp. The upper bits
   (which dont vary between consecutive packets) are obtained from the
   TX/RX PTP event/PEER registers. The timestamp obtained thus is
>> updated
   in skb for upper layers to access.
 - The drivers register functions with ptp to perform time and frequency
   adjustment.
 - Time adjustment is done by writing to the 1558_ADJUST register.
   The controller will read the delta in this register and update the timer
   counter register. Alternatively, for large time offset adjustments,
   the driver reads the secs and nsecs counter values, adds/subtracts the
   delta and updates the timer counter. In order to be as precise as
>> possible,
   nsecs counter is read again if secs has incremented during the counter
>> read.
 - Frequency adjustment is not directly supported by this IP.
   addend is the initial value ns increment and similarly addendesub.
   The ppb (parts per billion) provided is used as
   ns_incr = addend +/- (ppb/rate).
   Similarly the remainder of the above is used to populate subns
>> increment.
   In case the ppb requested is negative AND subns adjustment greater
>> than
   the addendsub, ns_incr is reduced by 1 and subns_incr is adjusted in
   positive accordingly.

 Signed-off-by: Harini Katakam :
 ---
  drivers/net/ethernet/cadence/macb.c |  372
>> ++-
  drivers/net/ethernet/cadence/macb.h |   64 ++
  2 files changed, 428 insertions(+), 8 deletions(-)

 diff --git a/drivers/net/ethernet/cadence/macb.c
>> b/drivers/net/ethernet/cadence/macb.c
 index bb2932c..b531008 100644
 --- a/drivers/net/ethernet/cadence/macb.c
 +++ b/drivers/net/ethernet/cadence/macb.c
 @@ -30,6 +30,8 @@
  #include 
  #include 
>>
>> [..]
>>
 +   unsigned intns_incr;
 +   unsigned intsubns_incr;
  };

  static inline bool macb_is_gem(struct macb *bp)
 --
 1.7.9.5
>>>
>>> Ping

[PATCH 18/21] net: thunderx: Use napi_consume_skb for bulk free

2016-08-10 Thread sunil . kovvuri
From: Sunil Goutham 

This patch enables bulk freeing on the Tx side.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 4a084ab..e73e6df 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -521,7 +521,8 @@ static int nicvf_init_resources(struct nicvf *nic)
 
 static void nicvf_snd_pkt_handler(struct net_device *netdev,
  struct cmp_queue *cq,
- struct cqe_send_t *cqe_tx, int cqe_type)
+ struct cqe_send_t *cqe_tx,
+ int cqe_type, int budget)
 {
struct sk_buff *skb = NULL;
struct nicvf *nic = netdev_priv(netdev);
@@ -545,7 +546,7 @@ static void nicvf_snd_pkt_handler(struct net_device *netdev,
if (skb) {
nicvf_put_sq_desc(sq, hdr->subdesc_cnt + 1);
prefetch(skb);
-   dev_consume_skb_any(skb);
+   napi_consume_skb(skb, budget);
sq->skbuff[cqe_tx->sqe_ptr] = (u64)NULL;
} else {
/* In case of HW TSO, HW sends a CQE for each segment of a TSO
@@ -700,7 +701,8 @@ loop:
break;
case CQE_TYPE_SEND:
nicvf_snd_pkt_handler(netdev, cq,
- (void *)cq_desc, CQE_TYPE_SEND);
+ (void *)cq_desc, CQE_TYPE_SEND,
+ budget);
tx_done++;
break;
case CQE_TYPE_INVALID:
-- 
2.7.4



Fwd: Re: [PATCH RESEND net-next 13/15] smc: receive data from RMBE

2016-08-10 Thread Ursula Braun

Dave,

sorry, forget my previous mail from today. I now realize that xchg() 
does not help on 32-bit architectures. I have to think about 
alternatives here.


 Forwarded Message 
Subject: Re: [PATCH RESEND net-next 13/15] smc: receive data from RMBE
Date: Wed, 10 Aug 2016 15:44:00 +0200
From: Ursula Braun 
To: David Miller 
CC: netdev@vger.kernel.org, linux-s...@vger.kernel.org, 
schwidef...@de.ibm.com, heiko.carst...@de.ibm.com, utz.bac...@de.ibm.com




On 08/09/2016 11:32 PM, David Miller wrote:

From: Ursula Braun 
Date: Tue,  9 Aug 2016 12:12:58 +0200


+   xchg(>rx_curs_confirmed.acurs,
+smc_curs_read(conn->local_tx_ctrl.cons.acurs));


Why in the world do you need to use xchg() in all of these places?

It makes no sense whatsoever, especially since you don't even check
the return value.
98e906b2
If you need the operation to be atomic, then you have to check the
return value and do something to recover if something else beat
you to the xchg() and put something else into the location.

Otherwise, you therefore don't need it be atomic and can avoid
this expensive operation and just store the value normally.

Reviewing my xchg() usages, I really detected some paranoid usages, that 
I am going to remove. But there are still usages (and 
conn->rx_curs_confirmed is one of them), where I need an 8-byte cursor 
field to be read and written atomicaly, even though I do not care 
whether the write operation has been beaten or not. But I do care that 
reading the cursor does not return a partially updated cursor. Isn't 
xchg() a possible solution in this case?




Re: [PATCH 1/2] net: ethernet: renesas: sh_eth: use phydev from struct net_device

2016-08-10 Thread Simon Horman
[CC linux-sh as some of those boards use this driver]

On Wed, Aug 10, 2016 at 12:04:48AM +0200, Philippe Reynes wrote:
> The private structure contain a pointer to phydev, but the structure
> net_device already contain such pointer. So we can remove the pointer
> phy_dev in the private structure, and update the driver to use the
> one contained in struct net_device.
> 
> Signed-off-by: Philippe Reynes 

I have looked over this and it seems good to me.
I have also tested it on the r8a7790/Lager board and it appears to work.

Tested-by: Simon Horman 


[PATCH 05/21] net: thunderx: Enable CQE_RX desc's extension fields

2016-08-10 Thread sunil . kovvuri
From: Sunil Goutham 

Unlike 88xx, CQE_RX descriptor's tunnelling extension i.e CQE_RX2_S
is always enabled on 81xx/83xx and HW does insert these fields into
CQE_RX. As a result receive buffer addresses will now be present at
7th word of CQE_RX instead of 6th.

Enable CQE_RX2_S on 88xx pass 2.x as well.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/cavium/thunder/nic.h  |  9 -
 drivers/net/ethernet/cavium/thunder/nic_main.c |  7 +++
 drivers/net/ethernet/cavium/thunder/nic_reg.h  |  1 +
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 12 +++-
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h 
b/drivers/net/ethernet/cavium/thunder/nic.h
index 6b0b240..136db2a 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -493,7 +493,14 @@ static inline int nic_get_node_id(struct pci_dev *pdev)
 
 static inline bool pass1_silicon(struct pci_dev *pdev)
 {
-   return pdev->revision < 8;
+   return (pdev->revision < 8) &&
+   (pdev->subsystem_device == PCI_SUBSYS_DEVID_88XX_NIC_PF);
+}
+
+static inline bool pass2_silicon(struct pci_dev *pdev)
+{
+   return (pdev->revision >= 8) &&
+   (pdev->subsystem_device == PCI_SUBSYS_DEVID_88XX_NIC_PF);
 }
 
 int nicvf_set_real_num_queues(struct net_device *netdev,
diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c 
b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 0d81117..3f52b36 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -799,6 +799,13 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
   (mbx.rq.qs_num << NIC_QS_ID_SHIFT) |
   (mbx.rq.rq_num << NIC_Q_NUM_SHIFT);
nic_reg_write(nic, reg_addr, mbx.rq.cfg);
+   /* Enable CQE_RX2_S extension in CQE_RX descriptor.
+* This gets appended by default on 81xx/83xx chips,
+* for consistency enabling the same on 88xx pass2
+* where this is introduced.
+*/
+   if (pass2_silicon(nic->pdev))
+   nic_reg_write(nic, NIC_PF_RX_CFG, 0x01);
break;
case NIC_MBOX_MSG_RQ_BP_CFG:
reg_addr = NIC_PF_QSET_0_127_RQ_0_7_BP_CFG |
diff --git a/drivers/net/ethernet/cavium/thunder/nic_reg.h 
b/drivers/net/ethernet/cavium/thunder/nic_reg.h
index 833cf3d..b4a7953 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_reg.h
+++ b/drivers/net/ethernet/cavium/thunder/nic_reg.h
@@ -36,6 +36,7 @@
 #define   NIC_PF_MAILBOX_ENA_W1C   (0x0450)
 #define   NIC_PF_MAILBOX_ENA_W1S   (0x0470)
 #define   NIC_PF_RX_ETYPE_0_7  (0x0500)
+#define   NIC_PF_RX_CFG(0x05D0)
 #define   NIC_PF_PKIND_0_15_CFG(0x0600)
 #define   NIC_PF_ECC0_FLIP0(0x1000)
 #define   NIC_PF_ECC1_FLIP0(0x1008)
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index e521a94..ca223aa 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -1190,7 +1190,17 @@ struct sk_buff *nicvf_get_rcv_skb(struct nicvf *nic, 
struct cqe_rx_t *cqe_rx)
u64 *rb_ptrs = NULL;
 
rb_lens = (void *)cqe_rx + (3 * sizeof(u64));
-   rb_ptrs = (void *)cqe_rx + (6 * sizeof(u64));
+   /* Except 88xx pass1 on all other chips CQE_RX2_S is added to
+* CQE_RX at word6, hence buffer pointers move by word
+*
+* Use existing 'hw_tso' flag which will be set for all chips
+* except 88xx pass1 instead of a additional cache line
+* access (or miss) by using pci dev's revision.
+*/
+   if (!nic->hw_tso)
+   rb_ptrs = (void *)cqe_rx + (6 * sizeof(u64));
+   else
+   rb_ptrs = (void *)cqe_rx + (7 * sizeof(u64));
 
netdev_dbg(nic->netdev, "%s rb_cnt %d rb0_ptr %llx rb0_sz %d\n",
   __func__, cqe_rx->rb_cnt, cqe_rx->rb0_ptr, cqe_rx->rb0_sz);
-- 
2.7.4



[PATCH 04/21] net: thunderx: Set queue count based on number of CPUs

2016-08-10 Thread sunil . kovvuri
From: Sunil Goutham 

81xx has only 4 CPUs, so it doesn't make sense to initialize
entire Qset i.e 8 queues by default. Made changes to queue
initialization to init queues equal to number of CPUs or
8 queues whichever is lesser. Also this will be applicable to
VMs with VNIC VF attached and having less VCPUs

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/cavium/thunder/nic_main.c | 6 ++
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 7 +++
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 8 
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h | 5 +
 4 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c 
b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 4974923..0d81117 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -1009,6 +1009,12 @@ static int nic_num_sqs_en(struct nicpf *nic, int vf_en)
int pos, sqs_per_vf = MAX_SQS_PER_VF_SINGLE_NODE;
u16 total_vf;
 
+   /* Secondary Qsets are needed only if CPU count is
+* morethan MAX_QUEUES_PER_QSET.
+*/
+   if (num_online_cpus() <= MAX_QUEUES_PER_QSET)
+   return 0;
+
/* Check if its a multi-node environment */
if (nr_node_ids > 1)
sqs_per_vf = MAX_SQS_PER_VF;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 0c10635..af04d9f 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1537,14 +1537,13 @@ static int nicvf_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
goto err_release_regions;
}
 
-   qcount = MAX_CMP_QUEUES_PER_QS;
+   qcount = min_t(int, MAX_CMP_QUEUES_PER_QS, num_online_cpus());
 
/* Restrict multiqset support only for host bound VFs */
if (pdev->is_virtfn) {
/* Set max number of queues per VF */
-   qcount = roundup(num_online_cpus(), MAX_CMP_QUEUES_PER_QS);
-   qcount = min(qcount,
-(MAX_SQS_PER_VF + 1) * MAX_CMP_QUEUES_PER_QS);
+   qcount = min_t(int, num_online_cpus(),
+  (MAX_SQS_PER_VF + 1) * MAX_CMP_QUEUES_PER_QS);
}
 
netdev = alloc_etherdev_mqs(sizeof(struct nicvf), qcount, qcount);
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 0ff8e60..e521a94 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -762,10 +762,10 @@ int nicvf_set_qset_resources(struct nicvf *nic)
nic->qs = qs;
 
/* Set count of each queue */
-   qs->rbdr_cnt = RBDR_CNT;
-   qs->rq_cnt = RCV_QUEUE_CNT;
-   qs->sq_cnt = SND_QUEUE_CNT;
-   qs->cq_cnt = CMP_QUEUE_CNT;
+   qs->rbdr_cnt = DEFAULT_RBDR_CNT;
+   qs->rq_cnt = min_t(u8, MAX_RCV_QUEUES_PER_QS, num_online_cpus());
+   qs->sq_cnt = min_t(u8, MAX_SND_QUEUES_PER_QS, num_online_cpus());
+   qs->cq_cnt = max_t(u8, qs->rq_cnt, qs->sq_cnt);
 
/* Set queue lengths */
qs->rbdr_len = RCV_BUF_COUNT;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h 
b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
index 6673e11..869f338 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.h
@@ -57,10 +57,7 @@
 #define CMP_QUEUE_SIZE66ULL /* 64K entries */
 
 /* Default queue count per QS, its lengths and threshold values */
-#define RBDR_CNT   1
-#define RCV_QUEUE_CNT  8
-#define SND_QUEUE_CNT  8
-#define CMP_QUEUE_CNT  8 /* Max of RCV and SND qcount */
+#define DEFAULT_RBDR_CNT   1
 
 #define SND_QSIZE  SND_QUEUE_SIZE2
 #define SND_QUEUE_LEN  (1ULL << (SND_QSIZE + 10))
-- 
2.7.4



RE: [PATCH net] bnx2x: don't reset chip on cleanup if PCI function is offline

2016-08-10 Thread Yuval Mintz
> > Why would the published resume()  from pci_error_handlers be called in this
> scenario?
> 
> It isn't. That's why I specifically commented on commit message: "There are 
> two
> cases though that another path is taken on the code".
> 
> The code path reach bnx2x_chip_cleanup() on device removal from the system,
> as seen in the below call trace:
> 
> bnx2x_chip_cleanup+0x3c0/0x910 [bnx2x]
> bnx2x_nic_unload+0x268/0xaf0 [bnx2x]
> bnx2x_close+0x34/0x50 [bnx2x]
> __dev_close_many+0xd4/0x150
> dev_close_many+0xa8/0x160
> rollback_registered_many+0x174/0x3f0
> rollback_registered+0x40/0x70
> unregister_netdevice_queue+0x98/0x110
> unregister_netdev+0x34/0x50
> __bnx2x_remove+0xa8/0x3a0 [bnx2x]
> pci_device_remove+0x70/0x110

Makes sense.

> >> Also, we avoid the MCP information dump in case of non-recoverable
> >> PCI error (when adapter is about to be removed), since it will certainly 
> >> fail.
> >
> > We should probably avoid several things here; Why specifically only this?
> 
> For example, we shouldn't execute bnx2x_timer() in this scenario. But I 
> thought
> it'd be too much to check every call of a timer function against PCI channel 
> state
> just to avoid it's execution on this scenario, so I just let it execute, 
> since it seems
> harmless.
> 
> >> +  /* Reset the chip, unless PCI function is offline. If we reach this
> >> +   * point following a PCI error handling, it means device is really
> >> +   * in a bad state and we're about to remove it, so reset the chip
> >> +   * is not a good idea.
> >> +   */
> >> +  if (!pci_channel_offline(bp->pdev)) {
> >> +  rc = bnx2x_reset_hw(bp, reset_code);
> >> +  if (rc)
> >> +  BNX2X_ERR("HW_RESET failed\n");
> >> +  }
> >
> > Why not simply check this at the beginning of the function?
> 
> Because I wasn't sure if I could drop the entire execution of chip_cleanup(). 
> I
> tried to keep the most of this function aiming to shutdown the module in a
> gentle way, like cleaning MAC, stopping queues...but again, I'm open to
> suggestions and gladly will change this in v2 if you think it's for the best.

Problem is I won't be able to have a more thorough review of this in the next
couple of days - and other than code-review I won't have a reasonable way
of testing this [I can use aer_inject, but I don't have your magical EEH
error injections, and I'm not at all certain it would suffice for a good 
testing ].

I agree that even as-is, what you're suggesting is an improvement to the
existing flow - so it's basically up to dave, i.e., whether to take a half fix
or wait for a more thorough one.




[PATCH net-next 1/4] flow_dissector: Get vlan info from skb->vlan_tci instead of skb->data

2016-08-10 Thread Hadar Hen Zion
Early in the datapath skb_vlan_untag function is called, stripped
the vlan from the skb and set skb->vlan_tci and skb->vlan_proto fields.

The current dissection doesn't handle vlan packets correctly.  Vlan
doesn't exist in skb->data anymore when applying flow dissection on the
skb, fix that.

Fixes: 0744dd00c1b1 ('net: introduce skb_flow_dissect()')
Signed-off-by: Hadar Hen Zion 
---
 net/core/flow_dissector.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 61ad43f..6060fc2 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -122,7 +122,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 
if (!data) {
data = skb->data;
-   proto = skb->protocol;
+   proto = skb_vlan_tag_present(skb) ?
+skb->vlan_proto : skb->protocol;
nhoff = skb_network_offset(skb);
hlen = skb_headlen(skb);
}
@@ -240,13 +241,6 @@ ipv6:
}
case htons(ETH_P_8021AD):
case htons(ETH_P_8021Q): {
-   const struct vlan_hdr *vlan;
-   struct vlan_hdr _vlan;
-
-   vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, 
hlen, &_vlan);
-   if (!vlan)
-   goto out_bad;
-
if (dissector_uses_key(flow_dissector,
   FLOW_DISSECTOR_KEY_VLANID)) {
key_tags = skb_flow_dissector_target(flow_dissector,
@@ -256,8 +250,7 @@ ipv6:
key_tags->vlan_id = skb_vlan_tag_get_id(skb);
}
 
-   proto = vlan->h_vlan_encapsulated_proto;
-   nhoff += sizeof(*vlan);
+   proto = skb->protocol;
goto again;
}
case htons(ETH_P_PPP_SES): {
-- 
1.8.3.1



[PATCH net-next 3/4] net_sched: flower: Add vlan support

2016-08-10 Thread Hadar Hen Zion
Enhance flower to support 802.1Q vlan protocol classification.
Currently, the supported fields are vlan_id and vlan_priority.

Example:

# add a flower filter with vlan id and priority classification
tc filter add dev ens4f0 protocol 802.1Q parent : \
flower \
indev ens4f0 \
vlan_ethtype ipv4 \
vlan_id 100 \
vlan_prio 3 \
action vlan pop

Signed-off-by: Hadar Hen Zion 
---
 include/uapi/linux/pkt_cls.h |  3 ++
 net/sched/cls_flower.c   | 69 ++--
 2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index d1c1cca..51b5b24 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -428,6 +428,9 @@ enum {
TCA_FLOWER_KEY_UDP_DST, /* be16 */
 
TCA_FLOWER_FLAGS,
+   TCA_FLOWER_KEY_VLAN_ID,
+   TCA_FLOWER_KEY_VLAN_PRIO,
+   TCA_FLOWER_KEY_VLAN_ETH_TYPE,
__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 5060801..4e249be 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -28,6 +28,7 @@ struct fl_flow_key {
struct flow_dissector_key_control control;
struct flow_dissector_key_basic basic;
struct flow_dissector_key_eth_addrs eth;
+   struct flow_dissector_key_vlan vlan;
struct flow_dissector_key_addrs ipaddrs;
union {
struct flow_dissector_key_ipv4_addrs ipv4;
@@ -293,6 +294,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 
1] = {
[TCA_FLOWER_KEY_TCP_DST]= { .type = NLA_U16 },
[TCA_FLOWER_KEY_UDP_SRC]= { .type = NLA_U16 },
[TCA_FLOWER_KEY_UDP_DST]= { .type = NLA_U16 },
+   [TCA_FLOWER_KEY_VLAN_ID]= { .type = NLA_U16 },
+   [TCA_FLOWER_KEY_VLAN_PRIO]  = { .type = NLA_U8 },
+   [TCA_FLOWER_KEY_VLAN_ETH_TYPE]  = { .type = NLA_U16 },
+
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -308,9 +313,28 @@ static void fl_set_key_val(struct nlattr **tb,
memcpy(mask, nla_data(tb[mask_type]), len);
 }
 
+static void fl_set_key_vlan(struct nlattr **tb,
+   struct flow_dissector_key_vlan *key_val,
+   struct flow_dissector_key_vlan *key_mask)
+{
+#define VLAN_PRIORITY_MASK 0x7
+
+   if (tb[TCA_FLOWER_KEY_VLAN_ID]) {
+   key_val->vlan_id =
+   nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK;
+   key_mask->vlan_id = VLAN_VID_MASK;
+   }
+   if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) {
+   key_val->vlan_priority =
+   nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) & 
VLAN_PRIORITY_MASK;
+   key_mask->vlan_priority = VLAN_PRIORITY_MASK;
+   }
+}
+
 static int fl_set_key(struct net *net, struct nlattr **tb,
  struct fl_flow_key *key, struct fl_flow_key *mask)
 {
+   __be16 ethertype;
 #ifdef CONFIG_NET_CLS_IND
if (tb[TCA_FLOWER_INDEV]) {
int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]);
@@ -328,9 +352,19 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
   mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
   sizeof(key->eth.src));
 
-   fl_set_key_val(tb, >basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE,
-  >basic.n_proto, TCA_FLOWER_UNSPEC,
-  sizeof(key->basic.n_proto));
+   if (tb[TCA_FLOWER_KEY_ETH_TYPE])
+   ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]);
+
+   if (ethertype == htons(ETH_P_8021Q)) {
+   fl_set_key_vlan(tb, >vlan, >vlan);
+   fl_set_key_val(tb, >basic.n_proto,
+  TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+  >basic.n_proto, TCA_FLOWER_UNSPEC,
+  sizeof(key->basic.n_proto));
+   } else {
+   key->basic.n_proto = ethertype;
+   mask->basic.n_proto = cpu_to_be16(~0);
+   }
 
if (key->basic.n_proto == htons(ETH_P_IP) ||
key->basic.n_proto == htons(ETH_P_IPV6)) {
@@ -440,6 +474,8 @@ static void fl_init_dissector(struct cls_fl_head *head,
   FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
   FLOW_DISSECTOR_KEY_PORTS, tp);
+   FL_KEY_SET_IF_IN_RANGE(mask, keys, cnt,
+  FLOW_DISSECTOR_KEY_VLAN, vlan);
 
skb_flow_dissector_init(>dissector, keys, cnt);
 }
@@ -668,6 +704,29 @@ static int fl_dump_key_val(struct sk_buff *skb,
return 0;
 }
 
+static int fl_dump_key_vlan(struct sk_buff *skb,
+   struct flow_dissector_key_vlan *vlan_key,
+   struct 

[PATCH net-next 0/4] net_sched, flow_dissector, flower: Introduce vlan tag support

2016-08-10 Thread Hadar Hen Zion
This patchset introduce vlan tag support to the flower classifier and the flow
dissector. In addition to adding vlan priority to act vlan.

The first 2 patches are dealing with the flow dissector:
 - The first patch is a fix, vlan id value should be taken from skb->vlan_tci
   and not from skb->data.
 - The second patch adds support for vlan priority.

The third patch adds vlan tag support to the flower classifier, user space
patches will be sent later to complete it.
The last patch adds vlan priority to act vlan since only vlan id is currently 
supported.

Hadar Hen Zion (4):
  flow_dissector: Get vlan info from skb->vlan_tci instead of skb->data
  flow_dissector: Get vlan priority in addition to vlan id
  net_sched: flower: Add vlan support
  net_sched: act_vlan: Add priority option

 include/linux/if_vlan.h |  1 +
 include/net/flow_dissector.h| 11 --
 include/net/tc_act/tc_vlan.h|  1 +
 include/uapi/linux/pkt_cls.h|  3 ++
 include/uapi/linux/tc_act/tc_vlan.h |  1 +
 net/core/flow_dissector.c   | 28 +++
 net/sched/act_vlan.c| 13 +--
 net/sched/cls_flower.c  | 69 +++--
 8 files changed, 103 insertions(+), 24 deletions(-)

-- 
1.8.3.1



[PATCH 00/21] net: thunderx: Support for newer chips and miscellaneous patches

2016-08-10 Thread sunil . kovvuri
From: Sunil Goutham 

This patch series adds support for VNIC on 81xx and 83xx SOCs.
81xx/83xx is different from 88xx in terms of capabilities and new type
of interfaces supported (eg: QSGMII, RGMII) and have DLMs instead of 
QLMs which allows single BGX to have interfaces of different LMAC types.

Also included some patches which are common for all 88xx/81xx/83xx
SOCs like using netdev's name while registering irqs, reset receive
queue stats and some changes to use standard API for split buffer Rx 
packets, generating RSS key e.t.c

PS: Most of the patches were submitted earlier under different series but
for some reason were not picked up by patchwork. Since new patches have been
added in the meantime, resubmitting all as a new patchset.

Jerin Jacob (1):
  net: thunderx: Reset RXQ HW stats when interface is brought down

Radoslaw Biernacki (1):
  net: thunderx: Improvement for MBX interface debug messages

Sunil Goutham (18):
  net: thunderx: Moved HW capability info from macros to structure
  net: thunderx: Add VNIC's PCI devid on future chips
  net: thunderx: Add support for 81xx and 83xx chips
  net: thunderx: Set queue count based on number of CPUs
  net: thunderx: Enable CQE_RX desc's extension fields
  net: thunderx: Enable mailbox interrupts on 81xx/83xx
  net: thunderx: Support for different LMAC types within BGX
  net: thunderx: Add 81xx support to BGX driver
  net: thunderx: Add QSGMII interface type support
  net: thunderx: Add RGMII interface type support
  net: thunderx: Add support for 16 LMACs of 83xx
  net: thunderx: Support for 83xx mixed QLM/DLM config
  net: thunderx: Use netdev's name for naming VF's interrupts
  net: thunderx: Use skb_add_rx_frag() for split buffer Rx pkts
  net: thunderx: Don't set mac address for secondary Qset VFs
  net: thunderx: Use napi_consume_skb for bulk free
  net: thunderx: Use netdev_rss_key_fill() helper
  net: thunderx: Don't set RX_PACKET_DIS while initializing

Zyta Szpak (1):
  net: thunderx: Configure tunnelling protocol parsing

 drivers/net/ethernet/cavium/Kconfig|  10 +
 drivers/net/ethernet/cavium/thunder/Makefile   |   1 +
 drivers/net/ethernet/cavium/thunder/nic.h  |  85 ++--
 drivers/net/ethernet/cavium/thunder/nic_main.c | 433 +++
 drivers/net/ethernet/cavium/thunder/nic_reg.h  |  15 +
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |  67 +--
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c |  59 +--
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h |   5 +-
 drivers/net/ethernet/cavium/thunder/thunder_bgx.c  | 460 ++---
 drivers/net/ethernet/cavium/thunder/thunder_bgx.h  |  33 +-
 drivers/net/ethernet/cavium/thunder/thunder_xcv.c  | 237 +++
 11 files changed, 1059 insertions(+), 346 deletions(-)
 create mode 100644 drivers/net/ethernet/cavium/thunder/thunder_xcv.c

-- 
2.7.4



[PATCH 09/21] net: thunderx: Add QSGMII interface type support

2016-08-10 Thread sunil . kovvuri
From: Sunil Goutham 

This patch adds support for QSGMII interface type to
the BGX driver. This type of interface is supported by
81xx SOC.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 65 ++-
 drivers/net/ethernet/cavium/thunder/thunder_bgx.h |  1 +
 2 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c 
b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
index 9c3c273..0bf8d24 100644
--- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
+++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c
@@ -379,8 +379,9 @@ void bgx_lmac_internal_loopback(int node, int bgx_idx,
 }
 EXPORT_SYMBOL(bgx_lmac_internal_loopback);
 
-static int bgx_lmac_sgmii_init(struct bgx *bgx, int lmacid)
+static int bgx_lmac_sgmii_init(struct bgx *bgx, struct lmac *lmac)
 {
+   int lmacid = lmac->lmacid;
u64 cfg;
 
bgx_reg_modify(bgx, lmacid, BGX_GMP_GMI_TXX_THRESH, 0x30);
@@ -409,6 +410,14 @@ static int bgx_lmac_sgmii_init(struct bgx *bgx, int lmacid)
cfg |= (PCS_MRX_CTL_RST_AN | PCS_MRX_CTL_AN_EN);
bgx_reg_write(bgx, lmacid, BGX_GMP_PCS_MRX_CTL, cfg);
 
+   if (lmac->lmac_type == BGX_MODE_QSGMII) {
+   /* Disable disparity check for QSGMII */
+   cfg = bgx_reg_read(bgx, lmacid, BGX_GMP_PCS_MISCX_CTL);
+   cfg &= ~PCS_MISC_CTL_DISP_EN;
+   bgx_reg_write(bgx, lmacid, BGX_GMP_PCS_MISCX_CTL, cfg);
+   return 0;
+   }
+
if (bgx_poll_reg(bgx, lmacid, BGX_GMP_PCS_MRX_STATUS,
 PCS_MRX_STATUS_AN_CPT, false)) {
dev_err(>pdev->dev, "BGX AN_CPT not completed\n");
@@ -650,6 +659,14 @@ static void bgx_poll_for_link(struct work_struct *work)
queue_delayed_work(lmac->check_link, >dwork, HZ * 2);
 }
 
+static int phy_interface_mode(u8 lmac_type)
+{
+   if (lmac_type == BGX_MODE_QSGMII)
+   return PHY_INTERFACE_MODE_QSGMII;
+
+   return PHY_INTERFACE_MODE_SGMII;
+}
+
 static int bgx_lmac_enable(struct bgx *bgx, u8 lmacid)
 {
struct lmac *lmac;
@@ -658,9 +675,10 @@ static int bgx_lmac_enable(struct bgx *bgx, u8 lmacid)
lmac = >lmac[lmacid];
lmac->bgx = bgx;
 
-   if (lmac->lmac_type == BGX_MODE_SGMII) {
+   if ((lmac->lmac_type == BGX_MODE_SGMII) ||
+   (lmac->lmac_type == BGX_MODE_QSGMII)) {
lmac->is_sgmii = 1;
-   if (bgx_lmac_sgmii_init(bgx, lmacid))
+   if (bgx_lmac_sgmii_init(bgx, lmac))
return -1;
} else {
lmac->is_sgmii = 0;
@@ -697,7 +715,7 @@ static int bgx_lmac_enable(struct bgx *bgx, u8 lmacid)
 
if (phy_connect_direct(>netdev, lmac->phydev,
   bgx_lmac_handler,
-  PHY_INTERFACE_MODE_SGMII))
+  phy_interface_mode(lmac->lmac_type)))
return -ENODEV;
 
phy_start_aneg(lmac->phydev);
@@ -799,6 +817,11 @@ static void bgx_init_hw(struct bgx *bgx)
bgx_reg_write(bgx, 0, BGX_CMR_RX_STREERING + (i * 8), 0x00);
 }
 
+static u8 bgx_get_lane2sds_cfg(struct bgx *bgx, struct lmac *lmac)
+{
+   return (u8)(bgx_reg_read(bgx, lmac->lmacid, BGX_CMRX_CFG) & 0xFF);
+}
+
 static void bgx_print_qlm_mode(struct bgx *bgx, u8 lmacid)
 {
struct device *dev = >pdev->dev;
@@ -838,12 +861,22 @@ static void bgx_print_qlm_mode(struct bgx *bgx, u8 lmacid)
else
dev_info(dev, "%s: 40G_KR4\n", (char *)str);
break;
-   default:
-   dev_info(dev, "%s: INVALID\n", (char *)str);
+   case BGX_MODE_QSGMII:
+   if ((lmacid == 0) &&
+   (bgx_get_lane2sds_cfg(bgx, lmac) != lmacid))
+   return;
+   if ((lmacid == 2) &&
+   (bgx_get_lane2sds_cfg(bgx, lmac) == lmacid))
+   return;
+   dev_info(dev, "%s: QSGMII\n", (char *)str);
+   break;
+   case BGX_MODE_INVALID:
+   /* Nothing to do */
+   break;
}
 }
 
-static void lmac_set_lane2sds(struct lmac *lmac)
+static void lmac_set_lane2sds(struct bgx *bgx, struct lmac *lmac)
 {
switch (lmac->lmac_type) {
case BGX_MODE_SGMII:
@@ -857,6 +890,14 @@ static void lmac_set_lane2sds(struct lmac *lmac)
case BGX_MODE_RXAUI:
lmac->lane_to_sds = (lmac->lmacid) ? 0xE : 0x4;
break;
+   case BGX_MODE_QSGMII:
+   /* There is no way to determine if DLM0/2 is QSGMII or
+* DLM1/3 is configured to QSGMII as bootloader will
+* configure all LMACs, so take whatever is configured
+* by low level firmware.
+*/
+   lmac->lane_to_sds = 

[PATCH net-next 2/4] flow_dissector: Get vlan priority in addition to vlan id

2016-08-10 Thread Hadar Hen Zion
Add vlan priority check to the flow dissector by adding new flow
dissector struct, flow_dissector_key_vlan which includes vlan tag
fields.

vlan_id and flow_label fields were under the same struct
(flow_dissector_key_tags). It was a convenient setting since struct
flow_dissector_key_tags is used by struct flow_keys and by setting
vlan_id and flow_label under the same struct, we get precisely 24 or 48
bytes in flow_keys from flow_dissector_key_basic.

Now, when adding vlan priority support, the code will be cleaner if
flow_label and vlan tag won't be under the same struct anymore.

Signed-off-by: Hadar Hen Zion 
---
 include/linux/if_vlan.h  |  1 +
 include/net/flow_dissector.h | 11 ---
 net/core/flow_dissector.c| 15 +--
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index a5f6ce6..49d4aef 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -81,6 +81,7 @@ static inline bool is_vlan_dev(const struct net_device *dev)
 #define skb_vlan_tag_present(__skb)((__skb)->vlan_tci & VLAN_TAG_PRESENT)
 #define skb_vlan_tag_get(__skb)((__skb)->vlan_tci & 
~VLAN_TAG_PRESENT)
 #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK)
+#define skb_vlan_tag_get_prio(__skb)   ((__skb)->vlan_tci & VLAN_PRIO_MASK)
 
 /**
  * struct vlan_pcpu_stats - VLAN percpu rx/tx stats
diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index d3d60dc..3781f18 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -32,8 +32,12 @@ struct flow_dissector_key_basic {
 };
 
 struct flow_dissector_key_tags {
-   u32 vlan_id:12,
-   flow_label:20;
+   u32 flow_label:20;
+};
+
+struct flow_dissector_key_vlan {
+   u16 vlan_id:12,
+   vlan_priority:3;
 };
 
 struct flow_dissector_key_keyid {
@@ -119,7 +123,7 @@ enum flow_dissector_key_id {
FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
FLOW_DISSECTOR_KEY_TIPC_ADDRS, /* struct flow_dissector_key_tipc_addrs 
*/
-   FLOW_DISSECTOR_KEY_VLANID, /* struct flow_dissector_key_flow_tags */
+   FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_flow_vlan */
FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_flow_tags */
FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
@@ -148,6 +152,7 @@ struct flow_keys {
 #define FLOW_KEYS_HASH_START_FIELD basic
struct flow_dissector_key_basic basic;
struct flow_dissector_key_tags tags;
+   struct flow_dissector_key_vlan vlan;
struct flow_dissector_key_keyid keyid;
struct flow_dissector_key_ports ports;
struct flow_dissector_key_addrs addrs;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 6060fc2..6dfcb10 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -116,6 +116,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_addrs *key_addrs;
struct flow_dissector_key_ports *key_ports;
struct flow_dissector_key_tags *key_tags;
+   struct flow_dissector_key_vlan *key_vlan;
struct flow_dissector_key_keyid *key_keyid;
u8 ip_proto = 0;
bool ret = false;
@@ -242,12 +243,14 @@ ipv6:
case htons(ETH_P_8021AD):
case htons(ETH_P_8021Q): {
if (dissector_uses_key(flow_dissector,
-  FLOW_DISSECTOR_KEY_VLANID)) {
-   key_tags = skb_flow_dissector_target(flow_dissector,
-
FLOW_DISSECTOR_KEY_VLANID,
+  FLOW_DISSECTOR_KEY_VLAN)) {
+   key_vlan = skb_flow_dissector_target(flow_dissector,
+
FLOW_DISSECTOR_KEY_VLAN,
 target_container);
 
-   key_tags->vlan_id = skb_vlan_tag_get_id(skb);
+   key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
+   key_vlan->vlan_priority =
+   (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
}
 
proto = skb->protocol;
@@ -865,8 +868,8 @@ static const struct flow_dissector_key 
flow_keys_dissector_keys[] = {
.offset = offsetof(struct flow_keys, ports),
},
{
-   .key_id = FLOW_DISSECTOR_KEY_VLANID,
-   .offset = offsetof(struct flow_keys, tags),
+   .key_id = FLOW_DISSECTOR_KEY_VLAN,
+   .offset = offsetof(struct flow_keys, vlan),
},
{
.key_id = 

[PATCH 03/21] net: thunderx: Add support for 81xx and 83xx chips

2016-08-10 Thread sunil . kovvuri
From: Sunil Goutham 

This patch adds info on HW maximums of 81xx/83xx and also
configures receive and transmit datapaths accordingly.

Signed-off-by: Sunil Goutham 
---
 drivers/net/ethernet/cavium/thunder/nic_main.c| 87 ++-
 drivers/net/ethernet/cavium/thunder/nic_reg.h |  1 +
 drivers/net/ethernet/cavium/thunder/thunder_bgx.h |  2 +
 3 files changed, 73 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c 
b/drivers/net/ethernet/cavium/thunder/nic_main.c
index dc845a0..4974923 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -24,6 +24,8 @@ struct hw_info {
u8  bgx_cnt;
u8  chans_per_lmac;
u8  chans_per_bgx; /* Rx/Tx chans */
+   u8  chans_per_rgx;
+   u8  chans_per_lbk;
u16 cpi_cnt;
u16 rssi_cnt;
u16 rss_ind_tbl_size;
@@ -332,6 +334,33 @@ static void nic_get_hw_info(struct nicpf *nic)
hw->tl1_cnt = 2;
hw->tl1_per_bgx = true;
break;
+   case PCI_SUBSYS_DEVID_81XX_NIC_PF:
+   hw->bgx_cnt = MAX_BGX_PER_CN81XX;
+   hw->chans_per_lmac = 8;
+   hw->chans_per_bgx = 32;
+   hw->chans_per_rgx = 8;
+   hw->chans_per_lbk = 24;
+   hw->cpi_cnt = 512;
+   hw->rssi_cnt = 256;
+   hw->rss_ind_tbl_size = 32; /* Max RSSI / Max interfaces */
+   hw->tl3_cnt = 64;
+   hw->tl2_cnt = 16;
+   hw->tl1_cnt = 10;
+   hw->tl1_per_bgx = false;
+   break;
+   case PCI_SUBSYS_DEVID_83XX_NIC_PF:
+   hw->bgx_cnt = MAX_BGX_PER_CN83XX;
+   hw->chans_per_lmac = 8;
+   hw->chans_per_bgx = 32;
+   hw->chans_per_lbk = 64;
+   hw->cpi_cnt = 2048;
+   hw->rssi_cnt = 1024;
+   hw->rss_ind_tbl_size = 64; /* Max RSSI / Max interfaces */
+   hw->tl3_cnt = 256;
+   hw->tl2_cnt = 64;
+   hw->tl1_cnt = 18;
+   hw->tl1_per_bgx = false;
+   break;
}
hw->tl4_cnt = MAX_QUEUES_PER_QSET * pci_sriov_get_totalvfs(nic->pdev);
 }
@@ -353,11 +382,15 @@ static void nic_init_hw(struct nicpf *nic)
/* Enable backpressure */
nic_reg_write(nic, NIC_PF_BP_CFG, (1ULL << 6) | 0x03);
 
-   /* Disable TNS mode on both interfaces */
-   nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG,
- (NIC_TNS_BYPASS_MODE << 7) | BGX0_BLOCK);
-   nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG | (1 << 8),
- (NIC_TNS_BYPASS_MODE << 7) | BGX1_BLOCK);
+   /* TNS and TNS bypass modes are present only on 88xx */
+   if (nic->pdev->subsystem_device == PCI_SUBSYS_DEVID_88XX_NIC_PF) {
+   /* Disable TNS mode on both interfaces */
+   nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG,
+ (NIC_TNS_BYPASS_MODE << 7) | BGX0_BLOCK);
+   nic_reg_write(nic, NIC_PF_INTF_0_1_SEND_CFG | (1 << 8),
+ (NIC_TNS_BYPASS_MODE << 7) | BGX1_BLOCK);
+   }
+
nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG,
  (1ULL << 63) | BGX0_BLOCK);
nic_reg_write(nic, NIC_PF_INTF_0_1_BP_CFG + (1 << 8),
@@ -525,7 +558,7 @@ static void nic_config_rss(struct nicpf *nic, struct 
rss_cfg_msg *cfg)
 /* 4 level transmit side scheduler configutation
  * for TNS bypass mode
  *
- * Sample configuration for SQ0
+ * Sample configuration for SQ0 on 88xx
  * VNIC0-SQ0 -> TL4(0)   -> TL3[0]   -> TL2[0]  -> TL1[0] -> BGX0
  * VNIC1-SQ0 -> TL4(8)   -> TL3[2]   -> TL2[0]  -> TL1[0] -> BGX0
  * VNIC2-SQ0 -> TL4(16)  -> TL3[4]   -> TL2[1]  -> TL1[0] -> BGX0
@@ -560,17 +593,21 @@ static void nic_tx_channel_cfg(struct nicpf *nic, u8 vnic,
/* For 88xx 0-511 TL4 transmits via BGX0 and
 * 512-1023 TL4s transmit via BGX1.
 */
-   tl4 = bgx * (hw->tl4_cnt / hw->bgx_cnt);
-   if (!sq->sqs_mode) {
-   tl4 += (lmac * MAX_QUEUES_PER_QSET);
-   } else {
-   for (svf = 0; svf < MAX_SQS_PER_VF; svf++) {
-   if (nic->vf_sqs[pqs_vnic][svf] == vnic)
-   break;
+   if (hw->tl1_per_bgx) {
+   tl4 = bgx * (hw->tl4_cnt / hw->bgx_cnt);
+   if (!sq->sqs_mode) {
+   tl4 += (lmac * MAX_QUEUES_PER_QSET);
+   } else {
+   for (svf = 0; svf < MAX_SQS_PER_VF; svf++) {
+   if (nic->vf_sqs[pqs_vnic][svf] == vnic)
+   break;
+   }
+   tl4 += (MAX_LMAC_PER_BGX * MAX_QUEUES_PER_QSET);
+   tl4 += (lmac * 

  1   2   >