[PATCH net-next 02/13] liquidio: Vlan offloads changes

2016-06-21 Thread Raghu Vatsavayi
This patch adds support for vlan offloads for the driver and
receive header structures are also modified appropriately. Also
requestID will not be used in reveive header any more.

Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Raghu Vatsavayi 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 26 +++-
 .../net/ethernet/cavium/liquidio/liquidio_common.h | 46 +++---
 2 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 5a0977f..4b95dbf 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -1849,6 +1849,7 @@ liquidio_push_packet(u32 octeon_id,
struct sk_buff *skb = (struct sk_buff *)skbuff;
struct skb_shared_hwtstamps *shhwtstamps;
u64 ns;
+   u16 vtag = 0;
struct net_device *netdev = (struct net_device *)arg;
struct octeon_droq *droq = container_of(param, struct octeon_droq,
napi);
@@ -1925,6 +1926,16 @@ liquidio_push_packet(u32 octeon_id,
else
skb->ip_summed = CHECKSUM_NONE;
 
+   /* inbound VLAN tag */
+   if ((netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
+   (rh->r_dh.vlan != 0)) {
+   u16 vid = rh->r_dh.vlan;
+   u16 priority = rh->r_dh.priority;
+
+   vtag = priority << 13 | vid;
+   __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vtag);
+   }
+
packet_was_received = napi_gro_receive(napi, skb) != GRO_DROP;
 
if (packet_was_received) {
@@ -2900,6 +2911,11 @@ static int liquidio_xmit(struct sk_buff *skb, struct 
net_device *netdev)
tx_info->s.gso_size = skb_shinfo(skb)->gso_size;
tx_info->s.gso_segs = skb_shinfo(skb)->gso_segs;
}
+   /* HW insert VLAN tag */
+   if (skb_vlan_tag_present(skb)) {
+   irh->priority = skb_vlan_tag_get(skb) >> 13;
+   irh->vlan = skb_vlan_tag_get(skb) & 0xfff;
+   }
 
xmit_more = skb->xmit_more;
 
@@ -3301,11 +3317,17 @@ static int setup_nic_devices(struct octeon_device 
*octeon_dev)
| NETIF_F_LRO;
netif_set_gso_max_size(netdev, OCTNIC_GSO_MAX_SIZE);
 
-   netdev->features = (lio->dev_capability & ~NETIF_F_LRO);
-
netdev->vlan_features = lio->dev_capability;
+   /* Add any unchangeable hw features */
+   lio->dev_capability |=  NETIF_F_HW_VLAN_CTAG_RX |
+   NETIF_F_HW_VLAN_CTAG_TX;
+
+   netdev->features = (lio->dev_capability & ~NETIF_F_LRO);
 
netdev->hw_features = lio->dev_capability;
+   /*HW_VLAN_RX and HW_VLAN_FILTER is always on*/
+   netdev->hw_features = netdev->hw_features &
+   ~NETIF_F_HW_VLAN_CTAG_RX;
 
/* Point to the  properties for octeon device to which this
 * interface belongs.
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h 
b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 2179691..c86421f 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -482,15 +482,15 @@ struct octeon_instr_irh {
u64 opcode:4;
u64 rflag:1;
u64 subcode:7;
-   u64 len:3;
-   u64 rid:13;
-   u64 reserved:4;
+   u64 vlan:12;
+   u64 priority:3;
+   u64 reserved:5;
u64 ossp:32; /* opcode/subcode specific parameters */
 #else
u64 ossp:32; /* opcode/subcode specific parameters */
-   u64 reserved:4;
-   u64 rid:13;
-   u64 len:3;
+   u64 reserved:5;
+   u64 priority:3;
+   u64 vlan:12;
u64 subcode:7;
u64 rflag:1;
u64 opcode:4;
@@ -517,28 +517,27 @@ union octeon_rh {
struct {
u64 opcode:4;
u64 subcode:8;
-   u64 len:3;   /** additional 64-bit words */
-   u64 rid:13;  /** request id in response to pkt sent by host 
*/
-   u64 reserved:4;
-   u64 ossp:32; /** opcode/subcode specific parameters */
+   u64 len:3; /** additional 64-bit words */
+   u64 reserved:17;
+   u64 ossp:32;   /** opcode/subcode specific parameters */
} r;
struct {
u64 opcode:4;
u64 subcode:8;
-   u64 

[PATCH net-next 06/13] liquidio: New unload state

2016-06-21 Thread Raghu Vatsavayi
This patch adds new state so that the ctrl packets are not sent
to firmware during unload time and only rx packets are allowed.

Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Raghu Vatsavayi 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c  |  4 
 drivers/net/ethernet/cavium/liquidio/octeon_device.h |  6 ++
 drivers/net/ethernet/cavium/liquidio/octeon_nic.c| 20 ++--
 .../net/ethernet/cavium/liquidio/response_manager.c  |  2 ++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 5fb1b79..4440086 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -1327,6 +1327,10 @@ static int liquidio_stop_nic_module(struct octeon_device 
*oct)
return 1;
}
 
+   spin_lock_bh(>cmd_resp_wqlock);
+   oct->cmd_resp_state = OCT_DRV_OFFLINE;
+   spin_unlock_bh(>cmd_resp_wqlock);
+
for (i = 0; i < oct->ifcount; i++) {
lio = GET_LIO(oct->props[i].netdev);
for (j = 0; j < lio->linfo.num_rxpciq; j++)
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.h 
b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
index abfc0d6..ceb905d 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_device.h
@@ -383,6 +383,10 @@ struct octeon_device {
 
struct cavium_wq dma_comp_wq;
 
+   /** Lock for dma response list */
+   spinlock_t cmd_resp_wqlock;
+   u32 cmd_resp_state;
+
struct cavium_wq check_db_wq[MAX_POSSIBLE_OCTEON_INSTR_QUEUES];
 
struct cavium_wk nic_poll_work;
@@ -392,6 +396,8 @@ struct octeon_device {
void *priv;
 };
 
+#define  OCT_DRV_ONLINE 1
+#define  OCT_DRV_OFFLINE 2
 #define  OCTEON_CN6XXX(oct)   ((oct->chip_id == OCTEON_CN66XX) || \
   (oct->chip_id == OCTEON_CN68XX))
 #define CHIP_FIELD(oct, TYPE, field) \
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_nic.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_nic.c
index 7843b8a..36f1970 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_nic.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_nic.c
@@ -171,20 +171,36 @@ octnet_send_nic_ctrl_pkt(struct octeon_device *oct,
int retval;
struct octeon_soft_command *sc = NULL;
 
+   spin_lock_bh(>cmd_resp_wqlock);
+   /* Allow only rx ctrl command to stop traffic on the chip
+* during offline operations
+*/
+   if ((oct->cmd_resp_state == OCT_DRV_OFFLINE) &&
+   (nctrl->ncmd.s.cmd != OCTNET_CMD_RX_CTL)) {
+   spin_unlock_bh(>cmd_resp_wqlock);
+   dev_err(>pci_dev->dev,
+   "%s cmd:%d not processed since driver offline\n",
+   __func__, nctrl->ncmd.s.cmd);
+   return -1;
+   }
+
sc = octnic_alloc_ctrl_pkt_sc(oct, nctrl);
if (!sc) {
dev_err(>pci_dev->dev, "%s soft command alloc failed\n",
__func__);
+   spin_unlock_bh(>cmd_resp_wqlock);
return -1;
}
 
retval = octeon_send_soft_command(oct, sc);
if (retval == IQ_SEND_FAILED) {
octeon_free_soft_command(oct, sc);
-   dev_err(>pci_dev->dev, "%s soft command send failed 
status: %x\n",
-   __func__, retval);
+   dev_err(>pci_dev->dev, "%s soft command:%d send failed 
status: %x\n",
+   __func__, nctrl->ncmd.s.cmd, retval);
+   spin_unlock_bh(>cmd_resp_wqlock);
return -1;
}
 
+   spin_unlock_bh(>cmd_resp_wqlock);
return retval;
 }
diff --git a/drivers/net/ethernet/cavium/liquidio/response_manager.c 
b/drivers/net/ethernet/cavium/liquidio/response_manager.c
index e2e9103..c93210f 100644
--- a/drivers/net/ethernet/cavium/liquidio/response_manager.c
+++ b/drivers/net/ethernet/cavium/liquidio/response_manager.c
@@ -54,6 +54,7 @@ int octeon_setup_response_list(struct octeon_device *oct)
spin_lock_init(>response_list[i].lock);
atomic_set(>response_list[i].pending_req_count, 0);
}
+   spin_lock_init(>cmd_resp_wqlock);
 
oct->dma_comp_wq.wq = alloc_workqueue("dma-comp", WQ_MEM_RECLAIM, 0);
if (!oct->dma_comp_wq.wq) {
@@ -64,6 +65,7 @@ int octeon_setup_response_list(struct octeon_device *oct)
cwq = >dma_comp_wq;
INIT_DELAYED_WORK(>wk.work, oct_poll_req_completion);
cwq->wk.ctxptr = oct;
+   oct->cmd_resp_state = 

[PATCH net-next 07/13] liquidio: chip reset changes

2016-06-21 Thread Raghu Vatsavayi
This patch resolves the order of chip reset while destroying
the resources by postoponing soft reset in destroy resources
function until all queues are removed properly.

Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Raghu Vatsavayi 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 4440086..56b1d67 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -1180,12 +1180,6 @@ static void octeon_destroy_resources(struct 
octeon_device *oct)
if (oct->flags & LIO_FLAG_MSI_ENABLED)
pci_disable_msi(oct->pci_dev);
 
-   /* Soft reset the octeon device before exiting */
-   oct->fn_list.soft_reset(oct);
-
-   /* Disable the device, releasing the PCI INT */
-   pci_disable_device(oct->pci_dev);
-
/* fallthrough */
case OCT_DEV_IN_RESET:
case OCT_DEV_DROQ_INIT_DONE:
@@ -1232,11 +1226,18 @@ static void octeon_destroy_resources(struct 
octeon_device *oct)
 
/* fallthrough */
case OCT_DEV_PCI_MAP_DONE:
+
+   /* Soft reset the octeon device before exiting */
+   oct->fn_list.soft_reset(oct);
+
octeon_unmap_pci_barx(oct, 0);
octeon_unmap_pci_barx(oct, 1);
 
/* fallthrough */
case OCT_DEV_BEGIN_STATE:
+   /* Disable the device, releasing the PCI INT */
+   pci_disable_device(oct->pci_dev);
+
/* Nothing to be done here either */
break;
}   /* end switch(oct->status) */
-- 
1.8.3.1



[PATCH net-next 04/13] liquidio: Napi rx/tx traffic

2016-06-21 Thread Raghu Vatsavayi
This Patch adds tx buffer handling  to Napi along with RX
traffic. Also separate spinlocks are introduced for handling
iq posting and buffer reclaim so that tx path and tx interrupt
do not compete against each other.

Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Raghu Vatsavayi 
---
 .../net/ethernet/cavium/liquidio/cn66xx_device.c   |   3 +-
 .../net/ethernet/cavium/liquidio/cn66xx_device.h   |   3 +-
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 150 +
 .../net/ethernet/cavium/liquidio/octeon_device.h   |   4 +-
 drivers/net/ethernet/cavium/liquidio/octeon_iq.h   |  12 +-
 .../net/ethernet/cavium/liquidio/request_manager.c | 110 +--
 6 files changed, 177 insertions(+), 105 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn66xx_device.c 
b/drivers/net/ethernet/cavium/liquidio/cn66xx_device.c
index c577559..d35864a 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn66xx_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn66xx_device.c
@@ -496,8 +496,7 @@ u32 lio_cn6xxx_bar1_idx_read(struct octeon_device *oct, u32 
idx)
 }
 
 u32
-lio_cn6xxx_update_read_index(struct octeon_device *oct __attribute__((unused)),
-struct octeon_instr_queue *iq)
+lio_cn6xxx_update_read_index(struct octeon_instr_queue *iq)
 {
u32 new_idx = readl(iq->inst_cnt_reg);
 
diff --git a/drivers/net/ethernet/cavium/liquidio/cn66xx_device.h 
b/drivers/net/ethernet/cavium/liquidio/cn66xx_device.h
index f779187..fe2932c 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn66xx_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/cn66xx_device.h
@@ -91,8 +91,7 @@ void lio_cn6xxx_bar1_idx_setup(struct octeon_device *oct, u64 
core_addr,
 void lio_cn6xxx_bar1_idx_write(struct octeon_device *oct, u32 idx, u32 mask);
 u32 lio_cn6xxx_bar1_idx_read(struct octeon_device *oct, u32 idx);
 u32
-lio_cn6xxx_update_read_index(struct octeon_device *oct __attribute__((unused)),
-struct octeon_instr_queue *iq);
+lio_cn6xxx_update_read_index(struct octeon_instr_queue *iq);
 void lio_cn6xxx_enable_interrupt(void *chip);
 void lio_cn6xxx_disable_interrupt(void *chip);
 void cn6xxx_get_pcie_qlmport(struct octeon_device *oct);
diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 9f3a93b..8310eb8 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -365,7 +365,7 @@ static int wait_for_pending_requests(struct octeon_device 
*oct)
[OCTEON_ORDERED_SC_LIST].pending_req_count);
if (pcount)
schedule_timeout_uninterruptible(HZ / 10);
-else
+   else
break;
}
 
@@ -409,7 +409,7 @@ static inline void pcierror_quiesce_device(struct 
octeon_device *oct)
iq->octeon_read_index = iq->host_write_index;
iq->stats.instr_processed +=
atomic_read(>instr_pending);
-   lio_process_iq_request_list(oct, iq);
+   lio_process_iq_request_list(oct, iq, 0);
spin_unlock_bh(>lock);
}
}
@@ -959,6 +959,36 @@ static inline void update_link_status(struct net_device 
*netdev,
}
 }
 
+/* Runs in interrupt context. */
+static void update_txq_status(struct octeon_device *oct, int iq_num)
+{
+   struct net_device *netdev;
+   struct lio *lio;
+   struct octeon_instr_queue *iq = oct->instr_queue[iq_num];
+
+   /*octeon_update_iq_read_idx(oct, iq);*/
+
+   netdev = oct->props[iq->ifidx].netdev;
+
+   /* This is needed because the first IQ does not have
+* a netdev associated with it.
+*/
+   if (!netdev)
+   return;
+
+   lio = GET_LIO(netdev);
+   if (netif_is_multiqueue(netdev)) {
+   if (__netif_subqueue_stopped(netdev, iq->q_index) &&
+   lio->linfo.link.s.link_up &&
+   (!octnet_iq_is_full(oct, iq_num))) {
+   netif_wake_subqueue(netdev, iq->q_index);
+   } else {
+   if (!octnet_iq_is_full(oct, lio->txq))
+   wake_q(netdev, lio->txq);
+   }
+   }
+}
+
 /**
  * \brief Droq packet processor sceduler
  * @param oct octeon device
@@ -1246,6 +1276,7 @@ static void liquidio_destroy_nic_device(struct 
octeon_device *oct, int ifidx)
 {
struct net_device *netdev = oct->props[ifidx].netdev;
struct lio *lio;
+   struct napi_struct *napi, *n;
 
if (!netdev) {
  

[PATCH net-next 09/13] liquidio: New statistics support

2016-06-21 Thread Raghu Vatsavayi
This patch adds extensive support of statistics for data path,
control path and firmware.

Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Raghu Vatsavayi 
---
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 574 ++---
 drivers/net/ethernet/cavium/liquidio/lio_main.c|  48 +-
 .../net/ethernet/cavium/liquidio/liquidio_common.h |  10 +
 .../net/ethernet/cavium/liquidio/octeon_device.h   |   6 +
 drivers/net/ethernet/cavium/liquidio/octeon_iq.h   |   4 +
 .../net/ethernet/cavium/liquidio/octeon_network.h  |  11 +
 6 files changed, 588 insertions(+), 65 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c 
b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 9c6b58a..2b03095 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -40,6 +40,8 @@
 #include "cn68xx_device.h"
 #include "liquidio_image.h"
 
+static int octnet_get_link_stats(struct net_device *netdev);
+
 struct oct_mdio_cmd_context {
int octeon_id;
wait_queue_head_t wc;
@@ -77,28 +79,109 @@ enum {
 #define OCT_ETHTOOL_REGDUMP_LEN  4096
 #define OCT_ETHTOOL_REGSVER  1
 
+/* statistics of PF */
+static const char oct_stats_strings[][ETH_GSTRING_LEN] = {
+   "rx_packets",
+   "tx_packets",
+   "rx_bytes",
+   "tx_bytes",
+   "rx_errors",/*jabber_err+l2_err+frame_err */
+   "tx_errors",/*fw_err_pko+fw_err_link+fw_err_drop */
+   "rx_dropped",   /*st->fromwire.total_rcvd - st->fromwire.fw_total_rcvd
+   *+st->fromwire.dmac_drop + st->fromwire.fw_err_drop
+   */
+   "tx_dropped",
+
+   "tx_total_sent",
+   "tx_total_fwd",
+   "tx_err_pko",
+   "tx_err_link",
+   "tx_err_drop",
+
+   "tx_tso",
+   "tx_tso_packets",
+   "tx_tso_err",
+
+   "mac_tx_total_pkts",
+   "mac_tx_total_bytes",
+   "mac_tx_mcast_pkts",
+   "mac_tx_bcast_pkts",
+   "mac_tx_ctl_packets",   /*oct->link_stats.fromhost.ctl_sent */
+   "mac_tx_total_collisions",
+   "mac_tx_one_collision",
+   "mac_tx_multi_collison",
+   "mac_tx_max_collision_fail",
+   "mac_tx_max_deferal_fail",
+   "mac_tx_fifo_err",
+   "mac_tx_runts",
+
+   "rx_total_rcvd",
+   "rx_total_fwd",
+   "rx_jabber_err",
+   "rx_l2_err",
+   "rx_frame_err",
+   "rx_err_pko",
+   "rx_err_link",
+   "rx_err_drop",
+
+   "rx_lro_pkts",
+   "rx_lro_bytes",
+   "rx_total_lro",
+
+   "rx_lro_aborts",
+   "rx_lro_aborts_port",
+   "rx_lro_aborts_seq",
+   "rx_lro_aborts_tsval",
+   "rx_lro_aborts_timer",
+   "rx_fwd_rate",
+
+   "mac_rx_total_rcvd",
+   "mac_rx_bytes",
+   "mac_rx_total_bcst",
+   "mac_rx_total_mcst",
+   "mac_rx_runts",
+   "mac_rx_ctl_packets",
+   "mac_rx_fifo_err",
+   "mac_rx_dma_drop",
+   "mac_rx_fcs_err",
+
+   "link_state_changes",
+};
+
+/* statistics of host tx queue */
 static const char oct_iq_stats_strings[][ETH_GSTRING_LEN] = {
-   "Instr posted",
-   "Instr processed",
-   "Instr dropped",
-   "Bytes Sent",
-   "Sgentry_sent",
-   "Inst cntreg",
-   "Tx done",
-   "Tx Iq busy",
-   "Tx dropped",
-   "Tx bytes",
+   "packets",  /*oct->instr_queue[iq_no]->stats.tx_done*/
+   "bytes",/*oct->instr_queue[iq_no]->stats.tx_tot_bytes*/
+   "dropped",
+   "iq_busy",
+   "sgentry_sent",
+
+   "fw_instr_posted",
+   "fw_instr_processed",
+   "fw_instr_dropped",
+   "fw_bytes_sent",
+
+   "tso",
+   "txq_restart",
 };
 
+/* statistics of host rx queue */
 static const char oct_droq_stats_strings[][ETH_GSTRING_LEN] = {
-   "OQ Pkts Received",
-   "OQ Bytes Received",
-   "Dropped no dispatch",
-   "Dropped nomem",
-   "Dropped toomany",
-   "Stack RX cnt",
-   "Stack RX Bytes",
-   "RX dropped",
+   "packets",  /*oct->droq[oq_no]->stats.rx_pkts_received */
+   "bytes",/*oct->droq[oq_no]->stats.rx_bytes_received */
+   "dropped",  /*oct->droq[oq_no]->stats.rx_dropped+
+*oct->droq[oq_no]->stats.dropped_nodispatch+
+*oct->droq[oq_no]->stats.dropped_toomany+
+*oct->droq[oq_no]->stats.dropped_nomem
+*/
+   "dropped_nomem",
+   "dropped_toomany",
+   "fw_dropped",
+   "fw_pkts_received",
+   "fw_bytes_received",
+   "fw_dropped_nodispatch",
+
+   "buffer_alloc_failure",
 };
 
 #define OCTNIC_NCMD_AUTONEG_ON  

[PATCH net-next 08/13] liquidio: tx rx interrupt moderation

2016-06-21 Thread Raghu Vatsavayi
This patch has new tx/rx interrupt moderation defaults of
count/timer for better throughput and utilisation.

Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Raghu Vatsavayi 
---
 drivers/net/ethernet/cavium/liquidio/lio_ethtool.c | 270 -
 drivers/net/ethernet/cavium/liquidio/lio_main.c|  22 +-
 .../net/ethernet/cavium/liquidio/liquidio_common.h |  49 ++--
 3 files changed, 200 insertions(+), 141 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c 
b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
index 56f465b..9c6b58a 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_ethtool.c
@@ -616,50 +616,50 @@ static int lio_get_intr_coalesce(struct net_device 
*netdev,
 {
struct lio *lio = GET_LIO(netdev);
struct octeon_device *oct = lio->oct_dev;
-   struct octeon_cn6xxx *cn6xxx = (struct octeon_cn6xxx *)oct->chip;
struct octeon_instr_queue *iq;
struct oct_intrmod_cfg *intrmod_cfg;
 
intrmod_cfg = >intrmod;
 
switch (oct->chip_id) {
-   /* case OCTEON_CN73XX: Todo */
-   /*  break; */
case OCTEON_CN68XX:
-   case OCTEON_CN66XX:
-   if (!intrmod_cfg->intrmod_enable) {
+   case OCTEON_CN66XX: {
+   struct octeon_cn6xxx *cn6xxx =
+   (struct octeon_cn6xxx *)oct->chip;
+
+   if (!intrmod_cfg->rx_enable) {
intr_coal->rx_coalesce_usecs =
CFG_GET_OQ_INTR_TIME(cn6xxx->conf);
intr_coal->rx_max_coalesced_frames =
CFG_GET_OQ_INTR_PKT(cn6xxx->conf);
-   } else {
-   intr_coal->use_adaptive_rx_coalesce =
-   intrmod_cfg->intrmod_enable;
-   intr_coal->rate_sample_interval =
-   intrmod_cfg->intrmod_check_intrvl;
-   intr_coal->pkt_rate_high =
-   intrmod_cfg->intrmod_maxpkt_ratethr;
-   intr_coal->pkt_rate_low =
-   intrmod_cfg->intrmod_minpkt_ratethr;
-   intr_coal->rx_max_coalesced_frames_high =
-   intrmod_cfg->intrmod_maxcnt_trigger;
-   intr_coal->rx_coalesce_usecs_high =
-   intrmod_cfg->intrmod_maxtmr_trigger;
-   intr_coal->rx_coalesce_usecs_low =
-   intrmod_cfg->intrmod_mintmr_trigger;
-   intr_coal->rx_max_coalesced_frames_low =
-   intrmod_cfg->intrmod_mincnt_trigger;
}
 
iq = oct->instr_queue[lio->linfo.txpciq[0].s.q_no];
intr_coal->tx_max_coalesced_frames = iq->fill_threshold;
break;
-
+   }
default:
netif_info(lio, drv, lio->netdev, "Unknown Chip !!\n");
return -EINVAL;
}
-
+   if (intrmod_cfg->rx_enable) {
+   intr_coal->use_adaptive_rx_coalesce =
+   intrmod_cfg->rx_enable;
+   intr_coal->rate_sample_interval =
+   intrmod_cfg->check_intrvl;
+   intr_coal->pkt_rate_high =
+   intrmod_cfg->maxpkt_ratethr;
+   intr_coal->pkt_rate_low =
+   intrmod_cfg->minpkt_ratethr;
+   intr_coal->rx_max_coalesced_frames_high =
+   intrmod_cfg->rx_maxcnt_trigger;
+   intr_coal->rx_coalesce_usecs_high =
+   intrmod_cfg->rx_maxtmr_trigger;
+   intr_coal->rx_coalesce_usecs_low =
+   intrmod_cfg->rx_mintmr_trigger;
+   intr_coal->rx_max_coalesced_frames_low =
+   intrmod_cfg->rx_mincnt_trigger;
+   }
return 0;
 }
 
@@ -679,19 +679,20 @@ static void octnet_intrmod_callback(struct octeon_device 
*oct_dev,
else
dev_info(_dev->pci_dev->dev,
 "Rx-Adaptive Interrupt moderation enabled:%llx\n",
-oct_dev->intrmod.intrmod_enable);
+oct_dev->intrmod.rx_enable);
 
octeon_free_soft_command(oct_dev, sc);
 }
 
 /*  Configure interrupt moderation parameters */
-static int octnet_set_intrmod_cfg(void *oct, struct oct_intrmod_cfg *intr_cfg)
+static int octnet_set_intrmod_cfg(struct lio *lio,
+ struct oct_intrmod_cfg *intr_cfg)
 {
struct octeon_soft_command *sc;
struct oct_intrmod_cmd *cmd;
struct 

[PATCH net-next 05/13] liquidio: Firmware image download

2016-06-21 Thread Raghu Vatsavayi
This patch has firmware image related changes for: firmware
release upon failure, support latest firmware version and
firmware download in 4MB chunks.

Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Raghu Vatsavayi 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 18 -
 .../net/ethernet/cavium/liquidio/liquidio_common.h | 10 +--
 .../net/ethernet/cavium/liquidio/octeon_device.c   | 79 --
 3 files changed, 65 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 8310eb8..5fb1b79 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -1375,6 +1375,7 @@ static int octeon_chip_specific_setup(struct 
octeon_device *oct)
 {
u32 dev_id, rev_id;
int ret = 1;
+   char *s;
 
pci_read_config_dword(oct->pci_dev, 0, _id);
pci_read_config_dword(oct->pci_dev, 8, _id);
@@ -1384,22 +1385,27 @@ static int octeon_chip_specific_setup(struct 
octeon_device *oct)
case OCTEON_CN68XX_PCIID:
oct->chip_id = OCTEON_CN68XX;
ret = lio_setup_cn68xx_octeon_device(oct);
+   s = "CN68XX";
break;
 
case OCTEON_CN66XX_PCIID:
oct->chip_id = OCTEON_CN66XX;
ret = lio_setup_cn66xx_octeon_device(oct);
+   s = "CN66XX";
break;
+
default:
+   s = "?";
dev_err(>pci_dev->dev, "Unknown device found (dev_id: 
%x)\n",
dev_id);
}
 
if (!ret)
-   dev_info(>pci_dev->dev, "CN68XX PASS%d.%d %s\n",
+   dev_info(>pci_dev->dev, "%s PASS%d.%d %s Version: %s\n", s,
 OCTEON_MAJOR_REV(oct),
 OCTEON_MINOR_REV(oct),
-octeon_get_conf(oct)->card_name);
+octeon_get_conf(oct)->card_name,
+LIQUIDIO_VERSION);
 
return ret;
 }
@@ -1772,6 +1778,7 @@ static int load_firmware(struct octeon_device *oct)
if (ret) {
dev_err(>pci_dev->dev, "Request firmware failed. Could not 
find file %s.\n.",
fw_name);
+   release_firmware(fw);
return ret;
}
 
@@ -1841,6 +1848,9 @@ static void if_cfg_callback(struct octeon_device *oct,
CVM_CAST64(resp->status));
ACCESS_ONCE(ctx->cond) = 1;
 
+   snprintf(oct->fw_info.liquidio_firmware_version, 32, "%s",
+resp->cfg_info.liquidio_firmware_version);
+
/* This barrier is required to be sure that the response has been
 * written fully before waking up the handler
 */
@@ -3635,6 +3645,7 @@ static void nic_starter(struct work_struct *work)
 static int octeon_device_init(struct octeon_device *octeon_dev)
 {
int j, ret;
+   char bootcmd[] = "\n";
struct octeon_device_priv *oct_priv =
(struct octeon_device_priv *)octeon_dev->priv;
atomic_set(_dev->status, OCT_DEV_BEGIN_STATE);
@@ -3767,6 +3778,9 @@ static int octeon_device_init(struct octeon_device 
*octeon_dev)
return 1;
}
 
+   /* Divert uboot to take commands from host instead. */
+   ret = octeon_console_send_cmd(octeon_dev, bootcmd, 50);
+
dev_dbg(_dev->pci_dev->dev, "Initializing consoles\n");
ret = octeon_init_consoles(octeon_dev);
if (ret) {
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h 
b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 3738877..1ef9001 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -30,11 +30,10 @@
 
 #include "octeon_config.h"
 
-#define LIQUIDIO_VERSION"1.1.9"
-#define LIQUIDIO_MAJOR_VERSION  1
-#define LIQUIDIO_MINOR_VERSION  1
-#define LIQUIDIO_MICRO_VERSION  9
-
+#define LIQUIDIO_BASE_VERSION   "1.4"
+#define LIQUIDIO_MICRO_VERSION  ".1"
+#define LIQUIDIO_PACKAGE ""
+#define LIQUIDIO_VERSION  "1.4.1"
 #define CONTROL_IQ 0
 /** Tag types used by Octeon cores in its work. */
 enum octeon_tag_type {
@@ -712,6 +711,7 @@ struct liquidio_if_cfg_info {
u64 iqmask; /** mask for IQs enabled for  the port */
u64 oqmask; /** mask for OQs enabled for the port */
struct oct_link_info linfo; /** initial link information */
+   char   liquidio_firmware_version[32];
 };
 
 /** Stats for each NIC port in RX direction. */
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_device.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_device.c

[PATCH net-next 03/13] liquidio: Vlan filtering

2016-06-21 Thread Raghu Vatsavayi
This patch adds supports for Vlan filtering for liquidio driver.

Signed-off-by: Derek Chickles 
Signed-off-by: Satanand Burla 
Signed-off-by: Felix Manlunas 
Signed-off-by: Raghu Vatsavayi 
Signed-off-by: Raghu Vatsavayi 
---
 drivers/net/ethernet/cavium/liquidio/lio_main.c| 82 +-
 .../net/ethernet/cavium/liquidio/liquidio_common.h |  4 ++
 2 files changed, 83 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_main.c
index 4b95dbf..9f3a93b 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
@@ -2310,6 +2310,21 @@ void liquidio_link_ctrl_cmd_completion(void *nctrl_ptr)
 netdev->name);
break;
 
+   case OCTNET_CMD_ENABLE_VLAN_FILTER:
+   dev_info(>pci_dev->dev, "%s VLAN filter enabled\n",
+netdev->name);
+   break;
+
+   case OCTNET_CMD_ADD_VLAN_FILTER:
+   dev_info(>pci_dev->dev, "%s VLAN filter %d added\n",
+netdev->name, nctrl->ncmd.s.param1);
+   break;
+
+   case OCTNET_CMD_DEL_VLAN_FILTER:
+   dev_info(>pci_dev->dev, "%s VLAN filter %d removed\n",
+netdev->name, nctrl->ncmd.s.param1);
+   break;
+
case OCTNET_CMD_SET_SETTINGS:
dev_info(>pci_dev->dev, "%s settings changed\n",
 netdev->name);
@@ -2965,6 +2980,61 @@ static void liquidio_tx_timeout(struct net_device 
*netdev)
txqs_wake(netdev);
 }
 
+static int liquidio_vlan_rx_add_vid(struct net_device *netdev,
+   __be16 proto __attribute__((unused)),
+   u16 vid)
+{
+   struct lio *lio = GET_LIO(netdev);
+   struct octeon_device *oct = lio->oct_dev;
+   struct octnic_ctrl_pkt nctrl;
+   int ret = 0;
+
+   memset(, 0, sizeof(struct octnic_ctrl_pkt));
+
+   nctrl.ncmd.u64 = 0;
+   nctrl.ncmd.s.cmd = OCTNET_CMD_ADD_VLAN_FILTER;
+   nctrl.ncmd.s.param1 = vid;
+   nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+   nctrl.wait_time = 100;
+   nctrl.netpndev = (u64)netdev;
+   nctrl.cb_fn = liquidio_link_ctrl_cmd_completion;
+
+   ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, );
+   if (ret < 0) {
+   dev_err(>pci_dev->dev, "Add VLAN filter failed in core 
(ret: 0x%x)\n",
+   ret);
+   }
+
+   return ret;
+}
+
+static int liquidio_vlan_rx_kill_vid(struct net_device *netdev,
+__be16 proto __attribute__((unused)),
+u16 vid)
+{
+   struct lio *lio = GET_LIO(netdev);
+   struct octeon_device *oct = lio->oct_dev;
+   struct octnic_ctrl_pkt nctrl;
+   int ret = 0;
+
+   memset(, 0, sizeof(struct octnic_ctrl_pkt));
+
+   nctrl.ncmd.u64 = 0;
+   nctrl.ncmd.s.cmd = OCTNET_CMD_DEL_VLAN_FILTER;
+   nctrl.ncmd.s.param1 = vid;
+   nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+   nctrl.wait_time = 100;
+   nctrl.netpndev = (u64)netdev;
+   nctrl.cb_fn = liquidio_link_ctrl_cmd_completion;
+
+   ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, );
+   if (ret < 0) {
+   dev_err(>pci_dev->dev, "Add VLAN filter failed in core 
(ret: 0x%x)\n",
+   ret);
+   }
+   return ret;
+}
+
 int liquidio_set_feature(struct net_device *netdev, int cmd, u16 param1)
 {
struct lio *lio = GET_LIO(netdev);
@@ -3056,6 +3126,9 @@ static struct net_device_ops lionetdevops = {
.ndo_set_mac_address= liquidio_set_mac,
.ndo_set_rx_mode= liquidio_set_mcast_list,
.ndo_tx_timeout = liquidio_tx_timeout,
+
+   .ndo_vlan_rx_add_vid= liquidio_vlan_rx_add_vid,
+   .ndo_vlan_rx_kill_vid   = liquidio_vlan_rx_kill_vid,
.ndo_change_mtu = liquidio_change_mtu,
.ndo_do_ioctl   = liquidio_ioctl,
.ndo_fix_features   = liquidio_fix_features,
@@ -3319,7 +3392,8 @@ static int setup_nic_devices(struct octeon_device 
*octeon_dev)
 
netdev->vlan_features = lio->dev_capability;
/* Add any unchangeable hw features */
-   lio->dev_capability |=  NETIF_F_HW_VLAN_CTAG_RX |
+   lio->dev_capability |=  NETIF_F_HW_VLAN_CTAG_FILTER |
+   NETIF_F_HW_VLAN_CTAG_RX |
NETIF_F_HW_VLAN_CTAG_TX;
 
netdev->features = (lio->dev_capability & ~NETIF_F_LRO);
@@ -3377,9 +3451,11 @@ static int setup_nic_devices(struct octeon_device 
*octeon_dev)
liquidio_set_feature(netdev, 

Re: [PATCH net-next v3] tcp: use RFC6298 compliant TCP RTO calculation

2016-06-21 Thread Yuchung Cheng
On Fri, Jun 17, 2016 at 11:56 AM, Yuchung Cheng  wrote:
>
> On Fri, Jun 17, 2016 at 11:32 AM, David Miller  wrote:
> >
> > From: Daniel Metz 
> > Date: Wed, 15 Jun 2016 20:00:03 +0200
> >
> > > This patch adjusts Linux RTO calculation to be RFC6298 Standard
> > > compliant. MinRTO is no longer added to the computed RTO, RTO damping
> > > and overestimation are decreased.
> >  ...
> >
> > Yuchung, I assume I am waiting for you to do the testing you said
> > you would do for this patch, right?
> Yes I spent the last two days resolving some unrelated glitches to
> start my testing on Web servers. I should be able to get some results
> over the weekend.
>
> I will test
> 0) current Linux
> 1) this patch
> 2) RFC6298 with min_RTO=1sec
> 3) RFC6298 with minimum RTTVAR of 200ms (so it is more like current
> Linux style of min RTO which only applies to RTTVAR)
>
> and collect the TCP latency (how long to send an HTTP response) and
> (spurious) timeout & retransmission stats.
>
Thanks for the patience. I've collected data from some Google Web
servers. They serve both a mix of US and SouthAm users using
HTTP1 and HTTP2. The traffic is Web browsing (e.g., search, maps,
gmails, etc but not Youtube videos). The mean RTT is about 100ms.

The user connections were split into 4 groups of different TCP RTO
configs. Each group has many millions of connections but the
size variation among groups is well under 1%.

B: baseline Linux
D: this patch
R: change RTTYAR averaging as in D, but bound RTO to 1sec per RFC6298
Y: change RTTVAR averaging as in D, but bound RTTVAR to 200ms instead (like B)

For mean TCP latency of HTTP responses (first byte sent to last byte
acked), B < R < Y < D. But the differences are so insignificant (<1%).
The median, 95pctl, and 99pctl has similar indifference. In summary
there's hardly visible impact on latency. I also look at only response
less than 4KB but do not see a different picture.

The main difference is the retransmission rate where R =~ Y < B =~D.
R and Y are ~20% lower than B and D. Parsing the SNMP stats reveal
more interesting details. The table shows the deltas in percentage to
the baseline B.

D  R Y
--
Timeout  +12%   -16%  -16%
TailLossProb +28%-7%   -7%
DSACK_rcvd   +37%-7%   -7%
Cwnd-undo+16%   -29%  -29%

RTO change affects TLP because TLP will use the min of RTO and TLP
timer value to arm the probe timer.

The stats indicate that the main culprit of spurious timeouts / rtx is
the RTO lower-bound. But they also show the RFC RTTVAR averaging is as
good as current Linux approach.

Given that I would recommend we revise this patch to use the RFC
averaging but keep existing lower-bound (of RTTVAR to 200ms). We can
further experiment the lower-bound and change that in a separate
patch.


Re: [PATCH] ppc: Fix BPF JIT for ABIv2

2016-06-21 Thread Michael Ellerman
On Tue, 2016-06-21 at 14:28 +0530, Naveen N. Rao wrote:
> On 2016/06/20 03:56PM, Thadeu Lima de Souza Cascardo wrote:
> > On Sun, Jun 19, 2016 at 11:19:14PM +0530, Naveen N. Rao wrote:
> > > On 2016/06/17 10:00AM, Thadeu Lima de Souza Cascardo wrote:
> > > > 
> > > > Hi, Michael and Naveen.
> > > > 
> > > > I noticed independently that there is a problem with BPF JIT and ABIv2, 
> > > > and
> > > > worked out the patch below before I noticed Naveen's patchset and the 
> > > > latest
> > > > changes in ppc tree for a better way to check for ABI versions.
> > > > 
> > > > However, since the issue described below affect mainline and stable 
> > > > kernels,
> > > > would you consider applying it before merging your two patchsets, so 
> > > > that we can
> > > > more easily backport the fix?
> > > 
> > > Hi Cascardo,
> > > Given that this has been broken on ABIv2 since forever, I didn't bother 
> > > fixing it. But, I can see why this would be a good thing to have for 
> > > -stable and existing distros. However, while your patch below may fix 
> > > the crash you're seeing on ppc64le, it is not sufficient -- you'll need 
> > > changes in bpf_jit_asm.S as well.
> > 
> > Hi, Naveen.
> > 
> > Any tips on how to exercise possible issues there? Or what changes you think
> > would be sufficient?
> 
> The calling convention is different with ABIv2 and so we'll need changes 
> in bpf_slow_path_common() and sk_negative_common().
> 
> However, rather than enabling classic JIT for ppc64le, are we better off 
> just disabling it?
> 
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -128,7 +128,7 @@ config PPC
> select IRQ_FORCED_THREADING
> select HAVE_RCU_TABLE_FREE if SMP
> select HAVE_SYSCALL_TRACEPOINTS
> -   select HAVE_CBPF_JIT
> +   select HAVE_CBPF_JIT if CPU_BIG_ENDIAN
> select HAVE_ARCH_JUMP_LABEL
> select ARCH_HAVE_NMI_SAFE_CMPXCHG
> select ARCH_HAS_GCOV_PROFILE_ALL
> 
> 
> Michael,
> Let me know your thoughts on whether you intend to take this patch or 
> Cascardo's patch for -stable before the eBPF patches. I can redo my 
> patches accordingly.

Can one of you send me a proper version of this patch, with change log and
sign-off etc.

cheers



Re: [PATCH] ppc: Fix BPF JIT for ABIv2

2016-06-21 Thread Michael Ellerman
On Fri, 2016-06-17 at 10:00 -0300, Thadeu Lima de Souza Cascardo wrote:
> From a984dc02b6317a1d3a3c2302385adba5227be5bd Mon Sep 17 00:00:00 2001
> From: Thadeu Lima de Souza Cascardo 
> Date: Wed, 15 Jun 2016 13:22:12 -0300
> Subject: [PATCH] ppc: Fix BPF JIT for ABIv2
> 
> ABIv2 used for ppc64le does not use function descriptors. Without this patch,
> whenever BPF JIT is enabled, we get a crash as below.
> 
...

> diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
> index 889fd19..28b89ed 100644
> --- a/arch/powerpc/net/bpf_jit.h
> +++ b/arch/powerpc/net/bpf_jit.h
> @@ -70,7 +70,7 @@ DECLARE_LOAD_FUNC(sk_load_half);
>  DECLARE_LOAD_FUNC(sk_load_byte);
>  DECLARE_LOAD_FUNC(sk_load_byte_msh);
>  
> -#ifdef CONFIG_PPC64
> +#if defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2)
>  #define FUNCTION_DESCR_SIZE  24
>  #else
>  #define FUNCTION_DESCR_SIZE  0
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index 2d66a84..035b887 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -664,7 +664,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
>  
>   if (image) {
>   bpf_flush_icache(code_base, code_base + (proglen/4));
> -#ifdef CONFIG_PPC64
> +#if defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2)
>   /* Function descriptor nastiness: Address + TOC */
>   ((u64 *)image)[0] = (u64)code_base;
>   ((u64 *)image)[1] = local_paca->kernel_toc;


Confirmed that even with this patch we still crash:

  # echo 1 > /proc/sys/net/core/bpf_jit_enable
  # modprobe test_bpf
  BPF filter opcode 0020 (@3) unsupported
  BPF filter opcode 0020 (@2) unsupported
  BPF filter opcode 0020 (@0) unsupported
  Unable to handle kernel paging request for data at address 0xd54f65e8
  Faulting instruction address: 0xc08765f8
  cpu 0x0: Vector: 300 (Data Access) at [c34f3480]
  pc: c08765f8: skb_copy_bits+0x158/0x330
  lr: c008fb7c: bpf_slow_path_byte+0x28/0x54
  sp: c34f3700
 msr: 80010280b033
 dar: d54f65e8
   dsisr: 4000
current = 0xc001f857d8d0
paca= 0xc7b8 softe: 0irq_happened: 0x01
  pid   = 2993, comm = modprobe
  Linux version 4.7.0-rc3-00055-g9497a1c1c5b4-dirty 
(mich...@ka3.ozlabs.ibm.com) () #30 SMP Wed Jun 22 15:06:58 AEST 2016
  enter ? for help
  [c34f3770] c008fb7c bpf_slow_path_byte+0x28/0x54
  [c34f37e0] d7bb004c
  [c34f3900] d5331668 test_bpf_init+0x5fc/0x7f8 [test_bpf]
  [c34f3a30] c000b628 do_one_initcall+0x68/0x1d0
  [c34f3af0] c09beb24 do_init_module+0x90/0x240
  [c34f3b80] c01642bc load_module+0x206c/0x22f0
  [c34f3d30] c01648b0 SyS_finit_module+0x120/0x180
  [c34f3e30] c0009260 system_call+0x38/0x108
  --- Exception: c01 (System Call) at 3fff7ffa2db4


cheers



Re: [net-next] samples/bpf: set max locked memory to ulimited

2016-06-21 Thread Alexei Starovoitov
On Tue, Jun 21, 2016 at 09:05:58PM -0700, William Tu wrote:
> Signed-off-by: William Tu 

Acked-by: Alexei Starovoitov 



Re: [PATCH v10 06/22] IB/hns: Add initial cmd operation

2016-06-21 Thread Leon Romanovsky
On Tue, Jun 21, 2016 at 09:01:57PM +0800, Wei Hu (Xavier) wrote:
> 
> 
> On 2016/6/21 19:28, Leon Romanovsky wrote:
> >On Tue, Jun 21, 2016 at 06:50:51PM +0800, Wei Hu (Xavier) wrote:
> >>
> >>On 2016/6/20 21:33, Leon Romanovsky wrote:
> >>>On Thu, Jun 16, 2016 at 10:35:14PM +0800, Lijun Ou wrote:
> This patch added the operation for cmd, and added some functions
> for initializing eq table and selecting cmd mode.
> 
> Signed-off-by: Wei Hu 
> Signed-off-by: Nenglong Zhao 
> Signed-off-by: Lijun Ou 
> ---
> PATCH v9/v8/v7/v6:
> - No change over the PATCH v5
> 
> PATCH v5:
> - The initial patch which was redesigned based on the second patch
>    in PATCH v4
> ---
> >>><...>
> >>>
> +#define CMD_MAX_NUM  32
> +
> +int hns_roce_cmd_init(struct hns_roce_dev *hr_dev)
> +{
> + struct device *dev = _dev->pdev->dev;
> +
> + mutex_init(_dev->cmd.hcr_mutex);
> + sema_init(_dev->cmd.poll_sem, 1);
> + hr_dev->cmd.use_events = 0;
> + hr_dev->cmd.toggle = 1;
> + hr_dev->cmd.max_cmds = CMD_MAX_NUM;
> >>><...>
> >>>
> + for (hr_cmd->token_mask = 1; hr_cmd->token_mask < hr_cmd->max_cmds;
> +  hr_cmd->token_mask <<= 1)
> + ;
> + --hr_cmd->token_mask;
> >>>It doesn't look that you dynamically change max_cmds supported.
> >>>Why do you need to calculate token_mask dynamically?
> >>Hi, Leon
> >>
> >> 1. The four lines above are in the function named
> >>hns_roce_cmd_use_events.
> >>  and now this function is only called once in hns_roce_probe.
> >> 2. In hns_roce_cmd_use_events,
> >> we use these 4 lines to achieve the value of hr_cmd->token_mask
> >>according to hr_cmd->max_cmds dynamically,
> >> then we only define one marco for hr_cmd->max_cmds as below:
> >>
> >>#define CMD_MAX_NUM 32
> >>
> >>And it looks more flexible.
> >It is called over engineering.
> >I would recommend to you to remove it.
> >
> >We don't need over complicated code which is executed
> >once with need to maintain with zero benefit.
> >
> >The other places need such simplification too.
> Hi, Leon
> 
> We will modify this place as below:
> In hns_roce_hw_v1.c(for hip06 soc) file:
> 
> void hns_roce_v1_profile(struct hns_roce_dev *hr_dev)
> {
> 
> caps->max_cmds = 32;
> 
> }
> 
> In hns_roce_cmd.c file:
> 
> int hns_roce_cmd_init(struct hns_roce_dev *hr_dev)
> {
>
>hr_dev->cmd.max_cmds = hr_dev->caps->max_cmds;
>   
>   }
> 
>Can you give more suggestions?

I would be happy to do it if I had enough time to review this code.

General suggestion will be to ask yourself, if value is going to be
changed during the runtime. In case the answer is no, there is no room
to additional logic which translate constant to different value which
will be other constant.

You should do it across all the patchset.

So, in this specific case, the proposed change is not enough, you are
not solving an issue, but hiding it.

Thanks

> 
> 
> Regards
> Wei Hu
> >>Regards
> >>Wei Hu
> >>
> >>
> >>
> 
> 


signature.asc
Description: Digital signature


Re: [PATCH] ppc: Fix BPF JIT for ABIv2

2016-06-21 Thread Michael Ellerman
On Tue, 2016-06-21 at 08:45 -0700, Alexei Starovoitov wrote:
> On 6/21/16 7:47 AM, Thadeu Lima de Souza Cascardo wrote:
> > > > 
> > > > The calling convention is different with ABIv2 and so we'll need changes
> > > > in bpf_slow_path_common() and sk_negative_common().
> > > 
> > > How big would those changes be? Do we know?
> > > 
> > > How come no one reported this was broken previously? This is the first 
> > > I've
> > > heard of it being broken.
> > > 
> > 
> > I just heard of it less than two weeks ago, and only could investigate it 
> > last
> > week, when I realized mainline was also affected.
> > 
> > It looks like the little-endian support for classic JIT were done before the
> > conversion to ABIv2. And as JIT is disabled by default, no one seems to have
> > exercised it.
> 
> it's not a surprise unfortunately. The JITs that were written before
> test_bpf.ko was developed were missing corner cases. Typical tcpdump
> would be fine, but fragmented packets, negative offsets and
> out-out-bounds wouldn't be handled correctly.
> I'd suggest to validate the stable backport with test_bpf as well.
 
OK thanks.

I have been running seltests/net/test_bpf, but I realise now it doesn't enable
the JIT.

cheers



[PATCH net-next] cxgb4vf: Synchronize access to mailbox

2016-06-21 Thread Hariprasad Shenai
The issue comes when there are multiple threads attempting to use the
mailbox facility at the same time. The issue is the for the Virtual
Function Driver, the only way to get the Virtual Interface statistics
is to issue mailbox commands to ask the firmware for the VI Stats.
And, because the VI Stats command can only retrieve a smallish number of
stats per mailbox command, we have to issue three mailbox commands in quick
succession. When ethtool or netstat command to get interface stats and
interface up/down is run in a loop for every 0.1 sec, we observed
mailbox collisions. And out of the two commands one would fail with
the present code, since we don't queue the second command.

To overcome the above issue, added a queue to access the mailbox.
Whenever a mailbox command is issued add it to the queue. If its at the
head issue the mailbox command, else wait for the existing command to
complete. Usually command takes less than a milli-second to complete.
Also timeout from the loop, if the command under execution takes
long time to run.

In reality, the number of mailbox access collisions is going to be very
rare since no one runs such abusive script.

Signed-off-by: Hariprasad Shenai 
---
 drivers/net/ethernet/chelsio/cxgb4vf/adapter.h |  8 
 .../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c|  2 +
 drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c | 55 ++
 3 files changed, 65 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h 
b/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
index 734dd776c22f..109bc630408b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/adapter.h
@@ -353,6 +353,10 @@ struct hash_mac_addr {
u8 addr[ETH_ALEN];
 };
 
+struct mbox_list {
+   struct list_head list;
+};
+
 /*
  * Per-"adapter" (Virtual Function) information.
  */
@@ -387,6 +391,10 @@ struct adapter {
/* various locks */
spinlock_t stats_lock;
 
+   /* lock for mailbox cmd list */
+   spinlock_t mbox_lock;
+   struct mbox_list mlist;
+
/* support for mailbox command/reply logging */
 #define T4VF_OS_LOG_MBOX_CMDS 256
struct mbox_cmd_log *mbox_log;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c 
b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 8d9b2cb74aa2..9f5526478d2f 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -2774,6 +2774,8 @@ static int cxgb4vf_pci_probe(struct pci_dev *pdev,
 * Initialize SMP data synchronization resources.
 */
spin_lock_init(>stats_lock);
+   spin_lock_init(>mbox_lock);
+   INIT_LIST_HEAD(>mlist.list);
 
/*
 * Map our I/O registers in BAR0.
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c 
b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
index 955ff7c61f1b..61bfe86da86d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/t4vf_hw.c
@@ -139,6 +139,7 @@ int t4vf_wr_mbox_core(struct adapter *adapter, const void 
*cmd, int size,
u32 mbox_ctl = T4VF_CIM_BASE_ADDR + CIM_VF_EXT_MAILBOX_CTRL;
u32 cmd_op = FW_CMD_OP_G(be32_to_cpu(((struct fw_cmd_hdr *)cmd)->hi));
__be64 cmd_rpl[MBOX_LEN / 8];
+   struct mbox_list entry;
 
/* In T6, mailbox size is changed to 128 bytes to avoid
 * invalidating the entire prefetch buffer.
@@ -156,6 +157,51 @@ int t4vf_wr_mbox_core(struct adapter *adapter, const void 
*cmd, int size,
size > NUM_CIM_VF_MAILBOX_DATA_INSTANCES * 4)
return -EINVAL;
 
+   /* Queue ourselves onto the mailbox access list.  When our entry is at
+* the front of the list, we have rights to access the mailbox.  So we
+* wait [for a while] till we're at the front [or bail out with an
+* EBUSY] ...
+*/
+   spin_lock(>mbox_lock);
+   list_add_tail(, >mlist.list);
+   spin_unlock(>mbox_lock);
+
+   delay_idx = 0;
+   ms = delay[0];
+
+   for (i = 0; ; i += ms) {
+   /* If we've waited too long, return a busy indication.  This
+* really ought to be based on our initial position in the
+* mailbox access list but this is a start.  We very rearely
+* contend on access to the mailbox ...
+*/
+   if (i > FW_CMD_MAX_TIMEOUT) {
+   spin_lock(>mbox_lock);
+   list_del();
+   spin_unlock(>mbox_lock);
+   ret = -EBUSY;
+   t4vf_record_mbox(adapter, cmd, size, access, ret);
+   return ret;
+   }
+
+   /* If we're at the head, break out and start the mailbox
+* protocol.
+*/
+   if (list_first_entry(>mlist.list, struct 

[net-next] samples/bpf: set max locked memory to ulimited

2016-06-21 Thread William Tu
Signed-off-by: William Tu 
---
 samples/bpf/sockex2_user.c | 3 +++
 samples/bpf/sockex3_user.c | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
index 29a276d..8a4085c 100644
--- a/samples/bpf/sockex2_user.c
+++ b/samples/bpf/sockex2_user.c
@@ -5,6 +5,7 @@
 #include "bpf_load.h"
 #include 
 #include 
+#include 
 
 struct pair {
__u64 packets;
@@ -13,11 +14,13 @@ struct pair {
 
 int main(int ac, char **argv)
 {
+   struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
char filename[256];
FILE *f;
int i, sock;
 
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+   setrlimit(RLIMIT_MEMLOCK, );
 
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
index 2617772..d4184ab 100644
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
@@ -5,6 +5,7 @@
 #include "bpf_load.h"
 #include 
 #include 
+#include 
 
 struct flow_keys {
__be32 src;
@@ -23,11 +24,13 @@ struct pair {
 
 int main(int argc, char **argv)
 {
+   struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
char filename[256];
FILE *f;
int i, sock;
 
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+   setrlimit(RLIMIT_MEMLOCK, );
 
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
-- 
2.5.0



Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-21 Thread David Ahern

On 6/21/16 9:42 PM, Jerry Chu wrote:

Yes TOU may lower the bar for random hacks by Joe Random. But I'd argue
no large organization would serious consider or dare deploy TCP stack
with random hacks.


There are userspace network stacks that have been around for years and 
widely deployed on devices that basically use Linux as the boot OS.


Re: [PATCH] ibmvnic: fix to use list_for_each_safe() when delete items

2016-06-21 Thread Wei Yongjun

Hi  Thomas Falcon,

Thanks for found this. I will send new patch include your changes.

Regards,
Yongjun Wei

On 06/22/2016 12:01 AM, Thomas Falcon wrote:

On 06/20/2016 10:50 AM, Thomas Falcon wrote:

On 06/17/2016 09:53 PM, weiyj...@163.com wrote:

From: Wei Yongjun 

Since we will remove items off the list using list_del() we need
to use a safe version of the list_for_each() macro aptly named
list_for_each_safe().

Signed-off-by: Wei Yongjun 
---
  drivers/net/ethernet/ibm/ibmvnic.c | 10 +-
  1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 864cb21..0b6a922 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -3141,14 +3141,14 @@ static void handle_request_ras_comp_num_rsp(union 
ibmvnic_crq *crq,
  
  static void ibmvnic_free_inflight(struct ibmvnic_adapter *adapter)

  {
-   struct ibmvnic_inflight_cmd *inflight_cmd;
+   struct ibmvnic_inflight_cmd *inflight_cmd, *tmp1;
struct device *dev = >vdev->dev;
-   struct ibmvnic_error_buff *error_buff;
+   struct ibmvnic_error_buff *error_buff, *tmp2;
unsigned long flags;
unsigned long flags2;
  
  	spin_lock_irqsave(>inflight_lock, flags);

-   list_for_each_entry(inflight_cmd, >inflight, list) {
+   list_for_each_entry_safe(inflight_cmd, tmp1, >inflight, list) {
switch (inflight_cmd->crq.generic.cmd) {
case LOGIN:
dma_unmap_single(dev, adapter->login_buf_token,
@@ -3165,8 +3165,8 @@ static void ibmvnic_free_inflight(struct ibmvnic_adapter 
*adapter)
break;
case REQUEST_ERROR_INFO:
spin_lock_irqsave(>error_list_lock, flags2);
-   list_for_each_entry(error_buff, >errors,
-   list) {
+   list_for_each_entry_safe(error_buff, tmp2,
+>errors, list) {
dma_unmap_single(dev, error_buff->dma,
 error_buff->len,
 DMA_FROM_DEVICE);


Thanks!

Acked-by: Thomas Falcon 

Hello, I apologize for prematurely ack'ing this.  There is another situation 
where you could use list_for_each_entry_safe in the function 
handle_error_info_rsp.  Could you include this in your patch, please?

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 864cb21..e9968d9 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -2121,7 +2121,7 @@ static void handle_error_info_rsp(union ibmvnic_crq *crq,
   struct ibmvnic_adapter *adapter)
  {
 struct device *dev = >vdev->dev;
-   struct ibmvnic_error_buff *error_buff;
+   struct ibmvnic_error_buff *error_buff, *tmp;
 unsigned long flags;
 bool found = false;
 int i;
@@ -2133,7 +2133,7 @@ static void handle_error_info_rsp(union ibmvnic_crq *crq,
 }
  
 spin_lock_irqsave(>error_list_lock, flags);

-   list_for_each_entry(error_buff, >errors, list)
+   list_for_each_entry_safe(error_buff, tmp, >errors, list)
 if (error_buff->error_id == crq->request_error_rsp.error_id) {
 found = true;
 list_del(_buff->list);



___
Linuxppc-dev mailing list
linuxppc-...@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev







Re: [PATCH v10 08/22] IB/hns: Add icm support

2016-06-21 Thread Wei Hu (Xavier)



On 2016/6/21 19:55, Leon Romanovsky wrote:

On Tue, Jun 21, 2016 at 12:37:39PM +0800, Wei Hu (Xavier) wrote:


On 2016/6/20 21:04, Leon Romanovsky wrote:

On Mon, Jun 20, 2016 at 05:48:15PM +0800, Wei Hu (Xavier) wrote:

On 2016/6/20 17:27, Leon Romanovsky wrote:

On Mon, Jun 20, 2016 at 03:49:24PM +0800, Wei Hu (Xavier) wrote:

On 2016/6/20 14:06, Leon Romanovsky wrote:

On Mon, Jun 20, 2016 at 12:37:40PM +0800, Wei Hu (Xavier) wrote:

On 2016/6/17 17:58, Leon Romanovsky wrote:

On Thu, Jun 16, 2016 at 10:35:16PM +0800, Lijun Ou wrote:

This patch mainly added icm support for RoCE. It initializes icm
which managers the relative memory blocks for RoCE. The data
structures of RoCE will be located in it. For example, CQ table,
QP table and MTPT table so on.

Signed-off-by: Wei Hu 
Signed-off-by: Nenglong Zhao 
Signed-off-by: Lijun Ou 
---

<...>


+

Another question which you didn't answer [1].

"I wonder if you have the same needs for ICM as it is in mlx4 device.
Do you have firmware?"

[1] http://marc.info/?l=linux-rdma=146545553104913=2

Hi, Leon
 Now we haven't firmware.
 But hardware still need memory for QPC\CQC\MTPT\mtt etc.

ICM stands for InfiniHost (Interconnect) Context Memory is a specific
memory place to share between host <-> FW and host <-> HW if HW is
aware of specific structures.

I assume that in your case, it is enough to allocate memory region and
supply it to HW. Am I right?

For Our hardware,
1. ICM has a memory management method, It's very good for QPC\CQC\MTPT\mtt
etc. we need it.

You need special HW to leverage its. AFAIK it is Mellanox specific.

For our hardware, we use ICM to memory management, the memory shared with
host and HW.
QPC\CQC\MTPT\mtt has specific memory requirement.
QPC\CQC\MTPT need continuous memory. we use ICM to management the block of
memory. It's very good!

I wasn't convinced why do you need to copy whole ICM logic which is
specific to Mellanox. Your requirements can be implemented by standard CMA
and/or DMA.

Hi, Leon

In hip06 soc,
Hardware need multiple memory blocks for QPC\CQC\MTPT, every block has 
continuous memory xxKbyte (like 128Kbyte),

We need to configure the first address of 128Kbyte to hardware.

For example:
//
example 1:
In create qp,
1. If the xx Kbyte memory that include QPC related with qpn, has not 
been allocated, do step 2.

   else do step 3.
2. dma_alloc xx Kbyte memory for QPC,  and configure the first address 
of xx Kbyte to hardware.

3. find the QPC memory in xx Kbyte, get the dma_addr.
4. send mailbox command to hardware to create QP.

In step 2, we call xx_table_get function as below to perform logic.
int hns_roce_table_get(struct hns_roce_dev *hr_dev,
   struct hns_roce_icm_table *table, unsigned long obj)
{

//dma_alloc_coherent 128Kbyte memory
hns_roce_alloc_icm(hr_dev,
  HNS_ROCE_TABLE_CHUNK_SIZE >> PAGE_SHIFT, );

/*configure the first address of xx Kbyte to hardware*/
hns_roce_map_icm(hr_dev, table, obj);

}

In step 3, we call xx_table_find function to perform logic.
void *hns_roce_table_find(struct hns_roce_icm_table *table, unsigned 
long obj,

  dma_addr_t *dma_handle);


example 2:
In modify qp:
1. find the QPC memory,  get the virtual addr.
2. modify the fields of QPC.
3. send mailbox command to hardware to modify QP.

In step 1,  we call xx_table_find function to perform logic.
//--


so, now we haven't a firmware, but ICM algorithm still suitable for 
hip06 soc perfectly.


Regards
Wei Hu

2. The meomry for QPC\CQC\MTPT\mtt only used for RoCE hardware and driver,
we don't want use MR.

I didn't mean Infiniband MR, but memory region returned from standard
allocation functions (kmalloc, ...).


3. Now we haven't firmware, maybe we need it next version.

You are always invited to add support once it will be needed, no need to
add it in advance.

Thanks







Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-21 Thread Jerry Chu
On Tue, Jun 21, 2016 at 1:29 AM, David Miller  wrote:
> From: Tom Herbert 
> Date: Mon, 20 Jun 2016 08:13:48 -0700
>
>> Routing around the problem is already being done.
>
> QUIC, a new protocol used for specific purposes and implemented in
> userspace from the start is significantly different from making the
> kernel's _TCP_ implementation bypassed into a userspace one just by
> UDP encapsulating it.
>
> That is a major and conscious change in our mentality.
>
> The consequences are far and wide, and I'm having a very hard time
> seeing the benefits you cite being larger than the negatives here.

I don't believe TOU will lead to a proliferation of TCP implementations in
the userland - getting a solid TCP implementation is hard. Yes any smart CS
student in the networking field can write one over a weekend, to get 3WHS
to work, and may even include graceful shutdown. But creating one from
scratch that is both high quality, compliant, highly inter-operable, and highly
performing is really hard. Just look at how much work folks on the list have
to continue to pour in to maintain the Linux TCP stack as the best on the
planet.

Yes TOU may lower the bar for random hacks by Joe Random. But I'd argue
no large organization would serious consider or dare deploy TCP stack
with random hacks. I know we have a very high bar to pass at Google. This
should limit the impact of bad TCP stacks on the Internet. If we continue
to keep up and make timely improvements to the Linux TCP stack, and
better yet, to continue to improve technology like UML and LKL to make it
easy for folks to access great technologies in the Linux kernel stack and
deploy them in the userland, it will probably take away all the motivations
for people to do their own random hacks.

Best,

Jerry


[PATCH iproute2 1/3] ss: Refactor inet_show_sock

2016-06-21 Thread David Ahern
Extract parsing of sockstat and filter from inet_show_sock.
While moving run_ssfilter into callers of inet_show_sock enable
userspace filtering before the kill.

Signed-off-by: David Ahern 
---
 misc/ss.c | 68 ---
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index 02be7e7407df..a22cfebadfa2 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -2038,42 +2038,47 @@ static void tcp_show_info(const struct nlmsghdr *nlh, 
struct inet_diag_msg *r,
}
 }
 
-static int inet_show_sock(struct nlmsghdr *nlh, struct filter *f, int protocol)
+static void parse_diag_msg(struct nlmsghdr *nlh, struct sockstat *s)
 {
struct rtattr *tb[INET_DIAG_MAX+1];
struct inet_diag_msg *r = NLMSG_DATA(nlh);
-   struct sockstat s = {};
 
parse_rtattr(tb, INET_DIAG_MAX, (struct rtattr *)(r+1),
 nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
 
-   s.state = r->idiag_state;
-   s.local.family  = s.remote.family = r->idiag_family;
-   s.lport = ntohs(r->id.idiag_sport);
-   s.rport = ntohs(r->id.idiag_dport);
-   s.wq= r->idiag_wqueue;
-   s.rq= r->idiag_rqueue;
-   s.ino   = r->idiag_inode;
-   s.uid   = r->idiag_uid;
-   s.iface = r->id.idiag_if;
-   s.sk= cookie_sk_get(>id.idiag_cookie[0]);
-
-   if (s.local.family == AF_INET) {
-   s.local.bytelen = s.remote.bytelen = 4;
+   s->state= r->idiag_state;
+   s->local.family = s->remote.family = r->idiag_family;
+   s->lport= ntohs(r->id.idiag_sport);
+   s->rport= ntohs(r->id.idiag_dport);
+   s->wq   = r->idiag_wqueue;
+   s->rq   = r->idiag_rqueue;
+   s->ino  = r->idiag_inode;
+   s->uid  = r->idiag_uid;
+   s->iface= r->id.idiag_if;
+   s->sk   = cookie_sk_get(>id.idiag_cookie[0]);
+
+   if (s->local.family == AF_INET) {
+   s->local.bytelen = s->remote.bytelen = 4;
} else {
-   s.local.bytelen = s.remote.bytelen = 16;
+   s->local.bytelen = s->remote.bytelen = 16;
}
 
-   memcpy(s.local.data, r->id.idiag_src, s.local.bytelen);
-   memcpy(s.remote.data, r->id.idiag_dst, s.local.bytelen);
+   memcpy(s->local.data, r->id.idiag_src, s->local.bytelen);
+   memcpy(s->remote.data, r->id.idiag_dst, s->local.bytelen);
+}
 
-   if (f && f->f && run_ssfilter(f->f, ) == 0)
-   return 0;
+static int inet_show_sock(struct nlmsghdr *nlh, struct sockstat *s, int 
protocol)
+{
+   struct rtattr *tb[INET_DIAG_MAX+1];
+   struct inet_diag_msg *r = NLMSG_DATA(nlh);
+
+   parse_rtattr(tb, INET_DIAG_MAX, (struct rtattr *)(r+1),
+nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
 
if (tb[INET_DIAG_PROTOCOL])
protocol = *(__u8 *)RTA_DATA(tb[INET_DIAG_PROTOCOL]);
 
-   inet_stats_print(, protocol);
+   inet_stats_print(s, protocol);
 
if (show_options) {
struct tcpstat t = {};
@@ -2085,8 +2090,8 @@ static int inet_show_sock(struct nlmsghdr *nlh, struct 
filter *f, int protocol)
}
 
if (show_details) {
-   sock_details_print();
-   if (s.local.family == AF_INET6 && tb[INET_DIAG_SKV6ONLY]) {
+   sock_details_print(s);
+   if (s->local.family == AF_INET6 && tb[INET_DIAG_SKV6ONLY]) {
unsigned char v6only;
 
v6only = *(__u8 *)RTA_DATA(tb[INET_DIAG_SKV6ONLY]);
@@ -2268,9 +2273,16 @@ static int show_one_inet_sock(const struct sockaddr_nl 
*addr,
int err;
struct inet_diag_arg *diag_arg = arg;
struct inet_diag_msg *r = NLMSG_DATA(h);
+   struct sockstat s = {};
 
if (!(diag_arg->f->families & (1 << r->idiag_family)))
return 0;
+
+   parse_diag_msg(h, );
+
+   if (diag_arg->f->f && run_ssfilter(diag_arg->f->f, ) == 0)
+   return 0;
+
if (diag_arg->f->kill && kill_inet_sock(h, arg) != 0) {
if (errno == EOPNOTSUPP || errno == ENOENT) {
/* Socket can't be closed, or is already closed. */
@@ -2280,7 +2292,7 @@ static int show_one_inet_sock(const struct sockaddr_nl 
*addr,
return -1;
}
}
-   if ((err = inet_show_sock(h, diag_arg->f, diag_arg->protocol)) < 0)
+   if ((err = inet_show_sock(h, , diag_arg->protocol)) < 0)
return err;
 
return 0;
@@ -2345,6 +2357,7 @@ static int tcp_show_netlink_file(struct filter *f)
while (1) {
int status, err;
struct nlmsghdr *h = (struct nlmsghdr *)buf;
+   struct sockstat s = {};
 
status = fread(buf, 1, sizeof(*h), fp);

[PATCH iproute2 0/3] ss: Add support to filter by device

2016-06-21 Thread David Ahern
Add support for specifying device name in the filter to ss.
The kernel does not provide support for iface filtering, so if
the user specifies 'dev == NAME' or 'dev != NAME' all filtering
is done in userspace.

I will send a patch to add support for iface filtering in the kernel,
but the reality is that ss will need to accommodate both (ie., lack of
kernel support) for some time - which this set provides.

David Ahern (3):
  ss: Refactor inet_show_sock
  ss: Allow ssfilter_bytecompile to return 0
  ss: Add support to filter on device

 misc/ss.c   | 152 +---
 misc/ssfilter.h |   2 +
 misc/ssfilter.y |  22 +++-
 3 files changed, 135 insertions(+), 41 deletions(-)

-- 
2.1.4



[PATCH iproute2 3/3] ss: Add support to filter on device

2016-06-21 Thread David Ahern
Add support for device names in the filter. Example:

root@kenny:~# ss -t  'sport == :22 && dev == red'
State  Recv-Q Send-Q Local Address:Port  Peer Address:Port
ESTAB  0  0  10.100.1.2%red:ssh  10.100.1.254:47814
ESTAB  0  0   2100:1::2%red:ssh2100:1::64:49406

Since kernel does not support iface in the filter specifying a
device name means all filtering is done in userspace.

Signed-off-by: David Ahern 
---
 misc/ss.c   | 32 
 misc/ssfilter.h |  2 ++
 misc/ssfilter.y | 22 +-
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/misc/ss.c b/misc/ss.c
index 3419a88c33be..6f0ad0295918 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -1043,6 +1043,7 @@ static void inet_addr_print(const inet_prefix *a, int 
port, unsigned int ifindex
 struct aafilter {
inet_prefix addr;
int port;
+   unsigned intiface;
struct aafilter *next;
 };
 
@@ -1157,7 +1158,12 @@ static int run_ssfilter(struct ssfilter *f, struct 
sockstat *s)
 
return s->lport <= a->port;
}
+   case SSF_DEVCOND:
+   {
+   struct aafilter *a = (void *)f->pred;
 
+   return s->iface == a->iface;
+   }
/* Yup. It is recursion. Sorry. */
case SSF_AND:
return run_ssfilter(f->pred, s) && run_ssfilter(f->post, s);
@@ -1328,6 +1334,11 @@ static int ssfilter_bytecompile(struct ssfilter *f, char 
**bytecode)
*bytecode = a;
return l1+4;
}
+   case SSF_DEVCOND:
+   {
+   /* bytecompile for SSF_DEVCOND not supported yet */
+return 0;
+   }
default:
abort();
}
@@ -1416,6 +1427,27 @@ static int xll_name_to_index(const char *dev)
return ll_name_to_index(dev);
 }
 
+void *parse_devcond(char *name)
+{
+   struct aafilter a = { .iface = 0 };
+   struct aafilter *res;
+
+   a.iface = xll_name_to_index(name);
+   if (a.iface == 0) {
+   char *end;
+   unsigned long res;
+
+   res = strtoul(name, , 0);
+   if (!end || end == name || *end || res > UINT_MAX)
+   return NULL;
+   }
+
+   res = malloc(sizeof(*res));
+   *res = a;
+
+   return res;
+}
+
 void *parse_hostcond(char *addr, bool is_port)
 {
char *port = NULL;
diff --git a/misc/ssfilter.h b/misc/ssfilter.h
index 53922a844457..c7db8eee9578 100644
--- a/misc/ssfilter.h
+++ b/misc/ssfilter.h
@@ -8,6 +8,7 @@
 #define SSF_S_GE  7
 #define SSF_S_LE  8
 #define SSF_S_AUTO  9
+#define SSF_DEVCOND 10
 
 #include 
 
@@ -20,3 +21,4 @@ struct ssfilter
 
 int ssfilter_parse(struct ssfilter **f, int argc, char **argv, FILE *fp);
 void *parse_hostcond(char *addr, bool is_port);
+void *parse_devcond(char *name);
diff --git a/misc/ssfilter.y b/misc/ssfilter.y
index a258d04b85d7..14bf9817f2c3 100644
--- a/misc/ssfilter.y
+++ b/misc/ssfilter.y
@@ -36,7 +36,7 @@ static void yyerror(char *s)
 
 %}
 
-%token HOSTCOND DCOND SCOND DPORT SPORT LEQ GEQ NEQ AUTOBOUND
+%token HOSTCOND DCOND SCOND DPORT SPORT LEQ GEQ NEQ AUTOBOUND DEVCOND DEVNAME
 %left '|'
 %left '&'
 %nonassoc '!'
@@ -108,6 +108,14 @@ expr:  DCOND HOSTCOND
 {
$$ = alloc_node(SSF_NOT, alloc_node(SSF_SCOND, $3));
 }
+| DEVNAME '=' DEVCOND
+{
+   $$ = alloc_node(SSF_DEVCOND, $3);
+}
+| DEVNAME NEQ DEVCOND
+{
+   $$ = alloc_node(SSF_NOT, alloc_node(SSF_DEVCOND, $3));
+}
 
 | AUTOBOUND
 {
@@ -237,6 +245,10 @@ int yylex(void)
tok_type = SPORT;
return SPORT;
}
+   if (strcmp(curtok, "dev") == 0) {
+   tok_type = DEVNAME;
+   return DEVNAME;
+   }
if (strcmp(curtok, ">=") == 0 ||
strcmp(curtok, "ge") == 0 ||
strcmp(curtok, "geq") == 0)
@@ -263,6 +275,14 @@ int yylex(void)
tok_type = AUTOBOUND;
return AUTOBOUND;
}
+   if (tok_type == DEVNAME) {
+   yylval = (void*)parse_devcond(curtok);
+   if (yylval == NULL) {
+   fprintf(stderr, "Cannot parse device.\n");
+   exit(1);
+   }
+   return DEVCOND;
+   }
yylval = (void*)parse_hostcond(curtok, tok_type == SPORT || tok_type == 
DPORT);
if (yylval == NULL) {
fprintf(stderr, "Cannot parse dst/src address.\n");
-- 
2.1.4



[PATCH iproute2 2/3] ss: Allow ssfilter_bytecompile to return 0

2016-06-21 Thread David Ahern
Allow ssfilter_bytecompile to return 0 for filter ops the kernel
does not support. If such an op is in the filter string then all
filtering is done in userspace.

Signed-off-by: David Ahern 
---
 misc/ss.c | 52 +---
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index a22cfebadfa2..3419a88c33be 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -1273,11 +1273,16 @@ static int ssfilter_bytecompile(struct ssfilter *f, 
char **bytecode)
 
case SSF_AND:
{
-   char *a1, *a2, *a;
+   char *a1 = NULL, *a2 = NULL, *a;
int l1, l2;
 
l1 = ssfilter_bytecompile(f->pred, );
l2 = ssfilter_bytecompile(f->post, );
+   if (!l1 || !l2) {
+   free(a1);
+   free(a2);
+   return 0;
+   }
if (!(a = malloc(l1+l2))) abort();
memcpy(a, a1, l1);
memcpy(a+l1, a2, l2);
@@ -1288,11 +1293,16 @@ static int ssfilter_bytecompile(struct ssfilter *f, 
char **bytecode)
}
case SSF_OR:
{
-   char *a1, *a2, *a;
+   char *a1 = NULL, *a2 = NULL, *a;
int l1, l2;
 
l1 = ssfilter_bytecompile(f->pred, );
l2 = ssfilter_bytecompile(f->post, );
+   if (!l1 || !l2) {
+   free(a1);
+   free(a2);
+   return 0;
+   }
if (!(a = malloc(l1+l2+4))) abort();
memcpy(a, a1, l1);
memcpy(a+l1+4, a2, l2);
@@ -1303,10 +1313,14 @@ static int ssfilter_bytecompile(struct ssfilter *f, 
char **bytecode)
}
case SSF_NOT:
{
-   char *a1, *a;
+   char *a1 = NULL, *a;
int l1;
 
l1 = ssfilter_bytecompile(f->pred, );
+   if (!l1) {
+   free(a1);
+   return 0;
+   }
if (!(a = malloc(l1+4))) abort();
memcpy(a, a1, l1);
free(a1);
@@ -2126,6 +2140,7 @@ static int tcpdiag_send(int fd, int protocol, struct 
filter *f)
struct msghdr msg;
struct rtattr rta;
struct iovec iov[3];
+   int iovlen = 1;
 
if (protocol == IPPROTO_UDP)
return -1;
@@ -2161,18 +2176,21 @@ static int tcpdiag_send(int fd, int protocol, struct 
filter *f)
};
if (f->f) {
bclen = ssfilter_bytecompile(f->f, );
-   rta.rta_type = INET_DIAG_REQ_BYTECODE;
-   rta.rta_len = RTA_LENGTH(bclen);
-   iov[1] = (struct iovec){ , sizeof(rta) };
-   iov[2] = (struct iovec){ bc, bclen };
-   req.nlh.nlmsg_len += RTA_LENGTH(bclen);
+   if (bclen) {
+   rta.rta_type = INET_DIAG_REQ_BYTECODE;
+   rta.rta_len = RTA_LENGTH(bclen);
+   iov[1] = (struct iovec){ , sizeof(rta) };
+   iov[2] = (struct iovec){ bc, bclen };
+   req.nlh.nlmsg_len += RTA_LENGTH(bclen);
+   iovlen = 3;
+   }
}
 
msg = (struct msghdr) {
.msg_name = (void *),
.msg_namelen = sizeof(nladdr),
.msg_iov = iov,
-   .msg_iovlen = f->f ? 3 : 1,
+   .msg_iovlen = iovlen,
};
 
if (sendmsg(fd, , 0) < 0) {
@@ -2193,6 +2211,7 @@ static int sockdiag_send(int family, int fd, int 
protocol, struct filter *f)
struct msghdr msg;
struct rtattr rta;
struct iovec iov[3];
+   int iovlen = 1;
 
if (family == PF_UNSPEC)
return tcpdiag_send(fd, protocol, f);
@@ -2221,18 +2240,21 @@ static int sockdiag_send(int family, int fd, int 
protocol, struct filter *f)
};
if (f->f) {
bclen = ssfilter_bytecompile(f->f, );
-   rta.rta_type = INET_DIAG_REQ_BYTECODE;
-   rta.rta_len = RTA_LENGTH(bclen);
-   iov[1] = (struct iovec){ , sizeof(rta) };
-   iov[2] = (struct iovec){ bc, bclen };
-   req.nlh.nlmsg_len += RTA_LENGTH(bclen);
+   if (bclen) {
+   rta.rta_type = INET_DIAG_REQ_BYTECODE;
+   rta.rta_len = RTA_LENGTH(bclen);
+   iov[1] = (struct iovec){ , sizeof(rta) };
+   iov[2] = (struct iovec){ bc, bclen };
+   req.nlh.nlmsg_len += RTA_LENGTH(bclen);
+   iovlen = 3;
+   }
}
 
msg = (struct msghdr) {
.msg_name = (void *),
.msg_namelen = sizeof(nladdr),
.msg_iov = iov,
-   .msg_iovlen = f->f ? 3 : 1,
+   

[PATCH v2] ibmvnic: fix to use list_for_each_safe() when delete items

2016-06-21 Thread Wei Yongjun

Since we will remove items off the list using list_del() we need
to use a safe version of the list_for_each() macro aptly named
list_for_each_safe().

Signed-off-by: Wei Yongjun 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 864cb21..ecdb685 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -2121,7 +2121,7 @@ static void handle_error_info_rsp(union ibmvnic_crq *crq,
  struct ibmvnic_adapter *adapter)
 {
struct device *dev = >vdev->dev;
-   struct ibmvnic_error_buff *error_buff;
+   struct ibmvnic_error_buff *error_buff, *tmp;
unsigned long flags;
bool found = false;
int i;
@@ -2133,7 +2133,7 @@ static void handle_error_info_rsp(union ibmvnic_crq *crq,
}
 
 	spin_lock_irqsave(>error_list_lock, flags);

-   list_for_each_entry(error_buff, >errors, list)
+   list_for_each_entry_safe(error_buff, tmp, >errors, list)
if (error_buff->error_id == crq->request_error_rsp.error_id) {
found = true;
list_del(_buff->list);
@@ -3141,14 +3141,14 @@ static void handle_request_ras_comp_num_rsp(union 
ibmvnic_crq *crq,
 
 static void ibmvnic_free_inflight(struct ibmvnic_adapter *adapter)

 {
-   struct ibmvnic_inflight_cmd *inflight_cmd;
+   struct ibmvnic_inflight_cmd *inflight_cmd, *tmp1;
struct device *dev = >vdev->dev;
-   struct ibmvnic_error_buff *error_buff;
+   struct ibmvnic_error_buff *error_buff, *tmp2;
unsigned long flags;
unsigned long flags2;
 
 	spin_lock_irqsave(>inflight_lock, flags);

-   list_for_each_entry(inflight_cmd, >inflight, list) {
+   list_for_each_entry_safe(inflight_cmd, tmp1, >inflight, list) {
switch (inflight_cmd->crq.generic.cmd) {
case LOGIN:
dma_unmap_single(dev, adapter->login_buf_token,
@@ -3165,8 +3165,8 @@ static void ibmvnic_free_inflight(struct ibmvnic_adapter 
*adapter)
break;
case REQUEST_ERROR_INFO:
spin_lock_irqsave(>error_list_lock, flags2);
-   list_for_each_entry(error_buff, >errors,
-   list) {
+   list_for_each_entry_safe(error_buff, tmp2,
+>errors, list) {
dma_unmap_single(dev, error_buff->dma,
 error_buff->len,
 DMA_FROM_DEVICE);





Re: [PATCH -next 2/4] cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

2016-06-21 Thread kbuild test robot
Hi,

[auto build test ERROR on next-20160621]

url:
https://github.com/0day-ci/linux/commits/Martin-KaFai-Lau/cgroup-bpf-cgroup2-membership-test-on-skb/20160622-082800
config: m68k-sun3_defconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 4.9.0
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=m68k 

All error/warnings (new ones prefixed by >>):

   kernel/bpf/arraymap.c: In function 'cgroup_fd_array_get_ptr':
>> kernel/bpf/arraymap.c:547:2: error: implicit declaration of function 
>> 'cgroup_get_from_fd' [-Werror=implicit-function-declaration]
 return cgroup_get_from_fd(fd);
 ^
>> kernel/bpf/arraymap.c:547:2: warning: return makes pointer from integer 
>> without a cast
   kernel/bpf/arraymap.c: In function 'cgroup_fd_array_put_ptr':
>> kernel/bpf/arraymap.c:553:2: error: implicit declaration of function 
>> 'cgroup_put' [-Werror=implicit-function-declaration]
 cgroup_put(ptr);
 ^
   cc1: some warnings being treated as errors

vim +/cgroup_get_from_fd +547 kernel/bpf/arraymap.c

   541  late_initcall(register_perf_event_array_map);
   542  
   543  static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
   544   struct file *map_file /* not used 
*/,
   545   int fd)
   546  {
 > 547  return cgroup_get_from_fd(fd);
   548  }
   549  
   550  static void cgroup_fd_array_put_ptr(void *ptr)
   551  {
   552  /* cgroup_put free cgrp after a rcu grace period */
 > 553  cgroup_put(ptr);
   554  }
   555  
   556  static void cgroup_fd_array_free(struct bpf_map *map)

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data


Re: [PATCH -next 3/4] cgroup: bpf: Add bpf_skb_in_cgroup_proto

2016-06-21 Thread kbuild test robot
Hi,

[auto build test ERROR on next-20160621]

url:
https://github.com/0day-ci/linux/commits/Martin-KaFai-Lau/cgroup-bpf-cgroup2-membership-test-on-skb/20160622-082800
config: sh-titan_defconfig (attached as .config)
compiler: sh4-linux-gnu-gcc (Debian 5.3.1-8) 5.3.1 20160205
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=sh 

All errors (new ones prefixed by >>):

   net/core/filter.c: In function 'bpf_skb_in_cgroup':
>> net/core/filter.c:2049:9: error: implicit declaration of function 
>> 'cgroup_is_descendant' [-Werror=implicit-function-declaration]
 return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data), cgrp);
^
   net/core/filter.c:2049:30: error: implicit declaration of function 
'sock_cgroup_ptr' [-Werror=implicit-function-declaration]
 return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data), cgrp);
 ^
   cc1: some warnings being treated as errors

vim +/cgroup_is_descendant +2049 net/core/filter.c

  2043  return -E2BIG;
  2044  
  2045  cgrp = READ_ONCE(array->ptrs[i]);
  2046  if (unlikely(!cgrp))
  2047  return -ENOENT;
  2048  
> 2049  return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data), 
> cgrp);
  2050  }
  2051  
  2052  static const struct bpf_func_proto bpf_skb_in_cgroup_proto = {

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data


Re: [PATCH net-next 16/19] net: hns: fix bug that alloc skb fail lead to port unavailable

2016-06-21 Thread Yisen Zhuang


在 2016/6/21 21:25, Sergei Shtylyov 写道:
> Hello.
> 
> On 6/21/2016 6:56 AM, Yisen Zhuang wrote:
> 
>> From: Jun He 
>>
>> When hns_nic_poll_rx_skb alloc skb fail, it will break receive cycle and
>> read new fbd_num to start new receive cycle. It recomputes cycle num is
>> fbd_num minus clean_count, actually this cycle num is too big because
>> it drop out receive cycle. It brings about the port unavailable.
>>
>> So we will goto out when alloc skb fail to fix this bug.
>>
>> Signed-off-by: Jun He 
>> Signed-off-by: Ding Tianhong 
>> Signed-off-by: Yisen Zhuang 
>> ---
>>  drivers/net/ethernet/hisilicon/hns/hns_enet.c | 5 +++--
>>  1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/hisilicon/hns/hns_enet.c 
>> b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
>> index f49246d..c0ce37b 100644
>> --- a/drivers/net/ethernet/hisilicon/hns/hns_enet.c
>> +++ b/drivers/net/ethernet/hisilicon/hns/hns_enet.c
>> @@ -768,10 +768,10 @@ recv:
>>  clean_count = 0;
>>  }
>>
>> -/* poll one pkg*/
>> +/* poll one pkt*/

I will fix it with a new patch.

Thanks,

Yisen

> 
>How about adding a space before */?
> 
> [...]
> 
> MBR, Sergei
> 
> 
> .
> 



Re: [PATCH net-next 10/19] net: hns: bugfix about pfc pause frame statistics

2016-06-21 Thread Yisen Zhuang


在 2016/6/21 18:32, Andy Shevchenko 写道:
> On Tue, 2016-06-21 at 11:56 +0800, Yisen Zhuang wrote:
>> From: Daode Huang 
>>
>> For SoC hip06, PFC pause handled in dsaf, while hip05 in XGMAC,
>> so change the statistics of pfc pause in dsaf and remove the old
>> pfc pause frame statistics.
>>
> 
> 
>> +static char *hns_dsaf_get_node_stats_strings(char *data, int node,
>> + struct dsaf_device
>> *dsaf_dev)
>>  {
>>  char *buff = data;
>> +int i;
>> +bool is_ver1 = AE_IS_VER1(dsaf_dev->dsaf_ver);
>>  
>>  snprintf(buff, ETH_GSTRING_LEN, "innod%d_pad_drop_pkts",
>> node);
>>  buff = buff + ETH_GSTRING_LEN;
>> @@ -2502,6 +2530,18 @@ static char
>> *hns_dsaf_get_node_stats_strings(char *data, int node)
>>  buff = buff + ETH_GSTRING_LEN;
>>  snprintf(buff, ETH_GSTRING_LEN, "innod%d_stp_drop_pkts",
>> node);
>>  buff = buff + ETH_GSTRING_LEN;
>> +if ((node < DSAF_SERVICE_NW_NUM) && (!is_ver1)) {
> 
> Redundant parens.
> 
>> +for (i = 0; i < DSAF_PRIO_NR; i++) {
>> +snprintf(buff, ETH_GSTRING_LEN,
>> + "inod%d_pfc_prio%d_pkts", node, i);
>> +buff = buff + ETH_GSTRING_LEN;
> 
> buff += ...
> 
>> +}
>> +for (i = 0; i < DSAF_PRIO_NR; i++) {
>> +snprintf(buff, ETH_GSTRING_LEN,
>> + "onod%d_pfc_prio%d_pkts", node, i);
>> +buff = buff + ETH_GSTRING_LEN;
> 
> Ditto.
> 
>>  {
>>  u64 *p = data;
>> +int i;
>>  struct dsaf_hw_stats *hw_stats = >hw_stats[node_num];
>> +bool is_ver1 = AE_IS_VER1(ddev->dsaf_ver);
>>  
>>  p[0] = hw_stats->pad_drop;
>>  p[1] = hw_stats->man_pkts;
>> @@ -2527,8 +2569,16 @@ static u64 *hns_dsaf_get_node_stats(struct
>> dsaf_device *ddev, u64 *data,
>>  p[10] = hw_stats->local_addr_false;
>>  p[11] = hw_stats->vlan_drop;
>>  p[12] = hw_stats->stp_drop;
>> -p[13] = hw_stats->tx_pkts;
>> +if ((node_num < DSAF_SERVICE_NW_NUM) && (!is_ver1)) {
>> +for (i = 0; i < DSAF_PRIO_NR; i++) {
>> +p[13 + i] = hw_stats->rx_pfc[i];
>> +p[13 + i + DSAF_PRIO_NR] = hw_stats-
>>> tx_pfc[i];
>> +}
> 
> Two different approaches how to assign data. Above uses 2 for-loops,
> here you put everything to one.

Above cann't be merged to 1 for-loop, because lenght of the string is 
unknowable.

And here we put everything to one to reduce codes.

I will generate a new patch to fix other comments.

Thanks,

Yisen

> 
>> +p[29] = hw_stats->tx_pkts;
>> +return [30];
>> +}
>>  
>> +p[13] = hw_stats->tx_pkts;
>>  return [14];
>>  }
> 



Re: [PATCH net-next 01/19] net: hns: bug fix of ge reset sequence

2016-06-21 Thread Yisen Zhuang


在 2016/6/21 18:35, Andy Shevchenko 写道:
> On Tue, 2016-06-21 at 11:56 +0800, Yisen Zhuang wrote:
>> From: Qianqian Xie 
>>
>> The bit fileds of PPE reset register are different between HNS v1 and
>> HNS v2, but the current procedure just only match HNS v1. Here is a
>> patch to fix it.
>>
>> Signed-off-by: Kejian Yan 
>> Signed-off-by: Qianqian Xie 
>> Signed-off-by: Yisen Zhuang 
>> ---
>>  drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c | 6 +-
>>  1 file changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
>> b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
>> index 96cb628..09e60d6 100644
>> --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
>> +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c
>> @@ -271,7 +271,11 @@ static void hns_dsaf_ge_srst_by_port(struct
>> dsaf_device *dsaf_dev, u32 port,
>>  }
>>  } else {
>>  reg_val_1 = 0x15540 << dsaf_dev->reset_offset;
>> -reg_val_2 = 0x100 << dsaf_dev->reset_offset;
>> +
>> +if (AE_IS_VER1(dsaf_dev->dsaf_ver))
>> +reg_val_2 = 0x100 << dsaf_dev->reset_offset;
>> +else
>> +reg_val_2 = 0x40 << dsaf_dev->reset_offset;
> 
> reg_val_1 = 0x15540;
> reg_val_2 = AE_IS_VER1(dsaf_dev->dsaf_ver) ? 0x100 : 0x40;
> 
> reg_val_1 <<= dsaf_dev->reset_offset;
> reg_val_2 <<= dsaf_dev-

I will fix it with a new patch.

Thanks,

Yisen

>> reset_offset;
> 
> 
>>  
>>  if (!dereset) {
>>  dsaf_write_sub(dsaf_dev,
>> DSAF_SUB_SC_GE_RESET_REQ1_REG,
> 



Re: [PATCH -next 3/4] cgroup: bpf: Add bpf_skb_in_cgroup_proto

2016-06-21 Thread kbuild test robot
Hi,

[auto build test ERROR on next-20160621]

url:
https://github.com/0day-ci/linux/commits/Martin-KaFai-Lau/cgroup-bpf-cgroup2-membership-test-on-skb/20160622-082800
config: i386-randconfig-s1-201625 (attached as .config)
compiler: gcc-6 (Debian 6.1.1-1) 6.1.1 20160430
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All error/warnings (new ones prefixed by >>):

   net/core/filter.c: In function 'bpf_skb_in_cgroup':
>> net/core/filter.c:2049:30: error: implicit declaration of function 
>> 'sock_cgroup_ptr' [-Werror=implicit-function-declaration]
 return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data), cgrp);
 ^~~
>> net/core/filter.c:2049:30: warning: passing argument 1 of 
>> 'cgroup_is_descendant' makes pointer from integer without a cast 
>> [-Wint-conversion]
   In file included from include/net/netprio_cgroup.h:17:0,
from include/linux/netdevice.h:48,
from net/core/filter.c:31:
   include/linux/cgroup.h:492:20: note: expected 'struct cgroup *' but argument 
is of type 'int'
static inline bool cgroup_is_descendant(struct cgroup *cgrp,
   ^~~~
   cc1: some warnings being treated as errors

vim +/sock_cgroup_ptr +2049 net/core/filter.c

  2043  return -E2BIG;
  2044  
  2045  cgrp = READ_ONCE(array->ptrs[i]);
  2046  if (unlikely(!cgrp))
  2047  return -ENOENT;
  2048  
> 2049  return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data), 
> cgrp);
  2050  }
  2051  
  2052  static const struct bpf_func_proto bpf_skb_in_cgroup_proto = {

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data


Re: [PATCH -next 3/4] cgroup: bpf: Add bpf_skb_in_cgroup_proto

2016-06-21 Thread Alexei Starovoitov
On Tue, Jun 21, 2016 at 05:23:21PM -0700, Martin KaFai Lau wrote:
> Adds a bpf helper, bpf_skb_in_cgroup, to decide if a skb->sk
> belongs to a descendant of a cgroup2.  It is similar to the
> feature added in netfilter:
> commit c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")
> 
> The user is expected to populate a BPF_MAP_TYPE_CGROUP_ARRAY
> which will be used by the bpf_skb_in_cgroup.
> 
> Modifications to the bpf verifier is to ensure BPF_MAP_TYPE_CGROUP_ARRAY
> and bpf_skb_in_cgroup() are always used together.
> 
> Signed-off-by: Martin KaFai Lau 
> Cc: Alexei Starovoitov 
> Cc: Daniel Borkmann 
> Cc: Tejun Heo 
> ---
>  include/uapi/linux/bpf.h |  1 +
>  kernel/bpf/verifier.c|  8 
>  net/core/filter.c| 36 
>  3 files changed, 45 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index ef4e386..a91714bd 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -314,6 +314,7 @@ enum bpf_func_id {
>*/
>   BPF_FUNC_skb_get_tunnel_opt,
>   BPF_FUNC_skb_set_tunnel_opt,
> + BPF_FUNC_skb_in_cgroup,
...
> +static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
> +{
...
> + if (unlikely(!cgrp))
> + return -ENOENT;
> +
> + return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data), cgrp);

if you'd need to respin the patch for other reasons please add kdoc
to bpf.h for this new helper similar to other helpers.
To say that 0 or 1 return values is indication of cg2 descendant relation
and < 0 in case of error.

Acked-by: Alexei Starovoitov 



Re: [PATCH -next 4/4] cgroup: bpf: Add an example to do cgroup checking in BPF

2016-06-21 Thread Alexei Starovoitov
On Tue, Jun 21, 2016 at 05:23:22PM -0700, Martin KaFai Lau wrote:
> test_cgrp2_array_pin.c:
> A userland program that creates a bpf_map (BPF_MAP_TYPE_GROUP_ARRAY),
> pouplates/updates it with a cgroup2's backed fd and pins it to a
> bpf-fs's file.  The pinned file can be loaded by tc and then used
> by the bpf prog later.  This program can also update an existing pinned
> array and it could be useful for debugging/testing purpose.
> 
> test_cgrp2_tc_kern.c:
> A bpf prog which should be loaded by tc.  It is to demonstrate
> the usage of bpf_skb_in_cgroup.
> 
> test_cgrp2_tc.sh:
> A script that glues the test_cgrp2_array_pin.c and
> test_cgrp2_tc_kern.c together.  The idea is like:
> 1. Use test_cgrp2_array_pin.c to populate a BPF_MAP_TYPE_CGROUP_ARRAY
>with a cgroup fd
> 2. Load the test_cgrp2_tc_kern.o by tc
> 3. Do a 'ping -6 ff02::1%ve' to ensure the packet has been
>dropped because of a match on the cgroup
> 
> Most of the lines in test_cgrp2_tc.sh is the boilerplate
> to setup the cgroup/bpf-fs/net-devices/netns...etc.  It is
> not bulletproof on errors but should work well enough and
> give enough debug info if things did not go well.
> 
> Signed-off-by: Martin KaFai Lau 
> Cc: Alexei Starovoitov 
> Cc: Daniel Borkmann 
> Cc: Tejun Heo 
> ---
>  samples/bpf/Makefile   |   3 +
>  samples/bpf/bpf_helpers.h  |   2 +
>  samples/bpf/test_cgrp2_array_pin.c | 109 +
>  samples/bpf/test_cgrp2_tc.sh   | 189 
> +
>  samples/bpf/test_cgrp2_tc_kern.c   |  71 ++
>  5 files changed, 374 insertions(+)
...
> +struct bpf_elf_map SEC("maps") test_cgrp2_array_pin = {
> + .type   = BPF_MAP_TYPE_CGROUP_ARRAY,
> + .size_key   = sizeof(uint32_t),
> + .size_value = sizeof(uint32_t),
> + .pinning= PIN_GLOBAL_NS,
> + .max_elem   = 1,
> +};
> +
> +SEC("filter")
> +int handle_egress(struct __sk_buff *skb)
> +{
> + void *data = (void *)(long)skb->data;
> + struct eth_hdr *eth = data;
> + struct ipv6hdr *ip6h = data + sizeof(*eth);
> + void *data_end = (void *)(long)skb->data_end;
> + char dont_care_msg[] = "dont care %04x %d\n";
> + char pass_msg[] = "pass\n";
> + char reject_msg[] = "reject\n";
> +
> + /* single length check */
> + if (data + sizeof(*eth) + sizeof(*ip6h) > data_end)
> + return TC_ACT_OK;

love the test case.
It's using tc + clsact + cls_bpf in da mode + bpffs + direct packet access
and new cgroup helper.
All the most recent features I can think of :)

Acked-by: Alexei Starovoitov 



Re: [PATCH -next 2/4] cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

2016-06-21 Thread Alexei Starovoitov
On Tue, Jun 21, 2016 at 05:23:20PM -0700, Martin KaFai Lau wrote:
> Add a BPF_MAP_TYPE_CGROUP_ARRAY and its bpf_map_ops's implementations.
> To update an element, the caller is expected to obtain a cgroup2 backed
> fd by open(cgroup2_dir) and then update the array with that fd.
> 
> Signed-off-by: Martin KaFai Lau 
> Cc: Alexei Starovoitov 
> Cc: Daniel Borkmann 
> Cc: Tejun Heo 

Acked-by: Alexei Starovoitov 


[PATCH iproute2] man: ip-link: Add vrf type

2016-06-21 Thread David Ahern
Add description for vrf type to ip-link man page.

Signed-off-by: David Ahern 
---
 man/man8/ip-link.8.in | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index d5673639d9dd..97042beaf4cc 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -65,7 +65,8 @@ ip-link \- network device configuration
 .BR nlmon " |"
 .BR ipvlan " |"
 .BR lowpan " |"
-.BR geneve " ]"
+.BR geneve " |"
+.BR vrf " ]"
 
 .ti -8
 .BR "ip link delete " {
@@ -263,6 +264,9 @@ specifies the type of the new device.
 .sp
 .BR macsec
 - Interface for IEEE 802.1AE MAC Security (MACsec)
+.sp
+.BR vrf
+- Interface for L3 VRF domains
 .in -8
 
 .TP
@@ -966,6 +970,20 @@ For a link of type
 
 .in -8
 
+.TP
+VRF Type Support
+For a link of type
+.I VRF
+the following additional arguments are supported:
+
+.BI "ip link add " DEVICE " type vrf table " TABLE
+
+.in +8
+.sp
+.BR table " table id associated with VRF device"
+
+.in -8
+
 .SS ip link delete - delete virtual link
 
 .TP
-- 
2.1.4



Re: 802.3ad bonding aggregator reselection

2016-06-21 Thread Jay Vosburgh

Veli-Matti Lintu  wrote:
[...]
>>>The ports are configured in switch settings (HP Procurve 2530-48G) in
>>>same trunk group (TrkX) and trunk group type is set as LACP.
>>>/proc/net/bonding/bond0 also shows that the three ports belong to same
>>>aggregator and bandwidth tests also support this. In my understanding
>>>Procurve's trunk group is pretty much the same as etherchannel in
>>>Cisco's terminology. The bonded link comes always up properly, but
>>>handling of links going down is the problem. Are there known
>>>differences between different vendors there?
>>
>> I did the original LACP reselection testing on a Cisco switch,
>> but I have an HP 2530 now; I'll test it later today or tomorrow and see
>> if it behaves properly, and whether your proposed patch is needed.
>
>Thanks for taking a look at this. Here are some more details about the
>setup as Zhu Yanjun also requested.

Summary (because anything involving a standard tends to get long
winded):

This is not a switch problem.  Bonding appears to be following
the standard in this case.  I've identified when this behavior changed,
and I think we should violate the standard in this case for ad_select
set to "bandwidth" or "count," neither of which is the default value.

Long winded version:

I've reproduced the issue locally, and it does not appear to be
anything particular to the switch.  It appears to be due to changes from

commit 7bb11dc9f59ddcb33ee317da77b235235aaa582a
Author: Mahesh Bandewar 
Date:   Sat Oct 31 12:45:06 2015 -0700

bonding: unify all places where actor-oper key needs to be updated.

Specifically this block:

 void bond_3ad_handle_link_change(struct slave *slave, char link)
[...]
-   /* there is no need to reselect a new aggregator, just signal the
-* state machines to reinitialize
-*/
-   port->sm_vars |= AD_PORT_BEGIN;

Previously, setting BEGIN would cause the port in question to be
reinitialized, which in turn would trigger reselection.

I'm not sure that adding this section back is the correct fix
from the point of view of the standard, however, as 802.1AX 5.2.3.1.2
defines BEGIN as:

A Boolean variable that is set to TRUE when the System is
initialized or reinitialized, and is set to FALSE when
(re-)initialization has completed.

and in this case we're not reinitializing the System (i.e., the
bond).

Further, 802.1AX 5.4.12 says:

If the port becomes inoperable and a BEGIN event has not
occurred, the state machine enters the PORT_DISABLED
state. Partner_Oper_Port_State.Synchronization is set to
FALSE. This state allows the current Selection state to remain
undisturbed, so that, in the event that the port is still
connected to the same Partner and Partner port when it becomes
operable again, there will be no disturbance caused to higher
layers by unneccessary re-configuration.

At the moment, bonding is doing what 5.4.12 specifies, by
placing the port into PORT_DISABLED state.  bond_3ad_handle_link_change
clears port->is_enabled, which causes ad_rx_machine to clear
AD_PORT_MATCHED but leave AD_PORT_SELECTED set.  This in turn cause the
selection logic to skip this port, resulting in the observed behavior
(that the port is link down, but stays in the aggregator).

Bonding will still remove the slave from the bond->slave_arr, so
it won't actually try to send on this slave.  I'll further note that
802.1AX 5.4.7 defines port_enabled as:

A variable indicating that the physical layer has indicated that
the link has been established and the port is operable.
Value: Boolean
TRUE if the physical layer has indicated that the port is operable.
FALSE otherwise.

So, it appears that bonding is in conformance with the standard
in this case.

I don't see an issue with the above behavior when ad_select is
set to the default value of "stable"; bonding does reselect a new
aggregator when all links fail, and it appears to follow the standard.

I think a reasonable compromise here is to utilize a modified
version of your patch that clears SELECTED (to trigger reselection) when
a link goes down, but only if ad_select is not "stable", for example:

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index b9304a295f86..1ee5a3a5e658 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2458,6 +2458,8 @@ void bond_3ad_handle_link_change(struct slave *slave, 
char link)
/* link has failed */
port->is_enabled = false;
ad_update_actor_keys(port, true);
+   if (__get_agg_selection_mode(port) != BOND_AD_STABLE)
+   port->port->sm_vars &= ~AD_PORT_SELECTED;
}

Re: [ovs-dev] [RFC PATCH net v2 2/2] openvswitch: Only set mark and labels with a commit flag.

2016-06-21 Thread Joe Stringer
On 21 June 2016 at 14:59, Jarno Rajahalme  wrote:
> Only set conntrack mark or labels when the commit flag is specified.
> This makes sure we can not set them before the connection has been
> persisted, as in that case the mark and labels would be lost in an
> event of an userspace upcall.
>
> OVS userspace already requires the commit flag to accept setting
> ct_mark and/or ct_labels.  Validate for this in the kernel API.
>
> Signed-off-by: Jarno Rajahalme 

As this is walling off an inconsistent corner of the ct action, and
OVS userspace already enforces this constraint, this looks OK to me.


[PATCH -next 0/4] cgroup: bpf: cgroup2 membership test on skb

2016-06-21 Thread Martin KaFai Lau
This series is to implement a bpf-way to
check the cgroup2 membership of a skb (sk_buff).

It is similar to the feature added in netfilter:
c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")

The current target is the tc-like usage.



[PATCH -next 2/4] cgroup: bpf: Add BPF_MAP_TYPE_CGROUP_ARRAY

2016-06-21 Thread Martin KaFai Lau
Add a BPF_MAP_TYPE_CGROUP_ARRAY and its bpf_map_ops's implementations.
To update an element, the caller is expected to obtain a cgroup2 backed
fd by open(cgroup2_dir) and then update the array with that fd.

Signed-off-by: Martin KaFai Lau 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Tejun Heo 
---
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/arraymap.c| 41 +
 kernel/bpf/syscall.c |  3 ++-
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 406459b..ef4e386 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -84,6 +84,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_PERCPU_HASH,
BPF_MAP_TYPE_PERCPU_ARRAY,
BPF_MAP_TYPE_STACK_TRACE,
+   BPF_MAP_TYPE_CGROUP_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 5af3073..5e279ec 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -539,3 +539,44 @@ static int __init register_perf_event_array_map(void)
return 0;
 }
 late_initcall(register_perf_event_array_map);
+
+static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
+struct file *map_file /* not used */,
+int fd)
+{
+   return cgroup_get_from_fd(fd);
+}
+
+static void cgroup_fd_array_put_ptr(void *ptr)
+{
+   /* cgroup_put free cgrp after a rcu grace period */
+   cgroup_put(ptr);
+}
+
+static void cgroup_fd_array_free(struct bpf_map *map)
+{
+   bpf_fd_array_map_clear(map);
+   fd_array_map_free(map);
+}
+
+static const struct bpf_map_ops cgroup_array_ops = {
+   .map_alloc = fd_array_map_alloc,
+   .map_free = cgroup_fd_array_free,
+   .map_get_next_key = array_map_get_next_key,
+   .map_lookup_elem = fd_array_map_lookup_elem,
+   .map_delete_elem = fd_array_map_delete_elem,
+   .map_fd_get_ptr = cgroup_fd_array_get_ptr,
+   .map_fd_put_ptr = cgroup_fd_array_put_ptr,
+};
+
+static struct bpf_map_type_list cgroup_array_type __read_mostly = {
+   .ops = _array_ops,
+   .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+};
+
+static int __init register_cgroup_array_map(void)
+{
+   bpf_register_map_type(_array_type);
+   return 0;
+}
+late_initcall(register_cgroup_array_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c23a4e93..cac13f1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -393,7 +393,8 @@ static int map_update_elem(union bpf_attr *attr)
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY ||
-  map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
+  map->map_type == BPF_MAP_TYPE_PROG_ARRAY ||
+  map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) {
rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
   attr->flags);
-- 
2.5.1



[PATCH -next 3/4] cgroup: bpf: Add bpf_skb_in_cgroup_proto

2016-06-21 Thread Martin KaFai Lau
Adds a bpf helper, bpf_skb_in_cgroup, to decide if a skb->sk
belongs to a descendant of a cgroup2.  It is similar to the
feature added in netfilter:
commit c38c4597e4bf ("netfilter: implement xt_cgroup cgroup2 path match")

The user is expected to populate a BPF_MAP_TYPE_CGROUP_ARRAY
which will be used by the bpf_skb_in_cgroup.

Modifications to the bpf verifier is to ensure BPF_MAP_TYPE_CGROUP_ARRAY
and bpf_skb_in_cgroup() are always used together.

Signed-off-by: Martin KaFai Lau 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Tejun Heo 
---
 include/uapi/linux/bpf.h |  1 +
 kernel/bpf/verifier.c|  8 
 net/core/filter.c| 36 
 3 files changed, 45 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ef4e386..a91714bd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -314,6 +314,7 @@ enum bpf_func_id {
 */
BPF_FUNC_skb_get_tunnel_opt,
BPF_FUNC_skb_set_tunnel_opt,
+   BPF_FUNC_skb_in_cgroup,
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 668e079..68753e0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1062,6 +1062,10 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
if (func_id != BPF_FUNC_get_stackid)
goto error;
break;
+   case BPF_MAP_TYPE_CGROUP_ARRAY:
+   if (func_id != BPF_FUNC_skb_in_cgroup)
+   goto error;
+   break;
default:
break;
}
@@ -1081,6 +1085,10 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+   case BPF_FUNC_skb_in_cgroup:
+   if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
+   goto error;
+   break;
default:
break;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index df6860c..410da89 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2024,6 +2024,40 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
}
 }
 
+static u64 bpf_skb_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   struct sk_buff *skb = (struct sk_buff *)(long)r1;
+   struct bpf_map *map = (struct bpf_map *)(long)r2;
+   struct bpf_array *array = container_of(map, struct bpf_array, map);
+   u32 i = (u32)r3;
+   struct cgroup *cgrp;
+   struct sock *sk;
+
+   WARN_ON_ONCE(!rcu_read_lock_held());
+
+   sk = skb->sk;
+   if (!sk || !sk_fullsock(sk))
+   return -ENOENT;
+
+   if (unlikely(i >= array->map.max_entries))
+   return -E2BIG;
+
+   cgrp = READ_ONCE(array->ptrs[i]);
+   if (unlikely(!cgrp))
+   return -ENOENT;
+
+   return cgroup_is_descendant(sock_cgroup_ptr(>sk_cgrp_data), cgrp);
+}
+
+static const struct bpf_func_proto bpf_skb_in_cgroup_proto = {
+   .func   = bpf_skb_in_cgroup,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_CTX,
+   .arg2_type  = ARG_CONST_MAP_PTR,
+   .arg3_type  = ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
@@ -2086,6 +2120,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
return _get_route_realm_proto;
case BPF_FUNC_perf_event_output:
return bpf_get_event_output_proto();
+   case BPF_FUNC_skb_in_cgroup:
+   return _skb_in_cgroup_proto;
default:
return sk_filter_func_proto(func_id);
}
-- 
2.5.1



[PATCH -next 1/4] cgroup: Add cgroup_get_from_fd

2016-06-21 Thread Martin KaFai Lau
Add a helper function to get a cgroup2 from a fd.  It will be
stored in a bpf array (BPF_MAP_TYPE_CGROUP_ARRAY) which will
be introduced in the later patch.

Signed-off-by: Martin KaFai Lau 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Tejun Heo 
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c| 26 ++
 2 files changed, 27 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a20320c..984f73b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -87,6 +87,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct 
dentry *dentry,
   struct cgroup_subsys 
*ss);
 
 struct cgroup *cgroup_get_from_path(const char *path);
+struct cgroup *cgroup_get_from_fd(int fd);
 
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 86cb5c6..616c751 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -62,6 +62,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /*
@@ -6205,6 +6206,31 @@ struct cgroup *cgroup_get_from_path(const char *path)
 }
 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
 
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+   struct cgroup_subsys_state *css;
+   struct cgroup *cgrp;
+   struct file *f;
+
+   f = fget_raw(fd);
+   if (!f)
+   return NULL;
+
+   css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+   fput(f);
+   if (IS_ERR(css))
+   return ERR_CAST(css);
+
+   cgrp = css->cgroup;
+   if (!cgroup_on_dfl(cgrp)) {
+   cgroup_put(cgrp);
+   return ERR_PTR(-EINVAL);
+   }
+
+   return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+
 /*
  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
  * definition in cgroup-defs.h.
-- 
2.5.1



[PATCH -next 4/4] cgroup: bpf: Add an example to do cgroup checking in BPF

2016-06-21 Thread Martin KaFai Lau
test_cgrp2_array_pin.c:
A userland program that creates a bpf_map (BPF_MAP_TYPE_GROUP_ARRAY),
pouplates/updates it with a cgroup2's backed fd and pins it to a
bpf-fs's file.  The pinned file can be loaded by tc and then used
by the bpf prog later.  This program can also update an existing pinned
array and it could be useful for debugging/testing purpose.

test_cgrp2_tc_kern.c:
A bpf prog which should be loaded by tc.  It is to demonstrate
the usage of bpf_skb_in_cgroup.

test_cgrp2_tc.sh:
A script that glues the test_cgrp2_array_pin.c and
test_cgrp2_tc_kern.c together.  The idea is like:
1. Use test_cgrp2_array_pin.c to populate a BPF_MAP_TYPE_CGROUP_ARRAY
   with a cgroup fd
2. Load the test_cgrp2_tc_kern.o by tc
3. Do a 'ping -6 ff02::1%ve' to ensure the packet has been
   dropped because of a match on the cgroup

Most of the lines in test_cgrp2_tc.sh is the boilerplate
to setup the cgroup/bpf-fs/net-devices/netns...etc.  It is
not bulletproof on errors but should work well enough and
give enough debug info if things did not go well.

Signed-off-by: Martin KaFai Lau 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Tejun Heo 
---
 samples/bpf/Makefile   |   3 +
 samples/bpf/bpf_helpers.h  |   2 +
 samples/bpf/test_cgrp2_array_pin.c | 109 +
 samples/bpf/test_cgrp2_tc.sh   | 189 +
 samples/bpf/test_cgrp2_tc_kern.c   |  71 ++
 5 files changed, 374 insertions(+)
 create mode 100644 samples/bpf/test_cgrp2_array_pin.c
 create mode 100755 samples/bpf/test_cgrp2_tc.sh
 create mode 100644 samples/bpf/test_cgrp2_tc_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 0bf2478..a98b780 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -20,6 +20,7 @@ hostprogs-y += offwaketime
 hostprogs-y += spintest
 hostprogs-y += map_perf_test
 hostprogs-y += test_overhead
+hostprogs-y += test_cgrp2_array_pin
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -40,6 +41,7 @@ offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
 spintest-objs := bpf_load.o libbpf.o spintest_user.o
 map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
 test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
+test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -61,6 +63,7 @@ always += map_perf_test_kern.o
 always += test_overhead_tp_kern.o
 always += test_overhead_kprobe_kern.o
 always += parse_varlen.o parse_simple.o parse_ldabs.o
+always += test_cgrp2_tc_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 7904a2a..84e3fd9 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -70,6 +70,8 @@ static int (*bpf_l3_csum_replace)(void *ctx, int off, int 
from, int to, int flag
(void *) BPF_FUNC_l3_csum_replace;
 static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int 
flags) =
(void *) BPF_FUNC_l4_csum_replace;
+static int (*bpf_skb_in_cgroup)(void *ctx, void *map, int index) =
+   (void *) BPF_FUNC_skb_in_cgroup;
 
 #if defined(__x86_64__)
 
diff --git a/samples/bpf/test_cgrp2_array_pin.c 
b/samples/bpf/test_cgrp2_array_pin.c
new file mode 100644
index 000..70e86f7
--- /dev/null
+++ b/samples/bpf/test_cgrp2_array_pin.c
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "libbpf.h"
+
+static void usage(void)
+{
+   printf("Usage: test_cgrp2_array_pin [...]\n");
+   printf("   -FFile to pin an BPF cgroup array\n");
+   printf("   -UUpdate an already pinned BPF cgroup 
array\n");
+   printf("   -v   Full path of the cgroup2\n");
+   printf("   -h  Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+   const char *pinned_file = NULL, *cg2 = NULL;
+   int create_array = 1;
+   int array_key = 0;
+   int array_fd = -1;
+   int cg2_fd = -1;
+   int ret = -1;
+   int opt;
+
+   while ((opt = getopt(argc, argv, "F:U:v:")) != -1) {
+   switch (opt) {
+   /* General args */
+   case 'F':
+   pinned_file = optarg;
+   break;
+   case 'U':
+   pinned_file = optarg;
+   create_array = 0;
+   break;
+   case 'v':
+   cg2 = optarg;
+   break;
+   default:
+   usage();
+

Re: [ovs-dev] [PATCH net v2 1/2] openvswitch: Set mark and labels before confirming.

2016-06-21 Thread Joe Stringer
On 21 June 2016 at 14:59, Jarno Rajahalme  wrote:
> Set conntrack mark and labels right before committing so that
> the initial conntrack NEW event has the mark and labels.
>
> Signed-off-by: Jarno Rajahalme 

Acked-by: Joe Stringer 


Re: r8169 regression: UDP packets dropped intermittantly

2016-06-21 Thread Jonathan Woithe
On Wed, Jun 22, 2016 at 01:09:57AM +0200, Francois Romieu wrote:
> Jonathan Woithe  :
> [...]
> > Is there any chance that this regression can be resolved?  It's been 6
> > months since the last contact was received from the list in relation to this
> > issue.  If the r8169 driver is to remain broken with respect to UDP traffic
> > then we will have no choice but to factor in a change in our standard
> > hardware for future systems.  Unfortunately this also means that dozens of
> > systems in the field cannot be upgraded to recent kernels since doing so
> > will trigger the regression.[1]
> 
> If I understood correctly (2015/11/21) you had a working system with a stock
> 4.2 or 4.3 kernel and the r8169.c from 
> 1e874e041fc7c222cbd85b20c4406070be1f687a
> (i.e. da78dbff2e05630921c551dbbc70a4b7981a8fff "r8169: remove work from irq
> handler." parent) patched with the snippet below, right ?

Thanks for your response.

You are correct: that combination worked just fine.  From the response given
at the time it seemed that further follow-up work was on the way, so I was
puzzled when this didn't eventuate.

> If so, while not perfect, it should at least mitigate the "can't upgrade
> kernel" part.

Yes, it does do that.  I was under the impression that mainline could be
subsequently patched to fix the regression based on understandings gained
from the outcome of this workaround, but it seems that hasn't been possible
for some reason.  We would obviously prefer to ship unpatched kernels with
our systems since, among other things, it means we don't need to maintain an
out-of-tree patch (something which will only get more complex as internal
APIs evolve).  In addition, this approach does peg the r8169 driver in the
resulting system to 1e874e041fc7c222cbd85b20c4406070be1f68, which could have
implications for future debugging, other bug fixes and the like which come
later.

However, you are right in that with the cited patch we have a way to migrate
to something based on either 4.2 or 4.3 (at this point I haven't tested the
woraround combination with anything later).

> > If the decision has been made to leave the regression unfixed, please
> > let me know.
> 
> No such decision that I know of.

Does this mean that a patch to fix the regression will eventually be applied
to mainline (in which case I'll keep watching out for it)?  Or is the
out-of-tree workaround mentioned above considered to be the long term
fix for those who encounter the problem?

Regards
  jonathan


Re: [PATCH net-next] tcp: reduce cpu usage when SO_SNDBUF is set

2016-06-21 Thread Eric Dumazet
On Tue, 2016-06-21 at 11:24 -0400, Jason Baron wrote:

> in tcp_check_space() with something like:
> 
> sk->sk_flags &= ~((1UL << SOCK_QUEUE_SHRUNK) | (1UL << SOCK_SHORT_WRITE));
> 
> Since we are already writing to sk_flags there this should have very
> minimal overhead. And then remove the clear in sk_stream_write_space().

Interesting. You added in 3c7151275c0c9 a smp_mb__after_atomic()
in tcp_check_space(), but there is no atomic operation to begin with ;)





[PATCH iproute2] Enable use of extra debugging information

2016-06-21 Thread David Ahern
Add -g flag to builds if DEBUG parameter is set. Improves
debugging with gdb.

Signed-off-by: David Ahern 
---
 Makefile | 4 
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index 15c81ecfdca3..8e006759079d 100644
--- a/Makefile
+++ b/Makefile
@@ -39,7 +39,11 @@ HOSTCC = gcc
 DEFINES += -D_GNU_SOURCE
 # Turn on transparent support for LFS
 DEFINES += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE
+ifdef DEBUG
+CCOPTS = -g
+else
 CCOPTS = -O2
+endif
 WFLAGS := -Wall -Wstrict-prototypes  -Wmissing-prototypes
 WFLAGS += -Wmissing-declarations -Wold-style-definition -Wformat=2
 
-- 
2.1.4



[PATCH V2 2/2] ath6kl: replace semaphore with mutex

2016-06-21 Thread Chaehyun Lim
It replaces struct semaphore sem with struct mutex mutex

Reported-by: kbuild test robot 
Signed-off-by: Chaehyun Lim 
---
V2: fix build failure reported by kbuild test robot

 drivers/net/wireless/ath/ath6kl/cfg80211.c | 30 +++---
 drivers/net/wireless/ath/ath6kl/core.c |  2 +-
 drivers/net/wireless/ath/ath6kl/core.h |  2 +-
 drivers/net/wireless/ath/ath6kl/debug.c| 12 ++--
 drivers/net/wireless/ath/ath6kl/init.c |  6 +++---
 5 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c 
b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index f6b5390..d4eb066 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -480,14 +480,14 @@ static int ath6kl_cfg80211_connect(struct wiphy *wiphy, 
struct net_device *dev,
return -EINVAL;
}
 
-   if (down_interruptible(>sem)) {
+   if (mutex_lock_interruptible(>mutex)) {
ath6kl_err("busy, couldn't get access\n");
return -ERESTARTSYS;
}
 
if (test_bit(DESTROY_IN_PROGRESS, >flag)) {
ath6kl_err("busy, destroy in progress\n");
-   up(>sem);
+   mutex_unlock(>mutex);
return -EBUSY;
}
 
@@ -500,14 +500,14 @@ static int ath6kl_cfg80211_connect(struct wiphy *wiphy, 
struct net_device *dev,
 WMI_TIMEOUT);
if (signal_pending(current)) {
ath6kl_err("cmd queue drain timeout\n");
-   up(>sem);
+   mutex_unlock(>mutex);
return -EINTR;
}
}
 
status = ath6kl_set_assoc_req_ies(vif, sme->ie, sme->ie_len);
if (status) {
-   up(>sem);
+   mutex_unlock(>mutex);
return status;
}
 
@@ -522,7 +522,7 @@ static int ath6kl_cfg80211_connect(struct wiphy *wiphy, 
struct net_device *dev,
  vif->req_bssid,
  vif->ch_hint);
 
-   up(>sem);
+   mutex_unlock(>mutex);
if (status) {
ath6kl_err("wmi_reconnect_cmd failed\n");
return -EIO;
@@ -548,7 +548,7 @@ static int ath6kl_cfg80211_connect(struct wiphy *wiphy, 
struct net_device *dev,
 
status = ath6kl_set_auth_type(vif, sme->auth_type);
if (status) {
-   up(>sem);
+   mutex_unlock(>mutex);
return status;
}
 
@@ -570,7 +570,7 @@ static int ath6kl_cfg80211_connect(struct wiphy *wiphy, 
struct net_device *dev,
if (sme->key_idx > WMI_MAX_KEY_INDEX) {
ath6kl_err("key index %d out of bounds\n",
   sme->key_idx);
-   up(>sem);
+   mutex_unlock(>mutex);
return -ENOENT;
}
 
@@ -594,7 +594,7 @@ static int ath6kl_cfg80211_connect(struct wiphy *wiphy, 
struct net_device *dev,
if (ath6kl_wmi_bssfilter_cmd(ar->wmi, vif->fw_vif_idx,
 ALL_BSS_FILTER, 0) != 0) {
ath6kl_err("couldn't set bss filtering\n");
-   up(>sem);
+   mutex_unlock(>mutex);
return -EIO;
}
}
@@ -626,7 +626,7 @@ static int ath6kl_cfg80211_connect(struct wiphy *wiphy, 
struct net_device *dev,
   0);
if (status) {
ath6kl_err("couldn't set listen intervel\n");
-   up(>sem);
+   mutex_unlock(>mutex);
return status;
}
}
@@ -651,7 +651,7 @@ static int ath6kl_cfg80211_connect(struct wiphy *wiphy, 
struct net_device *dev,
ath6kl_wmi_scanparams_cmd(ar->wmi, vif->fw_vif_idx, 0, 0,
  sme->bg_scan_period, 0, 0, 0, 3, 0, 0, 0);
 
-   up(>sem);
+   mutex_unlock(>mutex);
 
if (status == -EINVAL) {
memset(vif->ssid, 0, sizeof(vif->ssid));
@@ -832,7 +832,7 @@ static int ath6kl_cfg80211_disconnect(struct wiphy *wiphy,
return -EBUSY;
}
 
-   if (down_interruptible(>sem)) {
+   if (mutex_lock_interruptible(>mutex)) {
ath6kl_err("busy, couldn't get access\n");
return -ERESTARTSYS;
}
@@ -845,7 +845,7 @@ static int ath6kl_cfg80211_disconnect(struct wiphy *wiphy,
if (!test_bit(SKIP_SCAN, >flag))
memset(vif->req_bssid, 0, sizeof(vif->req_bssid));
 
-   up(>sem);
+   mutex_unlock(>mutex);
 
vif->sme_state = SME_DISCONNECTED;
 
@@ -1775,7 +1775,7 @@ 

Re: r8169 regression: UDP packets dropped intermittantly

2016-06-21 Thread Francois Romieu
Jonathan Woithe  :
[...]
> Is there any chance that this regression can be resolved?  It's been 6
> months since the last contact was received from the list in relation to this
> issue.  If the r8169 driver is to remain broken with respect to UDP traffic
> then we will have no choice but to factor in a change in our standard
> hardware for future systems.  Unfortunately this also means that dozens of
> systems in the field cannot be upgraded to recent kernels since doing so
> will trigger the regression.[1]

If I understood correctly (2015/11/21) you had a working system with a stock
4.2 or 4.3 kernel and the r8169.c from 1e874e041fc7c222cbd85b20c4406070be1f687a
(i.e. da78dbff2e05630921c551dbbc70a4b7981a8fff "r8169: remove work from irq
handler." parent) patched with the snippet below, right ?

If so, while not perfect, it should at least mitigate the "can't upgrade
kernel" part.

--- r8169.c 2015-11-21 23:02:10.435275753 +0100
+++ r8169.c 2015-11-21 23:21:49.429554012 +0100
@@ -29,7 +29,6 @@
 #include 
 #include 
 
-#include 
 #include 
 #include 
 
@@ -1616,7 +1615,7 @@ static int rtl8169_set_features(struct n
else
tp->cp_cmd &= ~RxChkSum;
 
-   if (dev->features & NETIF_F_HW_VLAN_RX)
+   if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
tp->cp_cmd |= RxVlan;
else
tp->cp_cmd &= ~RxVlan;
@@ -1632,8 +1631,8 @@ static int rtl8169_set_features(struct n
 static inline u32 rtl8169_tx_vlan_tag(struct rtl8169_private *tp,
  struct sk_buff *skb)
 {
-   return (vlan_tx_tag_present(skb)) ?
-   TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00;
+   return (skb_vlan_tag_present(skb)) ?
+   TxVlanTag | swab16(skb_vlan_tag_get(skb)) : 0x00;
 }
 
 static void rtl8169_rx_vlan_tag(struct RxDesc *desc, struct sk_buff *skb)
@@ -1641,7 +1640,7 @@ static void rtl8169_rx_vlan_tag(struct R
u32 opts2 = le32_to_cpu(desc->opts2);
 
if (opts2 & RxVlanTag)
-   __vlan_hwaccel_put_tag(skb, swab16(opts2 & 0x));
+   __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), swab16(opts2 & 
0x));
 
desc->opts2 = 0;
 }
@@ -3508,7 +3507,7 @@ static const struct net_device_ops rtl81
 
 };
 
-static void __devinit rtl_init_mdio_ops(struct rtl8169_private *tp)
+static void rtl_init_mdio_ops(struct rtl8169_private *tp)
 {
struct mdio_ops *ops = >mdio_ops;
 
@@ -3725,7 +3724,7 @@ static void rtl_pll_power_up(struct rtl8
rtl_generic_op(tp, tp->pll_power_ops.up);
 }
 
-static void __devinit rtl_init_pll_power_ops(struct rtl8169_private *tp)
+static void rtl_init_pll_power_ops(struct rtl8169_private *tp)
 {
struct pll_power_ops *ops = >pll_power_ops;
 
@@ -3905,7 +3904,7 @@ static void r8168b_1_hw_jumbo_disable(st
RTL_W8(Config4, RTL_R8(Config4) & ~(1 << 0));
 }
 
-static void __devinit rtl_init_jumbo_ops(struct rtl8169_private *tp)
+static void rtl_init_jumbo_ops(struct rtl8169_private *tp)
 {
struct jumbo_ops *ops = >jumbo_ops;
 
@@ -3971,7 +3970,7 @@ static void rtl_hw_reset(struct rtl8169_
}
 }
 
-static int __devinit
+static int
 rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
const struct rtl_cfg_info *cfg = rtl_cfg_infos + ent->driver_data;
@@ -4137,7 +4136,7 @@ rtl8169_init_one(struct pci_dev *pdev, c
dev->dev_addr[i] = RTL_R8(MAC0 + i);
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 
-   SET_ETHTOOL_OPS(dev, _ethtool_ops);
+   dev->ethtool_ops = _ethtool_ops;
dev->watchdog_timeo = RTL8169_TX_TIMEOUT;
dev->irq = pdev->irq;
dev->base_addr = (unsigned long) ioaddr;
@@ -4147,16 +4146,16 @@ rtl8169_init_one(struct pci_dev *pdev, c
/* don't enable SG, IP_CSUM and TSO by default - it might not work
 * properly for all devices */
dev->features |= NETIF_F_RXCSUM |
-   NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+   NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 
dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
-   NETIF_F_RXCSUM | NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+   NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX | 
NETIF_F_HW_VLAN_CTAG_RX;
dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
NETIF_F_HIGHDMA;
 
if (tp->mac_version == RTL_GIGA_MAC_VER_05)
/* 8110SCd requires hardware Rx VLAN - disallow toggling */
-   dev->hw_features &= ~NETIF_F_HW_VLAN_RX;
+   dev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_RX;
 
tp->intr_mask = 0x;
tp->hw_start = cfg->hw_start;
@@ -4217,7 +4216,7 @@ err_out_free_dev_1:
goto out;
 }
 
-static void __devexit rtl8169_remove_one(struct pci_dev *pdev)
+static void rtl8169_remove_one(struct pci_dev *pdev)
 {
struct net_device *dev = pci_get_drvdata(pdev);

[PATCH 2/2] net: ethernet: macb: use phy_ethtool_{get|set}_link_ksettings

2016-06-21 Thread Philippe Reynes
There are two generics functions phy_ethtool_{get|set}_link_ksettings,
so we can use them instead of defining the same code in the driver.

Signed-off-by: Philippe Reynes 
---
 drivers/net/ethernet/cadence/macb.c |   30 --
 1 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c 
b/drivers/net/ethernet/cadence/macb.c
index 090463f..89c0cfa 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -2091,28 +2091,6 @@ static struct net_device_stats *macb_get_stats(struct 
net_device *dev)
return nstat;
 }
 
-static int macb_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
-{
-   struct macb *bp = netdev_priv(dev);
-   struct phy_device *phydev = dev->phydev;
-
-   if (!phydev)
-   return -ENODEV;
-
-   return phy_ethtool_gset(phydev, cmd);
-}
-
-static int macb_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
-{
-   struct macb *bp = netdev_priv(dev);
-   struct phy_device *phydev = dev->phydev;
-
-   if (!phydev)
-   return -ENODEV;
-
-   return phy_ethtool_sset(phydev, cmd);
-}
-
 static int macb_get_regs_len(struct net_device *netdev)
 {
return MACB_GREGS_NBR * sizeof(u32);
@@ -2185,19 +2163,17 @@ static int macb_set_wol(struct net_device *netdev, 
struct ethtool_wolinfo *wol)
 }
 
 static const struct ethtool_ops macb_ethtool_ops = {
-   .get_settings   = macb_get_settings,
-   .set_settings   = macb_set_settings,
.get_regs_len   = macb_get_regs_len,
.get_regs   = macb_get_regs,
.get_link   = ethtool_op_get_link,
.get_ts_info= ethtool_op_get_ts_info,
.get_wol= macb_get_wol,
.set_wol= macb_set_wol,
+   .get_link_ksettings = phy_ethtool_get_link_ksettings,
+   .set_link_ksettings = phy_ethtool_set_link_ksettings,
 };
 
 static const struct ethtool_ops gem_ethtool_ops = {
-   .get_settings   = macb_get_settings,
-   .set_settings   = macb_set_settings,
.get_regs_len   = macb_get_regs_len,
.get_regs   = macb_get_regs,
.get_link   = ethtool_op_get_link,
@@ -2205,6 +2181,8 @@ static const struct ethtool_ops gem_ethtool_ops = {
.get_ethtool_stats  = gem_get_ethtool_stats,
.get_strings= gem_get_ethtool_strings,
.get_sset_count = gem_get_sset_count,
+   .get_link_ksettings = phy_ethtool_get_link_ksettings,
+   .set_link_ksettings = phy_ethtool_set_link_ksettings,
 };
 
 static int macb_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
-- 
1.7.4.4



[PATCH 1/2] net: ethernet: macb: use phydev from struct net_device

2016-06-21 Thread Philippe Reynes
The private structure contain a pointer to phydev, but the structure
net_device already contain such pointer. So we can remove the pointer
phydev in the private structure, and update the driver to use the
one contained in struct net_device.

Signed-off-by: Philippe Reynes 
---
 drivers/net/ethernet/cadence/macb.c |   28 +---
 drivers/net/ethernet/cadence/macb.h |1 -
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c 
b/drivers/net/ethernet/cadence/macb.c
index cb07d95..090463f 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -304,7 +304,7 @@ static void macb_set_tx_clk(struct clk *clk, int speed, 
struct net_device *dev)
 static void macb_handle_link_change(struct net_device *dev)
 {
struct macb *bp = netdev_priv(dev);
-   struct phy_device *phydev = bp->phy_dev;
+   struct phy_device *phydev = dev->phydev;
unsigned long flags;
int status_change = 0;
 
@@ -414,7 +414,6 @@ static int macb_mii_probe(struct net_device *dev)
bp->link = 0;
bp->speed = 0;
bp->duplex = -1;
-   bp->phy_dev = phydev;
 
return 0;
 }
@@ -1886,7 +1885,7 @@ static int macb_open(struct net_device *dev)
netif_carrier_off(dev);
 
/* if the phy is not yet register, retry later*/
-   if (!bp->phy_dev)
+   if (!dev->phydev)
return -EAGAIN;
 
/* RX buffers initialization */
@@ -1905,7 +1904,7 @@ static int macb_open(struct net_device *dev)
macb_init_hw(bp);
 
/* schedule a link state check */
-   phy_start(bp->phy_dev);
+   phy_start(dev->phydev);
 
netif_tx_start_all_queues(dev);
 
@@ -1920,8 +1919,8 @@ static int macb_close(struct net_device *dev)
netif_tx_stop_all_queues(dev);
napi_disable(>napi);
 
-   if (bp->phy_dev)
-   phy_stop(bp->phy_dev);
+   if (dev->phydev)
+   phy_stop(dev->phydev);
 
spin_lock_irqsave(>lock, flags);
macb_reset_hw(bp);
@@ -2095,7 +2094,7 @@ static struct net_device_stats *macb_get_stats(struct 
net_device *dev)
 static int macb_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
struct macb *bp = netdev_priv(dev);
-   struct phy_device *phydev = bp->phy_dev;
+   struct phy_device *phydev = dev->phydev;
 
if (!phydev)
return -ENODEV;
@@ -2106,7 +2105,7 @@ static int macb_get_settings(struct net_device *dev, 
struct ethtool_cmd *cmd)
 static int macb_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
struct macb *bp = netdev_priv(dev);
-   struct phy_device *phydev = bp->phy_dev;
+   struct phy_device *phydev = dev->phydev;
 
if (!phydev)
return -ENODEV;
@@ -2210,8 +2209,7 @@ static const struct ethtool_ops gem_ethtool_ops = {
 
 static int macb_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
-   struct macb *bp = netdev_priv(dev);
-   struct phy_device *phydev = bp->phy_dev;
+   struct phy_device *phydev = dev->phydev;
 
if (!netif_running(dev))
return -EINVAL;
@@ -2570,7 +2568,7 @@ static int at91ether_open(struct net_device *dev)
 MACB_BIT(HRESP));
 
/* schedule a link state check */
-   phy_start(lp->phy_dev);
+   phy_start(dev->phydev);
 
netif_start_queue(dev);
 
@@ -3010,7 +3008,7 @@ static int macb_probe(struct platform_device *pdev)
if (err)
goto err_out_free_netdev;
 
-   phydev = bp->phy_dev;
+   phydev = dev->phydev;
 
netif_carrier_off(dev);
 
@@ -3029,7 +3027,7 @@ static int macb_probe(struct platform_device *pdev)
return 0;
 
 err_out_unregister_mdio:
-   phy_disconnect(bp->phy_dev);
+   phy_disconnect(dev->phydev);
mdiobus_unregister(bp->mii_bus);
mdiobus_free(bp->mii_bus);
 
@@ -3057,8 +3055,8 @@ static int macb_remove(struct platform_device *pdev)
 
if (dev) {
bp = netdev_priv(dev);
-   if (bp->phy_dev)
-   phy_disconnect(bp->phy_dev);
+   if (dev->phydev)
+   phy_disconnect(dev->phydev);
mdiobus_unregister(bp->mii_bus);
mdiobus_free(bp->mii_bus);
 
diff --git a/drivers/net/ethernet/cadence/macb.h 
b/drivers/net/ethernet/cadence/macb.h
index 8a13824..36893d8 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -823,7 +823,6 @@ struct macb {
struct macb_or_gem_ops  macbgem_ops;
 
struct mii_bus  *mii_bus;
-   struct phy_device   *phy_dev;
int link;
int speed;
int duplex;
-- 
1.7.4.4



Re: [PATCH v3 0/6] Introduce pci_(request|release)_(mem|io)_regions

2016-06-21 Thread Bjorn Helgaas
On Tue, Jun 07, 2016 at 09:44:00AM +0200, Johannes Thumshirn wrote:
> The first patch in this series introduces the following 4 helper functions to
> the PCI core:
> 
> * pci_request_mem_regions()
> * pci_request_io_regions()
> * pci_release_mem_regions()
> * pci_release_io_regions()
> 
> which encapsulate the request and release of a PCI device's memory or I/O
> bars.
> 
> The subsequent patches convert the drivers, which use the
> pci_request_selected_regions(pdev, 
>   pci_select_bars(pdev, IORESOURCE_MEM), name); 
> and similar pattern to use the new interface.
> 
> This was suggested by Christoph Hellwig in
> http://lists.infradead.org/pipermail/linux-nvme/2016-May/004570.html and
> tested on kernel v4.6 with NVMe.

I applied all six of these to pci/resources for v4.8, thanks, Johannes.

> Johannes Thumshirn (6):
>   PCI: Add helpers to request/release memory and I/O regions
>   NVMe: Use pci_(request|release)_mem_regions
>   lpfc: Use pci_(request|release)_mem_regions
>   GenWQE: Use pci_(request|release)_mem_regions
>   ethernet/intel: Use pci_(request|release)_mem_regions
>   alx: Use pci_(request|release)_mem_regions
> 
>  drivers/misc/genwqe/card_base.c   | 13 +
>  drivers/net/ethernet/atheros/alx/main.c   | 12 +---
>  drivers/net/ethernet/intel/e1000e/netdev.c|  6 ++
>  drivers/net/ethernet/intel/fm10k/fm10k_pci.c  | 11 +++
>  drivers/net/ethernet/intel/i40e/i40e_main.c   |  9 +++--
>  drivers/net/ethernet/intel/igb/igb_main.c | 10 +++---
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  9 +++--
>  drivers/nvme/host/pci.c   | 10 +++---
>  drivers/scsi/lpfc/lpfc_init.c | 15 --
>  include/linux/pci.h   | 28 
> +++


Re: [PATCH v2] net: stmmac: dwmac-rk: add rk3228-specific data

2016-06-21 Thread Rob Herring
On Tue, Jun 21, 2016 at 08:33:28PM +0800, Xing Zheng wrote:
> Add constants and callback functions for the dwmac on rk3228/rk3229 socs.
> As can be seen, the base structure is the same, only registers and the
> bits in them moved slightly.
> 
> Signed-off-by: Xing Zheng 
> ---
> 
> Changes in v2:
> - the "rk322x" is not clear to SoC decription, rename it to "rk3228"
> 
>  .../devicetree/bindings/net/rockchip-dwmac.txt |3 +-
>  drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c |  117 
> 
>  2 files changed, 119 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt 
> b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt
> index 93eac7c..cccd945 100644
> --- a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt
> +++ b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt
> @@ -3,7 +3,8 @@ Rockchip SoC RK3288 10/100/1000 Ethernet driver(GMAC)
>  The device node has following properties.
>  
>  Required properties:
> - - compatible: Can be one of "rockchip,rk3288-gmac", "rockchip,rk3368-gmac"
> + - compatible: Can be one of "rockchip,rk3228-gmac", "rockchip,rk3288-gmac",
> + "rockchip,rk3368-gmac"

Not worth reposting just for this, but 1 per line is preferred.

Acked-by: Rob Herring 

>   - reg: addresses and length of the register sets for the device.
>   - interrupts: Should contain the GMAC interrupts.
>   - interrupt-names: Should contain the interrupt names "macirq".


Re: [ovs-dev] [PATCH net-next] openvswitch: Only set mark and labels when commiting a connection.

2016-06-21 Thread Jarno Rajahalme
Thanks for the review!

> On Jun 21, 2016, at 1:57 PM, Joe Stringer  wrote:
> 
> On 20 June 2016 at 17:19, Jarno Rajahalme  wrote:
>> Only allow setting conntrack mark or labels when the commit flag is
>> specified.  This makes sure we can not set them before the connection
>> has been persisted, as in that case the mark and labels would be lost
>> in an event of an userspace upcall.
>> 
>> OVS userspace already requires the commit flag to accept setting
>> ct_mark and/or ct_labels.  Validate for this on the kernel API.
>> 
>> Finally, set conntrack mark and labels right before committing so that
>> the initial conntrack NEW event has the mark and labels.
>> 
>> Signed-off-by: Jarno Rajahalme 
> 
> The structure of this commit message suggests there are multiple
> changes trying to be addressed in one patch. I suggest splitting them
> out.
> 

Done for v2 I just sent for net.

> In terms of applying the mark and labels before committing the
> connection, that's actually the behaviour I would expect if you were
> to execute ct(mark=foo,commit). The NEW event should include these
> pieces, and should have all along.

Right, the v2 patch 1/2 does this.

> 
>> @@ -1145,6 +1155,20 @@ static int parse_ct(const struct nlattr *attr, struct 
>> ovs_conntrack_info *info,
>>}
>>}
>> 
>> +#ifdef CONFIG_NF_CONNTRACK_MARK
>> +   if (!info->commit && info->mark.mask) {
>> +   OVS_NLERR(log,
>> + "Setting conntrack mark requires 'commit' flag.");
>> +   return -EINVAL;
>> +   }
>> +#endif
>> +#ifdef CONFIG_NF_CONNTRACK_LABELS
>> +   if (!info->commit && labels_nonzero(>labels.mask)) {
>> +   OVS_NLERR(log,
>> + "Setting conntrack labels requires 'commit' 
>> flag.");
>> +   return -EINVAL;
>> +   }
>> +#endif
> 
> I'm of mixed minds about this, but I lean towards agreeing with it. On
> one hand, it's applying more restrictions on an otherwise fairly loose
> interface and if anyone is relying on this behaviour then it would be
> surprising to have this restriction introduced. On the other hand, it
> doesn't make a lot of sense to set a label/mark but not to commit the
> connection. As you say, the behaviour isn't exactly consistent in that
> case today anyway: If there was a flow with
> actions=ct(mark=foo),recirc() followed by a userspace upcall, then the
> mark would be reflected in the flow key but not saved to any persisted
> connection. A subsequent ct(commit) after upcall wouldn't persist it,
> either. However if there were two flows already in the datapath to do
> this, then it /would/ be persisted. Restricting the mark/labels
> modification to only if you have the "commit" flag would address that
> consistency issue. The OVS userspace enforcing this constraint also
> hints that this was an unintentional omission from kernel validation.

I separated this out to the v2 patch 2/2.

  Jarno



[PATCH net v2 1/2] openvswitch: Set mark and labels before confirming.

2016-06-21 Thread Jarno Rajahalme
Set conntrack mark and labels right before committing so that
the initial conntrack NEW event has the mark and labels.

Signed-off-by: Jarno Rajahalme 
---
v2: Separate Kernel API change to an RFC patch (2/2).

 net/openvswitch/conntrack.c | 33 ++---
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 3d5feed..23fd4fb 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -824,23 +824,6 @@ static int ovs_ct_lookup(struct net *net, struct 
sw_flow_key *key,
return 0;
 }
 
-/* Lookup connection and confirm if unconfirmed. */
-static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
-const struct ovs_conntrack_info *info,
-struct sk_buff *skb)
-{
-   int err;
-
-   err = __ovs_ct_lookup(net, key, info, skb);
-   if (err)
-   return err;
-   /* This is a no-op if the connection has already been confirmed. */
-   if (nf_conntrack_confirm(skb) != NF_ACCEPT)
-   return -EINVAL;
-
-   return 0;
-}
-
 static bool labels_nonzero(const struct ovs_key_ct_labels *labels)
 {
size_t i;
@@ -873,21 +856,33 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
}
 
if (info->commit)
-   err = ovs_ct_commit(net, key, info, skb);
+   err = __ovs_ct_lookup(net, key, info, skb);
else
err = ovs_ct_lookup(net, key, info, skb);
if (err)
goto err;
 
+   /* Apply changes before confirming the connection so that the initial
+* conntrack NEW netlink event carries the values given in the CT
+* action.
+*/
if (info->mark.mask) {
err = ovs_ct_set_mark(skb, key, info->mark.value,
  info->mark.mask);
if (err)
goto err;
}
-   if (labels_nonzero(>labels.mask))
+   if (labels_nonzero(>labels.mask)) {
err = ovs_ct_set_labels(skb, key, >labels.value,
>labels.mask);
+   if (err)
+   goto err;
+   }
+   /* This will take care of sending queued events even if the connection
+* is already confirmed.
+*/
+   if (info->commit && nf_conntrack_confirm(skb) != NF_ACCEPT)
+   err = -EINVAL;
 err:
skb_push(skb, nh_ofs);
if (err)
-- 
2.1.4



[RFC PATCH net v2 2/2] openvswitch: Only set mark and labels with a commit flag.

2016-06-21 Thread Jarno Rajahalme
Only set conntrack mark or labels when the commit flag is specified.
This makes sure we can not set them before the connection has been
persisted, as in that case the mark and labels would be lost in an
event of an userspace upcall.

OVS userspace already requires the commit flag to accept setting
ct_mark and/or ct_labels.  Validate for this in the kernel API.

Signed-off-by: Jarno Rajahalme 
---
 net/openvswitch/conntrack.c | 76 ++---
 1 file changed, 51 insertions(+), 25 deletions(-)

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 23fd4fb..52f3b9b 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -835,6 +835,42 @@ static bool labels_nonzero(const struct ovs_key_ct_labels 
*labels)
return false;
 }
 
+/* Lookup connection and confirm if unconfirmed. */
+static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
+const struct ovs_conntrack_info *info,
+struct sk_buff *skb)
+{
+   int err;
+
+   err = __ovs_ct_lookup(net, key, info, skb);
+   if (err)
+   return err;
+
+   /* Apply changes before confirming the connection so that the initial
+* conntrack NEW netlink event carries the values given in the CT
+* action.
+*/
+   if (info->mark.mask) {
+   err = ovs_ct_set_mark(skb, key, info->mark.value,
+ info->mark.mask);
+   if (err)
+   return err;
+   }
+   if (labels_nonzero(>labels.mask)) {
+   err = ovs_ct_set_labels(skb, key, >labels.value,
+   >labels.mask);
+   if (err)
+   return err;
+   }
+   /* This will take care of sending queued events even if the connection
+* is already confirmed.
+*/
+   if (nf_conntrack_confirm(skb) != NF_ACCEPT)
+   return -EINVAL;
+
+   return 0;
+}
+
 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
  * value if 'skb' is freed.
  */
@@ -856,34 +892,10 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
}
 
if (info->commit)
-   err = __ovs_ct_lookup(net, key, info, skb);
+   err = ovs_ct_commit(net, key, info, skb);
else
err = ovs_ct_lookup(net, key, info, skb);
-   if (err)
-   goto err;
 
-   /* Apply changes before confirming the connection so that the initial
-* conntrack NEW netlink event carries the values given in the CT
-* action.
-*/
-   if (info->mark.mask) {
-   err = ovs_ct_set_mark(skb, key, info->mark.value,
- info->mark.mask);
-   if (err)
-   goto err;
-   }
-   if (labels_nonzero(>labels.mask)) {
-   err = ovs_ct_set_labels(skb, key, >labels.value,
-   >labels.mask);
-   if (err)
-   goto err;
-   }
-   /* This will take care of sending queued events even if the connection
-* is already confirmed.
-*/
-   if (info->commit && nf_conntrack_confirm(skb) != NF_ACCEPT)
-   err = -EINVAL;
-err:
skb_push(skb, nh_ofs);
if (err)
kfree_skb(skb);
@@ -1140,6 +1152,20 @@ static int parse_ct(const struct nlattr *attr, struct 
ovs_conntrack_info *info,
}
}
 
+#ifdef CONFIG_NF_CONNTRACK_MARK
+   if (!info->commit && info->mark.mask) {
+   OVS_NLERR(log,
+ "Setting conntrack mark requires 'commit' flag.");
+   return -EINVAL;
+   }
+#endif
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+   if (!info->commit && labels_nonzero(>labels.mask)) {
+   OVS_NLERR(log,
+ "Setting conntrack labels requires 'commit' flag.");
+   return -EINVAL;
+   }
+#endif
if (rem > 0) {
OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem);
return -EINVAL;
-- 
2.1.4



Re: [PATCHv3] wlcore: spi: add wl18xx support

2016-06-21 Thread Rob Herring
On Tue, Jun 21, 2016 at 12:02:06PM +, Reizer, Eyal wrote:
> Add support for using with both wl12xx and wl18xx.
> 
> - all wilink family needs special init command for entering wspi mode.
>   extra clock cycles should be sent after the spi init command while the
>   cs pin is high.
> - Use inverted chip select for sending a dummy 4 bytes command that
>   completes the init stage and puts the wilink chip into wspi mode.
> 
> Signed-off-by: Eyal Reizer 
> ---
> v1->v2:update device tree bindings configuration
> v2->v3:revert from manual gpio manipulation. use inverted chip select instead
> for sending the extra init cycle, which achieves the same hardware purpose.
> update device tree bindings docucmentation accordingly
> 
>  .../bindings/net/wireless/ti,wlcore,spi.txt|  47 ++--
>  drivers/net/wireless/ti/wlcore/spi.c   | 124 
> +
>  2 files changed, 145 insertions(+), 26 deletions(-)
> 
> diff --git a/Documentation/devicetree/bindings/net/wireless/ti,wlcore,spi.txt 
> b/Documentation/devicetree/bindings/net/wireless/ti,wlcore,spi.txt
> index 9180724..35467cf 100644
> --- a/Documentation/devicetree/bindings/net/wireless/ti,wlcore,spi.txt
> +++ b/Documentation/devicetree/bindings/net/wireless/ti,wlcore,spi.txt
> @@ -1,19 +1,30 @@
> -* Texas Instruments wl1271 wireless lan controller
> +* Texas Instruments wl12xx/wl18xx wireless lan controller
>  
> -The wl1271 chip can be connected via SPI or via SDIO. This
> +The wl12xx/wl18xx chips can be connected via SPI or via SDIO. This
>  document describes the binding for the SPI connected chip.
>  
>  Required properties:
> -- compatible :  Should be "ti,wl1271"
> +- compatible :  Should be one of the following:
> +* "ti,wl1271"
> +* "ti,wl1273"
> +* "ti,wl1281"
> +* "ti,wl1283"
> +* "ti,wl1801"
> +* "ti,wl1805"
> +* "ti,wl1807"
> +* "ti,wl1831"
> +* "ti,wl1835"
> +* "ti,wl1837"
>  - reg : Chip select address of device
>  - spi-max-frequency :   Maximum SPI clocking speed of device in Hz
> -- ref-clock-frequency : Reference clock frequency
>  - interrupt-parent, interrupts :
>  Should contain parameters for 1 interrupt line.
>  Interrupt parameters: parent, line number, type.
> -- vwlan-supply :Point the node of the regulator that powers/enable 
> the wl1271 chip
> +- vwlan-supply :Point the node of the regulator that powers/enable 
> the
> +wl12xx/wl18xx chip
>  
>  Optional properties:
> +- ref-clock-frequency : Reference clock frequency (should be set for wl12xx)
>  - clock-xtal :  boolean, clock is generated from XTAL
>  
>  - Please consult Documentation/devicetree/bindings/spi/spi-bus.txt
> @@ -21,10 +32,15 @@ Optional properties:
>  
>  Examples:
>  
> +For wl12xx family:
>   {
> - wl1271@1 {
> + status = "okay";
> + pinctrl-names = "default";
> + pinctrl-0 = <_pins>;
> + #address-cells = <1>;
> + #size-cells = <0>;

None of this is really relevant to this binding.

> + wlcore: wlcore@0 {

Now your unit-address and reg value don't match.

>   compatible = "ti,wl1271";
> -
>   reg = <1>;
>   spi-max-frequency = <4800>;
>   clock-xtal;
> @@ -34,3 +50,20 @@ Examples:
>   vwlan-supply = <_fixed>;
>   };
>  };
> +
> +For wl18xx family:
> +{
> + status = "okay";
> + pinctrl-names = "default";
> + pinctrl-0 = <_pins>;
> + #address-cells = <1>;
> + #size-cells = <0>;
> + wlcore: wlcore@0 {
> + compatible = "ti,wl1835";
> + vwlan-supply = <_en_reg>;
> + spi-max-frequency = <4800>;
> + reg = <0>;
> + interrupt-parent = <>;
> + interrupts = <27 IRQ_TYPE_EDGE_RISING>;
> + };
> +};


Re: [net-next PATCH v3 00/17] Future-proof tunnel offload handlers

2016-06-21 Thread Hannes Frederic Sowa
On 21.06.2016 11:42, Tom Herbert wrote:
>> > There is also some argument to be had for theory versus application.
>> > Arguably it is the customers that are leading to some of the dirty
>> > hacks as I think vendors are building NICs based on customer use cases
>> > versus following any specifications.  In most data centers the tunnel
>> > underlays will be deployed throughout the network and UDP will likely
>> > be blocked for anything that isn't being used explicitly for
>> > tunneling.  As such we seem to be seeing a lot of NICs that are only
>> > supporting one port for things like this instead of designing them to
>> > handle whatever we can throw at them.
>> >
> Actually, I don't believe that's true. It is not typical to deploy
> firewalls within a data center fabric, and nor do we restrict
> applications from binding to any UDP ports and they can pretty much
> transmit to any port on any host without cost using an unconnected UDP
> socket. I think it's more likely that NIC (and switch vendors) simply
> assumed that port numbers can be treated as global values. That's
> expedient and at small scale we can probably get away with it, but at
> large scale this will eventually bite someone.

I do have access to relatively normal expensive switches that can
basically be used to realize a scenario like the one Alex described. No
firewalls necessary. If you can guarantee that your customers never have
access to your hypervisors or container management namespace, this is
actually a pretty solid assumption.

Bye,
Hannes



Re: [alsa-devel] [very-RFC 0/8] TSN driver for the kernel

2016-06-21 Thread Richard Cochran
On Tue, Jun 21, 2016 at 10:45:18AM -0700, Pierre-Louis Bossart wrote:
> You can experiment with the 'dma' and 'link' timestamps today on any
> HDaudio-based device. Like I said the synchronized part has not been
> upstreamed yet (delays + dependency on ART-to-TSC conversions that made it
> in the kernel recently)

Can you point me to any open source apps using the dma/link
timestamps?

Thanks,
Richard


Re: [ovs-dev] [PATCH net-next] openvswitch: Only set mark and labels when commiting a connection.

2016-06-21 Thread Joe Stringer
On 20 June 2016 at 17:19, Jarno Rajahalme  wrote:
> Only allow setting conntrack mark or labels when the commit flag is
> specified.  This makes sure we can not set them before the connection
> has been persisted, as in that case the mark and labels would be lost
> in an event of an userspace upcall.
>
> OVS userspace already requires the commit flag to accept setting
> ct_mark and/or ct_labels.  Validate for this on the kernel API.
>
> Finally, set conntrack mark and labels right before committing so that
> the initial conntrack NEW event has the mark and labels.
>
> Signed-off-by: Jarno Rajahalme 

The structure of this commit message suggests there are multiple
changes trying to be addressed in one patch. I suggest splitting them
out.

In terms of applying the mark and labels before committing the
connection, that's actually the behaviour I would expect if you were
to execute ct(mark=foo,commit). The NEW event should include these
pieces, and should have all along.

> @@ -1145,6 +1155,20 @@ static int parse_ct(const struct nlattr *attr, struct 
> ovs_conntrack_info *info,
> }
> }
>
> +#ifdef CONFIG_NF_CONNTRACK_MARK
> +   if (!info->commit && info->mark.mask) {
> +   OVS_NLERR(log,
> + "Setting conntrack mark requires 'commit' flag.");
> +   return -EINVAL;
> +   }
> +#endif
> +#ifdef CONFIG_NF_CONNTRACK_LABELS
> +   if (!info->commit && labels_nonzero(>labels.mask)) {
> +   OVS_NLERR(log,
> + "Setting conntrack labels requires 'commit' flag.");
> +   return -EINVAL;
> +   }
> +#endif

I'm of mixed minds about this, but I lean towards agreeing with it. On
one hand, it's applying more restrictions on an otherwise fairly loose
interface and if anyone is relying on this behaviour then it would be
surprising to have this restriction introduced. On the other hand, it
doesn't make a lot of sense to set a label/mark but not to commit the
connection. As you say, the behaviour isn't exactly consistent in that
case today anyway: If there was a flow with
actions=ct(mark=foo),recirc() followed by a userspace upcall, then the
mark would be reflected in the flow key but not saved to any persisted
connection. A subsequent ct(commit) after upcall wouldn't persist it,
either. However if there were two flows already in the datapath to do
this, then it /would/ be persisted. Restricting the mark/labels
modification to only if you have the "commit" flag would address that
consistency issue. The OVS userspace enforcing this constraint also
hints that this was an unintentional omission from kernel validation.


Re: 802.3ad bonding aggregator reselection

2016-06-21 Thread Veli-Matti Lintu
2016-06-21 18:46 GMT+03:00 Jay Vosburgh :
> Veli-Matti Lintu  wrote:
>
>>2016-06-20 17:11 GMT+03:00 zhuyj :
>>> 5. Switch Configuration
>>> ===
>>>
>>> For this section, "switch" refers to whatever system the
>>> bonded devices are directly connected to (i.e., where the other end of
>>> the cable plugs into).  This may be an actual dedicated switch device,
>>> or it may be another regular system (e.g., another computer running
>>> Linux),
>>>
>>> The active-backup, balance-tlb and balance-alb modes do not
>>> require any specific configuration of the switch.
>>>
>>> The 802.3ad mode requires that the switch have the appropriate
>>> ports configured as an 802.3ad aggregation.  The precise method used
>>> to configure this varies from switch to switch, but, for example, a
>>> Cisco 3550 series switch requires that the appropriate ports first be
>>> grouped together in a single etherchannel instance, then that
>>> etherchannel is set to mode "lacp" to enable 802.3ad (instead of
>>> standard EtherChannel).
>>
>>The ports are configured in switch settings (HP Procurve 2530-48G) in
>>same trunk group (TrkX) and trunk group type is set as LACP.
>>/proc/net/bonding/bond0 also shows that the three ports belong to same
>>aggregator and bandwidth tests also support this. In my understanding
>>Procurve's trunk group is pretty much the same as etherchannel in
>>Cisco's terminology. The bonded link comes always up properly, but
>>handling of links going down is the problem. Are there known
>>differences between different vendors there?
>
> I did the original LACP reselection testing on a Cisco switch,
> but I have an HP 2530 now; I'll test it later today or tomorrow and see
> if it behaves properly, and whether your proposed patch is needed.

Thanks for taking a look at this. Here are some more details about the
setup as Zhu Yanjun also requested.

The server in question has two internal 10Gbps ports (using ixgbe) and
two Intel I350 T2 dual-1Gbps PCIe-cards (using igb). All ports are
using 1Gbps connections.

05:00.0 Ethernet controller: Intel Corporation Ethernet Controller
10-Gigabit X540-AT2 (rev 01)
05:00.1 Ethernet controller: Intel Corporation Ethernet Controller
10-Gigabit X540-AT2 (rev 01)
81:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network
Connection (rev 01)
81:00.1 Ethernet controller: Intel Corporation I350 Gigabit Network
Connection (rev 01)
82:00.0 Ethernet controller: Intel Corporation I350 Gigabit Network
Connection (rev 01)
82:00.1 Ethernet controller: Intel Corporation I350 Gigabit Network
Connection (rev 01)

In the test setup the bonds are setup as:

05:00.0 + 81:00.0 + 82:00.0 and
05:00.1 + 81:00.1 + 82:00.1

So each bond uses one port using ixgbe and two ports using igbe.

When testing, I have disabled the port in the switch configuration
that brings down the link and also miimon sees the link going down on
the server. This should be the same as unplugging the cable, so
there's nothing coming through the wire to the server.

Veli-Matti


Re: [PATCH] ibmvnic: fix to use list_for_each_safe() when delete items

2016-06-21 Thread Thomas Falcon
On 06/20/2016 10:50 AM, Thomas Falcon wrote:
> On 06/17/2016 09:53 PM, weiyj...@163.com wrote:
>> From: Wei Yongjun 
>>
>> Since we will remove items off the list using list_del() we need
>> to use a safe version of the list_for_each() macro aptly named
>> list_for_each_safe().
>>
>> Signed-off-by: Wei Yongjun 
>> ---
>>  drivers/net/ethernet/ibm/ibmvnic.c | 10 +-
>>  1 file changed, 5 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
>> b/drivers/net/ethernet/ibm/ibmvnic.c
>> index 864cb21..0b6a922 100644
>> --- a/drivers/net/ethernet/ibm/ibmvnic.c
>> +++ b/drivers/net/ethernet/ibm/ibmvnic.c
>> @@ -3141,14 +3141,14 @@ static void handle_request_ras_comp_num_rsp(union 
>> ibmvnic_crq *crq,
>>  
>>  static void ibmvnic_free_inflight(struct ibmvnic_adapter *adapter)
>>  {
>> -struct ibmvnic_inflight_cmd *inflight_cmd;
>> +struct ibmvnic_inflight_cmd *inflight_cmd, *tmp1;
>>  struct device *dev = >vdev->dev;
>> -struct ibmvnic_error_buff *error_buff;
>> +struct ibmvnic_error_buff *error_buff, *tmp2;
>>  unsigned long flags;
>>  unsigned long flags2;
>>  
>>  spin_lock_irqsave(>inflight_lock, flags);
>> -list_for_each_entry(inflight_cmd, >inflight, list) {
>> +list_for_each_entry_safe(inflight_cmd, tmp1, >inflight, list) {
>>  switch (inflight_cmd->crq.generic.cmd) {
>>  case LOGIN:
>>  dma_unmap_single(dev, adapter->login_buf_token,
>> @@ -3165,8 +3165,8 @@ static void ibmvnic_free_inflight(struct 
>> ibmvnic_adapter *adapter)
>>  break;
>>  case REQUEST_ERROR_INFO:
>>  spin_lock_irqsave(>error_list_lock, flags2);
>> -list_for_each_entry(error_buff, >errors,
>> -list) {
>> +list_for_each_entry_safe(error_buff, tmp2,
>> + >errors, list) {
>>  dma_unmap_single(dev, error_buff->dma,
>>   error_buff->len,
>>   DMA_FROM_DEVICE);
>>
> Thanks!
>
> Acked-by: Thomas Falcon 

Hello, I apologize for prematurely ack'ing this.  There is another situation 
where you could use list_for_each_entry_safe in the function 
handle_error_info_rsp.  Could you include this in your patch, please?

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 864cb21..e9968d9 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -2121,7 +2121,7 @@ static void handle_error_info_rsp(union ibmvnic_crq *crq,
  struct ibmvnic_adapter *adapter)
 {
struct device *dev = >vdev->dev;
-   struct ibmvnic_error_buff *error_buff;
+   struct ibmvnic_error_buff *error_buff, *tmp;
unsigned long flags;
bool found = false;
int i;
@@ -2133,7 +2133,7 @@ static void handle_error_info_rsp(union ibmvnic_crq *crq,
}
 
spin_lock_irqsave(>error_list_lock, flags);
-   list_for_each_entry(error_buff, >errors, list)
+   list_for_each_entry_safe(error_buff, tmp, >errors, list)
if (error_buff->error_id == crq->request_error_rsp.error_id) {
found = true;
list_del(_buff->list);

>>
>>
>> ___
>> Linuxppc-dev mailing list
>> linuxppc-...@lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/linuxppc-dev



Re: [iproute PATCH v2 2/7] Use C99 style initializers everywhere

2016-06-21 Thread David Ahern

On 6/21/16 11:03 AM, Phil Sutter wrote:

I downloaded CentOS 5 and 6. iproute2 fails to compile on CentOS 5.11;
ip command builds on 6.8 but with a flurry of redefinition errors
(BUILD_BUG_ON), but fails at tc.


What's the exact error message please? Maybe some incompatibility in
kernel headers? Although that shouldn't be ...


lib
CC   libgenl.o
CC   ll_map.o
CC   libnetlink.o
AR   libnetlink.a
CC   utils.o
In file included from utils.c:35:
../include/utils.h:212:1: warning: "BUILD_BUG_ON" redefined
In file included from ../include/linux/netlink.h:4,
 from ../include/linux/if_link.h:5,
 from ../include/linux/netdevice.h:31,
 from ../include/linux/if_arp.h:26,
 from utils.c:28:
/usr/include/linux/kernel.h:29:1: warning: this is the location of the 
previous definition

...


CC   tc_bpf.o
tc_bpf.c:41:26: error: linux/if_alg.h: No such file or directory
In file included from tc_bpf.c:45:
../include/utils.h:212:1: warning: "BUILD_BUG_ON" redefined
In file included from ../include/linux/netlink.h:4,
 from ../include/libnetlink.h:7,
 from ../include/utils.h:10,
 from tc_bpf.c:45:
/usr/include/linux/kernel.h:29:1: warning: this is the location of the 
previous definition

make[1]: *** [tc_bpf.o] Error 1
make: *** [all] Error 2




Re: [net-next PATCH v3 00/17] Future-proof tunnel offload handlers

2016-06-21 Thread Tom Herbert
On Tue, Jun 21, 2016 at 11:17 AM, Alexander Duyck
 wrote:
> On Tue, Jun 21, 2016 at 10:40 AM, Hannes Frederic Sowa
>  wrote:
>> On 21.06.2016 10:27, Edward Cree wrote:
>>> On 21/06/16 18:05, Alexander Duyck wrote:
 On Tue, Jun 21, 2016 at 1:22 AM, David Miller  wrote:
> But anyways, the vastness of the key is why we want to keep "sockets"
> out of network cards, because proper support of "sockets" requires
> access to information the card simply does not and should not have.
 Right.  Really what I would like to see for most of these devices is a
 2 tuple filter where you specify the UDP port number, and the PF/VF ID
 that the traffic is received on.
>>> But that doesn't make sense - the traffic is received on a physical network
>>> port, and it's the headers (i.e. flow) at that point that determine whether
>>> the traffic is encap or not.  After all, those headers are all that can
>>> determine which PF or VF it's sent to; and if it's multicast and goes to
>>> more than one of them, it seems odd for one to treat it as encap and the
>>> other to treat it as normal UDP - one of them must be misinterpreting it
>>> (unless the UDP is going to a userspace tunnel endpoint, but I'm ignoring
>>> that complication for now).
>>
>> Disabling offloading of packets is never going to cause data corruptions
>> or misinterpretations. In some cases we can hint the network card to do
>> even more (RSS+checksumming). We always have a safe choice, namely not
>> doing hw offloading.
>
> Agreed.  Also we need to keep in mind that in many cases things like
> RSS and checksumming can be very easily made port specific since what
> we are talking about is just what is reported in the Rx descriptor and
> not any sort of change to the packet data.
>
>> Multicast is often scoped, in some cases we have different multicast
>> scopes but the same addresses. In case of scoped traffic, we must verify
>> the device as well and can't install the same flow on every NIC.
>
> Right.  Hopefully the NIC vendors are thinking ahead and testing to
> validate such cases where multicast or broadcast traffic doesn't do
> anything weird to their NICs in terms of offloads.
>
>>> At a given physical point in the network, a given UDP flow either is or is
>>> not carrying encapsulated traffic, and if it tries to be both then things
>>> are certain to break, just as much as if two different applications try to
>>> use the same UDP flow for two different application protocols.
>>
>> I think the example Tom was hinting at initially is like that:
>>
>> A net namespace acts as a router and has a vxlan endpoint active. The
>> vxlan endpoint enables vxlan offloading on all net_devices in the same
>> namespace. Because we only identify the tunnel endpoint by UDP port
>> number, traffic which should actually just be forwarded and should never
>> be processed locally suddenly can become processed by the offloading hw
>> units. Because UDP ports only form a contract between the end points and
>> not with the router in between it would be illegal to treat those not
>> locally designated packets as vxlan by the router.
>
> Yes.  The problem is I am sure there are some vendors out there
> wanting to tout their product as being excellent at routing VXLAN
> traffic so they are probably exploiting this to try and claim
> performance gains.
>
> There is also some argument to be had for theory versus application.
> Arguably it is the customers that are leading to some of the dirty
> hacks as I think vendors are building NICs based on customer use cases
> versus following any specifications.  In most data centers the tunnel
> underlays will be deployed throughout the network and UDP will likely
> be blocked for anything that isn't being used explicitly for
> tunneling.  As such we seem to be seeing a lot of NICs that are only
> supporting one port for things like this instead of designing them to
> handle whatever we can throw at them.
>
Actually, I don't believe that's true. It is not typical to deploy
firewalls within a data center fabric, and nor do we restrict
applications from binding to any UDP ports and they can pretty much
transmit to any port on any host without cost using an unconnected UDP
socket. I think it's more likely that NIC (and switch vendors) simply
assumed that port numbers can be treated as global values. That's
expedient and at small scale we can probably get away with it, but at
large scale this will eventually bite someone.

> I really think it may be a few more years before we hit the point
> where the vendors start to catch a clue about the fact that they need
> to have a generic approach that works in all cases versus what we have
> now were they are supporting whatever the buzzword of the day is and
> not looking much further down the road than that.  The fact is in a
> few years time we might even have to start dealing with
> tunnel-in-tunnel type 

Re: [PATCH v4 00/19] CALIPSO Implementation

2016-06-21 Thread Paul Moore
On Tue, Jun 21, 2016 at 5:55 AM, Huw Davies  wrote:
> On Tue, Jun 21, 2016 at 05:39:28AM -0400, David Miller wrote:
>> From: Huw Davies 
>> Date: Mon, 20 Jun 2016 14:36:40 +0100
>>
>> > This patch series implements RFC 5570 - Common Architecture Label IPv6
>> > Security Option (CALIPSO).  Its goal is to set MLS sensitivity labels
>> > on IPv6 packets using a hop-by-hop option.  CALIPSO is very similar to
>> > its IPv4 cousin CIPSO and much of this series is based on that code.
>>
>> What tree do you expect to integrate this?
>
> My understanding is that Paul Moore is happy to take them
> in via the SELinux tree.  However, these patches do touch
> some core networking code, such as the IPv6 option handling
> code (in a similar manner to the way CIPSO touched the IPv4
> option code), so if you have any comments on those aspects
> that would be good to hear.

Huw is correct.  I haven't yet gone through this latest patchset,
although I've reviewed the previous versions and provided feedback;
the v3 revision looked pretty good to me and I'm assuming I won't find
any showstoppers in the v4 revision.

I'm happy to push this upstream via the SELinux tree, but only with an
implicit ACK from DaveM since it does touch of the core stack (in ways
comparable to what we did for CIPSO and IPv4).  If DaveM would prefer
to merge these patches via the netdev tree, that's fine too, I'll take
a closer look later this week and send my ACKs.

DaveM, let me know how you want to proceed with this patchset.

-- 
paul moore
www.paul-moore.com


[PATCH iproute2 net-next] bridge: man: fix "brige" typo

2016-06-21 Thread Vivien Didelot
Signed-off-by: Vivien Didelot 
---
 man/man8/bridge.8 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/man/man8/bridge.8 b/man/man8/bridge.8
index 1818542..ac42118 100644
--- a/man/man8/bridge.8
+++ b/man/man8/bridge.8
@@ -234,7 +234,7 @@ error.
 .sp
 
 .B 1
-- STP LISTENING state. Only valid if STP is enabled on the brige. In this
+- STP LISTENING state. Only valid if STP is enabled on the bridge. In this
 state the port for list for STP BPDUs and drop all other traffic.
 .sp
 
-- 
2.9.0



[PATCH iproute2 net-next] bridge: vlan: fix a few "fdb" typos in vlan doc

2016-06-21 Thread Vivien Didelot
Signed-off-by: Vivien Didelot 
---
 bridge/vlan.c | 2 +-
 man/man8/bridge.8 | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bridge/vlan.c b/bridge/vlan.c
index 717025a..a8a2e1d 100644
--- a/bridge/vlan.c
+++ b/bridge/vlan.c
@@ -299,6 +299,6 @@ int do_vlan(int argc, char **argv)
} else
return vlan_show(0, NULL);
 
-   fprintf(stderr, "Command \"%s\" is unknown, try \"bridge fdb 
help\".\n", *argv);
+   fprintf(stderr, "Command \"%s\" is unknown, try \"bridge vlan 
help\".\n", *argv);
exit(-1);
 }
diff --git a/man/man8/bridge.8 b/man/man8/bridge.8
index 08e8a5b..1818542 100644
--- a/man/man8/bridge.8
+++ b/man/man8/bridge.8
@@ -551,8 +551,8 @@ device is the bridge device.
 .BI master
 the vlan is configured on the software bridge (default).
 
-.SS bridge vlan delete - delete a forwarding database entry
-This command removes an existing fdb entry.
+.SS bridge vlan delete - delete a vlan filter entry
+This command removes an existing vlan filter entry.
 
 .PP
 The arguments are the same as with
-- 
2.9.0



[PATCH v2 1/2] netfilter/nflog: nflog-range does not truncate packets

2016-06-21 Thread Vishwanath Pai
netfilter/nflog: nflog-range does not truncate packets

li->u.ulog.copy_len is currently ignored by the kernel, we should truncate
the packet to either li->u.ulog.copy_len (if set) or copy_range before
sending it to userspace. 0 is a valid input for copy_len, so add a new
flag to indicate whether this was option was specified by the user or not.

Add two flags to indicate whether nflog-size/copy_len was set or not.
XT_NFLOG_F_COPY_LEN is for XT_NFLOG and NFLOG_F_COPY_LEN for nfnetlink_log

On the userspace side, this was initially represented by the option
nflog-range, this will be replaced by --nflog-size now. --nflog-range would
still exist but does not do anything.

Reported-by: Joe Dollard 
Reviewed-by: Josh Hunt 
Signed-off-by: Vishwanath Pai 

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index 57639fc..83d855b 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -12,6 +12,9 @@
 #define NF_LOG_UID 0x08/* Log UID owning local socket */
 #define NF_LOG_MASK0x0f
 
+/* This flag indicates that copy_len field in nf_loginfo is set */
+#define NF_LOG_F_COPY_LEN  0x1
+
 enum nf_log_type {
NF_LOG_TYPE_LOG = 0,
NF_LOG_TYPE_ULOG,
@@ -22,9 +25,13 @@ struct nf_loginfo {
u_int8_t type;
union {
struct {
+   /* copy_len will be used iff you set
+* NF_LOG_F_COPY_LEN in flags
+*/
u_int32_t copy_len;
u_int16_t group;
u_int16_t qthreshold;
+   u_int16_t flags;
} ulog;
struct {
u_int8_t level;
diff --git a/include/uapi/linux/netfilter/xt_NFLOG.h 
b/include/uapi/linux/netfilter/xt_NFLOG.h
index 87b5831..f330707 100644
--- a/include/uapi/linux/netfilter/xt_NFLOG.h
+++ b/include/uapi/linux/netfilter/xt_NFLOG.h
@@ -6,9 +6,13 @@
 #define XT_NFLOG_DEFAULT_GROUP 0x1
 #define XT_NFLOG_DEFAULT_THRESHOLD 0
 
-#define XT_NFLOG_MASK  0x0
+#define XT_NFLOG_MASK  0x1
+
+/* This flag indicates that 'len' field in xt_nflog_info is set*/
+#define XT_NFLOG_F_COPY_LEN0x1
 
 struct xt_nflog_info {
+   /* 'len' will be used iff you set XT_NFLOG_F_COPY_LEN in flags */
__u32   len;
__u16   group;
__u16   threshold;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 11f81c8..cbcfdfb 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -700,10 +700,13 @@ nfulnl_log_packet(struct net *net,
break;
 
case NFULNL_COPY_PACKET:
-   if (inst->copy_range > skb->len)
+   data_len = inst->copy_range;
+   if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) &&
+   (li->u.ulog.copy_len < data_len))
+   data_len = li->u.ulog.copy_len;
+
+   if (data_len > skb->len)
data_len = skb->len;
-   else
-   data_len = inst->copy_range;
 
size += nla_total_size(data_len);
break;
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index a1fa2c8..018eed7 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -33,6 +33,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param 
*par)
li.u.ulog.group  = info->group;
li.u.ulog.qthreshold = info->threshold;
 
+   if (info->flags & XT_NFLOG_F_COPY_LEN)
+   li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
+
nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
  par->out, , info->prefix);
return XT_CONTINUE;


[PATCH v2 2/2] netfilter/nflog: nflog-range does not truncate packets (userspace)

2016-06-21 Thread Vishwanath Pai
netfilter/nflog: nflog-range does not truncate packets

The option --nflog-range has never worked, but we cannot just fix this
because users might be using this feature option and their behavior would
change. Instead add a new option --nflog-size. This option works the same
way nflog-range should have, and both of them are mutually exclusive. When
someone uses --nflog-range we print a warning message informing them that
this feature has no effect.

To indicate the kernel that the user has set --nflog-size we have to pass a
new flag XT_NFLOG_F_COPY_LEN.

Also updated the man page to reflect this.

Reported-by: Joe Dollard 
Reviewed-by: Josh Hunt 
Signed-off-by: Vishwanath Pai 

diff --git a/extensions/libxt_NFLOG.c b/extensions/libxt_NFLOG.c
index f611631..8c564a2 100644
--- a/extensions/libxt_NFLOG.c
+++ b/extensions/libxt_NFLOG.c
@@ -12,7 +12,10 @@ enum {
O_GROUP = 0,
O_PREFIX,
O_RANGE,
+   O_SIZE,
O_THRESHOLD,
+   F_RANGE = 1 << O_RANGE,
+   F_SIZE = 1 << O_SIZE,
 };
 
 #define s struct xt_nflog_info
@@ -22,7 +25,9 @@ static const struct xt_option_entry NFLOG_opts[] = {
{.name = "nflog-prefix", .id = O_PREFIX, .type = XTTYPE_STRING,
 .min = 1, .flags = XTOPT_PUT, XTOPT_POINTER(s, prefix)},
{.name = "nflog-range", .id = O_RANGE, .type = XTTYPE_UINT32,
-.flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
+.excl = F_SIZE, .flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
+   {.name = "nflog-size", .id = O_SIZE, .type = XTTYPE_UINT32,
+.excl = F_RANGE, .flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
{.name = "nflog-threshold", .id = O_THRESHOLD, .type = XTTYPE_UINT16,
 .flags = XTOPT_PUT, XTOPT_POINTER(s, threshold)},
XTOPT_TABLEEND,
@@ -33,7 +38,8 @@ static void NFLOG_help(void)
 {
printf("NFLOG target options:\n"
   " --nflog-group NUM  NETLINK group used for 
logging\n"
-  " --nflog-range NUM  Number of byte to copy\n"
+  " --nflog-range NUM  This option has no effect, use 
--nflog-size\n"
+  " --nflog-size NUM   Number of bytes to copy\n"
   " --nflog-threshold NUM  Message threshold of in-kernel 
queue\n"
   " --nflog-prefix STRING  Prefix string for log 
messages\n");
 }
@@ -57,6 +63,18 @@ static void NFLOG_parse(struct xt_option_call *cb)
}
 }
 
+static void NFLOG_check(struct xt_fcheck_call *cb)
+{
+   struct xt_nflog_info *info = cb->data;
+
+   if (cb->xflags & F_RANGE)
+   fprintf(stderr, "warn: --nflog-range has never worked and is no"
+   " longer supported, please use --nflog-size insted\n");
+
+   if (cb->xflags & F_SIZE)
+   info->flags |= XT_NFLOG_F_COPY_LEN;
+}
+
 static void nflog_print(const struct xt_nflog_info *info, char *prefix)
 {
if (info->prefix[0] != '\0') {
@@ -65,7 +83,9 @@ static void nflog_print(const struct xt_nflog_info *info, 
char *prefix)
}
if (info->group)
printf(" %snflog-group %u", prefix, info->group);
-   if (info->len)
+   if (info->len && info->flags & XT_NFLOG_F_COPY_LEN)
+   printf(" %snflog-size %u", prefix, info->len);
+   else if (info->len)
printf(" %snflog-range %u", prefix, info->len);
if (info->threshold != XT_NFLOG_DEFAULT_THRESHOLD)
printf(" %snflog-threshold %u", prefix, info->threshold);
@@ -117,6 +137,7 @@ static struct xtables_target nflog_target = {
.userspacesize  = XT_ALIGN(sizeof(struct xt_nflog_info)),
.help   = NFLOG_help,
.init   = NFLOG_init,
+   .x6_fcheck  = NFLOG_check,
.x6_parse   = NFLOG_parse,
.print  = NFLOG_print,
.save   = NFLOG_save,
diff --git a/extensions/libxt_NFLOG.man b/extensions/libxt_NFLOG.man
index 1b6dbf1..318e630 100644
--- a/extensions/libxt_NFLOG.man
+++ b/extensions/libxt_NFLOG.man
@@ -17,6 +17,9 @@ A prefix string to include in the log message, up to 64 
characters
 long, useful for distinguishing messages in the logs.
 .TP
 \fB\-\-nflog\-range\fP \fIsize\fP
+This option has never worked, use --nflog-size instead
+.TP
+\fB\-\-nflog\-size\fP \fIsize\fP
 The number of bytes to be copied to userspace (only applicable for
 nfnetlink_log). nfnetlink_log instances may specify their own
 range, this option overrides it.
diff --git a/include/linux/netfilter/xt_NFLOG.h 
b/include/linux/netfilter/xt_NFLOG.h
index 87b5831..f330707 100644
--- a/include/linux/netfilter/xt_NFLOG.h
+++ b/include/linux/netfilter/xt_NFLOG.h
@@ -6,9 +6,13 @@
 #define XT_NFLOG_DEFAULT_GROUP 0x1
 #define XT_NFLOG_DEFAULT_THRESHOLD 0
 
-#define XT_NFLOG_MASK  0x0
+#define XT_NFLOG_MASK  0x1
+
+/* This flag indicates that 'len' field in xt_nflog_info is set*/

Re: [net-next PATCH v3 00/17] Future-proof tunnel offload handlers

2016-06-21 Thread Alexander Duyck
On Tue, Jun 21, 2016 at 10:40 AM, Hannes Frederic Sowa
 wrote:
> On 21.06.2016 10:27, Edward Cree wrote:
>> On 21/06/16 18:05, Alexander Duyck wrote:
>>> On Tue, Jun 21, 2016 at 1:22 AM, David Miller  wrote:
 But anyways, the vastness of the key is why we want to keep "sockets"
 out of network cards, because proper support of "sockets" requires
 access to information the card simply does not and should not have.
>>> Right.  Really what I would like to see for most of these devices is a
>>> 2 tuple filter where you specify the UDP port number, and the PF/VF ID
>>> that the traffic is received on.
>> But that doesn't make sense - the traffic is received on a physical network
>> port, and it's the headers (i.e. flow) at that point that determine whether
>> the traffic is encap or not.  After all, those headers are all that can
>> determine which PF or VF it's sent to; and if it's multicast and goes to
>> more than one of them, it seems odd for one to treat it as encap and the
>> other to treat it as normal UDP - one of them must be misinterpreting it
>> (unless the UDP is going to a userspace tunnel endpoint, but I'm ignoring
>> that complication for now).
>
> Disabling offloading of packets is never going to cause data corruptions
> or misinterpretations. In some cases we can hint the network card to do
> even more (RSS+checksumming). We always have a safe choice, namely not
> doing hw offloading.

Agreed.  Also we need to keep in mind that in many cases things like
RSS and checksumming can be very easily made port specific since what
we are talking about is just what is reported in the Rx descriptor and
not any sort of change to the packet data.

> Multicast is often scoped, in some cases we have different multicast
> scopes but the same addresses. In case of scoped traffic, we must verify
> the device as well and can't install the same flow on every NIC.

Right.  Hopefully the NIC vendors are thinking ahead and testing to
validate such cases where multicast or broadcast traffic doesn't do
anything weird to their NICs in terms of offloads.

>> At a given physical point in the network, a given UDP flow either is or is
>> not carrying encapsulated traffic, and if it tries to be both then things
>> are certain to break, just as much as if two different applications try to
>> use the same UDP flow for two different application protocols.
>
> I think the example Tom was hinting at initially is like that:
>
> A net namespace acts as a router and has a vxlan endpoint active. The
> vxlan endpoint enables vxlan offloading on all net_devices in the same
> namespace. Because we only identify the tunnel endpoint by UDP port
> number, traffic which should actually just be forwarded and should never
> be processed locally suddenly can become processed by the offloading hw
> units. Because UDP ports only form a contract between the end points and
> not with the router in between it would be illegal to treat those not
> locally designated packets as vxlan by the router.

Yes.  The problem is I am sure there are some vendors out there
wanting to tout their product as being excellent at routing VXLAN
traffic so they are probably exploiting this to try and claim
performance gains.

There is also some argument to be had for theory versus application.
Arguably it is the customers that are leading to some of the dirty
hacks as I think vendors are building NICs based on customer use cases
versus following any specifications.  In most data centers the tunnel
underlays will be deployed throughout the network and UDP will likely
be blocked for anything that isn't being used explicitly for
tunneling.  As such we seem to be seeing a lot of NICs that are only
supporting one port for things like this instead of designing them to
handle whatever we can throw at them.

I really think it may be a few more years before we hit the point
where the vendors start to catch a clue about the fact that they need
to have a generic approach that works in all cases versus what we have
now were they are supporting whatever the buzzword of the day is and
not looking much further down the road than that.  The fact is in a
few years time we might even have to start dealing with
tunnel-in-tunnel type workloads to address the use of containers
inside of KVM guests.  I'm pretty sure we don't have support for
recursive tunnel offloads in hardware and likely never will.  To that
end all I would really need is support for CHECKSUM_COMPLETE or outer
Rx checksums enabled, RSS based on the outer source port assuming the
destination port is recognized as a tunnel, the ability to have DF bit
set for any of the inner tunnel headers, and GSO partial extended to
support tunnel-in-tunnel scenarios.

- Alex


Re: [iproute PATCH v2 2/7] Use C99 style initializers everywhere

2016-06-21 Thread Stephen Hemminger
On Tue, 21 Jun 2016 19:17:31 +0200
Phil Sutter  wrote:

> On Tue, Jun 21, 2016 at 11:13:11AM -0600, David Ahern wrote:
> > On 6/21/16 11:03 AM, Phil Sutter wrote:
> > >> I downloaded CentOS 5 and 6. iproute2 fails to compile on CentOS 5.11;
> > >> ip command builds on 6.8 but with a flurry of redefinition errors
> > >> (BUILD_BUG_ON), but fails at tc.
> > >
> > > What's the exact error message please? Maybe some incompatibility in
> > > kernel headers? Although that shouldn't be ...
> [...]
> >  CC   tc_bpf.o
> > tc_bpf.c:41:26: error: linux/if_alg.h: No such file or directory
> 
> Ah! Looks like this header is missing in iproute2's copy of kernel
> headers.
> 
> Stephen, would you import the missing one?

I ran a script and imported all the headers that were linux/if_*.h and used
by current source.


Re: [net-next PATCH v3 00/17] Future-proof tunnel offload handlers

2016-06-21 Thread Edward Cree
On 21/06/16 18:40, Hannes Frederic Sowa wrote:
> On 21.06.2016 10:27, Edward Cree wrote:
>> At a given physical point in the network, a given UDP flow either is or is
>> not carrying encapsulated traffic, and if it tries to be both then things
>> are certain to break, just as much as if two different applications try to
>> use the same UDP flow for two different application protocols.
> I think the example Tom was hinting at initially is like that:
>
> A net namespace acts as a router and has a vxlan endpoint active. The
> vxlan endpoint enables vxlan offloading on all net_devices in the same
> namespace. Because we only identify the tunnel endpoint by UDP port
> number, traffic which should actually just be forwarded and should never
> be processed locally suddenly can become processed by the offloading hw
> units. Because UDP ports only form a contract between the end points and
> not with the router in between it would be illegal to treat those not
> locally designated packets as vxlan by the router.
Oh indeed, what we currently do is broken.  We would have to identify, for
each interface, which (if any) UDP flows on that interface correspond to
our vxlan endpoints, rather than (as now) saying that any UDP port that
matches any endpoint must be vxlan on all interfaces.
But as long as a vxlan endpoint is a device built on top of another device,
it can just ask that device to enable offloads for the corresponding flow;
and if that device is also a software device, it might modify the flow spec
before passing on the request to _its_ underlying device, or it might just
drop the request because it doesn't support it.
The problem is, AIUI the device is currently only used for transmitting;
anything received on any device that makes it through the IP stack to the
vxlan UDP socket is treated as vxlan.  And determining which interfaces'
traffic will get delivered locally and which will get routed is not
necessarily trivial.  Perhaps vxlan devices need to only receive traffic
that came through their underlying device?  Then the mapping to offload
becomes much simpler.

-Ed
> Also multicast traffic is always scoped, so the flow has to include the
> ifindex at least to allow differentiation between different scopes.


[PATCH iproute2 net-next v2] bridge: vlan: add support to display per-vlan statistics

2016-06-21 Thread Nikolay Aleksandrov
This patch adds support for the stats argument to the bridge
vlan command which will display the per-vlan statistics and the bridge
device each vlan belongs to. The supported command filtering options are
dev and vid. Also the man page is updated to explain the new option.
This patch uses the new RTM_GETSTATS interface with a filter_mask to dump
only the bridge vlans. Later we can add support for using the per-device
dump and filter it in the kernel instead.

Example:
$ bridge vlan stats
port vlan id
br0   1
RX: 34816114 bytes 495195 packets
TX: 68501306 bytes 987149 packets
  100
RX: 0 bytes 0 packets
TX: 0 bytes 0 packets
  200
RX: 0 bytes 0 packets
TX: 0 bytes 0 packets
  300
RX: 0 bytes 0 packets
TX: 0 bytes 0 packets
  301
RX: 169562135 bytes 790877 packets
TX: 169550926 bytes 790824 packets
br1   1
RX: 0 bytes 0 packets
TX: 0 bytes 0 packets

Note that it will print the per-vlan statistics for all vlans in a bridge
even if the vlan is only added to ports. Later when we add per-port
per-vlan statistics support, we'll be able to print the exact ports each
vlan belongs to, not only the bridge.

Signed-off-by: Nikolay Aleksandrov 
---
v2: Change the output format as per Stephen's comment and change the -s use
to a subcommand called stats in order to have a different format than show,
update the man page appropriately.

 bridge/vlan.c| 117 +++
 include/libnetlink.h |   8 
 lib/libnetlink.c |  20 +
 man/man8/bridge.8|  23 +-
 4 files changed, 150 insertions(+), 18 deletions(-)

diff --git a/bridge/vlan.c b/bridge/vlan.c
index 717025ae6eec..e251a5cae917 100644
--- a/bridge/vlan.c
+++ b/bridge/vlan.c
@@ -14,12 +14,14 @@
 #include "utils.h"
 
 static unsigned int filter_index, filter_vlan;
+static int last_ifidx = -1;
 
 static void usage(void)
 {
-   fprintf(stderr, "Usage: bridge vlan { add | del } vid VLAN_ID dev DEV [ 
pvid] [ untagged ]\n");
+   fprintf(stderr, "Usage: bridge vlan { add | del } vid VLAN_ID dev DEV [ 
pvid ] [ untagged ]\n");
fprintf(stderr, " [ 
self ] [ master ]\n");
fprintf(stderr, "   bridge vlan { show } [ dev DEV ] [ vid VLAN_ID 
]\n");
+   fprintf(stderr, "   bridge vlan { stats } [ dev DEV ] [ vid VLAN_ID 
]\n");
exit(-1);
 }
 
@@ -236,7 +238,67 @@ static int print_vlan(const struct sockaddr_nl *who,
return 0;
 }
 
-static int vlan_show(int argc, char **argv)
+static void print_one_vlan_stats(FILE *fp,
+const struct bridge_vlan_xstats *vstats,
+int ifindex)
+{
+   const char *ifname = "";
+
+   if (filter_vlan && filter_vlan != vstats->vid)
+   return;
+   if (last_ifidx != ifindex) {
+   ifname = ll_index_to_name(ifindex);
+   last_ifidx = ifindex;
+   }
+   fprintf(fp, "%-16s  %hu\n", ifname, vstats->vid);
+   fprintf(fp, "%-16sRX: %llu bytes %llu packets\n",
+   "", vstats->rx_bytes, vstats->rx_packets);
+   fprintf(fp, "%-16sTX: %llu bytes %llu packets\n",
+   "", vstats->tx_bytes, vstats->tx_packets);
+}
+
+static int print_vlan_stats(const struct sockaddr_nl *who,
+   struct nlmsghdr *n,
+   void *arg)
+{
+   struct rtattr *tb[IFLA_STATS_MAX+1], *brtb[LINK_XSTATS_TYPE_MAX+1];
+   struct if_stats_msg *ifsm = NLMSG_DATA(n);
+   struct rtattr *i, *list;
+   int len = n->nlmsg_len;
+   FILE *fp = arg;
+   int rem;
+
+   len -= NLMSG_LENGTH(sizeof(*ifsm));
+   if (len < 0) {
+   fprintf(stderr, "BUG: wrong nlmsg len %d\n", len);
+   return -1;
+   }
+
+   if (filter_index && filter_index != ifsm->ifindex)
+   return 0;
+
+   parse_rtattr(tb, IFLA_STATS_MAX, IFLA_STATS_RTA(ifsm), len);
+   if (!tb[IFLA_STATS_LINK_XSTATS])
+   return 0;
+
+   parse_rtattr(brtb, LINK_XSTATS_TYPE_MAX,
+RTA_DATA(tb[IFLA_STATS_LINK_XSTATS]),
+RTA_PAYLOAD(tb[IFLA_STATS_LINK_XSTATS]));
+   if (!brtb[LINK_XSTATS_TYPE_BRIDGE])
+   return 0;
+
+   list = brtb[LINK_XSTATS_TYPE_BRIDGE];
+   rem = RTA_PAYLOAD(list);
+   for (i = RTA_DATA(list); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
+   if (i->rta_type != BRIDGE_XSTATS_VLAN)
+   continue;
+   print_one_vlan_stats(fp, RTA_DATA(i), ifsm->ifindex);
+   }
+   fflush(fp);
+   return 0;
+}
+

Re: [PATCH iproute2 net-next v2] bridge: vlan: add support to display per-vlan statistics

2016-06-21 Thread Nikolay Aleksandrov
On 21/06/16 20:07, Nikolay Aleksandrov wrote:
> This patch adds support for the stats argument to the bridge
> vlan command which will display the per-vlan statistics and the bridge
> device each vlan belongs to. The supported command filtering options are
> dev and vid. Also the man page is updated to explain the new option.
> This patch uses the new RTM_GETSTATS interface with a filter_mask to dump
> only the bridge vlans. Later we can add support for using the per-device
> dump and filter it in the kernel instead.
> 
> Example:
> $ bridge vlan stats
> port vlan id
> br0   1
> RX: 34816114 bytes 495195 packets
> TX: 68501306 bytes 987149 packets
>   100
> RX: 0 bytes 0 packets
> TX: 0 bytes 0 packets
>   200
> RX: 0 bytes 0 packets
> TX: 0 bytes 0 packets
>   300
> RX: 0 bytes 0 packets
> TX: 0 bytes 0 packets
>   301
> RX: 169562135 bytes 790877 packets
> TX: 169550926 bytes 790824 packets
> br1   1
> RX: 0 bytes 0 packets
> TX: 0 bytes 0 packets
> 
> Note that it will print the per-vlan statistics for all vlans in a bridge
> even if the vlan is only added to ports. Later when we add per-port
> per-vlan statistics support, we'll be able to print the exact ports each
> vlan belongs to, not only the bridge.
> 
> Signed-off-by: Nikolay Aleksandrov 
> ---
> v2: Change the output format as per Stephen's comment and change the -s use
> to a subcommand called stats in order to have a different format than show,
> update the man page appropriately.
> 

I forgot to add - I also tested this in a small VM window and it fits
well. :-)

Cheers,
 Nik




Re: [alsa-devel] [very-RFC 0/8] TSN driver for the kernel

2016-06-21 Thread Pierre-Louis Bossart

On 6/20/16 5:18 AM, Richard Cochran wrote:

On Mon, Jun 20, 2016 at 01:08:27PM +0200, Pierre-Louis Bossart wrote:

The ALSA API provides support for 'audio' timestamps (playback/capture rate
defined by audio subsystem) and 'system' timestamps (typically linked to
TSC/ART) with one option to take synchronized timestamps should the hardware
support them.


Thanks for the info.  I just skimmed Documentation/sound/alsa/timestamping.txt.

That is fairly new, only since v4.1.  Are then any apps in the wild
that I can look at?  AFAICT, OpenAVB, gstreamer, etc, don't use the
new API.


The ALSA API supports a generic .get_time_info callback, its 
implementation is for now limited to a regular 'DMA' or 'link' timestamp 
for HDaudio - the difference being which counters are used and how close 
they are to the link serializer. The synchronized part is still WIP but 
should come 'soon'





The intent was that the 'audio' timestamps are translated to a shared time
reference managed in userspace by gPTP, which in turn would define if
(adaptive) audio sample rate conversion is needed. There is no support at
the moment for a 'play_at' function in ALSA, only means to control a
feedback loop.


Documentation/sound/alsa/timestamping.txt says:

  If supported in hardware, the absolute link time could also be used
  to define a precise start time (patches WIP)

Two questions:

1. Where are the patches?  (If some are coming, I would appreciate
   being on CC!)

2. Can you mention specific HW that would support this?


You can experiment with the 'dma' and 'link' timestamps today on any 
HDaudio-based device. Like I said the synchronized part has not been 
upstreamed yet (delays + dependency on ART-to-TSC conversions that made 
it in the kernel recently)




Re: [net-next PATCH v3 00/17] Future-proof tunnel offload handlers

2016-06-21 Thread Hannes Frederic Sowa
On 21.06.2016 10:27, Edward Cree wrote:
> On 21/06/16 18:05, Alexander Duyck wrote:
>> On Tue, Jun 21, 2016 at 1:22 AM, David Miller  wrote:
>>> But anyways, the vastness of the key is why we want to keep "sockets"
>>> out of network cards, because proper support of "sockets" requires
>>> access to information the card simply does not and should not have.
>> Right.  Really what I would like to see for most of these devices is a
>> 2 tuple filter where you specify the UDP port number, and the PF/VF ID
>> that the traffic is received on.
> But that doesn't make sense - the traffic is received on a physical network
> port, and it's the headers (i.e. flow) at that point that determine whether
> the traffic is encap or not.  After all, those headers are all that can
> determine which PF or VF it's sent to; and if it's multicast and goes to
> more than one of them, it seems odd for one to treat it as encap and the
> other to treat it as normal UDP - one of them must be misinterpreting it
> (unless the UDP is going to a userspace tunnel endpoint, but I'm ignoring
> that complication for now).

Disabling offloading of packets is never going to cause data corruptions
or misinterpretations. In some cases we can hint the network card to do
even more (RSS+checksumming). We always have a safe choice, namely not
doing hw offloading.

Multicast is often scoped, in some cases we have different multicast
scopes but the same addresses. In case of scoped traffic, we must verify
the device as well and can't install the same flow on every NIC.

> At a given physical point in the network, a given UDP flow either is or is
> not carrying encapsulated traffic, and if it tries to be both then things
> are certain to break, just as much as if two different applications try to
> use the same UDP flow for two different application protocols.

I think the example Tom was hinting at initially is like that:

A net namespace acts as a router and has a vxlan endpoint active. The
vxlan endpoint enables vxlan offloading on all net_devices in the same
namespace. Because we only identify the tunnel endpoint by UDP port
number, traffic which should actually just be forwarded and should never
be processed locally suddenly can become processed by the offloading hw
units. Because UDP ports only form a contract between the end points and
not with the router in between it would be illegal to treat those not
locally designated packets as vxlan by the router.

Also multicast traffic is always scoped, so the flow has to include the
ifindex at least to allow differentiation between different scopes.

Bye,
Hannes




Re: [net-next PATCH v3 00/17] Future-proof tunnel offload handlers

2016-06-21 Thread Edward Cree
On 21/06/16 18:05, Alexander Duyck wrote:
> On Tue, Jun 21, 2016 at 1:22 AM, David Miller  wrote:
>> But anyways, the vastness of the key is why we want to keep "sockets"
>> out of network cards, because proper support of "sockets" requires
>> access to information the card simply does not and should not have.
> Right.  Really what I would like to see for most of these devices is a
> 2 tuple filter where you specify the UDP port number, and the PF/VF ID
> that the traffic is received on.
But that doesn't make sense - the traffic is received on a physical network
port, and it's the headers (i.e. flow) at that point that determine whether
the traffic is encap or not.  After all, those headers are all that can
determine which PF or VF it's sent to; and if it's multicast and goes to
more than one of them, it seems odd for one to treat it as encap and the
other to treat it as normal UDP - one of them must be misinterpreting it
(unless the UDP is going to a userspace tunnel endpoint, but I'm ignoring
that complication for now).
At a given physical point in the network, a given UDP flow either is or is
not carrying encapsulated traffic, and if it tries to be both then things
are certain to break, just as much as if two different applications try to
use the same UDP flow for two different application protocols.

-Ed


Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-21 Thread Tom Herbert
On Tue, Jun 21, 2016 at 10:11 AM, Hannes Frederic Sowa
 wrote:
> On 17.06.2016 20:52, Tom Herbert wrote:
>>
>>> > Rather, I think people are going to start adding rules to block TOU
>>> > tunnels entirely because they cannot inspect nor conditionally
>>> > filter/rewrite the contents.  This is even more likely if Joe Random
>>> > and so easily can do their own userland TCP stack over TOU.
>>> >
>> Unfortunately, encryption is the only proven solution to protocol
>> ossification. If the network doesn't see it, it can't ossify it.
>
> DTLS carries still a lot of information, both in its handshake, as well
> as in the actual framing. The protocol is basically only TLS on top of
> datagrams and as such implements connection establishment and tear down
> of connections, which middle boxes can certainly track. It will just be
> a matter of time until middle boxes and security appliances will be able
> to track those connections, maybe not being able to inspect the content
> but at least see the certificates in clear-text and as such also have
> the common names and other addressing information at hand. The meta-data
> might certainly be track able.
>
> Because of reply protection you actually can infer the number of bytes
> transferred and someone can end up building congestion control on a
> middle box based on that, infer retransmissions etc.
>
Right, it's probably impossible to completely eliminate track-ability.
But hopefully we can keep the plain text information to the absolute
minimum needed to send the packet over the network and decrypt it at
the receiver.

One interesting characteristic of disassociated location is that we
could purposely try to manipulate ECMP so that every packet for a flow
take different paths so no single device (assuming multi-path) can
reconstruct the whole communication (kind of like spread spectrum for
the Internet). I imagine there are some might be some environments
where paranoids might want to do this.

Tom

> Bye,
> Hannes
>


Re: [alsa-devel] [very-RFC 0/8] TSN driver for the kernel

2016-06-21 Thread Pierre-Louis Bossart

On 6/20/16 5:31 AM, Richard Cochran wrote:

On Mon, Jun 20, 2016 at 02:18:38PM +0200, Richard Cochran wrote:

Documentation/sound/alsa/timestamping.txt says:


   Examples of typestamping with HDaudio:

   1. DMA timestamp, no compensation for DMA+analog delay
   $ ./audio_time  -p --ts_type=1

Where is this "audio_time" program of which you speak?


alsa-lib/test



Re: [iproute PATCH v2 2/7] Use C99 style initializers everywhere

2016-06-21 Thread Phil Sutter
On Tue, Jun 21, 2016 at 11:13:11AM -0600, David Ahern wrote:
> On 6/21/16 11:03 AM, Phil Sutter wrote:
> >> I downloaded CentOS 5 and 6. iproute2 fails to compile on CentOS 5.11;
> >> ip command builds on 6.8 but with a flurry of redefinition errors
> >> (BUILD_BUG_ON), but fails at tc.
> >
> > What's the exact error message please? Maybe some incompatibility in
> > kernel headers? Although that shouldn't be ...
[...]
>  CC   tc_bpf.o
> tc_bpf.c:41:26: error: linux/if_alg.h: No such file or directory

Ah! Looks like this header is missing in iproute2's copy of kernel
headers.

Stephen, would you import the missing one?

Thanks, Phil


Re: [iproute PATCH v2 7/7] ip/tcp_metrics: Simplify process_msg a bit

2016-06-21 Thread Stephen Hemminger
On Tue, 21 Jun 2016 18:18:41 +0200
Phil Sutter  wrote:

> By combining the attribute extraction and check for existence, the
> additional indentation level in the 'else' clause can be avoided.
> 
> In addition to that, common actions for 'daddr' are combined since the
> function returns if neither of the branches are taken.
> 
> Signed-off-by: Phil Sutter 
> ---
>  ip/tcp_metrics.c | 45 ++---
>  1 file changed, 18 insertions(+), 27 deletions(-)
> 
> diff --git a/ip/tcp_metrics.c b/ip/tcp_metrics.c
> index f82604f458ada..899830c127bcb 100644
> --- a/ip/tcp_metrics.c
> +++ b/ip/tcp_metrics.c
> @@ -112,47 +112,38 @@ static int process_msg(const struct sockaddr_nl *who, 
> struct nlmsghdr *n,
>   parse_rtattr(attrs, TCP_METRICS_ATTR_MAX, (void *) ghdr + GENL_HDRLEN,
>len);
>  
> - a = attrs[TCP_METRICS_ATTR_ADDR_IPV4];
> - if (a) {
> + if ((a = attrs[TCP_METRICS_ATTR_ADDR_IPV4])) {

NAK, plus it fails checkpatch


Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-21 Thread Hannes Frederic Sowa
On 17.06.2016 20:52, Tom Herbert wrote:
> 
>> > Rather, I think people are going to start adding rules to block TOU
>> > tunnels entirely because they cannot inspect nor conditionally
>> > filter/rewrite the contents.  This is even more likely if Joe Random
>> > and so easily can do their own userland TCP stack over TOU.
>> >
> Unfortunately, encryption is the only proven solution to protocol
> ossification. If the network doesn't see it, it can't ossify it.

DTLS carries still a lot of information, both in its handshake, as well
as in the actual framing. The protocol is basically only TLS on top of
datagrams and as such implements connection establishment and tear down
of connections, which middle boxes can certainly track. It will just be
a matter of time until middle boxes and security appliances will be able
to track those connections, maybe not being able to inspect the content
but at least see the certificates in clear-text and as such also have
the common names and other addressing information at hand. The meta-data
might certainly be track able.

Because of reply protection you actually can infer the number of bytes
transferred and someone can end up building congestion control on a
middle box based on that, infer retransmissions etc.

Bye,
Hannes



Re: [iproute PATCH v2 7/7] ip/tcp_metrics: Simplify process_msg a bit

2016-06-21 Thread Phil Sutter
On Tue, Jun 21, 2016 at 09:53:43AM -0700, Stephen Hemminger wrote:
> On Tue, 21 Jun 2016 18:18:41 +0200
> Phil Sutter  wrote:
> 
> > By combining the attribute extraction and check for existence, the
> > additional indentation level in the 'else' clause can be avoided.
> > 
> > In addition to that, common actions for 'daddr' are combined since the
> > function returns if neither of the branches are taken.
> > 
> > Signed-off-by: Phil Sutter 
> > ---
> >  ip/tcp_metrics.c | 45 ++---
> >  1 file changed, 18 insertions(+), 27 deletions(-)
> > 
> > diff --git a/ip/tcp_metrics.c b/ip/tcp_metrics.c
> > index f82604f458ada..899830c127bcb 100644
> > --- a/ip/tcp_metrics.c
> > +++ b/ip/tcp_metrics.c
> > @@ -112,47 +112,38 @@ static int process_msg(const struct sockaddr_nl *who, 
> > struct nlmsghdr *n,
> > parse_rtattr(attrs, TCP_METRICS_ATTR_MAX, (void *) ghdr + GENL_HDRLEN,
> >  len);
> >  
> > -   a = attrs[TCP_METRICS_ATTR_ADDR_IPV4];
> > -   if (a) {
> > +   if ((a = attrs[TCP_METRICS_ATTR_ADDR_IPV4])) {
> 
> NAK, plus it fails checkpatch

Oh well, I'll drop it then and get rid of the remaining checkpatch
warnings before resending.

Thanks, Phil


Re: [net-next PATCH v3 00/17] Future-proof tunnel offload handlers

2016-06-21 Thread Alexander Duyck
On Tue, Jun 21, 2016 at 1:22 AM, David Miller  wrote:
> From: Tom Herbert 
> Date: Mon, 20 Jun 2016 10:05:01 -0700
>
>> Generally, this means it needs to at least match by local addresses
>> and port for an unconnected/unbound socket, the source address for
>> an unconnected/bound socket, a the full 4-tuple for a connected
>> socket.
>
> These lookup keys are all insufficient.
>
> At the very least the network namespace must be in the lookup key as
> well if you want to match "sockets".  And this is just the tip of the
> iceberg in my opinion.
>
> The namespace bypassing to me is the biggest flaw in the UDP tunnel
> offloads.  That is creating real dangers right now.

I agree.  Fortunately this only really becomes an issue if SR-IOV is
enabled.  Otherwise the port based offloads only affect the PF as long
as no VFs are present.

> But anyways, the vastness of the key is why we want to keep "sockets"
> out of network cards, because proper support of "sockets" requires
> access to information the card simply does not and should not have.

Right.  Really what I would like to see for most of these devices is a
2 tuple filter where you specify the UDP port number, and the PF/VF ID
that the traffic is received on.  In order to get that we wouldn't
need any additional information from the API.  Then we at least have
indirect namespace isolation, and if someone really wanted to they
could do offloads on the VFs for different traffic.


Re: [iproute PATCH v2 2/7] Use C99 style initializers everywhere

2016-06-21 Thread Phil Sutter
On Tue, Jun 21, 2016 at 10:24:37AM -0600, David Ahern wrote:
> On 6/21/16 10:18 AM, Phil Sutter wrote:
> > This big patch was compiled by vimgrepping for memset calls and changing
> > to C99 initializer if applicable. One notable exception is the
> > initialization of union bpf_attr in tc/tc_bpf.c: changing it would break
> > for older gcc versions (at least <=3.4.6).
> >
> > Calls to memset for struct rtattr pointer fields for parse_rtattr*()
> > were just dropped since they are not needed.
> >
> > The changes here allowed the compiler to discover some unused variables,
> > so get rid of them, too.
> >
> > Signed-off-by: Phil Sutter 
> > ---
> > Changes since v1:
> > - Dropped former changes to tc/tc_bpf.c as they are incompatible to older
> >   gcc versions (at least <=3.4.6).
> 
> 
> What OS versions have you compiled iproute2 against?

Tested on Gentoo with old compiler installed, so apart from gcc the
system is up to date.

> I downloaded CentOS 5 and 6. iproute2 fails to compile on CentOS 5.11; 
> ip command builds on 6.8 but with a flurry of redefinition errors 
> (BUILD_BUG_ON), but fails at tc.

What's the exact error message please? Maybe some incompatibility in
kernel headers? Although that shouldn't be ...

Cheers, Phil


Re: [PATCH iproute2 net-next v3 1/5] json_writer: allow base json data type to be array or object

2016-06-21 Thread Stephen Hemminger
On Tue, 21 Jun 2016 09:24:50 -0700
Anuradha Karuppiah  wrote:

> On Tue, Jun 21, 2016 at 9:12 AM, Stephen Hemminger
>  wrote:
> > On Mon, 20 Jun 2016 23:39:43 -0700
> > Roopa Prabhu  wrote:
> >
> >> From: Anuradha Karuppiah 
> >>
> >> This patch adds a type qualifier to json_writer. Type can be a
> >> json object or array. This can be extended to other types like
> >> json-string, json-number etc in the future.
> >>
> >> Signed-off-by: Anuradha Karuppiah 
> >
> > Since json writer is not used in many places yet, why not just
> > get rid of the automatic object in the constructor.
> 
> I wanted to force the external api to start with an json-object or
> json-array. It reduces the chance of mistakes vs. a typeless
> constructor. With a typeless constructor you can accidentally end up
> with a json output that doesn't pass json lint; especially if optional
> params are being suppressed at different places.

Still, this is not how jsonwriter works in .NET, Android, or Java.
It is easily confusing to developers if similar API's behave
differently.  Kind of like if printf() always appended a new line
on some platforms.




Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-21 Thread Hannes Frederic Sowa
On 17.06.2016 09:51, Tom Herbert wrote:
> On Thu, Jun 16, 2016 at 4:15 PM, Hannes Frederic Sowa
>  wrote:
>> On 16.06.2016 19:51, Tom Herbert wrote:
>>> Transports over UDP is intended to encapsulate TCP and other transport
>>> protocols directly and securely in UDP.
>>>
>>> The goal of this work is twofold:
>>>
>>> 1) Allow applications to run their own transport layer stack (i.e.from
>>>userspace). This eliminates dependencies on the OS (e.g. solves a
>>>major dependency issue for Facebook on clients).
>>>
>>> 2) Make transport layer headers (all of L4) invisible to the network
>>>so that they can't do intrusive actions at L4. This will be enforced
>>>with DTLS in use.
>>>
>>> Note that #1 is really about running a transport stack in userspace
>>> applications in clients, not necessarily servers. For servers we
>>> intend to modified the kernel stack in order to leverage existing
>>> implementation for building scalable serves (hence these patches).
>>>
>>> This is described in more detail in the Internet Draft:
>>> https://tools.ietf.org/html/draft-herbert-transports-over-udp-00
>>>
>>> In Part I we implement a straightforward encapsulation of TCP in GUE.
>>> The implements the basic mechanics of TOU encapsulation for TCP,
>>> however does not yet implement the IP addressing interactions so
>>> therefore so this is not robust to use in the presence of NAT.
>>> TOU is enabled per socket with a new socket option. This
>>> implementation includes GSO, GRO, and RCO support.
>>>
>>> These patches also establish the baseline performance of TOU
>>> and isolate the performance cost of UDP encapsulation. Performance
>>> results are below.
>>>
>>> Tested: Various cases of TOU with IPv4, IPv6 using TCP_STREAM and
>>> TCP_RR. Also, tested IPIP for comparing TOU encapsulation to IP
>>> tunneling.
>>
>> Thinking about middleboxes again:
>>
>> E.g. https://tools.ietf.org/html/rfc6347#section-4.2.3 states that DTLS
>> packets are not allowed to be fragmented. Because of this and
>> furthermore because of the impossibility of clamp-mss-to-pmtu to work
>> anymore, do you have any idea on how reliable this can work?
>>
>> Or is your plan to use a smaller MSS on all paths by default?
>>
> Normal PMTU discovery mechanisms are applicable to prevent
> fragmentation. The overhead is accounted for in the MSS (similar to
> overhead of TCP options of IPv6 extension headers). Besides that,
> RFC6347 describes how fragmentation should be avoided, it does not
> explicitly forbid fragmentation, no IP protocol can outright forbid
> it. At most they could try to require DF bit is always set but that
> won't always be obeyed like when packets are tunneled in the network.

I agree, the specification is a bit unclear of what to do, but in terms
of not causing fragmentation it seems pretty clear to me:

"
   Each DTLS record MUST fit within a single datagram.  In order to
   avoid IP fragmentation, clients of the DTLS record layer SHOULD
   attempt to size records so that they fit within any PMTU estimates
   obtained from the record layer.
"

DTLS has invented its own fragmentation just to make sure that the
handshake actually doesn't depend on IP layer fragmentation.

Bye,
Hannes



Re: [RFC PATCH V3 0/3] basic device IOTLB support

2016-06-21 Thread Michael S. Tsirkin
On Tue, May 24, 2016 at 05:36:22PM +0800, Jason Wang wrote:
> This patch tries to implement an device IOTLB for vhost. This could be
> used with for co-operation with userspace IOMMU implementation (qemu)
> for a secure DMA environment (DMAR) in guest.
> 
> The idea is simple. When vhost meets an IOTLB miss, it will request
> the assistance of userspace to do the translation, this is done
> through:
> 
> - when there's a IOTLB miss, it will notify userspace through
>   vhost_net fd and then userspace read the fault address, size and
>   access from vhost fd.
> - userspace write the translation result back to vhost fd, vhost can
>   then update its IOTLB.
> 
> The codes were optimized for fixed mapping users e.g dpdk in guest. It
> will be slow if dynamic mappings were used in guest. We could do
> optimizations on top.
> 
> The codes were designed to be architecture independent. It should be
> easily ported to any architecture.
> 
> Stress tested with l2fwd/vfio in guest with 4K/2M/1G page size. On 1G
> hugepage case, 100% TLB hit rate were noticed.
> 
> Changes from V2:
> - introduce memory accessors for vhost
> - switch from ioctls to oridinary file read/write for iotlb miss and
>   updating
> - do not assume virtqueue were virtually mapped contiguously, all
>   virtqueue access were done throug IOTLB
> - verify memory access during IOTLB update and fail early
> - introduce a module parameter for the size of IOTLB
> 
> Changes from V1:
> - support any size/range of updating and invalidation through
>   introducing the interval tree.
> - convert from per device iotlb request to per virtqueue iotlb
>   request, this solves the possible deadlock in V1.
> - read/write permission check support.
> 
> Please review.

Nice, this looks good to me. Can you post a non-rfc please?


> Jason Wang (3):
>   vhost: introduce vhost memory accessors
>   vhost: convert pre sorted vhost memory array to interval tree
>   vhost: device IOTLB API
> 
>  drivers/vhost/net.c|  63 +++-
>  drivers/vhost/vhost.c  | 760 
> ++---
>  drivers/vhost/vhost.h  |  60 +++-
>  include/uapi/linux/vhost.h |  28 ++
>  4 files changed, 790 insertions(+), 121 deletions(-)
> 
> -- 
> 2.7.4


RE: [iproute PATCH v2 7/7] ip/tcp_metrics: Simplify process_msg a bit

2016-06-21 Thread David Laight
From: Of Phil Sutter
> Sent: 21 June 2016 17:19
> By combining the attribute extraction and check for existence, the
> additional indentation level in the 'else' clause can be avoided.
> 
> In addition to that, common actions for 'daddr' are combined since the
> function returns if neither of the branches are taken.
> 
> Signed-off-by: Phil Sutter 
> ---
>  ip/tcp_metrics.c | 45 ++---
>  1 file changed, 18 insertions(+), 27 deletions(-)
> 
> diff --git a/ip/tcp_metrics.c b/ip/tcp_metrics.c
> index f82604f458ada..899830c127bcb 100644
> --- a/ip/tcp_metrics.c
> +++ b/ip/tcp_metrics.c
> @@ -112,47 +112,38 @@ static int process_msg(const struct sockaddr_nl *who, 
> struct nlmsghdr *n,
>   parse_rtattr(attrs, TCP_METRICS_ATTR_MAX, (void *) ghdr + GENL_HDRLEN,
>len);
> 
> - a = attrs[TCP_METRICS_ATTR_ADDR_IPV4];
> - if (a) {
> + if ((a = attrs[TCP_METRICS_ATTR_ADDR_IPV4])) {

horrid



[iproute PATCH v2 2/7] Use C99 style initializers everywhere

2016-06-21 Thread Phil Sutter
This big patch was compiled by vimgrepping for memset calls and changing
to C99 initializer if applicable. One notable exception is the
initialization of union bpf_attr in tc/tc_bpf.c: changing it would break
for older gcc versions (at least <=3.4.6).

Calls to memset for struct rtattr pointer fields for parse_rtattr*()
were just dropped since they are not needed.

The changes here allowed the compiler to discover some unused variables,
so get rid of them, too.

Signed-off-by: Phil Sutter 
---
Changes since v1:
- Dropped former changes to tc/tc_bpf.c as they are incompatible to older
  gcc versions (at least <=3.4.6).
---
 bridge/fdb.c |  29 +++--
 bridge/link.c|  16 +++
 bridge/mdb.c |  19 -
 bridge/vlan.c|  19 -
 genl/ctrl.c  |  48 +
 ip/ip6tunnel.c   |  10 ++---
 ip/ipaddress.c   |  31 ++
 ip/ipaddrlabel.c |  23 +-
 ip/iplink.c  |  67 ++---
 ip/iplink_can.c  |   4 +-
 ip/ipmaddr.c |  27 +---
 ip/ipmroute.c|   8 +---
 ip/ipneigh.c |  36 
 ip/ipnetconf.c   |  12 +++---
 ip/ipnetns.c |  45 ++-
 ip/ipntable.c|  27 +---
 ip/iproute.c |  85 
 ip/iprule.c  |  26 +--
 ip/iptoken.c |  21 -
 ip/iptunnel.c|  31 --
 ip/ipxfrm.c  |  26 +++
 ip/link_gre.c|  22 +-
 ip/link_gre6.c   |  22 +-
 ip/link_ip6tnl.c |  29 ++---
 ip/link_iptnl.c  |  26 +--
 ip/link_vti.c|  22 +-
 ip/link_vti6.c   |  22 +-
 ip/xfrm_policy.c | 110 ++-
 ip/xfrm_state.c  | 128 +++
 lib/libnetlink.c |  74 ++--
 lib/ll_map.c |   1 -
 misc/arpd.c  |  68 ++---
 misc/ss.c|  41 --
 tc/e_bpf.c   |   7 +--
 tc/em_cmp.c  |   4 +-
 tc/em_ipset.c|   4 +-
 tc/em_meta.c |   4 +-
 tc/em_nbyte.c|   4 +-
 tc/em_u32.c  |   4 +-
 tc/f_flow.c  |   3 --
 tc/f_flower.c|   3 +-
 tc/f_fw.c|   6 +--
 tc/f_route.c |   3 --
 tc/f_rsvp.c  |   6 +--
 tc/f_u32.c   |  12 ++
 tc/m_bpf.c   |   5 +--
 tc/m_csum.c  |   4 +-
 tc/m_ematch.c|   4 +-
 tc/m_gact.c  |   5 +--
 tc/m_ife.c   |   5 +--
 tc/m_mirred.c|   7 +--
 tc/m_nat.c   |   4 +-
 tc/m_pedit.c |   8 +---
 tc/m_police.c|   5 +--
 tc/q_atm.c   |   3 +-
 tc/q_cbq.c   |  22 +++---
 tc/q_choke.c |   4 +-
 tc/q_codel.c |   3 +-
 tc/q_dsmark.c|   1 -
 tc/q_fifo.c  |   4 +-
 tc/q_fq_codel.c  |   3 +-
 tc/q_hfsc.c  |  13 ++
 tc/q_htb.c   |  15 +++
 tc/q_netem.c |  16 +++
 tc/q_red.c   |   4 +-
 tc/q_sfb.c   |  17 
 tc/q_sfq.c   |   4 +-
 tc/q_tbf.c   |   4 +-
 tc/tc_bpf.c  |  47 +---
 tc/tc_class.c|  33 ++
 tc/tc_exec.c |   3 +-
 tc/tc_filter.c   |  35 ++-
 tc/tc_qdisc.c|  35 ++-
 tc/tc_stab.c |   4 +-
 tc/tc_util.c |   3 +-
 75 files changed, 657 insertions(+), 898 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index be849f980a802..a59d6a9c13018 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -177,16 +177,15 @@ static int fdb_show(int argc, char **argv)
struct nlmsghdr n;
struct ifinfomsgifm;
charbuf[256];
-   } req;
+   } req = {
+   .n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+   .ifm.ifi_family = PF_BRIDGE
+   };
 
char *filter_dev = NULL;
char *br = NULL;
int msg_size = sizeof(struct ifinfomsg);
 
-   memset(, 0, sizeof(req));
-   req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
-   req.ifm.ifi_family = PF_BRIDGE;
-
while (argc > 0) {
if ((strcmp(*argv, "brport") == 0) || strcmp(*argv, "dev") == 
0) {
NEXT_ARG();
@@ -247,7 +246,17 @@ static int fdb_modify(int cmd, int flags, int argc, char 
**argv)
struct nlmsghdr n;
struct ndmsgndm;
charbuf[256];
-   } req;
+   } req = {
+   .n = {
+   .nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)),
+   .nlmsg_flags = NLM_F_REQUEST | flags,
+   .nlmsg_type = cmd
+   },
+   .ndm = {
+   .ndm_family = PF_BRIDGE,
+   .ndm_state = NUD_NOARP
+   }
+   };
char *addr = NULL;
char *d = NULL;
char abuf[ETH_ALEN];
@@ -259,14 +268,6 @@ static int fdb_modify(int cmd, int flags, int argc, char 
**argv)
char *endptr;
short vid = -1;
 
-   memset(, 0, 

[iproute PATCH v2 7/7] ip/tcp_metrics: Simplify process_msg a bit

2016-06-21 Thread Phil Sutter
By combining the attribute extraction and check for existence, the
additional indentation level in the 'else' clause can be avoided.

In addition to that, common actions for 'daddr' are combined since the
function returns if neither of the branches are taken.

Signed-off-by: Phil Sutter 
---
 ip/tcp_metrics.c | 45 ++---
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/ip/tcp_metrics.c b/ip/tcp_metrics.c
index f82604f458ada..899830c127bcb 100644
--- a/ip/tcp_metrics.c
+++ b/ip/tcp_metrics.c
@@ -112,47 +112,38 @@ static int process_msg(const struct sockaddr_nl *who, 
struct nlmsghdr *n,
parse_rtattr(attrs, TCP_METRICS_ATTR_MAX, (void *) ghdr + GENL_HDRLEN,
 len);
 
-   a = attrs[TCP_METRICS_ATTR_ADDR_IPV4];
-   if (a) {
+   if ((a = attrs[TCP_METRICS_ATTR_ADDR_IPV4])) {
if (f.daddr.family && f.daddr.family != AF_INET)
return 0;
-   memcpy(, RTA_DATA(a), 4);
daddr.bytelen = 4;
family = AF_INET;
atype = TCP_METRICS_ATTR_ADDR_IPV4;
-   dlen = RTA_PAYLOAD(a);
-   } else {
-   a = attrs[TCP_METRICS_ATTR_ADDR_IPV6];
-   if (a) {
-   if (f.daddr.family && f.daddr.family != AF_INET6)
-   return 0;
-   memcpy(, RTA_DATA(a), 16);
-   daddr.bytelen = 16;
-   family = AF_INET6;
-   atype = TCP_METRICS_ATTR_ADDR_IPV6;
-   dlen = RTA_PAYLOAD(a);
-   } else
+   } else if ((a = attrs[TCP_METRICS_ATTR_ADDR_IPV6])) {
+   if (f.daddr.family && f.daddr.family != AF_INET6)
return 0;
+   daddr.bytelen = 16;
+   family = AF_INET6;
+   atype = TCP_METRICS_ATTR_ADDR_IPV6;
+   } else {
+   return 0;
}
+   memcpy(, RTA_DATA(a), daddr.bytelen);
+   dlen = RTA_PAYLOAD(a);
 
-   a = attrs[TCP_METRICS_ATTR_SADDR_IPV4];
-   if (a) {
+   if ((a = attrs[TCP_METRICS_ATTR_SADDR_IPV4])) {
if (f.saddr.family && f.saddr.family != AF_INET)
return 0;
memcpy(, RTA_DATA(a), 4);
saddr.bytelen = 4;
stype = TCP_METRICS_ATTR_SADDR_IPV4;
slen = RTA_PAYLOAD(a);
-   } else {
-   a = attrs[TCP_METRICS_ATTR_SADDR_IPV6];
-   if (a) {
-   if (f.saddr.family && f.saddr.family != AF_INET6)
-   return 0;
-   memcpy(, RTA_DATA(a), 16);
-   saddr.bytelen = 16;
-   stype = TCP_METRICS_ATTR_SADDR_IPV6;
-   slen = RTA_PAYLOAD(a);
-   }
+   } else if ((a = attrs[TCP_METRICS_ATTR_SADDR_IPV6])) {
+   if (f.saddr.family && f.saddr.family != AF_INET6)
+   return 0;
+   memcpy(, RTA_DATA(a), 16);
+   saddr.bytelen = 16;
+   stype = TCP_METRICS_ATTR_SADDR_IPV6;
+   slen = RTA_PAYLOAD(a);
}
 
if (f.daddr.family && f.daddr.bitlen >= 0 &&
-- 
2.8.2



[iproute PATCH v2 3/7] Replace malloc && memset by calloc

2016-06-21 Thread Phil Sutter
This only replaces occurrences where the newly allocated memory is
cleared completely afterwards, as in other cases it is a theoretical
performance hit although code would be cleaner this way.

Signed-off-by: Phil Sutter 
---
 genl/genl.c|  3 +--
 lib/names.c|  7 ++-
 misc/lnstat.c  |  6 ++
 misc/lnstat_util.c |  4 +---
 tc/em_canid.c  |  3 +--
 tc/m_action.c  |  3 +--
 tc/m_ipt.c | 13 -
 tc/m_pedit.c   |  3 +--
 tc/tc.c|  9 +++--
 tc/tc_bpf.c|  4 +---
 tc/tc_class.c  |  3 +--
 tc/tc_exec.c   |  3 +--
 12 files changed, 19 insertions(+), 42 deletions(-)

diff --git a/genl/genl.c b/genl/genl.c
index e33fafdf2f524..747074b029a7b 100644
--- a/genl/genl.c
+++ b/genl/genl.c
@@ -86,9 +86,8 @@ reg:
return f;
 
 noexist:
-   f = malloc(sizeof(*f));
+   f = calloc(1, sizeof(*f));
if (f) {
-   memset(f, 0, sizeof(*f));
strncpy(f->name, str, 15);
f->parse_genlopt = parse_nofopt;
f->print_genlopt = print_nofopt;
diff --git a/lib/names.c b/lib/names.c
index 3b5b0b1e1201a..fbd6503f22d42 100644
--- a/lib/names.c
+++ b/lib/names.c
@@ -54,15 +54,12 @@ struct db_names *db_names_alloc(void)
 {
struct db_names *db;
 
-   db = malloc(sizeof(*db));
+   db = calloc(1, sizeof(*db));
if (!db)
return NULL;
 
-   memset(db, 0, sizeof(*db));
-
db->size = MAX_ENTRIES;
-   db->hash = malloc(sizeof(struct db_entry *) * db->size);
-   memset(db->hash, 0, sizeof(struct db_entry *) * db->size);
+   db->hash = calloc(db->size, sizeof(struct db_entry *));
 
return db;
 }
diff --git a/misc/lnstat.c b/misc/lnstat.c
index 659a01bd69931..863fd4d9f03f2 100644
--- a/misc/lnstat.c
+++ b/misc/lnstat.c
@@ -182,10 +182,8 @@ static struct table_hdr *build_hdr_string(struct 
lnstat_file *lnstat_files,
static struct table_hdr th;
int ofs = 0;
 
-   for (i = 0; i < HDR_LINES; i++) {
-   th.hdr[i] = malloc(HDR_LINE_LENGTH);
-   memset(th.hdr[i], 0, HDR_LINE_LENGTH);
-   }
+   for (i = 0; i < HDR_LINES; i++)
+   th.hdr[i] = calloc(1, HDR_LINE_LENGTH);
 
for (i = 0; i < fps->num; i++) {
char *cname, *fname = fps->params[i].lf->name;
diff --git a/misc/lnstat_util.c b/misc/lnstat_util.c
index d918151282f55..cc54598fe1bef 100644
--- a/misc/lnstat_util.c
+++ b/misc/lnstat_util.c
@@ -173,15 +173,13 @@ static struct lnstat_file *alloc_and_open(const char 
*path, const char *file)
struct lnstat_file *lf;
 
/* allocate */
-   lf = malloc(sizeof(*lf));
+   lf = calloc(1, sizeof(*lf));
if (!lf) {
fprintf(stderr, "out of memory\n");
return NULL;
}
 
/* initialize */
-   memset(lf, 0, sizeof(*lf));
-
/* de->d_name is guaranteed to be <= NAME_MAX */
strcpy(lf->basename, file);
strcpy(lf->path, path);
diff --git a/tc/em_canid.c b/tc/em_canid.c
index 16f6ed5c0b7a4..11e7c515b2aac 100644
--- a/tc/em_canid.c
+++ b/tc/em_canid.c
@@ -106,8 +106,7 @@ static int canid_parse_eopt(struct nlmsghdr *n, struct 
tcf_ematch_hdr *hdr,
if (args == NULL)
return PARSE_ERR(args, "canid: missing arguments");
 
-   rules.rules_raw = malloc(sizeof(struct can_filter) * 
rules.rules_capacity);
-   memset(rules.rules_raw, 0, sizeof(struct can_filter) * 
rules.rules_capacity);
+   rules.rules_raw = calloc(rules.rules_capacity, sizeof(struct 
can_filter));
 
do {
if (!bstrcmp(args, "sff")) {
diff --git a/tc/m_action.c b/tc/m_action.c
index ce399d2e43ccc..2cc671f3a7e08 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -126,9 +126,8 @@ noexist:
goto restart_s;
}
 #endif
-   a = malloc(sizeof(*a));
+   a = calloc(1, sizeof(*a));
if (a) {
-   memset(a, 0, sizeof(*a));
strncpy(a->id, "noact", 15);
a->parse_aopt = parse_noaopt;
a->print_aopt = print_noaopt;
diff --git a/tc/m_ipt.c b/tc/m_ipt.c
index 098f610f9439a..d6f62bd6b32c9 100644
--- a/tc/m_ipt.c
+++ b/tc/m_ipt.c
@@ -164,16 +164,11 @@ get_target_name(const char *name)
return NULL;
 #endif
 
-   new_name = malloc(strlen(name) + 1);
-   lname = malloc(strlen(name) + 1);
-   if (new_name)
-   memset(new_name, '\0', strlen(name) + 1);
-   else
+   new_name = calloc(1, strlen(name) + 1);
+   lname = calloc(1, strlen(name) + 1);
+   if (!new_name)
exit_error(PARAMETER_PROBLEM, "get_target_name");
-
-   if (lname)
-   memset(lname, '\0', strlen(name) + 1);
-   else
+   if (!lname)
exit_error(PARAMETER_PROBLEM, "get_target_name");
 
strcpy(new_name, name);
diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index c8f6d7c8e5ad7..3ae2e37b9c025 100644
--- 

[PATCH net-next 2/2] net: dsa: mv88e6xxx: rename single-chip support

2016-06-21 Thread Vivien Didelot
With the upcoming support for cross-chip operations, it will be hard to
distinguish portions of code supporting a single-chip or a switch fabric
of interconnected chips.

Make the code clearer now, by renaming the mv88e6xxx_priv_state chip
structure to mv88e6xxx_chip. This patch brings no functional changes.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6xxx/Makefile|2 +-
 drivers/net/dsa/mv88e6xxx/{mv88e6xxx.c => chip.c} | 1437 +++--
 drivers/net/dsa/mv88e6xxx/mv88e6xxx.h |   12 +-
 3 files changed, 731 insertions(+), 720 deletions(-)
 rename drivers/net/dsa/mv88e6xxx/{mv88e6xxx.c => chip.c} (62%)

diff --git a/drivers/net/dsa/mv88e6xxx/Makefile 
b/drivers/net/dsa/mv88e6xxx/Makefile
index 1128fc7..6e29a75 100644
--- a/drivers/net/dsa/mv88e6xxx/Makefile
+++ b/drivers/net/dsa/mv88e6xxx/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o
+obj-$(CONFIG_NET_DSA_MV88E6XXX) += chip.o
diff --git a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.c 
b/drivers/net/dsa/mv88e6xxx/chip.c
similarity index 62%
rename from drivers/net/dsa/mv88e6xxx/mv88e6xxx.c
rename to drivers/net/dsa/mv88e6xxx/chip.c
index 2073f7b..5cb06f7 100644
--- a/drivers/net/dsa/mv88e6xxx/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -31,10 +31,10 @@
 #include 
 #include "mv88e6xxx.h"
 
-static void assert_reg_lock(struct mv88e6xxx_priv_state *ps)
+static void assert_reg_lock(struct mv88e6xxx_chip *chip)
 {
-   if (unlikely(!mutex_is_locked(>reg_lock))) {
-   dev_err(ps->dev, "Switch registers lock not held!\n");
+   if (unlikely(!mutex_is_locked(>reg_lock))) {
+   dev_err(chip->dev, "Switch registers lock not held!\n");
dump_stack();
}
 }
@@ -51,30 +51,30 @@ static void assert_reg_lock(struct mv88e6xxx_priv_state *ps)
  * 2 registers, used to indirectly access the internal SMI devices.
  */
 
-static int mv88e6xxx_smi_read(struct mv88e6xxx_priv_state *ps,
+static int mv88e6xxx_smi_read(struct mv88e6xxx_chip *chip,
  int addr, int reg, u16 *val)
 {
-   if (!ps->smi_ops)
+   if (!chip->smi_ops)
return -EOPNOTSUPP;
 
-   return ps->smi_ops->read(ps, addr, reg, val);
+   return chip->smi_ops->read(chip, addr, reg, val);
 }
 
-static int mv88e6xxx_smi_write(struct mv88e6xxx_priv_state *ps,
+static int mv88e6xxx_smi_write(struct mv88e6xxx_chip *chip,
   int addr, int reg, u16 val)
 {
-   if (!ps->smi_ops)
+   if (!chip->smi_ops)
return -EOPNOTSUPP;
 
-   return ps->smi_ops->write(ps, addr, reg, val);
+   return chip->smi_ops->write(chip, addr, reg, val);
 }
 
-static int mv88e6xxx_smi_single_chip_read(struct mv88e6xxx_priv_state *ps,
+static int mv88e6xxx_smi_single_chip_read(struct mv88e6xxx_chip *chip,
  int addr, int reg, u16 *val)
 {
int ret;
 
-   ret = mdiobus_read_nested(ps->bus, addr, reg);
+   ret = mdiobus_read_nested(chip->bus, addr, reg);
if (ret < 0)
return ret;
 
@@ -83,12 +83,12 @@ static int mv88e6xxx_smi_single_chip_read(struct 
mv88e6xxx_priv_state *ps,
return 0;
 }
 
-static int mv88e6xxx_smi_single_chip_write(struct mv88e6xxx_priv_state *ps,
+static int mv88e6xxx_smi_single_chip_write(struct mv88e6xxx_chip *chip,
   int addr, int reg, u16 val)
 {
int ret;
 
-   ret = mdiobus_write_nested(ps->bus, addr, reg, val);
+   ret = mdiobus_write_nested(chip->bus, addr, reg, val);
if (ret < 0)
return ret;
 
@@ -100,13 +100,13 @@ static const struct mv88e6xxx_ops 
mv88e6xxx_smi_single_chip_ops = {
.write = mv88e6xxx_smi_single_chip_write,
 };
 
-static int mv88e6xxx_smi_multi_chip_wait(struct mv88e6xxx_priv_state *ps)
+static int mv88e6xxx_smi_multi_chip_wait(struct mv88e6xxx_chip *chip)
 {
int ret;
int i;
 
for (i = 0; i < 16; i++) {
-   ret = mdiobus_read_nested(ps->bus, ps->sw_addr, SMI_CMD);
+   ret = mdiobus_read_nested(chip->bus, chip->sw_addr, SMI_CMD);
if (ret < 0)
return ret;
 
@@ -117,29 +117,29 @@ static int mv88e6xxx_smi_multi_chip_wait(struct 
mv88e6xxx_priv_state *ps)
return -ETIMEDOUT;
 }
 
-static int mv88e6xxx_smi_multi_chip_read(struct mv88e6xxx_priv_state *ps,
+static int mv88e6xxx_smi_multi_chip_read(struct mv88e6xxx_chip *chip,
 int addr, int reg, u16 *val)
 {
int ret;
 
/* Wait for the bus to become free. */
-   ret = mv88e6xxx_smi_multi_chip_wait(ps);
+   ret = mv88e6xxx_smi_multi_chip_wait(chip);
if (ret < 0)
return ret;
 
/* Transmit the read command. */
-   ret = mdiobus_write_nested(ps->bus, ps->sw_addr, SMI_CMD,
+   ret = mdiobus_write_nested(chip->bus, 

[PATCH net-next 1/2] net: dsa: mv88e6xxx: move driver in its own folder

2016-06-21 Thread Vivien Didelot
With the upcoming support for cross-chip operations and other mv88e6xxx
enhancements, new files will be added.

Similarly to mlxsw or b53, move mv88e6xxx files into their own folder.

In the meantime, update the MAINTAINERS entry to please checkpatch.pl,
by replacing the invalid 88E6352 entry with 88E6XXX, maintained by
Andrew and myself.

Signed-off-by: Vivien Didelot 
---
 MAINTAINERS | 11 ++-
 drivers/net/dsa/Kconfig | 10 ++
 drivers/net/dsa/Makefile|  2 +-
 drivers/net/dsa/mv88e6xxx/Kconfig   |  7 +++
 drivers/net/dsa/mv88e6xxx/Makefile  |  1 +
 drivers/net/dsa/{ => mv88e6xxx}/mv88e6xxx.c |  3 ++-
 drivers/net/dsa/{ => mv88e6xxx}/mv88e6xxx.h |  3 ++-
 7 files changed, 21 insertions(+), 16 deletions(-)
 create mode 100644 drivers/net/dsa/mv88e6xxx/Kconfig
 create mode 100644 drivers/net/dsa/mv88e6xxx/Makefile
 rename drivers/net/dsa/{ => mv88e6xxx}/mv88e6xxx.c (99%)
 rename drivers/net/dsa/{ => mv88e6xxx}/mv88e6xxx.h (99%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 50f69ba7..2b27168 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7172,6 +7172,12 @@ W:   http://www.kernel.org/doc/man-pages
 L: linux-...@vger.kernel.org
 S: Maintained
 
+MARVELL 88E6XXX ETHERNET SWITCH FABRIC DRIVER
+M: Andrew Lunn 
+M: Vivien Didelot 
+S: Maintained
+F: drivers/net/dsa/mv88e6xxx/
+
 MARVELL ARMADA DRM SUPPORT
 M: Russell King 
 S: Maintained
@@ -7179,11 +7185,6 @@ F:   drivers/gpu/drm/armada/
 F: include/uapi/drm/armada_drm.h
 F: Documentation/devicetree/bindings/display/armada/
 
-MARVELL 88E6352 DSA support
-M: Guenter Roeck 
-S: Maintained
-F: drivers/net/dsa/mv88e6352.c
-
 MARVELL CRYPTO DRIVER
 M: Boris Brezillon 
 M: Arnaud Ebalard 
diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index be481e1..8f45443 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -9,14 +9,6 @@ config NET_DSA_MV88E6060
  This enables support for the Marvell 88E6060 ethernet switch
  chip.
 
-config NET_DSA_MV88E6XXX
-   tristate "Marvell 88E6xxx Ethernet switch chip support"
-   depends on NET_DSA
-   select NET_DSA_TAG_EDSA
-   ---help---
- This enables support for most of the Marvell 88E6xxx models of
- Ethernet switch chips, except 88E6060.
-
 config NET_DSA_BCM_SF2
tristate "Broadcom Starfighter 2 Ethernet switch support"
depends on HAS_IOMEM && NET_DSA
@@ -30,4 +22,6 @@ config NET_DSA_BCM_SF2
 
 source "drivers/net/dsa/b53/Kconfig"
 
+source "drivers/net/dsa/mv88e6xxx/Kconfig"
+
 endmenu
diff --git a/drivers/net/dsa/Makefile b/drivers/net/dsa/Makefile
index 97bc70a..ca1e71b 100644
--- a/drivers/net/dsa/Makefile
+++ b/drivers/net/dsa/Makefile
@@ -1,5 +1,5 @@
 obj-$(CONFIG_NET_DSA_MV88E6060) += mv88e6060.o
-obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o
 obj-$(CONFIG_NET_DSA_BCM_SF2)  += bcm_sf2.o
 
 obj-y  += b53/
+obj-y  += mv88e6xxx/
diff --git a/drivers/net/dsa/mv88e6xxx/Kconfig 
b/drivers/net/dsa/mv88e6xxx/Kconfig
new file mode 100644
index 000..490bc06
--- /dev/null
+++ b/drivers/net/dsa/mv88e6xxx/Kconfig
@@ -0,0 +1,7 @@
+config NET_DSA_MV88E6XXX
+   tristate "Marvell 88E6xxx Ethernet switch fabric support"
+   depends on NET_DSA
+   select NET_DSA_TAG_EDSA
+   help
+ This driver adds support for most of the Marvell 88E6xxx models of
+ Ethernet switch chips, except 88E6060.
diff --git a/drivers/net/dsa/mv88e6xxx/Makefile 
b/drivers/net/dsa/mv88e6xxx/Makefile
new file mode 100644
index 000..1128fc7
--- /dev/null
+++ b/drivers/net/dsa/mv88e6xxx/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_NET_DSA_MV88E6XXX) += mv88e6xxx.o
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.c
similarity index 99%
rename from drivers/net/dsa/mv88e6xxx.c
rename to drivers/net/dsa/mv88e6xxx/mv88e6xxx.c
index 9b116d8..2073f7b 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.c
@@ -1,5 +1,6 @@
 /*
- * net/dsa/mv88e6xxx.c - Marvell 88e6xxx switch chip support
+ * Marvell 88e6xxx Ethernet switch single-chip support
+ *
  * Copyright (c) 2008 Marvell Semiconductor
  *
  * Copyright (c) 2015 CMC Electronics, Inc.
diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
similarity index 99%
rename from drivers/net/dsa/mv88e6xxx.h
rename to drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
index a94acd8..856c6e5 100644
--- a/drivers/net/dsa/mv88e6xxx.h
+++ b/drivers/net/dsa/mv88e6xxx/mv88e6xxx.h
@@ -1,5 +1,6 @@
 /*
- * net/dsa/mv88e6xxx.h - Marvell 88e6xxx switch chip support
+ * Marvell 88e6xxx common definitions
+ *
  * Copyright (c) 2008 Marvell 

Re: [PATCH net-next v10 2/5] openvswitch: set skb protocol and mac_len when receiving on internal device

2016-06-21 Thread pravin shelar
On Mon, Jun 20, 2016 at 7:25 PM, Simon Horman
 wrote:
> [Cc Jiri Benc]
>
> On Sat, Jun 18, 2016 at 06:38:54PM -0700, pravin shelar wrote:
>> On Thu, Jun 16, 2016 at 10:53 PM, Simon Horman
>>  wrote:
>> > On Tue, Jun 07, 2016 at 03:45:27PM -0700, pravin shelar wrote:
>> >> On Mon, Jun 6, 2016 at 8:08 PM, Simon Horman  
>> >> wrote:
>> >> > On Thu, Jun 02, 2016 at 03:01:47PM -0700, pravin shelar wrote:
>> >> >> On Wed, Jun 1, 2016 at 11:24 PM, Simon Horman
>> >> >>  wrote:
>> >> >> > * Set skb protocol based on contents of packet. I have observed this 
>> >> >> > is
>> >> >> >   necessary to get actual protocol of a packet when it is injected 
>> >> >> > into an
>> >> >> >   internal device e.g. by libnet in which case skb protocol will be 
>> >> >> > set to
>> >> >> >   ETH_ALL.
>> 
>> 
>> >> > eth_type = eth_type_trans(skb, skb->dev);
>> >> > skb->mac_len = skb->data - skb_mac_header(skb);
>> >> > __skb_push(skb, skb->mac_len);
>> >> >
>> >> > if (eth_type == htons(ETH_P_8021Q))
>> >> > skb->mac_len += VLAN_HLEN;
>> >> >
>> >> > Perhaps that logic ought to be in a helper used by both 
>> >> > internal_dev_xmit()
>> >> > and netdev_port_receive(). Or somehow centralised in 
>> >> > ovs_vport_receive().
>> >>
>> >> This does looks bit complex. Can we use other skb metadata like
>> >> skb_mac_header_was_set()?
>> >
>> > Yes, I think that can be made to work if skb->mac_header is unset
>> > for l3 packets in netdev_port_receive(). The following is an incremental
>> > patch on the entire series. Is this the kind of thing you had in mind?
>> >
>> > diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
>> > index 86f2cfb19de3..42587d5bf894 100644
>> > --- a/net/openvswitch/flow.c
>> > +++ b/net/openvswitch/flow.c
>> > @@ -729,7 +729,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info 
>> > *tun_info,
>> > key->phy.skb_mark = skb->mark;
>> > ovs_ct_fill_key(skb, key);
>> > key->ovs_flow_hash = 0;
>> > -   key->phy.is_layer3 = skb->mac_len == 0;
>> > +   key->phy.is_layer3 = skb_mac_header_was_set(skb) == 0;
>> > key->recirc_id = 0;
>> >
>> > err = key_extract(skb, key);
>> > diff --git a/net/openvswitch/vport-internal_dev.c 
>> > b/net/openvswitch/vport-internal_dev.c
>> > index 484ba529c682..8973d4db509b 100644
>> > --- a/net/openvswitch/vport-internal_dev.c
>> > +++ b/net/openvswitch/vport-internal_dev.c
>> > @@ -50,7 +50,6 @@ static int internal_dev_xmit(struct sk_buff *skb, struct 
>> > net_device *netdev)
>> >
>> > skb->protocol = eth_type_trans(skb, netdev);
>> > skb_push(skb, ETH_HLEN);
>> > -   skb_reset_mac_len(skb);
>> >
>> > len = skb->len;
>> > rcu_read_lock();
>> > diff --git a/net/openvswitch/vport-netdev.c 
>> > b/net/openvswitch/vport-netdev.c
>> > index 3df36df62ee9..4cf3f12ffc99 100644
>> > --- a/net/openvswitch/vport-netdev.c
>> > +++ b/net/openvswitch/vport-netdev.c
>> > @@ -60,22 +60,9 @@ static void netdev_port_receive(struct sk_buff *skb)
>> > if (vport->dev->type == ARPHRD_ETHER) {
>> > skb_push(skb, ETH_HLEN);
>> > skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
>> > -   } else if (vport->dev->type == ARPHRD_NONE) {
>> > -   if (skb->protocol == htons(ETH_P_TEB)) {
>> > -   __be16 eth_type;
>> > -
>> > -   if (unlikely(skb->len < ETH_HLEN))
>> > -   goto error;
>> > -
>> > -   eth_type = eth_type_trans(skb, skb->dev);
>> > -   skb->mac_len = skb->data - skb_mac_header(skb);
>> > -   __skb_push(skb, skb->mac_len);
>> > -
>> > -   if (eth_type == htons(ETH_P_8021Q))
>> > -   skb->mac_len += VLAN_HLEN;
>> > -   } else {
>> > -   skb->mac_len = 0;
>> > -   }
>> > +   } else if (vport->dev->type == ARPHRD_NONE &&
>> > +  skb->protocol != htons(ETH_P_TEB)) {
>> > +   skb->mac_header = (typeof(skb->mac_header))~0U;
>> > }
>> >
>> > ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
>>
>> This certainly looks better. I was wondering if we can unset the mac
>> header offset in L3 tunnel devices itself. So there is no need to have
>> this check here.
>
> I think that might be possible for GRE by modifying the following in
> __ipgre_rcv().
>
> if (tunnel->dev->type != ARPHRD_NONE)
> skb_pop_mac_header(skb);
> else
> skb_reset_mac_header(skb);
>
> But I am unsure what side effects this might have on other users of the
> code.
>
I think it is fine with device of type ARPHRD_NONE. metadata tunnel
devices would be of this type anyways.

> Jiri, do 

[iproute PATCH v2 1/7] tc: m_action: Improve conversion to C99 style initializers

2016-06-21 Thread Phil Sutter
This improves my initial change in the following points:

- Drop superfluous comma after last expression in block.
- No need to initialize variables to zero as the key feature of C99
  initializers is to do this implicitly.
- By relocating the declaration of struct rtattr *tail, it can be
  initialized at the same time.

Fixes: a0a73b298a579 ("tc: m_action: Use C99 style initializers for struct req")
Signed-off-by: Phil Sutter 
---
 tc/m_action.c | 17 ++---
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index ea16817aefd4f..ce399d2e43ccc 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -398,10 +398,9 @@ static int tc_action_gd(int cmd, unsigned int flags, int 
*argc_p, char ***argv_p
.n = {
.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
.nlmsg_flags = NLM_F_REQUEST | flags,
-   .nlmsg_type = cmd,
+   .nlmsg_type = cmd
},
-   .t.tca_family = AF_UNSPEC,
-   .buf = { 0 }
+   .t.tca_family = AF_UNSPEC
};
 
argc -= 1;
@@ -491,8 +490,6 @@ static int tc_action_modify(int cmd, unsigned int flags, 
int *argc_p, char ***ar
int argc = *argc_p;
char **argv = *argv_p;
int ret = 0;
-
-   struct rtattr *tail;
struct {
struct nlmsghdr n;
struct tcamsg   t;
@@ -501,13 +498,12 @@ static int tc_action_modify(int cmd, unsigned int flags, 
int *argc_p, char ***ar
.n = {
.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
.nlmsg_flags = NLM_F_REQUEST | flags,
-   .nlmsg_type = cmd,
+   .nlmsg_type = cmd
},
-   .t.tca_family = AF_UNSPEC,
-   .buf = { 0 }
+   .t.tca_family = AF_UNSPEC
};
+   struct rtattr *tail = NLMSG_TAIL();
 
-   tail = NLMSG_TAIL();
argc -= 1;
argv += 1;
if (parse_action(, , TCA_ACT_TAB, )) {
@@ -539,8 +535,7 @@ static int tc_act_list_or_flush(int argc, char **argv, int 
event)
charbuf[MAX_MSG];
} req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcamsg)),
-   .t.tca_family = AF_UNSPEC,
-   .buf = { 0 }
+   .t.tca_family = AF_UNSPEC
};
 
tail = NLMSG_TAIL();
-- 
2.8.2



[iproute PATCH v2 0/7] Big C99 style initializer rework

2016-06-21 Thread Phil Sutter
This is v2 of my C99-style initializer related patch series. The changes
since v1 are:

- Rebased onto current upstream master:
  My own commit a0a73b298a579 ("tc: m_action: Use C99 style initializers
  for struct req") contains most of the changes to tc/m_action.c already,
  so I put the remaining ones into a dedicated patch (the first one here)
  with a better description.

- Tested against gcc-3.4.6:
  This is the oldest gcc version I was able to install locally. It indeed
  does not like the former changes in tc/tc_bpf.c, so I reverted them.
  Apart from emitting many warnings, it successfully compiles the
  sources.

In the process of compatibility testing, I made a few more changes which
make sense to have:

- New patch 5 allows to conveniently override the compiler via command
  line.

- New patch 6 eliminates a warning with old gcc but looks valid in
  general.

- A warning made me look at ip/tcp_metrics.c and I found a minor code
  simplification (patch 7).

I have a follow-up series of patches which get rid of every warning
gcc-3.4.6 emits, but am not sure whether they make sense to keep. I would
appreciate if you could review them here:
http://nwl.cc/cgi-bin/git/gitweb.cgi?p=iproute2.git;a=shortlog;h=refs/heads/c99_init

Phil Sutter (7):
  tc: m_action: Improve conversion to C99 style initializers
  Use C99 style initializers everywhere
  Replace malloc && memset by calloc
  No need to initialize rtattr fields before parsing
  Makefile: Allow to override CC
  misc/ifstat: simplify unsigned value comparison
  ip/tcp_metrics: Simplify process_msg a bit

 Makefile   |   4 +-
 bridge/fdb.c   |  29 ++--
 bridge/link.c  |  16 +++
 bridge/mdb.c   |  19 
 bridge/vlan.c  |  19 
 genl/ctrl.c|  48 +---
 genl/genl.c|   3 +-
 ip/ip6tunnel.c |  10 ++---
 ip/ipaddress.c |  33 ++
 ip/ipaddrlabel.c   |  23 +-
 ip/iplink.c|  67 +---
 ip/iplink_can.c|   4 +-
 ip/ipmaddr.c   |  27 +--
 ip/ipmroute.c  |   8 +---
 ip/ipneigh.c   |  36 +++
 ip/ipnetconf.c |  12 ++---
 ip/ipnetns.c   |  45 ++-
 ip/ipntable.c  |  27 +--
 ip/iproute.c   |  85 +++
 ip/iprule.c|  26 +--
 ip/iptoken.c   |  21 -
 ip/iptunnel.c  |  31 -
 ip/ipxfrm.c|  26 +++
 ip/link_gre.c  |  22 -
 ip/link_gre6.c |  22 -
 ip/link_ip6tnl.c   |  29 ++--
 ip/link_iptnl.c|  26 +--
 ip/link_vti.c  |  22 -
 ip/link_vti6.c |  22 -
 ip/tcp_metrics.c   |  45 ---
 ip/xfrm_policy.c   | 110 +
 ip/xfrm_state.c| 128 ++---
 lib/libnetlink.c   |  74 +--
 lib/ll_map.c   |   1 -
 lib/names.c|   7 +--
 misc/arpd.c|  68 ++--
 misc/ifstat.c  |   2 +-
 misc/lnstat.c  |   6 +--
 misc/lnstat_util.c |   4 +-
 misc/ss.c  |  41 -
 tc/e_bpf.c |   7 +--
 tc/em_canid.c  |   3 +-
 tc/em_cmp.c|   4 +-
 tc/em_ipset.c  |   4 +-
 tc/em_meta.c   |   4 +-
 tc/em_nbyte.c  |   4 +-
 tc/em_u32.c|   4 +-
 tc/f_flow.c|   3 --
 tc/f_flower.c  |   3 +-
 tc/f_fw.c  |   6 +--
 tc/f_route.c   |   3 --
 tc/f_rsvp.c|   6 +--
 tc/f_u32.c |  12 ++---
 tc/m_action.c  |  20 +++--
 tc/m_bpf.c |   5 +--
 tc/m_csum.c|   4 +-
 tc/m_ematch.c  |   4 +-
 tc/m_gact.c|   5 +--
 tc/m_ife.c |   5 +--
 tc/m_ipt.c |  13 ++
 tc/m_mirred.c  |   7 +--
 tc/m_nat.c |   4 +-
 tc/m_pedit.c   |  11 ++---
 tc/m_police.c  |   5 +--
 tc/q_atm.c |   3 +-
 tc/q_cbq.c |  22 +++--
 tc/q_choke.c   |   4 +-
 tc/q_codel.c   |   3 +-
 tc/q_dsmark.c  |   1 -
 tc/q_fifo.c|   4 +-
 tc/q_fq_codel.c|   3 +-
 tc/q_hfsc.c|  13 ++
 tc/q_htb.c |  15 +++
 tc/q_netem.c   |  16 +++
 tc/q_red.c |   4 +-
 tc/q_sfb.c |  17 ---
 tc/q_sfq.c |   4 +-
 tc/q_tbf.c |   4 +-
 tc/tc.c|   9 ++--
 tc/tc_bpf.c|  51 +
 tc/tc_class.c  |  40 +++--
 tc/tc_exec.c   |   6 +--
 tc/tc_filter.c |  35 ++-
 tc/tc_qdisc.c  |  35 ++-
 tc/tc_stab.c   |   4 +-
 tc/tc_util.c   |   3 +-
 86 files changed, 706 insertions(+), 984 deletions(-)

-- 
2.8.2



[iproute PATCH v2 6/7] misc/ifstat: simplify unsigned value comparison

2016-06-21 Thread Phil Sutter
By directly comparing the value of both unsigned variables, casting to
signed becomes unnecessary.

This also fixes for compiling with older versions of gcc (at least
<=3.4.6) which emit the following warning:

| ifstat.c: In function `update_db':
| ifstat.c:542: warning: comparison is always false due to limited range of 
data type

Signed-off-by: Phil Sutter 
---
 misc/ifstat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/misc/ifstat.c b/misc/ifstat.c
index abbb4e732fcef..9a44da487599e 100644
--- a/misc/ifstat.c
+++ b/misc/ifstat.c
@@ -539,7 +539,7 @@ static void update_db(int interval)
int i;
 
for (i = 0; i < MAXS; i++) {
-   if ((long)(h1->ival[i] - n->ival[i]) < 
0) {
+   if (h1->ival[i] < n->ival[i]) {
memset(n->ival, 0, 
sizeof(n->ival));
break;
}
-- 
2.8.2



[iproute PATCH v2 4/7] No need to initialize rtattr fields before parsing

2016-06-21 Thread Phil Sutter
Since parse_rtattr_flags() calls memset already, there is no need for
callers to do so themselves.

Signed-off-by: Phil Sutter 
---
 ip/ipaddress.c | 2 +-
 tc/tc_class.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index afb115e5f592a..643e9133e47b2 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -439,7 +439,7 @@ static void print_num(FILE *fp, unsigned int width, 
uint64_t count)
 
 static void print_vf_stats64(FILE *fp, struct rtattr *vfstats)
 {
-   struct rtattr *vf[IFLA_VF_STATS_MAX + 1] = {};
+   struct rtattr *vf[IFLA_VF_STATS_MAX + 1];
 
if (vfstats->rta_type != IFLA_VF_STATS) {
fprintf(stderr, "BUG: rta type is %d\n", vfstats->rta_type);
diff --git a/tc/tc_class.c b/tc/tc_class.c
index 523fafb35dd44..1b79b03efc038 100644
--- a/tc/tc_class.c
+++ b/tc/tc_class.c
@@ -221,7 +221,7 @@ static void graph_cls_show(FILE *fp, char *buf, struct 
hlist_head *root_list,
 {
struct hlist_node *n, *tmp_cls;
char cls_id_str[256] = {};
-   struct rtattr *tb[TCA_MAX + 1] = {};
+   struct rtattr *tb[TCA_MAX + 1];
struct qdisc_util *q;
char str[100] = {};
 
@@ -306,7 +306,7 @@ int print_class(const struct sockaddr_nl *who,
FILE *fp = (FILE *)arg;
struct tcmsg *t = NLMSG_DATA(n);
int len = n->nlmsg_len;
-   struct rtattr *tb[TCA_MAX + 1] = {};
+   struct rtattr *tb[TCA_MAX + 1];
struct qdisc_util *q;
char abuf[256];
 
-- 
2.8.2



Re: [PATCH iproute2 net-next v3 1/5] json_writer: allow base json data type to be array or object

2016-06-21 Thread Anuradha Karuppiah
On Tue, Jun 21, 2016 at 9:12 AM, Stephen Hemminger
 wrote:
> On Mon, 20 Jun 2016 23:39:43 -0700
> Roopa Prabhu  wrote:
>
>> From: Anuradha Karuppiah 
>>
>> This patch adds a type qualifier to json_writer. Type can be a
>> json object or array. This can be extended to other types like
>> json-string, json-number etc in the future.
>>
>> Signed-off-by: Anuradha Karuppiah 
>
> Since json writer is not used in many places yet, why not just
> get rid of the automatic object in the constructor.

I wanted to force the external api to start with an json-object or
json-array. It reduces the chance of mistakes vs. a typeless
constructor. With a typeless constructor you can accidentally end up
with a json output that doesn't pass json lint; especially if optional
params are being suppressed at different places.

>
>
>
> diff --git a/lib/json_writer.c b/lib/json_writer.c
> index 2af16e1..5e588d8 100644
> --- a/lib/json_writer.c
> +++ b/lib/json_writer.c
> @@ -102,7 +102,6 @@ json_writer_t *jsonw_new(FILE *f)
> self->depth = 0;
> self->pretty = false;
> self->sep = '\0';
> -   putc('{', self->out);
> }
> return self;
>  }
> @@ -114,7 +113,6 @@ void jsonw_destroy(json_writer_t **self_p)
>
> assert(self->depth == 0);
> jsonw_eol(self);
> -   fputs("}\n", self->out);
> fflush(self->out);
> free(self);
> *self_p = NULL;
> diff --git a/misc/ifstat.c b/misc/ifstat.c
> index abbb4e7..d9a7e50 100644
> --- a/misc/ifstat.c
> +++ b/misc/ifstat.c
> @@ -246,7 +246,6 @@ static void dump_raw_db(FILE *fp, int to_hist)
> h = hist_db;
> if (jw) {
> jsonw_pretty(jw, pretty);
> -   jsonw_name(jw, info_source);
> jsonw_start_object(jw);
> } else
> fprintf(fp, "#%s\n", info_source);
> @@ -452,7 +451,6 @@ static void dump_kern_db(FILE *fp)
>
> if (jw) {
> jsonw_pretty(jw, pretty);
> -   jsonw_name(jw, info_source);
> jsonw_start_object(jw);
> } else
> print_head(fp);
> @@ -466,8 +464,10 @@ static void dump_kern_db(FILE *fp)
> else
> print_one_if(fp, n, n->val);
> }
> -   if (json_output)
> -   fprintf(fp, "\n} }\n");
> +   if (jw) {
> +   jsonw_end_object(jw);
> +   jsonw_destroy();
> +   }
>  }
>
>  static void dump_incr_db(FILE *fp)
> @@ -478,7 +478,6 @@ static void dump_incr_db(FILE *fp)
> h = hist_db;
> if (jw) {
> jsonw_pretty(jw, pretty);
> -   jsonw_name(jw, info_source);
> jsonw_start_object(jw);
> } else
> print_head(fp);
> diff --git a/misc/nstat.c b/misc/nstat.c
> index a9e0f20..411cd87 100644
> --- a/misc/nstat.c
> +++ b/misc/nstat.c
> @@ -285,7 +285,6 @@ static void dump_kern_db(FILE *fp, int to_hist)
> h = hist_db;
> if (jw) {
> jsonw_pretty(jw, pretty);
> -   jsonw_name(jw, info_source);
> jsonw_start_object(jw);
> } else
> fprintf(fp, "#%s\n", info_source);


Re: [iproute PATCH v2 2/7] Use C99 style initializers everywhere

2016-06-21 Thread David Ahern

On 6/21/16 10:18 AM, Phil Sutter wrote:

This big patch was compiled by vimgrepping for memset calls and changing
to C99 initializer if applicable. One notable exception is the
initialization of union bpf_attr in tc/tc_bpf.c: changing it would break
for older gcc versions (at least <=3.4.6).

Calls to memset for struct rtattr pointer fields for parse_rtattr*()
were just dropped since they are not needed.

The changes here allowed the compiler to discover some unused variables,
so get rid of them, too.

Signed-off-by: Phil Sutter 
---
Changes since v1:
- Dropped former changes to tc/tc_bpf.c as they are incompatible to older
  gcc versions (at least <=3.4.6).



What OS versions have you compiled iproute2 against?

I downloaded CentOS 5 and 6. iproute2 fails to compile on CentOS 5.11; 
ip command builds on 6.8 but with a flurry of redefinition errors 
(BUILD_BUG_ON), but fails at tc.


  1   2   >