pull request: bluetooth-next 2016-12-08

2016-12-07 Thread Johan Hedberg
(resending since I forgot to sign the first one)

Hi Dave,

I didn't miss your "net-next is closed" email, but it did come as a bit
of a surprise, and due to time-zone differences I didn't have a chance
to react to it until now. We would have had a couple of patches in
bluetooth-next that we'd still have wanted to get to 4.10.

Out of these the most critical one is the H7/CT2 patch for Bluetooth
Security Manager Protocol, something that couldn't be published before
the Bluetooth 5.0 specification went public (yesterday). If these really
can't go to net-next we'll likely be sending at least this patch through
bluetooth.git to net.git for rc1 inclusion.

Johan

---
The following changes since commit 5fccd64aa44829f87997e3342698ef98862adffd:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next (2016-12-07 
19:16:46 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git 
for-upstream

for you to fetch changes up to a62da6f14db79bd7ea435ab095e998b31b3dbb22:

  Bluetooth: SMP: Add support for H7 crypto function and CT2 auth flag 
(2016-12-08 07:50:24 +0100)


Geliang Tang (1):
  Bluetooth: btmrvl: drop duplicate header slab.h

Johan Hedberg (1):
  Bluetooth: SMP: Add support for H7 crypto function and CT2 auth flag

Stefan Schmidt (4):
  ieee802154: atusb: sync header file from firmware for new features
  ieee802154: atusb: store firmware version after retrieval for later use
  ieee802154: atusb: try to read permanent extended address from device
  ieee802154: atusb: implement .set_frame_retries ops callback

 drivers/bluetooth/btmrvl_drv.h |  1 -
 drivers/net/ieee802154/atusb.c | 79 +
 drivers/net/ieee802154/atusb.h | 11 --
 net/bluetooth/smp.c| 85 
 net/bluetooth/smp.h|  1 +
 5 files changed, 149 insertions(+), 28 deletions(-)


signature.asc
Description: PGP signature


pull request: bluetooth-next 2016-12-08

2016-12-07 Thread Johan Hedberg
Hi Dave,

I didn't miss your "net-next is closed" email, but it did come as a bit
of a surprise, and due to time-zone differences I didn't have a chance
to react to it until now. We would have had a couple of patches in
bluetooth-next that we'd still have wanted to get to 4.10.

Out of these the most critical one is the H7/CT2 patch for Bluetooth
Security Manager Protocol, something that couldn't be published before
the Bluetooth 5.0 specification went public (yesterday). If these really
can't go to net-next we'll likely be sending at least this patch through
bluetooth.git to net.git for rc1 inclusion.

Johan

---
The following changes since commit 5fccd64aa44829f87997e3342698ef98862adffd:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next (2016-12-07 
19:16:46 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git 
for-upstream

for you to fetch changes up to a62da6f14db79bd7ea435ab095e998b31b3dbb22:

  Bluetooth: SMP: Add support for H7 crypto function and CT2 auth flag 
(2016-12-08 07:50:24 +0100)


Geliang Tang (1):
  Bluetooth: btmrvl: drop duplicate header slab.h

Johan Hedberg (1):
  Bluetooth: SMP: Add support for H7 crypto function and CT2 auth flag

Stefan Schmidt (4):
  ieee802154: atusb: sync header file from firmware for new features
  ieee802154: atusb: store firmware version after retrieval for later use
  ieee802154: atusb: try to read permanent extended address from device
  ieee802154: atusb: implement .set_frame_retries ops callback

 drivers/bluetooth/btmrvl_drv.h |  1 -
 drivers/net/ieee802154/atusb.c | 79 +
 drivers/net/ieee802154/atusb.h | 11 --
 net/bluetooth/smp.c| 85 
 net/bluetooth/smp.h|  1 +
 5 files changed, 149 insertions(+), 28 deletions(-)


[PATCH] cxgb4/cxgb4vf: Remove deprecated module parameters

2016-12-07 Thread Ganesh Goudar
Remove deprecated module parameters num_vf, dflt_msg_enable and
force_init.

Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c| 41 +-
 .../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c|  9 +
 2 files changed, 2 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 449884f..48113c6 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -134,24 +134,6 @@ MODULE_FIRMWARE(FW5_FNAME);
 MODULE_FIRMWARE(FW6_FNAME);
 
 /*
- * Normally we're willing to become the firmware's Master PF but will be happy
- * if another PF has already become the Master and initialized the adapter.
- * Setting "force_init" will cause this driver to forcibly establish itself as
- * the Master PF and initialize the adapter.
- */
-static uint force_init;
-
-module_param(force_init, uint, 0644);
-MODULE_PARM_DESC(force_init, "Forcibly become Master PF and initialize 
adapter,"
-"deprecated parameter");
-
-static int dflt_msg_enable = DFLT_MSG_ENABLE;
-
-module_param(dflt_msg_enable, int, 0644);
-MODULE_PARM_DESC(dflt_msg_enable, "Chelsio T4 default message enable bitmap, "
-"deprecated parameter");
-
-/*
  * The driver uses the best interrupt scheme available on a platform in the
  * order MSI-X, MSI, legacy INTx interrupts.  This parameter determines which
  * of these schemes the driver may consider as follows:
@@ -179,16 +161,6 @@ MODULE_PARM_DESC(msi, "whether to use INTx (0), MSI (1) or 
MSI-X (2)");
  */
 static int rx_dma_offset = 2;
 
-#ifdef CONFIG_PCI_IOV
-/* Configure the number of PCI-E Virtual Function which are to be instantiated
- * on SR-IOV Capable Physical Functions.
- */
-static unsigned int num_vf[NUM_OF_PF_WITH_SRIOV];
-
-module_param_array(num_vf, uint, NULL, 0644);
-MODULE_PARM_DESC(num_vf, "number of VFs for each of PFs 0-3, deprecated 
parameter - please use the pci sysfs interface.");
-#endif
-
 /* TX Queue select used to determine what algorithm to use for selecting TX
  * queue. Select between the kernel provided function (select_queue=0) or user
  * cxgb_select_queue function (select_queue=1)
@@ -4729,7 +4701,7 @@ static int init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
adapter->name = pci_name(pdev);
adapter->mbox = func;
adapter->pf = func;
-   adapter->msg_enable = dflt_msg_enable;
+   adapter->msg_enable = DFLT_MSG_ENABLE;
memset(adapter->chan_map, 0xff, sizeof(adapter->chan_map));
 
spin_lock_init(>stats_lock);
@@ -4988,17 +4960,6 @@ static int init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
 
 sriov:
 #ifdef CONFIG_PCI_IOV
-   if (func < ARRAY_SIZE(num_vf) && num_vf[func] > 0) {
-   dev_warn(>dev,
-"Enabling SR-IOV VFs using the num_vf module "
-"parameter is deprecated - please use the pci sysfs "
-"interface instead.\n");
-   if (pci_enable_sriov(pdev, num_vf[func]) == 0)
-   dev_info(>dev,
-"instantiated %u virtual functions\n",
-num_vf[func]);
-   }
-
adapter = kzalloc(sizeof(*adapter), GFP_KERNEL);
if (!adapter) {
err = -ENOMEM;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c 
b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 5d4da0e..fa43e06d 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -70,13 +70,6 @@
 NETIF_MSG_TIMER | NETIF_MSG_IFDOWN | NETIF_MSG_IFUP |\
 NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR)
 
-static int dflt_msg_enable = DFLT_MSG_ENABLE;
-
-module_param(dflt_msg_enable, int, 0644);
-MODULE_PARM_DESC(dflt_msg_enable,
-"default adapter ethtool message level bitmap, "
-"deprecated parameter");
-
 /*
  * The driver uses the best interrupt scheme available on a platform in the
  * order MSI-X then MSI.  This parameter determines which of these schemes the
@@ -2891,7 +2884,7 @@ static int cxgb4vf_pci_probe(struct pci_dev *pdev,
 * Initialize adapter level features.
 */
adapter->name = pci_name(pdev);
-   adapter->msg_enable = dflt_msg_enable;
+   adapter->msg_enable = DFLT_MSG_ENABLE;
err = adap_init0(adapter);
if (err)
goto err_unmap_bar;
-- 
2.1.0



[PATCH v2 2/2] net: rfkill: Add rfkill-any LED trigger

2016-12-07 Thread Michał Kępień
Add a new "global" (i.e. not per-rfkill device) LED trigger, rfkill-any,
which may be useful on laptops with a single "radio LED" and multiple
radio transmitters.  The trigger is meant to turn a LED on whenever
there is at least one radio transmitter active and turn it off
otherwise.

Signed-off-by: Michał Kępień 
---
Changes from v1:

  - take rfkill_global_mutex before calling rfkill_set_block() in
rfkill_resume(); the need for doing this was previously obviated by
908209c ("rfkill: don't impose global states on resume"), but given
that __rfkill_any_led_trigger_event() is called from
rfkill_set_block() unconditionally, each caller of the latter needs
to take care of locking rfkill_global_mutex,

  - declare __rfkill_any_led_trigger_event() even when
CONFIG_RFKILL_LEDS=n to prevent implicit declaration errors,

  - remove the #ifdef surrounding rfkill_any_led_trigger_{,un}register()
calls to prevent compilation warnings about functions and a label
being defined but not used,

  - move the rfkill_any_led_trigger_register() call in rfkill_init()
before the rfkill_handler_init() call to avoid the need to call
rfkill_handler_exit() from rfkill_init() and thus prevent a section
mismatch.

 net/rfkill/core.c | 73 +++
 1 file changed, 73 insertions(+)

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index f28e441..cd50b11 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -176,6 +176,47 @@ static void rfkill_led_trigger_unregister(struct rfkill 
*rfkill)
 {
led_trigger_unregister(>led_trigger);
 }
+
+static struct led_trigger rfkill_any_led_trigger;
+
+static void __rfkill_any_led_trigger_event(void)
+{
+   enum led_brightness brightness = LED_OFF;
+   struct rfkill *rfkill;
+
+   list_for_each_entry(rfkill, _list, node) {
+   if (!(rfkill->state & RFKILL_BLOCK_ANY)) {
+   brightness = LED_FULL;
+   break;
+   }
+   }
+
+   led_trigger_event(_any_led_trigger, brightness);
+}
+
+static void rfkill_any_led_trigger_event(void)
+{
+   mutex_lock(_global_mutex);
+   __rfkill_any_led_trigger_event();
+   mutex_unlock(_global_mutex);
+}
+
+static void rfkill_any_led_trigger_activate(struct led_classdev *led_cdev)
+{
+   rfkill_any_led_trigger_event();
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+   rfkill_any_led_trigger.name = "rfkill-any";
+   rfkill_any_led_trigger.activate = rfkill_any_led_trigger_activate;
+   return led_trigger_register(_any_led_trigger);
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+   led_trigger_unregister(_any_led_trigger);
+}
 #else
 static void rfkill_led_trigger_event(struct rfkill *rfkill)
 {
@@ -189,6 +230,23 @@ static inline int rfkill_led_trigger_register(struct 
rfkill *rfkill)
 static inline void rfkill_led_trigger_unregister(struct rfkill *rfkill)
 {
 }
+
+static void __rfkill_any_led_trigger_event(void)
+{
+}
+
+static void rfkill_any_led_trigger_event(void)
+{
+}
+
+static int rfkill_any_led_trigger_register(void)
+{
+   return 0;
+}
+
+static void rfkill_any_led_trigger_unregister(void)
+{
+}
 #endif /* CONFIG_RFKILL_LEDS */
 
 static void rfkill_fill_event(struct rfkill_event *ev, struct rfkill *rfkill,
@@ -297,6 +355,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool 
blocked)
spin_unlock_irqrestore(>lock, flags);
 
rfkill_led_trigger_event(rfkill);
+   __rfkill_any_led_trigger_event();
 
if (prev != curr)
rfkill_event(rfkill);
@@ -477,6 +536,7 @@ bool rfkill_set_hw_state(struct rfkill *rfkill, bool 
blocked)
spin_unlock_irqrestore(>lock, flags);
 
rfkill_led_trigger_event(rfkill);
+   rfkill_any_led_trigger_event();
 
if (!rfkill->registered)
return ret;
@@ -523,6 +583,7 @@ bool rfkill_set_sw_state(struct rfkill *rfkill, bool 
blocked)
schedule_work(>uevent_work);
 
rfkill_led_trigger_event(rfkill);
+   rfkill_any_led_trigger_event();
 
return blocked;
 }
@@ -572,6 +633,7 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool 
hw)
schedule_work(>uevent_work);
 
rfkill_led_trigger_event(rfkill);
+   rfkill_any_led_trigger_event();
}
 }
 EXPORT_SYMBOL(rfkill_set_states);
@@ -815,8 +877,10 @@ static int rfkill_resume(struct device *dev)
rfkill->suspended = false;
 
if (!rfkill->persistent) {
+   mutex_lock(_global_mutex);
cur = !!(rfkill->state & RFKILL_BLOCK_SW);
rfkill_set_block(rfkill, cur);
+   mutex_unlock(_global_mutex);
}
 
if (rfkill->ops->poll && !rfkill->polling_paused)
@@ -988,6 +1052,7 @@ int __must_check rfkill_register(struct rfkill *rfkill)
 #endif
}
 
+   __rfkill_any_led_trigger_event();

[PATCH v2 1/2] net: rfkill: Cleanup error handling in rfkill_init()

2016-12-07 Thread Michał Kępień
Use a separate label per error condition in rfkill_init() to make it a
bit cleaner and easier to extend.

Signed-off-by: Michał Kępień 
---
No changes from v1.

 net/rfkill/core.c | 23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 884027f..f28e441 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1266,24 +1266,25 @@ static int __init rfkill_init(void)
 
error = class_register(_class);
if (error)
-   goto out;
+   goto error_class;
 
error = misc_register(_miscdev);
-   if (error) {
-   class_unregister(_class);
-   goto out;
-   }
+   if (error)
+   goto error_misc;
 
 #ifdef CONFIG_RFKILL_INPUT
error = rfkill_handler_init();
-   if (error) {
-   misc_deregister(_miscdev);
-   class_unregister(_class);
-   goto out;
-   }
+   if (error)
+   goto error_input;
 #endif
 
- out:
+   return 0;
+
+error_input:
+   misc_deregister(_miscdev);
+error_misc:
+   class_unregister(_class);
+error_class:
return error;
 }
 subsys_initcall(rfkill_init);
-- 
2.10.2



Re: [PATCH] sh_eth: add wake-on-lan support via magic packet

2016-12-07 Thread Niklas Söderlund
Hi Geert,

Thanks for testing and your feedback.

On 2016-12-07 19:14:40 +0100, Geert Uytterhoeven wrote:
> Hi Niklas,
> 
> On Wed, Dec 7, 2016 at 5:28 PM, Niklas Söderlund
>  wrote:
> > Signed-off-by: Niklas Söderlund 
> 
> Thanks, works fine on r8a7791/koelsch!
> 
> Tested-by: Geert Uytterhoeven 
> 
> > --- a/drivers/net/ethernet/renesas/sh_eth.c
> > +++ b/drivers/net/ethernet/renesas/sh_eth.c
> > @@ -624,7 +624,7 @@ static struct sh_eth_cpu_data r8a779x_data = {
> >
> > .register_type  = SH_ETH_REG_FAST_RCAR,
> >
> > -   .ecsr_value = ECSR_PSRTO | ECSR_LCHNG | ECSR_ICD,
> > +   .ecsr_value = ECSR_PSRTO | ECSR_LCHNG | ECSR_ICD | ECSR_MPD,
> 
> Interestingly, the ECSR_MPD bit is already set for several SoCs.

Yes, I noticed that and my assumption was that it was set 'just in case' 
to clear any MagicPacket interrupts at probe time.

> 
> Hence adding ".magic = 1" to the entry for r8a7740 instantly gave me working
> WoL support on r8a7740/armadillo. Cool!

Cool, I will set ".magic = 1" for r8a7740 in v2.

> 
> > --- a/drivers/net/ethernet/renesas/sh_eth.h
> > +++ b/drivers/net/ethernet/renesas/sh_eth.h
> > @@ -493,6 +493,7 @@ struct sh_eth_cpu_data {
> > unsigned shift_rd0:1;   /* shift Rx descriptor word 0 right by 16 */
> > unsigned rmiimode:1;/* EtherC has RMIIMODE register */
> > unsigned rtrate:1;  /* EtherC has RTRATE register */
> > +   unsigned magic:1;   /* EtherC have PMDE in ECMR and MPDIP in 
> > ECSIPR */
> 
> Instead of adding a new flag, perhaps you can just check for the ECSR_MPD flag
> in ecsr_value?

I briefly considered this but decided against it since I do not have 
documentation for all versions of the device and no way to test it. You 
tested and confirmed functionality on r8a7740, which leaves:

- sh7734-gether
- sh7763-gether
- sh7757-gether

To figure out if they support MagicPacket in the same fashion as r8a7740 
and r8a779x. If anyone have access to documentation or hardware to 
confirm this I be more then happy to get rid of the magic flag in favor 
och checking for ECSR_MPD in ecsr_value.

> 
> > @@ -529,6 +530,9 @@ struct sh_eth_private {
> > unsigned no_ether_link:1;
> > unsigned ether_link_active_low:1;
> > unsigned is_opened:1;
> > +
> > +   bool wol_enabled;
> 
> "unsigned wol_enabled:1", to merge with the bitfield above?

Thanks, looking it it now I don't know what I was thinking. I will 
changes it for v2.

> 
> > +   struct clk *clk;
> 
> It's a good practice to keep all pointers at the top of the struct, to avoid
> gaps due to alignment restrictions, especially on 64-bit (I know that's not
> the case here).

Thanks, you learn new things everyday. I will move it for v2.

> 
> Gr{oetje,eeting}s,
> 
> Geert
> 
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- 
> ge...@linux-m68k.org
> 
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like 
> that.
> -- Linus Torvalds

-- 
Regards,
Niklas Söderlund


PROBLEM:

2016-12-07 Thread Tony

Hello,

NB. This is a re-send. I've been advised to send again as I gather I'm 
supposed to receive a URL back.


I am reporting this as requested below:

Anthony Buckley, the issue you are reporting is an upstream one. Could
you please report this problem following the instructions verbatim at
https://wiki.ubuntu.com/Bugs/Upstream/kernel  to the appropriate mailing
list (TO: Eric Dumazet, and David S. Miller, CC netdev)?

Please provide a direct URL to your post to the mailing list when it
becomes available so that it may be tracked.

Thank you for your help.

** Changed in: linux (Ubuntu)
   Status: Confirmed => Triaged

** Summary changed:

- Network scanner not detected by xsane after upgrade to 16.04
+ Network scanner not detected by xsane after kernel upgrade


Apologies if I have sent this to the wrong area(s). I'm a bit new to this.


Kernel.org format information

[1] One line summary of the problem:

Network scanner not detected by xsane after kernel upgrade

[2] Full description of the problem/report:

The scanner on my 'Epson WF-3520' multi-function is no longer detected 
by xsane

(and other scan apps.) when connected wirelessly to the network.
The problem occurs on a Dell 64 bit desktop, an Asus 64 bit laptop and a 
Medion 32 bit laptop.
Printing works normally and the scanner is detected if connected via a 
USB cable.
To reproduce, I turn on the scanner and start xsane. There is some delay 
and then

a 'no devices found' window appears.

[3] Keywords. Leave blank.

[4] Kernel version

cat /proc/version
Linux version 4.9.0-040900rc4-generic (kernel@tangerine) (gcc version 
6.2.0 20161005 (Ubuntu 6.2.0-5ubuntu12) ) #201611052031 SMP Sun Nov 6 
00:33:05 UTC 2016


[5] Not applicable

[6] Not applicable

[7] Environment

lsb_release -rd
Description:Ubuntu 16.04.1 LTS
Release:16.04

[7.1] Software (add the output of the ver_linux script here)

If some fields are empty or look unusual you may have an old version.
Compare to the current minimal requirements in Documentation/Changes.

Linux Handel 4.9.0-040900rc4-generic #201611052031 SMP Sun Nov 6 
00:33:05 UTC 2016 x86_64 x86_64 x86_64 GNU/Linux


GNU C   5.4.0
GNU Make4.1
Binutils2.26.1
Util-linux  2.27.1
Mount   2.27.1
Module-init-tools   22
E2fsprogs   1.42.13
Pcmciautils 018
PPP 2.4.7
Linux C Library 2.23
Dynamic linker (ldd)2.23
Linux C++ Library   6.0.21
Procps  3.3.10
Net-tools   1.60
Kbd 1.15.5
Console-tools   1.15.5
Sh-utils8.25
Udev229
Wireless-tools  30
Modules Loaded  amdgpu amd_iommu_v2 amdkfd autofs4 binfmt_misc 
bluetooth bnep btbcm btintel btrtl btusb coretemp crc_itu_t dcdbas 
dell_smm_hwmon drm drm_kms_helper e1000e edac_core fb_sys_fops 
firewire_core firewire_ohci fjes gpio_ich hid hid_generic i2c_algo_bit 
i5500_temp i7core_edac input_leds intel_cstate ip6table_filter 
ip6_tables ip6t_REJECT ip6t_rt iptable_filter ip_tables ipt_REJECT 
irqbypass joydev kvm kvm_intel lp lpc_ich mac_hid nf_conntrack 
nf_conntrack_broadcast nf_conntrack_ftp nf_conntrack_ipv4 
nf_conntrack_ipv6 nf_conntrack_netbios_ns nf_defrag_ipv4 nf_defrag_ipv6 
nf_log_common nf_log_ipv4 nf_log_ipv6 nf_nat nf_nat_ftp nf_reject_ipv4 
nf_reject_ipv6 parport parport_pc pata_acpi ppdev pps_core psmouse ptp 
radeon rfcomm serio_raw shpchp snd snd_hda_codec snd_hda_codec_generic 
snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_core snd_hda_intel 
snd_hwdep snd_pcm snd_rawmidi snd_seq snd_seq_device snd_seq_midi 
snd_seq_midi_event snd_timer soundcore syscopyarea sysfillrect sysimgblt 
ttm uas usbhid usb_storage x_tables xt_addrtype xt_conntrack xt_hl 
xt_limit xt_LOG xt_multiport xt_recent xt_tcpudp


[7.2] Processor information

cat /proc/cpuinfo
processor: 0
vendor_id: GenuineIntel
cpu family: 6
model: 26
model name: Intel(R) Core(TM) i7 CPU 920  @ 2.67GHz
stepping: 4
microcode: 0x11
cpu MHz: 1600.000
cache size: 8192 KB
physical id: 0
siblings: 8
core id: 0
cpu cores: 4
apicid: 0
initial apicid: 0
fpu: yes
fpu_exception: yes
cpuid level: 11
wp: yes
flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca 
cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall 
nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology 
nonstop_tsc aperfmperf eagerfpu pni dtes64 monitor ds_cpl vmx est tm2 
ssse3 cx16 xtpr pdcm sse4_1 sse4_2 popcnt lahf_lm tpr_shadow vnmi 
flexpriority ept vpid dtherm ida

bugs:
bogomips: 5320.35
clflush size: 64
cache_alignment: 64
address sizes: 36 bits physical, 48 bits virtual
power management:

processor: 1
vendor_id: GenuineIntel
cpu family: 6
model: 26
model name: Intel(R) Core(TM) i7 

[PATCH] ethtool: add one ethtool option to set relax ordering mode

2016-12-07 Thread Mao Wenan
This patch provides one way to set/unset IXGBE NIC TX and RX
relax ordering mode, which can be set by ethtool.
Relax ordering is one mode of 82599 NIC, to enable this mode
can enhance the performance for some cpu architecure.
example:
ethtool -s enp1s0f0 relaxorder off
ethtool -s enp1s0f0 relaxorder on

Signed-off-by: Mao Wenan 
---
 ethtool-copy.h |  6 ++
 ethtool.c  | 24 +++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/ethtool-copy.h b/ethtool-copy.h
index 3d299e3..37d93be 100644
--- a/ethtool-copy.h
+++ b/ethtool-copy.h
@@ -1329,6 +1329,8 @@ struct ethtool_per_queue_op {
 #define ETHTOOL_PHY_GTUNABLE   0x004e /* Get PHY tunable configuration */
 #define ETHTOOL_PHY_STUNABLE   0x004f /* Set PHY tunable configuration */
 
+#define ETHTOOL_SRELAXORDER0x0050 /* Set relax ordering mode, on or 
off*/
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET ETHTOOL_GSET
 #define SPARC_ETH_SSET ETHTOOL_SSET
@@ -1558,6 +1560,10 @@ static __inline__ int ethtool_validate_duplex(__u8 
duplex)
 #define WAKE_MAGIC (1 << 5)
 #define WAKE_MAGICSECURE   (1 << 6) /* only meaningful if WAKE_MAGIC */
 
+/* Relax Ordering mode, on or off. */
+#define RELAXORDER_OFF 0x00
+#define RELAXORDER_ON  0x01
+
 /* L2-L4 network traffic flow types */
 #defineTCP_V4_FLOW 0x01/* hash or spec (tcp_ip4_spec) */
 #defineUDP_V4_FLOW 0x02/* hash or spec (udp_ip4_spec) */
diff --git a/ethtool.c b/ethtool.c
index 7af039e..acafd71 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -2738,6 +2738,8 @@ static int do_sset(struct cmd_context *ctx)
int msglvl_changed = 0;
u32 msglvl_wanted = 0;
u32 msglvl_mask = 0;
+   int relaxorder_wanted = -1;
+   int relaxorder_changed = 0;
struct cmdline_info cmdline_msglvl[ARRAY_SIZE(flags_msglvl)];
int argc = ctx->argc;
char **argp = ctx->argp;
@@ -2873,6 +2875,16 @@ static int do_sset(struct cmd_context *ctx)
ARRAY_SIZE(cmdline_msglvl));
break;
}
+   } else if (!strcmp(argp[i], "relaxorder")) {
+   relaxorder_changed = 1;
+   i += 1;
+   if (i >= argc)
+   exit_bad_args();
+   if (!strcmp(argp[i], "on"))
+   relaxorder_wanted = RELAXORDER_ON;
+   else if (!strcmp(argp[i], "off"))
+   relaxorder_wanted = RELAXORDER_OFF;
+   elseexit_bad_args();
} else {
exit_bad_args();
}
@@ -3093,6 +3105,15 @@ static int do_sset(struct cmd_context *ctx)
}
}
 
+   if (relaxorder_changed) {
+   struct ethtool_value edata;
+
+   edata.cmd = ETHTOOL_SRELAXORDER;
+   edata.data = relaxorder_wanted;
+   err = send_ioctl(ctx, );
+   if (err < 0)
+   perror("Cannot set relax ordering mode");
+   }
return 0;
 }
 
@@ -4690,7 +4711,8 @@ static const struct option {
  " [ xcvr internal|external ]\n"
  " [ wol p|u|m|b|a|g|s|d... ]\n"
  " [ sopass %x:%x:%x:%x:%x:%x ]\n"
- " [ msglvl %d | msglvl type on|off ... ]\n" },
+ " [ msglvl %d | msglvl type on|off ... ]\n"
+ " [ relaxorder on|off ]\n" },
{ "-a|--show-pause", 1, do_gpause, "Show pause options" },
{ "-A|--pause", 1, do_spause, "Set pause options",
  " [ autoneg on|off ]\n"
-- 
2.7.0




[PATCH] net: add one ethtool option to set relax ordering mode

2016-12-07 Thread Mao Wenan
This patch provides one way to set/unset IXGBE NIC TX and RX
relax ordering mode, which can be set by ethtool.
Relax ordering is one mode of 82599 NIC, to enable this mode
can enhance the performance for some cpu architecure.
example:
ethtool -s enp1s0f0 relaxorder off
ethtool -s enp1s0f0 relaxorder on

Signed-off-by: Mao Wenan 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 34 
 include/linux/ethtool.h  |  2 ++
 include/uapi/linux/ethtool.h |  6 +
 net/core/ethtool.c   |  5 
 4 files changed, 47 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index f49f803..9650539 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -493,6 +493,39 @@ static void ixgbe_set_msglevel(struct net_device *netdev, 
u32 data)
adapter->msg_enable = data;
 }
 
+static void ixgbe_set_relaxorder(struct net_device *netdev, u32 data)
+{
+   struct ixgbe_adapter *adapter = netdev_priv(netdev);
+   struct ixgbe_hw *hw = >hw;
+   u32 i = 0;
+   pr_info("set relax ordering mode : %s\n",data?"on":"off");
+   
+   for (i = 0; i < hw->mac.max_tx_queues; i++) {
+   u32 regval;
+
+   regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i));
+   if (data)
+   regval |= IXGBE_DCA_TXCTRL_DESC_WRO_EN;
+   else
+   regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN;
+   IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval);
+   }
+
+   for (i = 0; i < hw->mac.max_rx_queues; i++) {
+   u32 regval;
+   
+   regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i));
+   if (data)
+   regval |= (IXGBE_DCA_RXCTRL_DATA_WRO_EN |
+   IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
+   else
+   regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN |
+   IXGBE_DCA_RXCTRL_HEAD_WRO_EN);
+   IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval);
+   }
+
+}
+
 static int ixgbe_get_regs_len(struct net_device *netdev)
 {
 #define IXGBE_REGS_LEN  1139
@@ -3274,6 +3307,7 @@ static const struct ethtool_ops ixgbe_ethtool_ops = {
.get_ts_info= ixgbe_get_ts_info,
.get_module_info= ixgbe_get_module_info,
.get_module_eeprom  = ixgbe_get_module_eeprom,
+   .set_relaxorder = ixgbe_set_relaxorder,
 };
 
 void ixgbe_set_ethtool_ops(struct net_device *netdev)
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 9ded8c6..0fae148 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -286,6 +286,7 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 
*legacy_u32,
  * fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS
  * instead of the latter), any change to them will be overwritten
  * by kernel. Returns a negative error code or zero.
+ * @set_relaxorder: set relax ordering mode, on|off.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -372,5 +373,6 @@ struct ethtool_ops {
  struct ethtool_link_ksettings *);
int (*set_link_ksettings)(struct net_device *,
  const struct ethtool_link_ksettings *);
+   void(*set_relaxorder)(struct net_device *, u32);
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 8e54723..86349b9 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1314,6 +1314,8 @@ struct ethtool_per_queue_op {
 #define ETHTOOL_GLINKSETTINGS  0x004c /* Get ethtool_link_settings */
 #define ETHTOOL_SLINKSETTINGS  0x004d /* Set ethtool_link_settings */
 
+#define ETHTOOL_SRELAXORDER0x0050 /* Set relax ordering mode, on or 
off*/
+
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET ETHTOOL_GSET
@@ -1494,6 +1496,10 @@ static inline int ethtool_validate_speed(__u32 speed)
 #define DUPLEX_FULL0x01
 #define DUPLEX_UNKNOWN 0xff
 
+/* Relax Ordering mode, on or off. */
+#define RELAXORDER_OFF  0x00
+#define RELAXORDER_ON   0x01
+
 static inline int ethtool_validate_duplex(__u8 duplex)
 {
switch (duplex) {
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 047a175..b7629d1 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2685,6 +2685,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SLINKSETTINGS:
rc = ethtool_set_link_ksettings(dev, useraddr);
break;
+   case ETHTOOL_SRELAXORDER:
+ 

Re: [net-next PATCH v5 5/6] virtio_net: add XDP_TX support

2016-12-07 Thread Michael S. Tsirkin
On Wed, Dec 07, 2016 at 12:12:45PM -0800, John Fastabend wrote:
> This adds support for the XDP_TX action to virtio_net. When an XDP
> program is run and returns the XDP_TX action the virtio_net XDP
> implementation will transmit the packet on a TX queue that aligns
> with the current CPU that the XDP packet was processed on.
> 
> Before sending the packet the header is zeroed.  Also XDP is expected
> to handle checksum correctly so no checksum offload  support is
> provided.
> 
> Signed-off-by: John Fastabend 
> ---
>  drivers/net/virtio_net.c |   99 
> +++---
>  1 file changed, 92 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 28b1196..8e5b13c 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -330,12 +330,57 @@ static struct sk_buff *page_to_skb(struct virtnet_info 
> *vi,
>   return skb;
>  }
>  
> +static void virtnet_xdp_xmit(struct virtnet_info *vi,
> +  struct receive_queue *rq,
> +  struct send_queue *sq,
> +  struct xdp_buff *xdp)
> +{
> + struct page *page = virt_to_head_page(xdp->data);
> + struct virtio_net_hdr_mrg_rxbuf *hdr;
> + unsigned int num_sg, len;
> + void *xdp_sent;
> + int err;
> +
> + /* Free up any pending old buffers before queueing new ones. */
> + while ((xdp_sent = virtqueue_get_buf(sq->vq, )) != NULL) {
> + struct page *sent_page = virt_to_head_page(xdp_sent);
> +
> + if (vi->mergeable_rx_bufs)
> + put_page(sent_page);
> + else
> + give_pages(rq, sent_page);
> + }

Looks like this is the only place where you do virtqueue_get_buf.
No interrupt handler?
This means that if you fill up the queue, nothing will clean it
and things will get stuck.
Can this be the issue you saw?


> +
> + /* Zero header and leave csum up to XDP layers */
> + hdr = xdp->data;
> + memset(hdr, 0, vi->hdr_len);
> +
> + num_sg = 1;
> + sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
> + err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
> +xdp->data, GFP_ATOMIC);
> + if (unlikely(err)) {
> + if (vi->mergeable_rx_bufs)
> + put_page(page);
> + else
> + give_pages(rq, page);
> + } else if (!vi->mergeable_rx_bufs) {
> + /* If not mergeable bufs must be big packets so cleanup pages */
> + give_pages(rq, (struct page *)page->private);
> + page->private = 0;
> + }
> +
> + virtqueue_kick(sq->vq);

Is this unconditional kick a work-around for hang
we could not figure out yet?
I guess this helps because it just slows down the guest.
I don't much like it ...

> +}
> +
>  static u32 do_xdp_prog(struct virtnet_info *vi,
> +struct receive_queue *rq,
>  struct bpf_prog *xdp_prog,
>  struct page *page, int offset, int len)
>  {
>   int hdr_padded_len;
>   struct xdp_buff xdp;
> + unsigned int qp;
>   u32 act;
>   u8 *buf;
>  
> @@ -353,9 +398,15 @@ static u32 do_xdp_prog(struct virtnet_info *vi,
>   switch (act) {
>   case XDP_PASS:
>   return XDP_PASS;
> + case XDP_TX:
> + qp = vi->curr_queue_pairs -
> + vi->xdp_queue_pairs +
> + smp_processor_id();
> + xdp.data = buf + (vi->mergeable_rx_bufs ? 0 : 4);
> + virtnet_xdp_xmit(vi, rq, >sq[qp], );
> + return XDP_TX;
>   default:
>   bpf_warn_invalid_xdp_action(act);
> - case XDP_TX:
>   case XDP_ABORTED:
>   case XDP_DROP:
>   return XDP_DROP;
> @@ -390,9 +441,17 @@ static struct sk_buff *receive_big(struct net_device 
> *dev,
>  
>   if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
>   goto err_xdp;
> - act = do_xdp_prog(vi, xdp_prog, page, 0, len);
> - if (act == XDP_DROP)
> + act = do_xdp_prog(vi, rq, xdp_prog, page, 0, len);
> + switch (act) {
> + case XDP_PASS:
> + break;
> + case XDP_TX:
> + rcu_read_unlock();
> + goto xdp_xmit;
> + case XDP_DROP:
> + default:
>   goto err_xdp;
> + }
>   }
>   rcu_read_unlock();
>  
> @@ -407,6 +466,7 @@ static struct sk_buff *receive_big(struct net_device *dev,
>  err:
>   dev->stats.rx_dropped++;
>   give_pages(rq, page);
> +xdp_xmit:
>   return NULL;
>  }
>  
> @@ -425,6 +485,8 @@ static struct sk_buff *receive_mergeable(struct 
> net_device *dev,
>   struct bpf_prog *xdp_prog;
>   unsigned int truesize;
>  
> + head_skb = NULL;
> +
>   

Re: [net-next PATCH v5 4/6] virtio_net: add dedicated XDP transmit queues

2016-12-07 Thread Michael S. Tsirkin
On Wed, Dec 07, 2016 at 12:12:23PM -0800, John Fastabend wrote:
> XDP requires using isolated transmit queues to avoid interference
> with normal networking stack (BQL, NETDEV_TX_BUSY, etc).
> This patch
> adds a XDP queue per cpu when a XDP program is loaded and does not
> expose the queues to the OS via the normal API call to
> netif_set_real_num_tx_queues(). This way the stack will never push
> an skb to these queues.
> 
> However virtio/vhost/qemu implementation only allows for creating
> TX/RX queue pairs at this time so creating only TX queues was not
> possible. And because the associated RX queues are being created I
> went ahead and exposed these to the stack and let the backend use
> them. This creates more RX queues visible to the network stack than
> TX queues which is worth mentioning but does not cause any issues as
> far as I can tell.
> 
> Signed-off-by: John Fastabend 
> ---
>  drivers/net/virtio_net.c |   30 --
>  1 file changed, 28 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index a009299..28b1196 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -114,6 +114,9 @@ struct virtnet_info {
>   /* # of queue pairs currently used by the driver */
>   u16 curr_queue_pairs;
>  
> + /* # of XDP queue pairs currently used by the driver */
> + u16 xdp_queue_pairs;
> +
>   /* I like... big packets and I cannot lie! */
>   bool big_packets;
>  
> @@ -1547,7 +1550,8 @@ static int virtnet_xdp_set(struct net_device *dev, 
> struct bpf_prog *prog)
>   unsigned long int max_sz = PAGE_SIZE - sizeof(struct padded_vnet_hdr);
>   struct virtnet_info *vi = netdev_priv(dev);
>   struct bpf_prog *old_prog;
> - int i;
> + u16 xdp_qp = 0, curr_qp;
> + int i, err;
>  
>   if ((dev->features & NETIF_F_LRO) && prog) {
>   netdev_warn(dev, "can't set XDP while LRO is on, disable LRO 
> first\n");
> @@ -1564,12 +1568,34 @@ static int virtnet_xdp_set(struct net_device *dev, 
> struct bpf_prog *prog)
>   return -EINVAL;
>   }
>  
> + curr_qp = vi->curr_queue_pairs - vi->xdp_queue_pairs;
> + if (prog)
> + xdp_qp = nr_cpu_ids;
> +
> + /* XDP requires extra queues for XDP_TX */
> + if (curr_qp + xdp_qp > vi->max_queue_pairs) {
> + netdev_warn(dev, "request %i queues but max is %i\n",
> + curr_qp + xdp_qp, vi->max_queue_pairs);
> + return -ENOMEM;
> + }

Can't we disable XDP_TX somehow? Many people might only want RX drop,
and extra queues are not always there.


> +
> + err = virtnet_set_queues(vi, curr_qp + xdp_qp);
> + if (err) {
> + dev_warn(>dev, "XDP Device queue allocation failure.\n");
> + return err;
> + }
> +
>   if (prog) {
>   prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
> - if (IS_ERR(prog))
> + if (IS_ERR(prog)) {
> + virtnet_set_queues(vi, curr_qp);
>   return PTR_ERR(prog);
> + }
>   }
>  
> + vi->xdp_queue_pairs = xdp_qp;
> + netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
> +
>   for (i = 0; i < vi->max_queue_pairs; i++) {
>   old_prog = rtnl_dereference(vi->rq[i].xdp_prog);
>   rcu_assign_pointer(vi->rq[i].xdp_prog, prog);


Re: [net-next PATCH v5 3/6] virtio_net: Add XDP support

2016-12-07 Thread Michael S. Tsirkin
On Wed, Dec 07, 2016 at 09:14:48PM -0800, John Fastabend wrote:
> On 16-12-07 08:48 PM, Michael S. Tsirkin wrote:
> > On Wed, Dec 07, 2016 at 12:11:57PM -0800, John Fastabend wrote:
> >> From: John Fastabend 
> >>
> >> This adds XDP support to virtio_net. Some requirements must be
> >> met for XDP to be enabled depending on the mode. First it will
> >> only be supported with LRO disabled so that data is not pushed
> >> across multiple buffers. Second the MTU must be less than a page
> >> size to avoid having to handle XDP across multiple pages.
> >>
> >> If mergeable receive is enabled this patch only supports the case
> >> where header and data are in the same buf which we can check when
> >> a packet is received by looking at num_buf. If the num_buf is
> >> greater than 1 and a XDP program is loaded the packet is dropped
> >> and a warning is thrown. When any_header_sg is set this does not
> >> happen and both header and data is put in a single buffer as expected
> >> so we check this when XDP programs are loaded.  Subsequent patches
> >> will process the packet in a degraded mode to ensure connectivity
> >> and correctness is not lost even if backend pushes packets into
> >> multiple buffers.
> >>
> >> If big packets mode is enabled and MTU/LRO conditions above are
> >> met then XDP is allowed.
> >>
> >> This patch was tested with qemu with vhost=on and vhost=off where
> >> mergeable and big_packet modes were forced via hard coding feature
> >> negotiation. Multiple buffers per packet was forced via a small
> >> test patch to vhost.c in the vhost=on qemu mode.
> >>
> >> Suggested-by: Shrijeet Mukherjee 
> >> Signed-off-by: John Fastabend 
> > 
> > I'd like to note that I don't think disabling LRO is a good
> > plan long-term. It's really important for virtio performance,
> > so IMHO we need a fix for that.
> > I'm guessing that a subset of XDP programs would be quite
> > happy with just looking at headers, and that is there in the 1st buffer.
> > So how about teaching XDP that there could be a truncated packet?
> > 
> > Then we won't have to disable LRO.
> > 
> 
> Agreed long-term we can drop this requirement this type of improvement
> would also allow working with jumbo frames on nics.
> 
> I don't think it should block this patch series though.
> 
> .John

Right.



Re: [PATCH] linux/types.h: enable endian checks for all sparse builds

2016-12-07 Thread Bart Van Assche
On 12/07/16 18:29, Michael S. Tsirkin wrote:
> By now, linux is mostly endian-clean. Enabling endian-ness
> checks for everyone produces about 200 new sparse warnings for me -
> less than 10% over the 2000 sparse warnings already there.
>
> Not a big deal, OTOH enabling this helps people notice
> they are introducing new bugs.
>
> So let's just drop __CHECK_ENDIAN__. Follow-up patches
> can drop distinction between __bitwise and __bitwise__.

Hello Michael,

This patch makes a whole bunch of ccflags-y += -D__CHECK_ENDIAN__ 
statements obsolete. Have you considered to remove these statements?

Additionally, there are notable exceptions to the rule that most drivers 
are endian-clean, e.g. drivers/scsi/qla2xxx. I would appreciate it if it 
would remain possible to check such drivers with sparse without enabling 
endianness checks. Have you considered to change #ifdef __CHECK_ENDIAN__ 
into e.g. #ifndef __DONT_CHECK_ENDIAN__?

Thanks,

Bart.


Re: [PATCH] linux/types.h: enable endian checks for all sparse builds

2016-12-07 Thread Michael S. Tsirkin
On Thu, Dec 08, 2016 at 05:21:47AM +, Bart Van Assche wrote:
> On 12/07/16 18:29, Michael S. Tsirkin wrote:
> > By now, linux is mostly endian-clean. Enabling endian-ness
> > checks for everyone produces about 200 new sparse warnings for me -
> > less than 10% over the 2000 sparse warnings already there.
> >
> > Not a big deal, OTOH enabling this helps people notice
> > they are introducing new bugs.
> >
> > So let's just drop __CHECK_ENDIAN__. Follow-up patches
> > can drop distinction between __bitwise and __bitwise__.
> 
> Hello Michael,
> 
> This patch makes a whole bunch of ccflags-y += -D__CHECK_ENDIAN__ 
> statements obsolete. Have you considered to remove these statements?

Absolutely. Just waiting for feedback on the idea.

> Additionally, there are notable exceptions to the rule that most drivers 
> are endian-clean, e.g. drivers/scsi/qla2xxx. I would appreciate it if it 
> would remain possible to check such drivers with sparse without enabling 
> endianness checks. Have you considered to change #ifdef __CHECK_ENDIAN__ 
> into e.g. #ifndef __DONT_CHECK_ENDIAN__?
> 
> Thanks,
> 
> Bart.

The right thing is probably just to fix these, isn't it?
Until then, why not just ignore the warnings?

-- 
MST


Re: [net-next PATCH v5 3/6] virtio_net: Add XDP support

2016-12-07 Thread John Fastabend
On 16-12-07 08:48 PM, Michael S. Tsirkin wrote:
> On Wed, Dec 07, 2016 at 12:11:57PM -0800, John Fastabend wrote:
>> From: John Fastabend 
>>
>> This adds XDP support to virtio_net. Some requirements must be
>> met for XDP to be enabled depending on the mode. First it will
>> only be supported with LRO disabled so that data is not pushed
>> across multiple buffers. Second the MTU must be less than a page
>> size to avoid having to handle XDP across multiple pages.
>>
>> If mergeable receive is enabled this patch only supports the case
>> where header and data are in the same buf which we can check when
>> a packet is received by looking at num_buf. If the num_buf is
>> greater than 1 and a XDP program is loaded the packet is dropped
>> and a warning is thrown. When any_header_sg is set this does not
>> happen and both header and data is put in a single buffer as expected
>> so we check this when XDP programs are loaded.  Subsequent patches
>> will process the packet in a degraded mode to ensure connectivity
>> and correctness is not lost even if backend pushes packets into
>> multiple buffers.
>>
>> If big packets mode is enabled and MTU/LRO conditions above are
>> met then XDP is allowed.
>>
>> This patch was tested with qemu with vhost=on and vhost=off where
>> mergeable and big_packet modes were forced via hard coding feature
>> negotiation. Multiple buffers per packet was forced via a small
>> test patch to vhost.c in the vhost=on qemu mode.
>>
>> Suggested-by: Shrijeet Mukherjee 
>> Signed-off-by: John Fastabend 
> 
> I'd like to note that I don't think disabling LRO is a good
> plan long-term. It's really important for virtio performance,
> so IMHO we need a fix for that.
> I'm guessing that a subset of XDP programs would be quite
> happy with just looking at headers, and that is there in the 1st buffer.
> So how about teaching XDP that there could be a truncated packet?
> 
> Then we won't have to disable LRO.
> 

Agreed long-term we can drop this requirement this type of improvement
would also allow working with jumbo frames on nics.

I don't think it should block this patch series though.

.John


Re: Misalignment, MIPS, and ip_hdr(skb)->version

2016-12-07 Thread Daniel Kahn Gillmor
On Wed 2016-12-07 19:30:34 -0500, Hannes Frederic Sowa wrote:
> Your custom protocol should be designed in a way you get an aligned ip
> header. Most protocols of the IETF follow this mantra and it is always
> possible to e.g. pad options so you end up on aligned boundaries for the
> next header.

fwiw, i'm not convinced that "most protocols of the IETF follow this
mantra".  we've had multiple discussions in different protocol groups
about shaving or bloating by a few bytes here or there in different
protocols, and i don't think anyone has brought up memory alignment as
an argument in any of the discussions i've followed.

that said, it sure does sound like it would make things simpler to
construct the protocol that way :)

  --dkg


Re: [net-next PATCH v5 3/6] virtio_net: Add XDP support

2016-12-07 Thread Michael S. Tsirkin
On Wed, Dec 07, 2016 at 12:11:57PM -0800, John Fastabend wrote:
> From: John Fastabend 
> 
> This adds XDP support to virtio_net. Some requirements must be
> met for XDP to be enabled depending on the mode. First it will
> only be supported with LRO disabled so that data is not pushed
> across multiple buffers. Second the MTU must be less than a page
> size to avoid having to handle XDP across multiple pages.
> 
> If mergeable receive is enabled this patch only supports the case
> where header and data are in the same buf which we can check when
> a packet is received by looking at num_buf. If the num_buf is
> greater than 1 and a XDP program is loaded the packet is dropped
> and a warning is thrown. When any_header_sg is set this does not
> happen and both header and data is put in a single buffer as expected
> so we check this when XDP programs are loaded.  Subsequent patches
> will process the packet in a degraded mode to ensure connectivity
> and correctness is not lost even if backend pushes packets into
> multiple buffers.
> 
> If big packets mode is enabled and MTU/LRO conditions above are
> met then XDP is allowed.
> 
> This patch was tested with qemu with vhost=on and vhost=off where
> mergeable and big_packet modes were forced via hard coding feature
> negotiation. Multiple buffers per packet was forced via a small
> test patch to vhost.c in the vhost=on qemu mode.
> 
> Suggested-by: Shrijeet Mukherjee 
> Signed-off-by: John Fastabend 

I'd like to note that I don't think disabling LRO is a good
plan long-term. It's really important for virtio performance,
so IMHO we need a fix for that.
I'm guessing that a subset of XDP programs would be quite
happy with just looking at headers, and that is there in the 1st buffer.
So how about teaching XDP that there could be a truncated packet?

Then we won't have to disable LRO.


> ---
>  drivers/net/virtio_net.c |  175 
> +-
>  1 file changed, 170 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index a5c47b1..a009299 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -22,6 +22,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -81,6 +82,8 @@ struct receive_queue {
>  
>   struct napi_struct napi;
>  
> + struct bpf_prog __rcu *xdp_prog;
> +
>   /* Chain pages by the private ptr. */
>   struct page *pages;
>  
> @@ -324,6 +327,38 @@ static struct sk_buff *page_to_skb(struct virtnet_info 
> *vi,
>   return skb;
>  }
>  
> +static u32 do_xdp_prog(struct virtnet_info *vi,
> +struct bpf_prog *xdp_prog,
> +struct page *page, int offset, int len)
> +{
> + int hdr_padded_len;
> + struct xdp_buff xdp;
> + u32 act;
> + u8 *buf;
> +
> + buf = page_address(page) + offset;
> +
> + if (vi->mergeable_rx_bufs)
> + hdr_padded_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> + else
> + hdr_padded_len = sizeof(struct padded_vnet_hdr);
> +
> + xdp.data = buf + hdr_padded_len;
> + xdp.data_end = xdp.data + (len - vi->hdr_len);
> +
> + act = bpf_prog_run_xdp(xdp_prog, );
> + switch (act) {
> + case XDP_PASS:
> + return XDP_PASS;
> + default:
> + bpf_warn_invalid_xdp_action(act);
> + case XDP_TX:
> + case XDP_ABORTED:
> + case XDP_DROP:
> + return XDP_DROP;
> + }
> +}
> +
>  static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, 
> unsigned int len)
>  {
>   struct sk_buff * skb = buf;
> @@ -340,14 +375,32 @@ static struct sk_buff *receive_big(struct net_device 
> *dev,
>  void *buf,
>  unsigned int len)
>  {
> + struct bpf_prog *xdp_prog;
>   struct page *page = buf;
> - struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
> + struct sk_buff *skb;
>  
> + rcu_read_lock();
> + xdp_prog = rcu_dereference(rq->xdp_prog);
> + if (xdp_prog) {
> + struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
> + u32 act;
> +
> + if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
> + goto err_xdp;
> + act = do_xdp_prog(vi, xdp_prog, page, 0, len);
> + if (act == XDP_DROP)
> + goto err_xdp;
> + }
> + rcu_read_unlock();
> +
> + skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
>   if (unlikely(!skb))
>   goto err;
>  
>   return skb;
>  
> +err_xdp:
> + rcu_read_unlock();
>  err:
>   dev->stats.rx_dropped++;
>   give_pages(rq, page);
> @@ -365,11 +418,42 @@ static struct sk_buff *receive_mergeable(struct 
> net_device *dev,
>   u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
>   struct 

Re: net-next closing, README

2016-12-07 Thread Stephen Hemminger
On Wed, 07 Dec 2016 16:28:45 -0500 (EST)
David Miller  wrote:

> The merge window is about to open soon, and next week I will be
> having sporadic internet access while travelling around, therefore
> I am closing net-next up tonight.
> 
> Therefore, please do not submit any new features or cleanups for
> net-next.  Bug fixes for problems introduced in net-next are fine,
> however.
> 
> Thank you.

I have a couple of patches that I would like to get into net-next, but
it is not critical. They replace the hardcoded workarounds with code
that negotiates values with the host. Would these be acceptable?
Sorry for the delay but needed to test on oldest supported version
to ensure negotiation worked.


Re: net-next closing, README

2016-12-07 Thread Stephen Hemminger
On Wed, 7 Dec 2016 19:13:45 -0800
Stephen Hemminger  wrote:

> On Wed, 07 Dec 2016 16:28:45 -0500 (EST)
> David Miller  wrote:
> 
> > The merge window is about to open soon, and next week I will be
> > having sporadic internet access while travelling around, therefore
> > I am closing net-next up tonight.
> > 
> > Therefore, please do not submit any new features or cleanups for
> > net-next.  Bug fixes for problems introduced in net-next are fine,
> > however.
> > 
> > Thank you.  
> 
> I have a couple of patches that I would like to get into net-next, but
> it is not critical. They replace the hardcoded workarounds with code
> that negotiates values with the host. Would these be acceptable?
> Sorry for the delay but needed to test on oldest supported version
> to ensure negotiation worked.

Never mind, although the changes work on older versions of Windows Server,
the performance would be worse.  Basically old servers don't do UDP checksum
offload but still are capable of handling TCP.  Let me work up a better
solution that handles both cases.



[PATCH net v2 1/1] driver: ipvlan: Unlink the upper dev when ipvlan_link_new failed

2016-12-07 Thread fgao
From: Gao Feng 

When netdev_upper_dev_unlink failed in ipvlan_link_new, need to
unlink the ipvlan dev with upper dev.

Signed-off-by: Gao Feng 
---
 v2: Rename the label to unlink_netdev, per Mahesh Bandewar
 v1: Initial patch

 drivers/net/ipvlan/ipvlan_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 0fef178..dfbc4ef 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -546,13 +546,15 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
}
err = ipvlan_set_port_mode(port, mode);
if (err) {
-   goto unregister_netdev;
+   goto unlink_netdev;
}
 
list_add_tail_rcu(>pnode, >ipvlans);
netif_stacked_transfer_operstate(phy_dev, dev);
return 0;
 
+unlink_netdev:
+   netdev_upper_dev_unlink(phy_dev, dev);
 unregister_netdev:
unregister_netdevice(dev);
 destroy_ipvlan_port:
-- 
1.9.1




Re: commit : ppp: add rtnetlink device creation support - breaks netcf on my machine.

2016-12-07 Thread Brad Campbell

On 08/12/16 01:43, Thomas Haller wrote:

On Tue, 2016-12-06 at 17:12 -0600, Dan Williams wrote:



libnl1 rejects the IFLA_INFO_DATA attribute because it expects it
to
contain a sub-attribute. Since the payload size is zero it doesn't
match the policy and parsing fails.

There's no problem with libnl3 because its policy accepts empty
payloads for NLA_NESTED attributes (see libnl3 commit 4be02ace4826


Hi,

libnl1 is unmaintained these days. I don't think it makes sense to
backport that patch. The last upstream release was 3+ years ago, with
no upstream development since then.

IMHO netcf should drop libnl-1 support.



G'day Thomas,

I'm not sure anyone was suggesting fixing libnl1, it was more around a 
discussion with regard to a change in the kernel breaking old userspace 
and whether it needs to be fixed in the kernel.


Personally, now I have a solution to *my* immediate problem (that being 
any kernel 4.7 or later prevented libvirtd starting on my servers 
because my netcf was compiled against libnl1) I can upgrade the relevant 
userspace components to work around the issue.


Also, now this issue is a number of months old and I appear to be the 
only person reporting it, maybe it's not worth tackling. I would 
absolutely say that netcf needs to drop libnl1 now though as it *is* 
broken on newer kernels under the right circumstances.


I appreciate the assistance in tracking it down anyway. Thanks guys.

Regards,
Brad



[PATCH] linux/types.h: enable endian checks for all sparse builds

2016-12-07 Thread Michael S. Tsirkin
By now, linux is mostly endian-clean. Enabling endian-ness
checks for everyone produces about 200 new sparse warnings for me -
less than 10% over the 2000 sparse warnings already there.

Not a big deal, OTOH enabling this helps people notice
they are introducing new bugs.

So let's just drop __CHECK_ENDIAN__. Follow-up patches
can drop distinction between __bitwise and __bitwise__.

Cc: Linus Torvalds 
Suggested-by: Christoph Hellwig 
Signed-off-by: Michael S. Tsirkin 
---

Linus, could you ack this for upstream? If yes I'll
merge through my tree as a replacement for enabling
this just for virtio.

 include/uapi/linux/types.h | 4 
 1 file changed, 4 deletions(-)

diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h
index acf0979..41e5914 100644
--- a/include/uapi/linux/types.h
+++ b/include/uapi/linux/types.h
@@ -23,11 +23,7 @@
 #else
 #define __bitwise__
 #endif
-#ifdef __CHECK_ENDIAN__
 #define __bitwise __bitwise__
-#else
-#define __bitwise
-#endif
 
 typedef __u16 __bitwise __le16;
 typedef __u16 __bitwise __be16;
-- 
MST


Re: [PATCH net 1/1] driver: ipvlan: Unlink the upper dev when ipvlan_link_new failed

2016-12-07 Thread Gao Feng
On Thu, Dec 8, 2016 at 9:39 AM, Mahesh Bandewar (महेश बंडेवार)
 wrote:
> On Wed, Dec 7, 2016 at 5:21 PM,   wrote:
>> From: Gao Feng 
>>
>> When netdev_upper_dev_unlink failed in ipvlan_link_new, need to
>> unlink the ipvlan dev with upper dev.
>>
>> Signed-off-by: Gao Feng 
>> ---
>>  drivers/net/ipvlan/ipvlan_main.c | 4 +++-
>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ipvlan/ipvlan_main.c 
>> b/drivers/net/ipvlan/ipvlan_main.c
>> index 0fef178..189adbc 100644
>> --- a/drivers/net/ipvlan/ipvlan_main.c
>> +++ b/drivers/net/ipvlan/ipvlan_main.c
>> @@ -546,13 +546,15 @@ static int ipvlan_link_new(struct net *src_net, struct 
>> net_device *dev,
>> }
>> err = ipvlan_set_port_mode(port, mode);
>> if (err) {
>> -   goto unregister_netdev;
>> +   goto dev_unlink;
>> }
>>
>> list_add_tail_rcu(>pnode, >ipvlans);
>> netif_stacked_transfer_operstate(phy_dev, dev);
>> return 0;
>>
>> +dev_unlink:
> probably 'unlink_netdev' label inline with other labels used. thanks

OK, it is better name.
I will follow it and send v2 update.

Regards
Feng

>> +   netdev_upper_dev_unlink(phy_dev, dev);
>>  unregister_netdev:
>> unregister_netdevice(dev);
>>  destroy_ipvlan_port:
>> --
>> 1.9.1
>>
>>




Re: [PATCH net 1/1] driver: ipvlan: Unlink the upper dev when ipvlan_link_new failed

2016-12-07 Thread महेश बंडेवार
On Wed, Dec 7, 2016 at 5:21 PM,   wrote:
> From: Gao Feng 
>
> When netdev_upper_dev_unlink failed in ipvlan_link_new, need to
> unlink the ipvlan dev with upper dev.
>
> Signed-off-by: Gao Feng 
> ---
>  drivers/net/ipvlan/ipvlan_main.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ipvlan/ipvlan_main.c 
> b/drivers/net/ipvlan/ipvlan_main.c
> index 0fef178..189adbc 100644
> --- a/drivers/net/ipvlan/ipvlan_main.c
> +++ b/drivers/net/ipvlan/ipvlan_main.c
> @@ -546,13 +546,15 @@ static int ipvlan_link_new(struct net *src_net, struct 
> net_device *dev,
> }
> err = ipvlan_set_port_mode(port, mode);
> if (err) {
> -   goto unregister_netdev;
> +   goto dev_unlink;
> }
>
> list_add_tail_rcu(>pnode, >ipvlans);
> netif_stacked_transfer_operstate(phy_dev, dev);
> return 0;
>
> +dev_unlink:
probably 'unlink_netdev' label inline with other labels used. thanks
> +   netdev_upper_dev_unlink(phy_dev, dev);
>  unregister_netdev:
> unregister_netdevice(dev);
>  destroy_ipvlan_port:
> --
> 1.9.1
>
>


Re: [Intel-wired-lan] [PATCH 1/1] ixgbe: fcoe: return value of skb_linearize should be handled

2016-12-07 Thread Rustad, Mark D

Zhouyi Zhou  wrote:

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

index fee1f29..4926d48 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2173,8 +2173,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector  
*q_vector,

total_rx_bytes += ddp_bytes;
total_rx_packets += DIV_ROUND_UP(ddp_bytes,
 mss);
-   }
-   if (!ddp_bytes) {
+   } else {
dev_kfree_skb_any(skb);
continue;
}


This is changing the logic by treating a negative ddp_bytes value (an error  
return) the same as a 0 value. This is probably wrong and inappropriate for  
this patch in any case.


--
Mark Rustad, Networking Division, Intel Corporation


signature.asc
Description: Message signed with OpenPGP using GPGMail


[PATCH net 1/1] driver: ipvlan: Unlink the upper dev when ipvlan_link_new failed

2016-12-07 Thread fgao
From: Gao Feng 

When netdev_upper_dev_unlink failed in ipvlan_link_new, need to
unlink the ipvlan dev with upper dev.

Signed-off-by: Gao Feng 
---
 drivers/net/ipvlan/ipvlan_main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 0fef178..189adbc 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -546,13 +546,15 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
}
err = ipvlan_set_port_mode(port, mode);
if (err) {
-   goto unregister_netdev;
+   goto dev_unlink;
}
 
list_add_tail_rcu(>pnode, >ipvlans);
netif_stacked_transfer_operstate(phy_dev, dev);
return 0;
 
+dev_unlink:
+   netdev_upper_dev_unlink(phy_dev, dev);
 unregister_netdev:
unregister_netdevice(dev);
 destroy_ipvlan_port:
-- 
1.9.1




Re: [PATCH 1/1] ixgbe: fcoe: return value of skb_linearize should be handled

2016-12-07 Thread Zhouyi Zhou
Thanks Jeff for your advice,
Sorry for the  my innocence as a Linux kernel rookie.

Zhouyi

On Thu, Dec 8, 2016 at 1:30 AM, Jeff Kirsher
 wrote:
> On Wed, 2016-12-07 at 15:43 +0800, Zhouyi Zhou wrote:
>> Signed-off-by: Zhouyi Zhou 
>> Reviewed-by: Cong Wang 
>> Reviewed-by: Yuval Shaia 
>> Reviewed-by: Eric Dumazet 
>> ---
>>  drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c | 6 +-
>>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 +--
>>  2 files changed, 6 insertions(+), 3 deletions(-)
>
> Did Cong, Yuval and Eric give their Reviewed-by offline?  I see they made
> comments and suggests, but never saw them actually give you their Reviewed-
> by.  You cannot automatically add their Reviewed-by, Signed-off-by, etc
> just because someone provides feedback on your patch.


RE: [net-next v2 02/19] i40e: simplify txd use count calculation

2016-12-07 Thread Duyck, Alexander H
> -Original Message-
> From: Eric Dumazet [mailto:eric.duma...@gmail.com]
> Sent: Wednesday, December 7, 2016 5:04 PM
> To: Duyck, Alexander H 
> Cc: Kirsher, Jeffrey T ; da...@davemloft.net;
> Williams, Mitch A ; netdev@vger.kernel.org;
> nhor...@redhat.com; sassm...@redhat.com; jogre...@redhat.com;
> guru.anbalag...@oracle.com
> Subject: Re: [net-next v2 02/19] i40e: simplify txd use count calculation
> 
> On Thu, 2016-12-08 at 00:35 +, Duyck, Alexander H wrote:
> 
> > Well there ends up being a few aspects to it.  First we don't need the
> > precision of a full 64b inverse multiplication, that is why we can get
> > away with multiple by 85 and shift.  The assumption is we should never
> > see a buffer larger than 64K for a TSO frame.  That being the case we
> > can do the same thing without having to use a 64b value which isn't an
> > option on 32b architectures.
> >
> > So basically what it comes down to is dealing with the "optimized for
> > size" kernel option, and 32b architectures not being able to do this.
> > Arguably both are corner cases but better to deal with them than take
> > a performance hit we don't have to.
> 
> ok ok ;)
> 
> Too bad the 65536 value is accepted, (is it ?) otherwise
> 
> unsigned int foo(unsigned short size)
> {
>   return size / 0x3000;
> }
> 
> -> generates the same kind of instructions, with maybe a better
> precision.
> 
> foo:
>   movzwl  %di, %eax
>   imull   $43691, %eax, %eax
>   shrl$29, %eax
>   ret

I haven't ever looked all that closely.  I'm assuming the frame size can 
probably exceed by at least ETH_HLEN - 1 since the IP header length can 
theoretically reach 64K - 1 before we are maxed out.

We don't really need the extra precision anyway.  We will be off by 1 most 
likely anyway in many cases since the actual hardware can handle up to 16K - 1 
in either the first and/or last buffer of any series since the actual limit is 
the 14 bits in the Tx descriptor for reporting the buffer length and we try to 
keep the split between buffers 4K aligned.

- Alex


RE: [net-next] icmp: correct return value of icmp_rcv()

2016-12-07 Thread 张胜举
> -Original Message-
> From: Eric Dumazet [mailto:eric.duma...@gmail.com]
> Sent: Wednesday, December 07, 2016 10:18 PM
> To: Zhang Shengju 
> Cc: netdev@vger.kernel.org
> Subject: Re: [net-next] icmp: correct return value of icmp_rcv()
> 
> On Wed, 2016-12-07 at 14:52 +0800, Zhang Shengju wrote:
> > Currently, icmp_rcv() always return zero on a packet delivery upcall.
> >
> > To make its behavior more compliant with the way this API should be
> > used, this patch changes this to let it return NET_RX_SUCCESS when the
> > packet is proper handled, and NET_RX_DROP otherwise.
> >
> > Signed-off-by: Zhang Shengju 
> > ---
> >  net/ipv4/icmp.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 691146a..f79d7a8
> > 100644
> > --- a/net/ipv4/icmp.c
> > +++ b/net/ipv4/icmp.c
> > @@ -1047,12 +1047,12 @@ int icmp_rcv(struct sk_buff *skb)
> >
> > if (success)  {
> > consume_skb(skb);
> > -   return 0;
> > +   return NET_RX_SUCCESS;
> > }
> >
> >  drop:
> > kfree_skb(skb);
> > -   return 0;
> > +   return NET_RX_DROP;
> >  csum_error:
> > __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
> >  error:
> 
> 
> I am curious, what external/visible effects do you expect from such a change ?
> 
> We now have a very precise monitoring of where packets are dropped
> (consume_skb()/kfree_skb())
> 
> 

I know that the return value is always ignored, I just to want to make it 
compliant with the way this API required like I said in the comment.

Thanks,





Re: [net-next v2 02/19] i40e: simplify txd use count calculation

2016-12-07 Thread Eric Dumazet
On Thu, 2016-12-08 at 00:35 +, Duyck, Alexander H wrote:

> Well there ends up being a few aspects to it.  First we don't need the
> precision of a full 64b inverse multiplication, that is why we can get
> away with multiple by 85 and shift.  The assumption is we should never
> see a buffer larger than 64K for a TSO frame.  That being the case we
> can do the same thing without having to use a 64b value which isn't an
> option on 32b architectures.
> 
> So basically what it comes down to is dealing with the "optimized for
> size" kernel option, and 32b architectures not being able to do this.
> Arguably both are corner cases but better to deal with them than take
> a performance hit we don't have to.

ok ok ;)

Too bad the 65536 value is accepted, (is it ?) otherwise

unsigned int foo(unsigned short size)
{
return size / 0x3000;
}

-> generates the same kind of instructions, with maybe a better
precision.

foo:
movzwl  %di, %eax
imull   $43691, %eax, %eax
shrl$29, %eax
ret







[PATCH 1/1] orinoco: fix improper return value

2016-12-07 Thread Pan Bian
Function orinoco_ioctl_commit() returns 0 (indicates success) when the
call to orinoco_lock() fails. Thus, the return value is inconsistent with
the execution status. It may be better to return "-EBUSY" when the call 
to orinoco_lock() fails.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=188671

Signed-off-by: Pan Bian 
---
 drivers/net/wireless/intersil/orinoco/wext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/intersil/orinoco/wext.c 
b/drivers/net/wireless/intersil/orinoco/wext.c
index 1d4dae4..fee57ea 100644
--- a/drivers/net/wireless/intersil/orinoco/wext.c
+++ b/drivers/net/wireless/intersil/orinoco/wext.c
@@ -1314,7 +1314,7 @@ static int orinoco_ioctl_commit(struct net_device *dev,
return 0;
 
if (orinoco_lock(priv, ) != 0)
-   return err;
+   return -EBUSY;
 
err = orinoco_commit(priv);
 
-- 
1.9.1




Re: [PATCH 00/50] Netfilter/IPVS updates for net-next

2016-12-07 Thread David Miller
From: Pablo Neira Ayuso 
Date: Wed,  7 Dec 2016 22:52:06 +0100

> The following patchset contains a large Netfilter update for net-next,
> to summarise:
 ...
> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git

Pulled, thanks a lot Pablo.


Re: Misalignment, MIPS, and ip_hdr(skb)->version

2016-12-07 Thread David Miller
From: "Jason A. Donenfeld" 
Date: Thu, 8 Dec 2016 01:29:42 +0100

> On Wed, Dec 7, 2016 at 8:52 PM, David Miller  wrote:
>> The only truly difficult case to handle is GRE encapsulation.  Is
>> that the situation you are running into?
>>
>> If not, please figure out what the header configuration looks like
>> in the case that hits for you, and what the originating device is
>> just in case it is a device driver issue.
> 
> My case is my own driver and my own protocol, which uses a 13 byte
> header. I can, if absolutely necessary, change the protocol to add
> another byte of padding. Or I can choose not to decrypt in place but
> rather use a different trick, like overwriting the header during
> decryption, though this removes some of the scatterwalk optimizations
> when src and dst are the same. Or something else. I wrote the top
> email of this thread inquiring about just exactly how bad it is to
> call netif_rx(skb) when skb->data is unaligned.

You really have to land the IP header on a proper 4 byte boundary.

I would suggest pushing 3 dummy garbage bytes of padding at the front
or the end of your header.


Re: Misalignment, MIPS, and ip_hdr(skb)->version

2016-12-07 Thread Jason A. Donenfeld
On Wed, Dec 7, 2016 at 8:52 PM, David Miller  wrote:
> The only truly difficult case to handle is GRE encapsulation.  Is
> that the situation you are running into?
>
> If not, please figure out what the header configuration looks like
> in the case that hits for you, and what the originating device is
> just in case it is a device driver issue.

My case is my own driver and my own protocol, which uses a 13 byte
header. I can, if absolutely necessary, change the protocol to add
another byte of padding. Or I can choose not to decrypt in place but
rather use a different trick, like overwriting the header during
decryption, though this removes some of the scatterwalk optimizations
when src and dst are the same. Or something else. I wrote the top
email of this thread inquiring about just exactly how bad it is to
call netif_rx(skb) when skb->data is unaligned.


RE: [net-next v2 02/19] i40e: simplify txd use count calculation

2016-12-07 Thread Duyck, Alexander H
> -Original Message-
> From: Eric Dumazet [mailto:eric.duma...@gmail.com]
> Sent: Wednesday, December 7, 2016 4:16 PM
> To: Kirsher, Jeffrey T 
> Cc: da...@davemloft.net; Williams, Mitch A ;
> netdev@vger.kernel.org; nhor...@redhat.com; sassm...@redhat.com;
> jogre...@redhat.com; guru.anbalag...@oracle.com; Duyck, Alexander H
> 
> Subject: Re: [net-next v2 02/19] i40e: simplify txd use count calculation
> 
> On Wed, 2016-12-07 at 14:19 -0800, Jeff Kirsher wrote:
> > From: Mitch Williams 
> >
> > The i40e_txd_use_count function was fast but confusing. In the
> > comments, it even admits that it's ugly. So replace it with a new
> > function that is
> > (very) slightly faster and has extensive commenting to help the
> > thicker among us (including the author, who will forget in a week)
> > understand how it works.
> >
> > Change-ID: Ifb533f13786a0bf39cb29f77969a5be2c83d9a87
> > Signed-off-by: Mitch Williams 
> > Signed-off-by: Alexander Duyck 
> > Tested-by: Andrew Bowers 
> > Signed-off-by: Jeff Kirsher 
> > ---
> >  drivers/net/ethernet/intel/i40e/i40e_txrx.h   | 45 
> > +-
> -
> >  drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 45
> > +--
> >  2 files changed, 56 insertions(+), 34 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > index de8550f..e065321 100644
> > --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> > @@ -173,26 +173,37 @@ static inline bool i40e_test_staterr(union
> > i40e_rx_desc *rx_desc,  #define I40E_MAX_DATA_PER_TXD_ALIGNED \
> > (I40E_MAX_DATA_PER_TXD & ~(I40E_MAX_READ_REQ_SIZE - 1))
> >
> > -/* This ugly bit of math is equivalent to DIV_ROUNDUP(size, X) where
> > X is
> > - * the value I40E_MAX_DATA_PER_TXD_ALIGNED.  It is needed due to the
> > fact
> > - * that 12K is not a power of 2 and division is expensive.  It is
> > used to
> > - * approximate the number of descriptors used per linear buffer.
> > Note
> > - * that this will overestimate in some cases as it doesn't account
> > for the
> > - * fact that we will add up to 4K - 1 in aligning the 12K buffer,
> > however
> > - * the error should not impact things much as large buffers usually
> > mean
> > - * we will use fewer descriptors then there are frags in an skb.
> > +/**
> > + * i40e_txd_use_count  - estimate the number of descriptors needed
> > +for Tx
> > + * @size: transmit request size in bytes
> > + *
> > + * Due to hardware alignment restrictions (4K alignment), we need to
> > + * assume that we can have no more than 12K of data per descriptor,
> > +even
> > + * though each descriptor can take up to 16K - 1 bytes of aligned memory.
> > + * Thus, we need to divide by 12K. But division is slow! Instead,
> > + * we decompose the operation into shifts and one relatively cheap
> > + * multiply operation.
> > + *
> > + * To divide by 12K, we first divide by 4K, then divide by 3:
> > + * To divide by 4K, shift right by 12 bits
> > + * To divide by 3, multiply by 85, then divide by 256
> > + * (Divide by 256 is done by shifting right by 8 bits)
> > + * Finally, we add one to round up. Because 256 isn't an exact
> > +multiple of
> > + * 3, we'll underestimate near each multiple of 12K. This is actually
> > +more
> > + * accurate as we have 4K - 1 of wiggle room that we can fit into the
> > +last
> > + * segment.  For our purposes this is accurate out to 1M which is
> > +orders of
> > + * magnitude greater than our largest possible GSO size.
> > + *
> > + * This would then be implemented as:
> > + * return (((size >> 12) * 85) >> 8) + 1;
> > + *
> > + * Since multiplication and division are commutative, we can reorder
> > + * operations into:
> > + * return ((size * 85) >> 20) + 1;
> >   */
> >  static inline unsigned int i40e_txd_use_count(unsigned int size)  {
> > -   const unsigned int max = I40E_MAX_DATA_PER_TXD_ALIGNED;
> > -   const unsigned int reciprocal = ((1ull << 32) - 1 + (max / 2)) / max;
> > -   unsigned int adjust = ~(u32)0;
> > -
> > -   /* if we rounded up on the reciprocal pull down the adjustment */
> > -   if ((max * reciprocal) > adjust)
> > -   adjust = ~(u32)(reciprocal - 1);
> > -
> > -   return (u32)u64)size * reciprocal) + adjust) >> 32);
> > +   return ((size * 85) >> 20) + 1;
> >  }
> 
> But...
> 
> I thought gcc already implemented reciprocal divides ?
> 
> 
> $ cat div.c
> unsigned int foo(unsigned int size)
> {
>   return size / 0x3000;
> }
> $ gcc -O2 -S div.c && cat div.s
> foo:
> .LFB0:
>   .cfi_startproc
>   movl%edi, %eax
>   movl$-1431655765, %edx  // 0xaaab
>   mull%edx
>   shrl$13, %edx
>   movl  

Re: Misalignment, MIPS, and ip_hdr(skb)->version

2016-12-07 Thread Hannes Frederic Sowa
Hi Jason,

On 07.12.2016 19:35, Jason A. Donenfeld wrote:
> I receive encrypted packets with a 13 byte header. I decrypt the
> ciphertext in place, and then discard the header. I then pass the
> plaintext to the rest of the networking stack. The plaintext is an IP
> packet. Due to the 13 byte header that was discarded, the plaintext
> possibly begins at an unaligned location (depending on whether
> dev->needed_headroom was respected).
> 
> Does this matter? Is this bad? Will there be a necessary performance hit?

Your custom protocol should be designed in a way you get an aligned ip
header. Most protocols of the IETF follow this mantra and it is always
possible to e.g. pad options so you end up on aligned boundaries for the
next header.

GRE-TEB for example needs skb_copy_bits to extract the header so it can
access them in an aligned way.

> In order to find out, I instrumented the MIPS unaligned access
> exception handler to see where I was actually in trouble.
> Surprisingly, the only part of the stack that seemed to be upset was
> on calls to ip_hdr(skb)->version.
> 
> Two things disturb me about this. First, this seems too good to be
> true. Does it seem reasonable to you that this is actually the only
> place that would be problematic? Or was my testing methodology wrong
> to arrive at such an optimistic conclusion?
> 
> Secondly, why should a call to ip_hdr(skb)->version cause an unaligned
> access anyway? This struct member is simply the second half of a
> single byte in a bit field. I'd expect for the compiler to generate a
> single byte load, followed by a bitshift or a mask. Instead, the
> compiler appears to generate a double byte load, hence the exception.
> What's up with this? Stupid compiler that should be fixed? Some odd
> optimization? What to do?

I don't see an issue with that at all. Why do you think it could be a
problem?

Bye,
Hannes



Re: [PATCH 2/2] [v2] net: qcom/emac: add support for the Qualcomm Technologies QDF2400

2016-12-07 Thread kbuild test robot
Hi Timur,

[auto build test ERROR on net-next/master]
[also build test ERROR on v4.9-rc8 next-20161207]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Timur-Tabi/net-qcom-emac-simplify-support-for-different-SOCs/20161208-064231
config: xtensa-allmodconfig (attached as .config)
compiler: xtensa-linux-gcc (GCC) 4.9.0
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=xtensa 

All errors (new ones prefixed by >>):

   drivers/net/ethernet/qualcomm/emac/emac-sgmii.c: In function 
'emac_sgmii_acpi_match':
>> drivers/net/ethernet/qualcomm/emac/emac-sgmii.c:178:3: error: implicit 
>> declaration of function 'acpi_evaluate_integer' 
>> [-Werror=implicit-function-declaration]
  status = acpi_evaluate_integer(handle, "_HRV", NULL, );
  ^
   cc1: some warnings being treated as errors

vim +/acpi_evaluate_integer +178 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c

   172  
   173  if (id) {
   174  acpi_handle handle = ACPI_HANDLE(dev);
   175  unsigned long long hrv;
   176  acpi_status status;
   177  
 > 178  status = acpi_evaluate_integer(handle, "_HRV", NULL, 
 > );
   179  if (status) {
   180  if (status == AE_NOT_FOUND)
   181  /* Older versions of the QDF2432 ACPI 
tables do

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [net-next v2 02/19] i40e: simplify txd use count calculation

2016-12-07 Thread Eric Dumazet
On Wed, 2016-12-07 at 14:19 -0800, Jeff Kirsher wrote:
> From: Mitch Williams 
> 
> The i40e_txd_use_count function was fast but confusing. In the comments,
> it even admits that it's ugly. So replace it with a new function that is
> (very) slightly faster and has extensive commenting to help the thicker
> among us (including the author, who will forget in a week) understand
> how it works.
> 
> Change-ID: Ifb533f13786a0bf39cb29f77969a5be2c83d9a87
> Signed-off-by: Mitch Williams 
> Signed-off-by: Alexander Duyck 
> Tested-by: Andrew Bowers 
> Signed-off-by: Jeff Kirsher 
> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h   | 45 
> +--
>  drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 45 
> +--
>  2 files changed, 56 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h 
> b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> index de8550f..e065321 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
> @@ -173,26 +173,37 @@ static inline bool i40e_test_staterr(union i40e_rx_desc 
> *rx_desc,
>  #define I40E_MAX_DATA_PER_TXD_ALIGNED \
>   (I40E_MAX_DATA_PER_TXD & ~(I40E_MAX_READ_REQ_SIZE - 1))
>  
> -/* This ugly bit of math is equivalent to DIV_ROUNDUP(size, X) where X is
> - * the value I40E_MAX_DATA_PER_TXD_ALIGNED.  It is needed due to the fact
> - * that 12K is not a power of 2 and division is expensive.  It is used to
> - * approximate the number of descriptors used per linear buffer.  Note
> - * that this will overestimate in some cases as it doesn't account for the
> - * fact that we will add up to 4K - 1 in aligning the 12K buffer, however
> - * the error should not impact things much as large buffers usually mean
> - * we will use fewer descriptors then there are frags in an skb.
> +/**
> + * i40e_txd_use_count  - estimate the number of descriptors needed for Tx
> + * @size: transmit request size in bytes
> + *
> + * Due to hardware alignment restrictions (4K alignment), we need to
> + * assume that we can have no more than 12K of data per descriptor, even
> + * though each descriptor can take up to 16K - 1 bytes of aligned memory.
> + * Thus, we need to divide by 12K. But division is slow! Instead,
> + * we decompose the operation into shifts and one relatively cheap
> + * multiply operation.
> + *
> + * To divide by 12K, we first divide by 4K, then divide by 3:
> + * To divide by 4K, shift right by 12 bits
> + * To divide by 3, multiply by 85, then divide by 256
> + * (Divide by 256 is done by shifting right by 8 bits)
> + * Finally, we add one to round up. Because 256 isn't an exact multiple of
> + * 3, we'll underestimate near each multiple of 12K. This is actually more
> + * accurate as we have 4K - 1 of wiggle room that we can fit into the last
> + * segment.  For our purposes this is accurate out to 1M which is orders of
> + * magnitude greater than our largest possible GSO size.
> + *
> + * This would then be implemented as:
> + * return (((size >> 12) * 85) >> 8) + 1;
> + *
> + * Since multiplication and division are commutative, we can reorder
> + * operations into:
> + * return ((size * 85) >> 20) + 1;
>   */
>  static inline unsigned int i40e_txd_use_count(unsigned int size)
>  {
> - const unsigned int max = I40E_MAX_DATA_PER_TXD_ALIGNED;
> - const unsigned int reciprocal = ((1ull << 32) - 1 + (max / 2)) / max;
> - unsigned int adjust = ~(u32)0;
> -
> - /* if we rounded up on the reciprocal pull down the adjustment */
> - if ((max * reciprocal) > adjust)
> - adjust = ~(u32)(reciprocal - 1);
> -
> - return (u32)u64)size * reciprocal) + adjust) >> 32);
> + return ((size * 85) >> 20) + 1;
>  }

But...

I thought gcc already implemented reciprocal divides ?


$ cat div.c
unsigned int foo(unsigned int size)
{
return size / 0x3000;
}
$ gcc -O2 -S div.c && cat div.s
foo:
.LFB0:
.cfi_startproc
movl%edi, %eax
movl$-1431655765, %edx  // 0xaaab
mull%edx
shrl$13, %edx
movl%edx, %eax
ret




Re: [net-next v2 00/19][pull request] 40GbE Intel Wired LAN Driver Updates 2016-12-07

2016-12-07 Thread David Miller
From: Jeff Kirsher 
Date: Wed,  7 Dec 2016 14:18:59 -0800

> This series contains updates to i40e and i40evf only.

Pulled, thanks Jeff.


[PATCH v4 net-next 0/4]: Allow head adjustment in XDP prog

2016-12-07 Thread Martin KaFai Lau
This series adds a helper to allow head adjusting in XDP prog.  mlx4
driver has been modified to support this feature.  An example is written
to encapsulate a packet with an IPv4/v6 header and then XDP_TX it
out.

v4:
1. Remove XDP_QUERY_FEATURES command.  Instead, check
   the prog->xdp_adjust_head bit inside the driver itself
   during XDP_SETUP_PROG in patch 1of4.
   Thanks for everybody's ideas.
2. Nit changes on sample code per Jesper

v3:
1. Check if the driver supports head adjustment before
   setting the xdp_prog fd to the device in patch 1of4.
2. Remove the page alignment assumption on the data_hard_start.
   Instead, add data_hard_start to the struct xdp_buff and the
   driver has to fill it if it supports head adjustment.
3. Keep the wire MTU as before in mlx4
4. Set map0_byte_count to PAGE_SIZE in patch 3of4

v2:
1. Make a variable name change in bpf_xdp_adjust_head() in patch 1
2. Ensure no less than ETH_HLEN data in bpf_xdp_adjust_head() in patch 1
3. Some clarifications in commit log messages of patch 2 and 3

Thanks,
Martin

Martin KaFai Lau (4):
  bpf: xdp: Allow head adjustment in XDP prog
  mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs
  mlx4: xdp: Reserve headroom for receiving packet when XDP prog is
active
  bpf: xdp: Add XDP example for head adjustment

 arch/powerpc/net/bpf_jit_comp64.c  |   4 +-
 arch/s390/net/bpf_jit_comp.c   |   2 +-
 arch/x86/net/bpf_jit_comp.c|   2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  29 ++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c |  70 +++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c |   9 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   5 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c|   4 +
 drivers/net/ethernet/qlogic/qede/qede_main.c   |   5 +
 include/linux/filter.h |   6 +-
 include/uapi/linux/bpf.h   |  11 +-
 kernel/bpf/core.c  |   2 +-
 kernel/bpf/syscall.c   |   2 +
 kernel/bpf/verifier.c  |   2 +-
 net/core/filter.c  |  28 ++-
 samples/bpf/Makefile   |   4 +
 samples/bpf/bpf_helpers.h  |   2 +
 samples/bpf/bpf_load.c |  94 
 samples/bpf/bpf_load.h |   1 +
 samples/bpf/xdp1_user.c|  93 
 samples/bpf/xdp_tx_iptunnel_common.h   |  37 +++
 samples/bpf/xdp_tx_iptunnel_kern.c | 236 +++
 samples/bpf/xdp_tx_iptunnel_user.c | 256 +
 24 files changed, 762 insertions(+), 145 deletions(-)
 create mode 100644 samples/bpf/xdp_tx_iptunnel_common.h
 create mode 100644 samples/bpf/xdp_tx_iptunnel_kern.c
 create mode 100644 samples/bpf/xdp_tx_iptunnel_user.c

-- 
2.5.1



[PATCH v4 net-next 4/4] bpf: xdp: Add XDP example for head adjustment

2016-12-07 Thread Martin KaFai Lau
The XDP prog checks if the incoming packet matches any VIP:PORT
combination in the BPF hashmap.  If it is, it will encapsulate
the packet with a IPv4/v6 header as instructed by the value of
the BPF hashmap and then XDP_TX it out.

The VIP:PORT -> IP-Encap-Info can be specified by the cmd args
of the user prog.

Acked-by: Alexei Starovoitov 
Signed-off-by: Martin KaFai Lau 
---
 samples/bpf/Makefile |   4 +
 samples/bpf/bpf_helpers.h|   2 +
 samples/bpf/bpf_load.c   |  94 +
 samples/bpf/bpf_load.h   |   1 +
 samples/bpf/xdp1_user.c  |  93 -
 samples/bpf/xdp_tx_iptunnel_common.h |  37 +
 samples/bpf/xdp_tx_iptunnel_kern.c   | 236 
 samples/bpf/xdp_tx_iptunnel_user.c   | 256 +++
 8 files changed, 630 insertions(+), 93 deletions(-)
 create mode 100644 samples/bpf/xdp_tx_iptunnel_common.h
 create mode 100644 samples/bpf/xdp_tx_iptunnel_kern.c
 create mode 100644 samples/bpf/xdp_tx_iptunnel_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 00cd3081c038..f2219c1489e5 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -33,6 +33,7 @@ hostprogs-y += trace_event
 hostprogs-y += sampleip
 hostprogs-y += tc_l2_redirect
 hostprogs-y += lwt_len_hist
+hostprogs-y += xdp_tx_iptunnel
 
 test_lru_dist-objs := test_lru_dist.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
@@ -67,6 +68,7 @@ trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
 sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
 tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o
 lwt_len_hist-objs := bpf_load.o libbpf.o lwt_len_hist_user.o
+xdp_tx_iptunnel-objs := bpf_load.o libbpf.o xdp_tx_iptunnel_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -99,6 +101,7 @@ always += test_current_task_under_cgroup_kern.o
 always += trace_event_kern.o
 always += sampleip_kern.o
 always += lwt_len_hist_kern.o
+always += xdp_tx_iptunnel_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/testing/selftests/bpf/
@@ -129,6 +132,7 @@ HOSTLOADLIBES_trace_event += -lelf
 HOSTLOADLIBES_sampleip += -lelf
 HOSTLOADLIBES_tc_l2_redirect += -l elf
 HOSTLOADLIBES_lwt_len_hist += -l elf
+HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 8370a6e3839d..faaffe2e139a 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -57,6 +57,8 @@ static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int 
size) =
(void *) BPF_FUNC_skb_set_tunnel_opt;
 static unsigned long long (*bpf_get_prandom_u32)(void) =
(void *) BPF_FUNC_get_prandom_u32;
+static int (*bpf_xdp_adjust_head)(void *ctx, int offset) =
+   (void *) BPF_FUNC_xdp_adjust_head;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 49b45ccbe153..e30b6de94f2e 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -12,6 +12,10 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -450,3 +454,93 @@ struct ksym *ksym_search(long key)
/* out of range. return _stext */
return [0];
 }
+
+int set_link_xdp_fd(int ifindex, int fd)
+{
+   struct sockaddr_nl sa;
+   int sock, seq = 0, len, ret = -1;
+   char buf[4096];
+   struct nlattr *nla, *nla_xdp;
+   struct {
+   struct nlmsghdr  nh;
+   struct ifinfomsg ifinfo;
+   char attrbuf[64];
+   } req;
+   struct nlmsghdr *nh;
+   struct nlmsgerr *err;
+
+   memset(, 0, sizeof(sa));
+   sa.nl_family = AF_NETLINK;
+
+   sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+   if (sock < 0) {
+   printf("open netlink socket: %s\n", strerror(errno));
+   return -1;
+   }
+
+   if (bind(sock, (struct sockaddr *), sizeof(sa)) < 0) {
+   printf("bind to netlink: %s\n", strerror(errno));
+   goto cleanup;
+   }
+
+   memset(, 0, sizeof(req));
+   req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+   req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+   req.nh.nlmsg_type = RTM_SETLINK;
+   req.nh.nlmsg_pid = 0;
+   req.nh.nlmsg_seq = ++seq;
+   req.ifinfo.ifi_family = AF_UNSPEC;
+   req.ifinfo.ifi_index = ifindex;
+   nla = (struct nlattr *)(((char *))
+   + NLMSG_ALIGN(req.nh.nlmsg_len));
+   nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
+
+   nla_xdp = (struct nlattr *)((char *)nla + 

[PATCH v4 net-next 2/4] mlx4: xdp: Allow raising MTU up to one page minus eth and vlan hdrs

2016-12-07 Thread Martin KaFai Lau
When XDP is active in mlx4, mlx4 is using one page/pkt.
At the same time (i.e. when XDP is active), it is currently
limiting MTU to be FRAG_SZ0 - ETH_HLEN - (2 * VLAN_HLEN)
which is 1514 in x86.  AFAICT, we can at least raise the MTU
limit up to PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) which this
patch is doing.  It will be useful in the next patch which
allows XDP program to extend the packet by adding new header(s).

Note: In the earlier XDP patches, there is already existing guard
to ensure the page/pkt scheme only applies when XDP is active
in mlx4.

Signed-off-by: Martin KaFai Lau 
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 28 +++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 46 ++
 2 files changed, 44 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index f441eda63bec..c97d25b06444 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -51,6 +51,8 @@
 #include "mlx4_en.h"
 #include "en_port.h"
 
+#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN)))
+
 int mlx4_en_setup_tc(struct net_device *dev, u8 up)
 {
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -2249,6 +2251,19 @@ void mlx4_en_destroy_netdev(struct net_device *dev)
free_netdev(dev);
 }
 
+static bool mlx4_en_check_xdp_mtu(struct net_device *dev, int mtu)
+{
+   struct mlx4_en_priv *priv = netdev_priv(dev);
+
+   if (mtu > MLX4_EN_MAX_XDP_MTU) {
+   en_err(priv, "mtu:%d > max:%d when XDP prog is attached\n",
+  mtu, MLX4_EN_MAX_XDP_MTU);
+   return false;
+   }
+
+   return true;
+}
+
 static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
 {
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -2258,11 +2273,10 @@ static int mlx4_en_change_mtu(struct net_device *dev, 
int new_mtu)
en_dbg(DRV, priv, "Change MTU called - current:%d new:%d\n",
 dev->mtu, new_mtu);
 
-   if (priv->tx_ring_num[TX_XDP] && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) {
-   en_err(priv, "MTU size:%d requires frags but XDP running\n",
-  new_mtu);
-   return -EOPNOTSUPP;
-   }
+   if (priv->tx_ring_num[TX_XDP] &&
+   !mlx4_en_check_xdp_mtu(dev, new_mtu))
+   return -ENOTSUPP;
+
dev->mtu = new_mtu;
 
if (netif_running(dev)) {
@@ -2715,10 +2729,8 @@ static int mlx4_xdp_set(struct net_device *dev, struct 
bpf_prog *prog)
return 0;
}
 
-   if (priv->num_frags > 1) {
-   en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
+   if (!mlx4_en_check_xdp_mtu(dev, dev->mtu))
return -EOPNOTSUPP;
-   }
 
tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
if (!tmp)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 6562f78b07f4..23e9d04d1ef4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -1164,37 +1164,39 @@ static const int frag_sizes[] = {
 
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
-   enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE;
struct mlx4_en_priv *priv = netdev_priv(dev);
int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
-   int order = MLX4_EN_ALLOC_PREFER_ORDER;
-   u32 align = SMP_CACHE_BYTES;
-   int buf_size = 0;
int i = 0;
 
/* bpf requires buffers to be set up as 1 packet per page.
 * This only works when num_frags == 1.
 */
if (priv->tx_ring_num[TX_XDP]) {
-   dma_dir = PCI_DMA_BIDIRECTIONAL;
-   /* This will gain efficient xdp frame recycling at the expense
-* of more costly truesize accounting
+   priv->frag_info[0].order = 0;
+   priv->frag_info[0].frag_size = eff_mtu;
+   priv->frag_info[0].frag_prefix_size = 0;
+   /* This will gain efficient xdp frame recycling at the
+* expense of more costly truesize accounting
 */
-   align = PAGE_SIZE;
-   order = 0;
-   }
-
-   while (buf_size < eff_mtu) {
-   priv->frag_info[i].order = order;
-   priv->frag_info[i].frag_size =
-   (eff_mtu > buf_size + frag_sizes[i]) ?
-   frag_sizes[i] : eff_mtu - buf_size;
-   priv->frag_info[i].frag_prefix_size = buf_size;
-   priv->frag_info[i].frag_stride =
-   ALIGN(priv->frag_info[i].frag_size, align);
-   priv->frag_info[i].dma_dir = dma_dir;
-   buf_size += priv->frag_info[i].frag_size;
-   i++;
+   priv->frag_info[0].frag_stride = PAGE_SIZE;
+

[PATCH v4 net-next 1/4] bpf: xdp: Allow head adjustment in XDP prog

2016-12-07 Thread Martin KaFai Lau
This patch allows XDP prog to extend/remove the packet
data at the head (like adding or removing header).  It is
done by adding a new XDP helper bpf_xdp_adjust_head().

It also renames bpf_helper_changes_skb_data() to
bpf_helper_changes_pkt_data() to better reflect
that XDP prog does not work on skb.

This patch adds one "xdp_adjust_head" bit to bpf_prog for the
XDP-capable driver to check if the XDP prog requires
bpf_xdp_adjust_head() support.  The driver can then decide
to error out during XDP_SETUP_PROG.

Signed-off-by: Martin KaFai Lau 
---
 arch/powerpc/net/bpf_jit_comp64.c  |  4 ++--
 arch/s390/net/bpf_jit_comp.c   |  2 +-
 arch/x86/net/bpf_jit_comp.c|  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  5 
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |  5 
 .../net/ethernet/netronome/nfp/nfp_net_common.c|  4 
 drivers/net/ethernet/qlogic/qede/qede_main.c   |  5 
 include/linux/filter.h |  6 +++--
 include/uapi/linux/bpf.h   | 11 -
 kernel/bpf/core.c  |  2 +-
 kernel/bpf/syscall.c   |  2 ++
 kernel/bpf/verifier.c  |  2 +-
 net/core/filter.c  | 28 --
 13 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 0fe98a567125..73a5cf18fd84 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -766,7 +766,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image,
func = (u8 *) __bpf_call_base + imm;
 
/* Save skb pointer if we need to re-cache skb data */
-   if (bpf_helper_changes_skb_data(func))
+   if (bpf_helper_changes_pkt_data(func))
PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
 
bpf_jit_emit_func_call(image, ctx, (u64)func);
@@ -775,7 +775,7 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image,
PPC_MR(b2p[BPF_REG_0], 3);
 
/* refresh skb cache */
-   if (bpf_helper_changes_skb_data(func)) {
+   if (bpf_helper_changes_pkt_data(func)) {
/* reload skb pointer to r3 */
PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
bpf_jit_emit_skb_loads(image, ctx);
diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index bee281f3163d..167b31b186c1 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, 
struct bpf_prog *fp, int i
EMIT2(0x0d00, REG_14, REG_W1);
/* lgr %b0,%r2: load return value into %b0 */
EMIT4(0xb904, BPF_REG_0, REG_2);
-   if (bpf_helper_changes_skb_data((void *)func)) {
+   if (bpf_helper_changes_pkt_data((void *)func)) {
jit->seen |= SEEN_SKB_CHANGE;
/* lg %b1,ST_OFF_SKBP(%r15) */
EMIT6_DISP_LH(0xe300, 0x0004, BPF_REG_1, REG_0,
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index fe04a04dab8e..e76d1af60f7a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -853,7 +853,7 @@ xadd:   if (is_imm8(insn->off))
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
if (seen_ld_abs) {
-   reload_skb_data = 
bpf_helper_changes_skb_data(func);
+   reload_skb_data = 
bpf_helper_changes_pkt_data(func);
if (reload_skb_data) {
EMIT1(0x57); /* push %rdi */
jmp_offset += 22; /* pop, mov, sub, mov 
*/
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 49a81f1fc1d6..f441eda63bec 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2686,6 +2686,11 @@ static int mlx4_xdp_set(struct net_device *dev, struct 
bpf_prog *prog)
int err;
int i;
 
+   if (prog && prog->xdp_adjust_head) {
+   en_err(priv, "Does not support bpf_xdp_adjust_head()\n");
+   return -EOPNOTSUPP;
+   }
+
xdp_ring_num = prog ? priv->rx_ring_num : 0;
 
/* No need to reconfigure buffers when simply swapping the
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 

[PATCH v4 net-next 3/4] mlx4: xdp: Reserve headroom for receiving packet when XDP prog is active

2016-12-07 Thread Martin KaFai Lau
Reserve XDP_PACKET_HEADROOM for packet and enable bpf_xdp_adjust_head()
support.  This patch only affects the code path when XDP is active.

After testing, the tx_dropped counter is incremented if the xdp_prog sends
more than wire MTU.

Signed-off-by: Martin KaFai Lau 
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  8 ++--
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 24 ++--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c |  9 +
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  3 ++-
 4 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index c97d25b06444..bcd955339058 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -51,7 +51,8 @@
 #include "mlx4_en.h"
 #include "en_port.h"
 
-#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN)))
+#define MLX4_EN_MAX_XDP_MTU ((int)(PAGE_SIZE - ETH_HLEN - (2 * VLAN_HLEN) - \
+  XDP_PACKET_HEADROOM))
 
 int mlx4_en_setup_tc(struct net_device *dev, u8 up)
 {
@@ -2700,11 +2701,6 @@ static int mlx4_xdp_set(struct net_device *dev, struct 
bpf_prog *prog)
int err;
int i;
 
-   if (prog && prog->xdp_adjust_head) {
-   en_err(priv, "Does not support bpf_xdp_adjust_head()\n");
-   return -EOPNOTSUPP;
-   }
-
xdp_ring_num = prog ? priv->rx_ring_num : 0;
 
/* No need to reconfigure buffers when simply swapping the
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 23e9d04d1ef4..3c37e216bbf3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -96,7 +96,6 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
const struct mlx4_en_frag_info *frag_info;
struct page *page;
-   dma_addr_t dma;
int i;
 
for (i = 0; i < priv->num_frags; i++) {
@@ -115,9 +114,10 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
 
for (i = 0; i < priv->num_frags; i++) {
frags[i] = ring_alloc[i];
-   dma = ring_alloc[i].dma + ring_alloc[i].page_offset;
+   frags[i].page_offset += priv->frag_info[i].rx_headroom;
+   rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
+   frags[i].page_offset);
ring_alloc[i] = page_alloc[i];
-   rx_desc->data[i].addr = cpu_to_be64(dma);
}
 
return 0;
@@ -250,7 +250,8 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv 
*priv,
 
if (ring->page_cache.index > 0) {
frags[0] = ring->page_cache.buf[--ring->page_cache.index];
-   rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
+   rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
+   frags[0].page_offset);
return 0;
}
 
@@ -889,6 +890,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
if (xdp_prog) {
struct xdp_buff xdp;
dma_addr_t dma;
+   void *orig_data;
u32 act;
 
dma = be64_to_cpu(rx_desc->data[0].addr);
@@ -896,11 +898,19 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
priv->frag_info[0].frag_size,
DMA_FROM_DEVICE);
 
-   xdp.data = page_address(frags[0].page) +
-   frags[0].page_offset;
+   xdp.data_hard_start = page_address(frags[0].page);
+   xdp.data = xdp.data_hard_start + frags[0].page_offset;
xdp.data_end = xdp.data + length;
+   orig_data = xdp.data;
 
act = bpf_prog_run_xdp(xdp_prog, );
+
+   if (xdp.data != orig_data) {
+   length = xdp.data_end - xdp.data;
+   frags[0].page_offset = xdp.data -
+   xdp.data_hard_start;
+   }
+
switch (act) {
case XDP_PASS:
break;
@@ -1180,6 +1190,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
 */
priv->frag_info[0].frag_stride = PAGE_SIZE;
priv->frag_info[0].dma_dir = PCI_DMA_BIDIRECTIONAL;
+   priv->frag_info[0].rx_headroom = XDP_PACKET_HEADROOM;
i = 1;
} else {
  

Re: [PATCH 2/2] net: ethernet: stmmac: remove private tx queue lock

2016-12-07 Thread David Miller
From: Pavel Machek 
Date: Wed, 7 Dec 2016 22:37:57 +0100

> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
> b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index 982c952..7415bc2 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -1308,7 +1308,7 @@ static void stmmac_tx_clean(struct stmmac_priv *priv)
>   unsigned int bytes_compl = 0, pkts_compl = 0;
>   unsigned int entry = priv->dirty_tx;
>  
> - spin_lock(>tx_lock);
> + netif_tx_lock_bh(priv->dev);
>  
>   priv->xstats.tx_clean++;
>  

stmmac_tx_clean() runs from either the timer or the NAPI poll handler,
both execute from software interrupts, therefore _bh() should be
unnecessary.


Re: [PATCH 2/2] net: ethernet: stmmac: remove private tx queue lock

2016-12-07 Thread Pavel Machek
On Wed 2016-12-07 23:34:19, Lino Sanfilippo wrote:
> On 07.12.2016 22:43, Lino Sanfilippo wrote:
> > Hi Pavel,
> > 
> > On 07.12.2016 22:37, Pavel Machek wrote:
> >> On Wed 2016-12-07 21:05:38, Lino Sanfilippo wrote:
> >>> The driver uses a private lock for synchronization between the xmit
> >>> function and the xmit completion handler, but since the NETIF_F_LLTX flag
> >>> is not set, the xmit function is also called with the xmit_lock held.
> >>> 
> >>> On the other hand the xmit completion handler first takes the private lock
> >>> and (in case that the tx queue has been stopped) the xmit_lock, leading to
> >>> a reverse locking order and the potential danger of a deadlock.
> >>> 
> >>> Fix this by removing the private lock completely and synchronizing the 
> >>> xmit
> >>> function and completion handler solely by means of the xmit_lock. By doing
> >>> this remove also the now unnecessary double check for a stopped tx queue.
> >>> 
> >> 
> >> FYI, here's modified version. I believe _bh versions are needed, and
> >> I'm testing that version now. (Oh and I also ported it to net-next).
> >> 
> >> It survived 30 minutes of testing so far...
> >> 
> > 
> > First off, thanks for testing.
> > Hmm. I dont understand why _bh would be needed. We call that function from
> > BH context only (napi poll and timer).
> > Any idea?
> > 
> 
> Could this once again be caused by irq coalescing? When the tx queue has been 
> stopped
> the cleanup handler has to wakeup the queue within a certain time span, 
> otherwise the
> watchdog will complain (as it happened in your test). Could you retest this 
> with
> irq coalescing disabled?

I actually had TX coalescing disabled, with

-#define STMMAC_TX_FRAMES   64
+#define STMMAC_TX_FRAMES   0



Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


Re: [PATCH 1/2] net: ethernet: sxgbe: remove private tx queue lock

2016-12-07 Thread Francois Romieu
Lino Sanfilippo  :
> The driver uses a private lock for synchronization between the xmit
> function and the xmit completion handler, but since the NETIF_F_LLTX flag
> is not set, the xmit function is also called with the xmit_lock held.
> 
> On the other hand the xmit completion handler first takes the private lock
> and (in case that the tx queue has been stopped) the xmit_lock, leading
> to a reverse locking order and the potential danger of a deadlock.

netif_tx_stop_queue is used by:
1. xmit function before releasing lock and returning.
2. sxgbe_restart_tx_queue()
   <- sxgbe_tx_interrupt
   <- sxgbe_reset_all_tx_queues()
  <- sxgbe_tx_timeout()

Given xmit won't be called again until tx queue is enabled, it's not clear
how a deadlock could happen due to #1.

Regardless of deadlocks anywhere else, #2 has some serious problem due to
the lack of exclusion between the tx queue restart handler and the xmit
handler.

-- 
Ueimor


Re: [PATCH 2/2] net: ethernet: stmmac: remove private tx queue lock

2016-12-07 Thread Lino Sanfilippo
On 07.12.2016 22:43, Lino Sanfilippo wrote:
> Hi Pavel,
> 
> On 07.12.2016 22:37, Pavel Machek wrote:
>> On Wed 2016-12-07 21:05:38, Lino Sanfilippo wrote:
>>> The driver uses a private lock for synchronization between the xmit
>>> function and the xmit completion handler, but since the NETIF_F_LLTX flag
>>> is not set, the xmit function is also called with the xmit_lock held.
>>> 
>>> On the other hand the xmit completion handler first takes the private lock
>>> and (in case that the tx queue has been stopped) the xmit_lock, leading to
>>> a reverse locking order and the potential danger of a deadlock.
>>> 
>>> Fix this by removing the private lock completely and synchronizing the xmit
>>> function and completion handler solely by means of the xmit_lock. By doing
>>> this remove also the now unnecessary double check for a stopped tx queue.
>>> 
>> 
>> FYI, here's modified version. I believe _bh versions are needed, and
>> I'm testing that version now. (Oh and I also ported it to net-next).
>> 
>> It survived 30 minutes of testing so far...
>> 
> 
> First off, thanks for testing.
> Hmm. I dont understand why _bh would be needed. We call that function from
> BH context only (napi poll and timer).
> Any idea?
> 

Could this once again be caused by irq coalescing? When the tx queue has been 
stopped
the cleanup handler has to wakeup the queue within a certain time span, 
otherwise the
watchdog will complain (as it happened in your test). Could you retest this with
irq coalescing disabled?



[net-next v2 07/19] Changed version from 1.6.21 to 1.6.25

2016-12-07 Thread Jeff Kirsher
From: Bimmy Pujari 

Signed-off-by: Bimmy Pujari 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index dbb854b..aecf63b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -41,7 +41,7 @@ static const char i40e_driver_string[] =
 
 #define DRV_VERSION_MAJOR 1
 #define DRV_VERSION_MINOR 6
-#define DRV_VERSION_BUILD 21
+#define DRV_VERSION_BUILD 25
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 __stringify(DRV_VERSION_MINOR) "." \
 __stringify(DRV_VERSION_BUILD)DRV_KERN
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index ca85021..c0fc533 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -38,7 +38,7 @@ static const char i40evf_driver_string[] =
 
 #define DRV_VERSION_MAJOR 1
 #define DRV_VERSION_MINOR 6
-#define DRV_VERSION_BUILD 21
+#define DRV_VERSION_BUILD 25
 #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \
 __stringify(DRV_VERSION_MINOR) "." \
 __stringify(DRV_VERSION_BUILD) \
-- 
2.9.3



[net-next v2 09/19] i40e: Add support for 25G devices

2016-12-07 Thread Jeff Kirsher
From: Carolyn Wyborny 

Add support for 25G devices - defines and data structures.

One tricky part here is that the firmware support for these
Devices introduces a mismatch between the PHY type enum and
the bitfields for the phy types.

This change creates a macro and uses it to increment the 25G
PHY values when creating 25G bitfields.

Change-ID: I69b24d837d44cf9220bf5cb8dd46c5be89ce490b
Signed-off-by: Carolyn Wyborny 
Signed-off-by: Mitch Williams 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  | 30 +++-
 drivers/net/ethernet/intel/i40e/i40e_common.c  | 11 ++-
 drivers/net/ethernet/intel/i40e/i40e_devids.h  |  2 +
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 26 ++-
 drivers/net/ethernet/intel/i40e/i40e_main.c|  6 +-
 drivers/net/ethernet/intel/i40e/i40e_type.h| 82 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |  3 +
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h| 30 +++-
 drivers/net/ethernet/intel/i40evf/i40e_common.c|  2 +
 drivers/net/ethernet/intel/i40evf/i40e_devids.h|  2 +
 drivers/net/ethernet/intel/i40evf/i40e_type.h  | 82 +-
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c |  8 +++
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c|  3 +
 13 files changed, 208 insertions(+), 79 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index 67e396b..c9d1f91 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -1642,6 +1642,10 @@ enum i40e_aq_phy_type {
I40E_PHY_TYPE_1000BASE_LX   = 0x1C,
I40E_PHY_TYPE_1000BASE_T_OPTICAL= 0x1D,
I40E_PHY_TYPE_20GBASE_KR2   = 0x1E,
+   I40E_PHY_TYPE_25GBASE_KR= 0x1F,
+   I40E_PHY_TYPE_25GBASE_CR= 0x20,
+   I40E_PHY_TYPE_25GBASE_SR= 0x21,
+   I40E_PHY_TYPE_25GBASE_LR= 0x22,
I40E_PHY_TYPE_MAX
 };
 
@@ -1650,6 +1654,7 @@ enum i40e_aq_phy_type {
 #define I40E_LINK_SPEED_10GB_SHIFT 0x3
 #define I40E_LINK_SPEED_40GB_SHIFT 0x4
 #define I40E_LINK_SPEED_20GB_SHIFT 0x5
+#define I40E_LINK_SPEED_25GB_SHIFT 0x6
 
 enum i40e_aq_link_speed {
I40E_LINK_SPEED_UNKNOWN = 0,
@@ -1657,7 +1662,8 @@ enum i40e_aq_link_speed {
I40E_LINK_SPEED_1GB = BIT(I40E_LINK_SPEED_1000MB_SHIFT),
I40E_LINK_SPEED_10GB= BIT(I40E_LINK_SPEED_10GB_SHIFT),
I40E_LINK_SPEED_40GB= BIT(I40E_LINK_SPEED_40GB_SHIFT),
-   I40E_LINK_SPEED_20GB= BIT(I40E_LINK_SPEED_20GB_SHIFT)
+   I40E_LINK_SPEED_20GB= BIT(I40E_LINK_SPEED_20GB_SHIFT),
+   I40E_LINK_SPEED_25GB= BIT(I40E_LINK_SPEED_25GB_SHIFT),
 };
 
 struct i40e_aqc_module_desc {
@@ -1690,7 +1696,13 @@ struct i40e_aq_get_phy_abilities_resp {
__le32  eeer_val;
u8  d3_lpan;
 #define I40E_AQ_SET_PHY_D3_LPAN_ENA0x01
-   u8  reserved[3];
+   u8  phy_type_ext;
+#define I40E_AQ_PHY_TYPE_EXT_25G_KR0X01
+#define I40E_AQ_PHY_TYPE_EXT_25G_CR0X02
+#define I40E_AQ_PHY_TYPE_EXT_25G_SR0x04
+#define I40E_AQ_PHY_TYPE_EXT_25G_LR0x08
+   u8  mod_type_ext;
+   u8  ext_comp_code;
u8  phy_id[4];
u8  module_type[3];
u8  qualified_module_count;
@@ -1712,7 +1724,12 @@ struct i40e_aq_set_phy_config { /* same bits as above in 
all */
__le16  eee_capability;
__le32  eeer;
u8  low_power_ctrl;
-   u8  reserved[3];
+   u8  phy_type_ext;
+#define I40E_AQ_PHY_TYPE_EXT_25G_KR0X01
+#define I40E_AQ_PHY_TYPE_EXT_25G_CR0X02
+#define I40E_AQ_PHY_TYPE_EXT_25G_SR0x04
+#define I40E_AQ_PHY_TYPE_EXT_25G_LR0x08
+   u8  reserved[2];
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aq_set_phy_config);
@@ -1792,6 +1809,13 @@ struct i40e_aqc_get_link_status {
 #define I40E_AQ_LINK_TX_DRAINED0x01
 #define I40E_AQ_LINK_TX_FLUSHED0x03
 #define I40E_AQ_LINK_FORCED_40G0x10
+/* 25G Error Codes */
+#define I40E_AQ_25G_NO_ERR 0X00
+#define I40E_AQ_25G_NOT_PRESENT0X01
+#define I40E_AQ_25G_NVM_CRC_ERR0X02
+#define I40E_AQ_25G_SBUS_UCODE_ERR 0X03
+#define I40E_AQ_25G_SERDES_UCODE_ERR   0X04
+#define I40E_AQ_25G_NIMB_UCODE_ERR 0X05
u8  loopback; /* use defines from i40e_aqc_set_lb_mode */
__le16  max_frame_size;
u8  config;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index eb392d6..1318c7d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ 

[net-next v2 05/19] i40e: remove code to handle dev_addr specially

2016-12-07 Thread Jeff Kirsher
From: Jacob Keller 

The netdev->dev_addr MAC filter already exists in the
MAC/VLAN hash table, as it is added when we configure
the netdev in i40e_configure_netdev. Because we already
know that this address will be updated in the
hash_for_each loops, we do not need to handle it
specially. This removes duplicate code and simplifies
the i40e_vsi_add_vlan and i40e_vsi_kill_vlan functions.
Because we know these filters must be part of the
MAC/VLAN hash table, this should not have any functional
impact on what filters are included and is merely a code
simplification.

Change-ID: I5e648302dbdd7cc29efc6d203b7019c11f0b5705
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 43 +
 1 file changed, 7 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index c467cc4..ae4a2b2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2515,17 +2515,6 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
/* Locked once because all functions invoked below iterates list*/
spin_lock_bh(>mac_filter_hash_lock);
 
-   if (vsi->netdev) {
-   add_f = i40e_add_filter(vsi, vsi->netdev->dev_addr, vid);
-   if (!add_f) {
-   dev_info(>back->pdev->dev,
-"Could not add vlan filter %d for %pM\n",
-vid, vsi->netdev->dev_addr);
-   spin_unlock_bh(>mac_filter_hash_lock);
-   return -ENOMEM;
-   }
-   }
-
hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
if (f->state == I40E_FILTER_REMOVE)
continue;
@@ -2539,28 +2528,14 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
}
}
 
-   /* Now if we add a vlan tag, make sure to check if it is the first
-* tag (i.e. a "tag" -1 does exist) and if so replace the -1 "tag"
-* with 0, so we now accept untagged and specified tagged traffic
-* (and not all tags along with untagged)
+   /* When we add a new VLAN filter, we need to make sure that all existing
+* filters which are marked as vid=-1 (I40E_VLAN_ANY) are converted to
+* vid=0. The simplest way is just search for all filters marked as
+* vid=-1 and replace them with vid=0. This converts all filters that
+* were marked to receive all traffic (tagged or untagged) into
+* filters to receive only untagged traffic, so that we don't receive
+* tagged traffic for VLANs which we have not configured.
 */
-   if (vid > 0 && vsi->netdev) {
-   del_f = i40e_find_filter(vsi, vsi->netdev->dev_addr,
-I40E_VLAN_ANY);
-   if (del_f) {
-   __i40e_del_filter(vsi, del_f);
-   add_f = i40e_add_filter(vsi, vsi->netdev->dev_addr, 0);
-   if (!add_f) {
-   dev_info(>back->pdev->dev,
-"Could not add filter 0 for %pM\n",
-vsi->netdev->dev_addr);
-   spin_unlock_bh(>mac_filter_hash_lock);
-   return -ENOMEM;
-   }
-   }
-   }
-
-   /* Do not assume that I40E_VLAN_ANY should be reset to VLAN 0 */
if (vid > 0 && !vsi->info.pvid) {
hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
if (f->state == I40E_FILTER_REMOVE)
@@ -2597,7 +2572,6 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
  **/
 void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid)
 {
-   struct net_device *netdev = vsi->netdev;
struct i40e_mac_filter *f;
struct hlist_node *h;
int bkt;
@@ -2605,9 +2579,6 @@ void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid)
/* Locked once because all functions invoked below iterates list */
spin_lock_bh(>mac_filter_hash_lock);
 
-   if (vsi->netdev)
-   i40e_del_filter(vsi, netdev->dev_addr, vid);
-
hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
if (f->vlan == vid)
__i40e_del_filter(vsi, f);
-- 
2.9.3



[net-next v2 01/19] i40e: Driver prints log message on link speed change

2016-12-07 Thread Jeff Kirsher
From: Filip Sadowski 

This patch makes the driver log link speed change. Before applying the
patch link messages were printed only on state change. Now message is
printed when link is brought up or down and when speed changes.

Change-ID: Ifbee14b4b16c24967450b3cecac6e8351dcc8f74
Signed-off-by: Filip Sadowski 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h  | 1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c | 6 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 4cb8fb3..06e3c23 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -596,6 +596,7 @@ struct i40e_vsi {
u16 veb_idx;/* index of VEB parent */
struct kobject *kobj;   /* sysfs object */
bool current_isup;  /* Sync 'link up' logging */
+   enum i40e_aq_link_speed current_speed;  /* Sync link speed logging */
 
void *priv; /* client driver data reference. */
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 5777e49..4534d41 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -5225,12 +5225,16 @@ static int i40e_init_pf_dcb(struct i40e_pf *pf)
  */
 void i40e_print_link_message(struct i40e_vsi *vsi, bool isup)
 {
+   enum i40e_aq_link_speed new_speed;
char *speed = "Unknown";
char *fc = "Unknown";
 
-   if (vsi->current_isup == isup)
+   new_speed = vsi->back->hw.phy.link_info.link_speed;
+
+   if ((vsi->current_isup == isup) && (vsi->current_speed == new_speed))
return;
vsi->current_isup = isup;
+   vsi->current_speed = new_speed;
if (!isup) {
netdev_info(vsi->netdev, "NIC Link is Down\n");
return;
-- 
2.9.3



[net-next v2 04/19] i40e/i40evf: napi_poll must return the work done

2016-12-07 Thread Jeff Kirsher
From: Alexander Duyck 

Currently the function i40e_napi-poll() returns 0 when it clean completely
the Rx rings, but this foul budget accounting in core code.

Fix this by returning the actual work done, capped to budget - 1, since
the core doesn't allow to return the full budget when the driver modifies
the NAPI status

This is based on a similar change that was made for the ixgbe driver by
Paolo Abeni.

Change-ID: Ic3d93ad2fa2fc8ce3164bc461e69367da0f9173b
Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 2 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 5544b50..352cf7c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2027,7 +2027,7 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
else
i40e_update_enable_itr(vsi, q_vector);
 
-   return 0;
+   return min(work_done, budget - 1);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index c4b174a..df67ef3 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1490,7 +1490,7 @@ int i40evf_napi_poll(struct napi_struct *napi, int budget)
else
i40e_update_enable_itr(vsi, q_vector);
 
-   return 0;
+   return min(work_done, budget - 1);
 }
 
 /**
-- 
2.9.3



[net-next v2 10/19] i40e: Add FEC for 25g

2016-12-07 Thread Jeff Kirsher
From: Carolyn Wyborny 

This patch adds adminq support for Forward Error
Correction ("FEC")for 25g products.

Change-ID: Iaff4910737c239d2c730e5c22a313ce9c37d3964
Signed-off-by: Carolyn Wyborny 
Signed-off-by: Mitch Williams 
Signed-off-by: Jacek Naczyk 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  | 25 --
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  2 ++
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h| 25 --
 3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
index c9d1f91..b2101a5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h
@@ -1686,6 +1686,8 @@ struct i40e_aq_get_phy_abilities_resp {
 #define I40E_AQ_PHY_LINK_ENABLED   0x08
 #define I40E_AQ_PHY_AN_ENABLED 0x10
 #define I40E_AQ_PHY_FLAG_MODULE_QUAL   0x20
+#define I40E_AQ_PHY_FEC_ABILITY_KR 0x40
+#define I40E_AQ_PHY_FEC_ABILITY_RS 0x80
__le16  eee_capability;
 #define I40E_AQ_EEE_100BASE_TX 0x0002
 #define I40E_AQ_EEE_1000BASE_T 0x0004
@@ -1701,7 +1703,16 @@ struct i40e_aq_get_phy_abilities_resp {
 #define I40E_AQ_PHY_TYPE_EXT_25G_CR0X02
 #define I40E_AQ_PHY_TYPE_EXT_25G_SR0x04
 #define I40E_AQ_PHY_TYPE_EXT_25G_LR0x08
-   u8  mod_type_ext;
+   u8  fec_cfg_curr_mod_ext_info;
+#define I40E_AQ_ENABLE_FEC_KR  0x01
+#define I40E_AQ_ENABLE_FEC_RS  0x02
+#define I40E_AQ_REQUEST_FEC_KR 0x04
+#define I40E_AQ_REQUEST_FEC_RS 0x08
+#define I40E_AQ_ENABLE_FEC_AUTO0x10
+#define I40E_AQ_FEC
+#define I40E_AQ_MODULE_TYPE_EXT_MASK   0xE0
+#define I40E_AQ_MODULE_TYPE_EXT_SHIFT  5
+
u8  ext_comp_code;
u8  phy_id[4];
u8  module_type[3];
@@ -1729,7 +1740,15 @@ struct i40e_aq_set_phy_config { /* same bits as above in 
all */
 #define I40E_AQ_PHY_TYPE_EXT_25G_CR0X02
 #define I40E_AQ_PHY_TYPE_EXT_25G_SR0x04
 #define I40E_AQ_PHY_TYPE_EXT_25G_LR0x08
-   u8  reserved[2];
+   u8  fec_config;
+#define I40E_AQ_SET_FEC_ABILITY_KR BIT(0)
+#define I40E_AQ_SET_FEC_ABILITY_RS BIT(1)
+#define I40E_AQ_SET_FEC_REQUEST_KR BIT(2)
+#define I40E_AQ_SET_FEC_REQUEST_RS BIT(3)
+#define I40E_AQ_SET_FEC_AUTO   BIT(4)
+#define I40E_AQ_PHY_FEC_CONFIG_SHIFT   0x0
+#define I40E_AQ_PHY_FEC_CONFIG_MASK(0x1F << I40E_AQ_PHY_FEC_CONFIG_SHIFT)
+   u8  reserved;
 };
 
 I40E_CHECK_CMD_LENGTH(i40e_aq_set_phy_config);
@@ -1819,6 +1838,8 @@ struct i40e_aqc_get_link_status {
u8  loopback; /* use defines from i40e_aqc_set_lb_mode */
__le16  max_frame_size;
u8  config;
+#define I40E_AQ_CONFIG_FEC_KR_ENA  0x01
+#define I40E_AQ_CONFIG_FEC_RS_ENA  0x02
 #define I40E_AQ_CONFIG_CRC_ENA 0x04
 #define I40E_AQ_CONFIG_PACING_MASK 0x78
u8  external_power_ability;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 1318c7d..f8c4c14 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -1714,6 +1714,8 @@ enum i40e_status_code i40e_set_fc(struct i40e_hw *hw, u8 
*aq_failures,
config.eee_capability = abilities.eee_capability;
config.eeer = abilities.eeer_val;
config.low_power_ctrl = abilities.d3_lpan;
+   config.fec_config = abilities.fec_cfg_curr_mod_ext_info &
+   I40E_AQ_PHY_FEC_CONFIG_MASK;
status = i40e_aq_set_phy_config(hw, , NULL);
 
if (status)
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h 
b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
index f8d7d95..eeb9864 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h
@@ -1683,6 +1683,8 @@ struct i40e_aq_get_phy_abilities_resp {
 #define I40E_AQ_PHY_LINK_ENABLED   0x08
 #define I40E_AQ_PHY_AN_ENABLED 0x10
 #define I40E_AQ_PHY_FLAG_MODULE_QUAL   0x20
+#define I40E_AQ_PHY_FEC_ABILITY_KR 0x40
+#define I40E_AQ_PHY_FEC_ABILITY_RS 0x80
__le16  eee_capability;
 #define I40E_AQ_EEE_100BASE_TX 0x0002
 #define I40E_AQ_EEE_1000BASE_T 0x0004
@@ -1698,7 +1700,16 @@ struct i40e_aq_get_phy_abilities_resp {
 #define I40E_AQ_PHY_TYPE_EXT_25G_CR0X02
 #define I40E_AQ_PHY_TYPE_EXT_25G_SR0x04
 #define I40E_AQ_PHY_TYPE_EXT_25G_LR0x08
-   u8  mod_type_ext;
+   u8  fec_cfg_curr_mod_ext_info;
+#define I40E_AQ_ENABLE_FEC_KR  0x01
+#define 

[net-next v2 06/19] i40e: Blink LED on 1G BaseT boards

2016-12-07 Thread Jeff Kirsher
From: Henry Tieman 

Before this patch "ethtool -p" was not blinking the LEDs on boards
with 1G BaseT PHYs.

This commit identifies 1G BaseT boards as having the LEDs connected
to the MAC. Also, renamed the flag to be more descriptive of usage.
The flag is now I40E_FLAG_PHY_CONTROLS_LEDS.

Change-ID: I4eb741da9780da7849ddf2dc4c0cb27ffa42a801
Signed-off-by: Henry Tieman 
Signed-off-by: Harshitha Ramamurthy 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 10 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c|  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 06e3c23..b8f2978 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -356,7 +356,7 @@ struct i40e_pf {
 #define I40E_FLAG_NO_DCB_SUPPORT   BIT_ULL(45)
 #define I40E_FLAG_USE_SET_LLDP_MIB BIT_ULL(46)
 #define I40E_FLAG_STOP_FW_LLDP BIT_ULL(47)
-#define I40E_FLAG_HAVE_10GBASET_PHYBIT_ULL(48)
+#define I40E_FLAG_PHY_CONTROLS_LEDSBIT_ULL(48)
 #define I40E_FLAG_PF_MAC   BIT_ULL(50)
 #define I40E_FLAG_TRUE_PROMISC_SUPPORT BIT_ULL(51)
 #define I40E_FLAG_HAVE_CRT_RETIMER BIT_ULL(52)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 76753e1..6ba0035 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1890,7 +1890,7 @@ static int i40e_set_phys_id(struct net_device *netdev,
 
switch (state) {
case ETHTOOL_ID_ACTIVE:
-   if (!(pf->flags & I40E_FLAG_HAVE_10GBASET_PHY)) {
+   if (!(pf->flags & I40E_FLAG_PHY_CONTROLS_LEDS)) {
pf->led_status = i40e_led_get(hw);
} else {
i40e_aq_set_phy_debug(hw, I40E_PHY_DEBUG_ALL, NULL);
@@ -1900,20 +1900,20 @@ static int i40e_set_phys_id(struct net_device *netdev,
}
return blink_freq;
case ETHTOOL_ID_ON:
-   if (!(pf->flags & I40E_FLAG_HAVE_10GBASET_PHY))
+   if (!(pf->flags & I40E_FLAG_PHY_CONTROLS_LEDS))
i40e_led_set(hw, 0xf, false);
else
ret = i40e_led_set_phy(hw, true, pf->led_status, 0);
break;
case ETHTOOL_ID_OFF:
-   if (!(pf->flags & I40E_FLAG_HAVE_10GBASET_PHY))
+   if (!(pf->flags & I40E_FLAG_PHY_CONTROLS_LEDS))
i40e_led_set(hw, 0x0, false);
else
ret = i40e_led_set_phy(hw, false, pf->led_status, 0);
break;
case ETHTOOL_ID_INACTIVE:
-   if (!(pf->flags & I40E_FLAG_HAVE_10GBASET_PHY)) {
-   i40e_led_set(hw, false, pf->led_status);
+   if (!(pf->flags & I40E_FLAG_PHY_CONTROLS_LEDS)) {
+   i40e_led_set(hw, pf->led_status, false);
} else {
ret = i40e_led_set_phy(hw, false, pf->led_status,
   (pf->phy_led_val |
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index ae4a2b2..dbb854b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -11380,8 +11380,8 @@ static int i40e_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
   pf->main_vsi_seid);
 
if ((pf->hw.device_id == I40E_DEV_ID_10G_BASE_T) ||
-   (pf->hw.device_id == I40E_DEV_ID_10G_BASE_T4))
-   pf->flags |= I40E_FLAG_HAVE_10GBASET_PHY;
+   (pf->hw.device_id == I40E_DEV_ID_10G_BASE_T4))
+   pf->flags |= I40E_FLAG_PHY_CONTROLS_LEDS;
if (pf->hw.device_id == I40E_DEV_ID_SFP_I_X722)
pf->flags |= I40E_FLAG_HAVE_CRT_RETIMER;
/* print a string summarizing features */
-- 
2.9.3



[net-next v2 03/19] i40e: restore workaround for removing default MAC filter

2016-12-07 Thread Jeff Kirsher
From: Jacob Keller 

A previous commit 53cb6e9e8949 ("i40e: Removal of workaround for simple
MAC address filter deletion") removed a workaround for some
firmware versions which was reported to not be necessary in production
NICs. Unfortunately this workaround is necessary in some configurations,
specifically the Ethernet Controller XL710 for 40GbE QSFP+ (8086:1583).

Without this patch, the mentioned NICs with current firmware exhibit
issues when adding VLANs, as outlined by the following reproduction:

  $modprobe i40e
  $ip link set  up
  $ip link add link  vlan100 type vlan id 100
  $dmesg | tail
  
  kernel: i40e :82:00.0: Error I40E_AQ_RC_EINVAL adding RX
filters on PF, promiscuous mode forced on

This results in filters being marked as FAILED and setting the device in
promiscuous mode.

The root cause of receiving the -EINVAL error response appears to be due
to a conflict with the default MAC filter which still exists on the
default firmware for this device. Attempting to add a new VLAN filter on
the default MAC address conflicts with the IGNORE_VLAN setting on the
default rule.

Change-ID: I4d8f6d48ac5f60cfe981b3baad30eb4d7c170d61
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 41 +
 1 file changed, 41 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 4534d41..c467cc4 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1226,6 +1226,39 @@ bool i40e_is_vsi_in_vlan(struct i40e_vsi *vsi)
 }
 
 /**
+ * i40e_rm_default_mac_filter - Remove the default MAC filter set by NVM
+ * @vsi: the PF Main VSI - inappropriate for any other VSI
+ * @macaddr: the MAC address
+ *
+ * Remove whatever filter the firmware set up so the driver can manage
+ * its own filtering intelligently.
+ **/
+static void i40e_rm_default_mac_filter(struct i40e_vsi *vsi, u8 *macaddr)
+{
+   struct i40e_aqc_remove_macvlan_element_data element;
+   struct i40e_pf *pf = vsi->back;
+
+   /* Only appropriate for the PF main VSI */
+   if (vsi->type != I40E_VSI_MAIN)
+   return;
+
+   memset(, 0, sizeof(element));
+   ether_addr_copy(element.mac_addr, macaddr);
+   element.vlan_tag = 0;
+   /* Ignore error returns, some firmware does it this way... */
+   element.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH;
+   i40e_aq_remove_macvlan(>hw, vsi->seid, , 1, NULL);
+
+   memset(, 0, sizeof(element));
+   ether_addr_copy(element.mac_addr, macaddr);
+   element.vlan_tag = 0;
+   /* ...and some firmware does it this way. */
+   element.flags = I40E_AQC_MACVLAN_DEL_PERFECT_MATCH |
+   I40E_AQC_MACVLAN_DEL_IGNORE_VLAN;
+   i40e_aq_remove_macvlan(>hw, vsi->seid, , 1, NULL);
+}
+
+/**
  * i40e_add_filter - Add a mac/vlan filter to the VSI
  * @vsi: the VSI to be searched
  * @macaddr: the MAC address
@@ -9295,6 +9328,12 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
if (vsi->type == I40E_VSI_MAIN) {
SET_NETDEV_DEV(netdev, >pdev->dev);
ether_addr_copy(mac_addr, hw->mac.perm_addr);
+   /* The following steps are necessary to prevent reception
+* of tagged packets - some older NVM configurations load a
+* default a MAC-VLAN filter that accepts any tagged packet
+* which must be replaced by a normal filter.
+*/
+   i40e_rm_default_mac_filter(vsi, mac_addr);
spin_lock_bh(>mac_filter_hash_lock);
i40e_add_filter(vsi, mac_addr, I40E_VLAN_ANY);
spin_unlock_bh(>mac_filter_hash_lock);
@@ -9828,6 +9867,8 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct 
i40e_vsi *vsi)
pf->vsi[pf->lan_vsi]->tc_config.enabled_tc = 0;
pf->vsi[pf->lan_vsi]->seid = pf->main_vsi_seid;
i40e_vsi_config_tc(pf->vsi[pf->lan_vsi], enabled_tc);
+   if (vsi->type == I40E_VSI_MAIN)
+   i40e_rm_default_mac_filter(vsi, pf->hw.mac.perm_addr);
 
/* assign it some queues */
ret = i40e_alloc_rings(vsi);
-- 
2.9.3



[net-next v2 12/19] i40e: lock service task correctly

2016-12-07 Thread Jeff Kirsher
From: Mitch Williams 

The service task lock was being set in the scheduling function, not the
actual service task. This would potentially leave the bit set for a long
time before the task actually ran. Furthermore, if the service task
takes too long, it calls the schedule function to reschedule itself -
which would fail to take the lock and do nothing.

Instead, set and clear the lock bit in the service task itself. In the
process, get rid of the i40e_service_event_complete() function, which is
really just two lines of code that can be put right in the service task
itself.

Change-ID: I83155e682b686121e2897f4429eb7d3f7c669168
Signed-off-by: Mitch Williams 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 24 +++-
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b0486c9..c47e9c5 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -288,8 +288,7 @@ struct i40e_vsi *i40e_find_vsi_from_id(struct i40e_pf *pf, 
u16 id)
 void i40e_service_event_schedule(struct i40e_pf *pf)
 {
if (!test_bit(__I40E_DOWN, >state) &&
-   !test_bit(__I40E_RESET_RECOVERY_PENDING, >state) &&
-   !test_and_set_bit(__I40E_SERVICE_SCHED, >state))
+   !test_bit(__I40E_RESET_RECOVERY_PENDING, >state))
queue_work(i40e_wq, >service_task);
 }
 
@@ -5955,19 +5954,6 @@ static void i40e_handle_lan_overflow_event(struct 
i40e_pf *pf,
 }
 
 /**
- * i40e_service_event_complete - Finish up the service event
- * @pf: board private structure
- **/
-static void i40e_service_event_complete(struct i40e_pf *pf)
-{
-   WARN_ON(!test_bit(__I40E_SERVICE_SCHED, >state));
-
-   /* flush memory to make sure state is correct before next watchog */
-   smp_mb__before_atomic();
-   clear_bit(__I40E_SERVICE_SCHED, >state);
-}
-
-/**
  * i40e_get_cur_guaranteed_fd_count - Get the consumed guaranteed FD filters
  * @pf: board private structure
  **/
@@ -7276,10 +7262,12 @@ static void i40e_service_task(struct work_struct *work)
 
/* don't bother with service tasks if a reset is in progress */
if (test_bit(__I40E_RESET_RECOVERY_PENDING, >state)) {
-   i40e_service_event_complete(pf);
return;
}
 
+   if (test_and_set_bit(__I40E_SERVICE_SCHED, >state))
+   return;
+
i40e_detect_recover_hung(pf);
i40e_sync_filters_subtask(pf);
i40e_reset_subtask(pf);
@@ -7292,7 +7280,9 @@ static void i40e_service_task(struct work_struct *work)
i40e_sync_udp_filters_subtask(pf);
i40e_clean_adminq_subtask(pf);
 
-   i40e_service_event_complete(pf);
+   /* flush memory to make sure state is correct before next watchdog */
+   smp_mb__before_atomic();
+   clear_bit(__I40E_SERVICE_SCHED, >state);
 
/* If the tasks have taken longer than one timer cycle or there
 * is more work to be done, reschedule the service task now
-- 
2.9.3



[net-next v2 17/19] i40e: factor out addition/deletion of VLAN per each MAC address

2016-12-07 Thread Jeff Kirsher
From: Jacob Keller 

A future refactor of how the PF assigns a PVID to a VF will want to be
able to add and remove a block of filters by VLAN without worrying about
accidentally triggering the accounting for I40E_VLAN_ANY. Additionally
the PVID assignment would like to be able to batch several changes under
one use of the mac_filter_hash_lock.

Factor out the addition and deletion of a VLAN on all MACs into their
own function which i40e_vsi_(add|kill)_vlan can use. These new functions
expect the caller to take the hash lock, as well as perform any
necessary accounting for updating I40E_VLAN_ANY filters if we are now
operating under VLAN mode.

Change-ID: If79e5b60b770433275350a74b3f1880333a185d5
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 68 +++--
 1 file changed, 55 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f9e9c90..8aedfb7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2493,19 +2493,24 @@ static void i40e_vlan_rx_register(struct net_device 
*netdev, u32 features)
 }
 
 /**
- * i40e_vsi_add_vlan - Add vsi membership for given vlan
+ * i40e_add_vlan_all_mac - Add a MAC/VLAN filter for each existing MAC address
  * @vsi: the vsi being configured
  * @vid: vlan id to be added (0 = untagged only , -1 = any)
+ *
+ * This is a helper function for adding a new MAC/VLAN filter with the
+ * specified VLAN for each existing MAC address already in the hash table.
+ * This function does *not* perform any accounting to update filters based on
+ * VLAN mode.
+ *
+ * NOTE: this function expects to be called while under the
+ * mac_filter_hash_lock
  **/
-int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
+static int i40e_add_vlan_all_mac(struct i40e_vsi *vsi, s16 vid)
 {
-   struct i40e_mac_filter *f, *add_f, *del_f;
+   struct i40e_mac_filter *f, *add_f;
struct hlist_node *h;
int bkt;
 
-   /* Locked once because all functions invoked below iterates list*/
-   spin_lock_bh(>mac_filter_hash_lock);
-
hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
if (f->state == I40E_FILTER_REMOVE)
continue;
@@ -2514,11 +2519,33 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
dev_info(>back->pdev->dev,
 "Could not add vlan filter %d for %pM\n",
 vid, f->macaddr);
-   spin_unlock_bh(>mac_filter_hash_lock);
return -ENOMEM;
}
}
 
+   return 0;
+}
+
+/**
+ * i40e_vsi_add_vlan - Add VSI membership for given VLAN
+ * @vsi: the VSI being configured
+ * @vid: VLAN id to be added (0 = untagged only , -1 = any)
+ **/
+int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
+{
+   struct i40e_mac_filter *f, *add_f, *del_f;
+   struct hlist_node *h;
+   int bkt, err;
+
+   /* Locked once because all functions invoked below iterates list*/
+   spin_lock_bh(>mac_filter_hash_lock);
+
+   err = i40e_add_vlan_all_mac(vsi, vid);
+   if (err) {
+   spin_unlock_bh(>mac_filter_hash_lock);
+   return err;
+   }
+
/* When we add a new VLAN filter, we need to make sure that all existing
 * filters which are marked as vid=-1 (I40E_VLAN_ANY) are converted to
 * vid=0. The simplest way is just search for all filters marked as
@@ -2557,24 +2584,39 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
 }
 
 /**
- * i40e_vsi_kill_vlan - Remove vsi membership for given vlan
+ * i40e_rm_vlan_all_mac - Remove MAC/VLAN pair for all MAC with the given VLAN
  * @vsi: the vsi being configured
  * @vid: vlan id to be removed (0 = untagged only , -1 = any)
- **/
-void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid)
+ *
+ * This function should be used to remove all VLAN filters which match the
+ * given VID. It does not schedule the service event and does not take the
+ * mac_filter_hash_lock so it may be combined with other operations under
+ * a single invocation of the mac_filter_hash_lock.
+ *
+ * NOTE: this function expects to be called while under the
+ * mac_filter_hash_lock
+ */
+static void i40e_rm_vlan_all_mac(struct i40e_vsi *vsi, s16 vid)
 {
struct i40e_mac_filter *f;
struct hlist_node *h;
int bkt;
 
-   /* Locked once because all functions invoked below iterates list */
-   spin_lock_bh(>mac_filter_hash_lock);
-
hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
if (f->vlan == vid)
__i40e_del_filter(vsi, f);
}
+}
 
+/**
+ * i40e_vsi_kill_vlan - 

[net-next v2 19/19] i40e: move all updates for VLAN mode into i40e_sync_vsi_filters

2016-12-07 Thread Jeff Kirsher
From: Jacob Keller 

In a similar fashion to how we handled exiting VLAN mode, move the logic
in i40e_vsi_add_vlan into i40e_sync_vsi_filters. Extract this logic into
its own function for ease of understanding as it will become quite
complex.

The new function, i40e_correct_mac_vlan_filters() correctly updates all
filters for when we need to enter VLAN mode, exit VLAN mode, and also
enforces the PVID when assigned.

Call i40e_correct_mac_vlan_filters from i40e_sync_vsi_filters passing it
the number of active VLAN filters, and the two temporary lists.

Remove the function for updating VLAN=0 filters from i40e_vsi_add_vlan.

The end result is that the logic for entering and exiting VLAN mode is
in one location which has the most knowledge about all filters. This
ensures that we always correctly have the non-VLAN filters assigned to
VID=0 or VID=-1 regardless of how we ended up getting to this result.

Additionally this enforces the PVID at sync time so that we know for
certain that an assigned PVID results in only filters with that PVID
will be added to the firmware.

Change-ID: I895cee81e9c92d0a16baee38bd0ca51bbb14e372
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 214 +++-
 1 file changed, 113 insertions(+), 101 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 49261cc..da4cbe3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1227,6 +1227,107 @@ bool i40e_is_vsi_in_vlan(struct i40e_vsi *vsi)
 }
 
 /**
+ * i40e_correct_mac_vlan_filters - Correct non-VLAN filters if necessary
+ * @vsi: the VSI to configure
+ * @tmp_add_list: list of filters ready to be added
+ * @tmp_del_list: list of filters ready to be deleted
+ * @vlan_filters: the number of active VLAN filters
+ *
+ * Update VLAN=0 and VLAN=-1 (I40E_VLAN_ANY) filters properly so that they
+ * behave as expected. If we have any active VLAN filters remaining or about
+ * to be added then we need to update non-VLAN filters to be marked as VLAN=0
+ * so that they only match against untagged traffic. If we no longer have any
+ * active VLAN filters, we need to make all non-VLAN filters marked as VLAN=-1
+ * so that they match against both tagged and untagged traffic. In this way,
+ * we ensure that we correctly receive the desired traffic. This ensures that
+ * when we have an active VLAN we will receive only untagged traffic and
+ * traffic matching active VLANs. If we have no active VLANs then we will
+ * operate in non-VLAN mode and receive all traffic, tagged or untagged.
+ *
+ * Finally, in a similar fashion, this function also corrects filters when
+ * there is an active PVID assigned to this VSI.
+ *
+ * In case of memory allocation failure return -ENOMEM. Otherwise, return 0.
+ *
+ * This function is only expected to be called from within
+ * i40e_sync_vsi_filters.
+ *
+ * NOTE: This function expects to be called while under the
+ * mac_filter_hash_lock
+ */
+static int i40e_correct_mac_vlan_filters(struct i40e_vsi *vsi,
+struct hlist_head *tmp_add_list,
+struct hlist_head *tmp_del_list,
+int vlan_filters)
+{
+   struct i40e_mac_filter *f, *add_head;
+   struct hlist_node *h;
+   int bkt, new_vlan;
+
+   /* To determine if a particular filter needs to be replaced we
+* have the three following conditions:
+*
+* a) if we have a PVID assigned, then all filters which are
+*not marked as VLAN=PVID must be replaced with filters that
+*are.
+* b) otherwise, if we have any active VLANS, all filters
+*which are marked as VLAN=-1 must be replaced with
+*filters marked as VLAN=0
+* c) finally, if we do not have any active VLANS, all filters
+*which are marked as VLAN=0 must be replaced with filters
+*marked as VLAN=-1
+*/
+
+   /* Update the filters about to be added in place */
+   hlist_for_each_entry(f, tmp_add_list, hlist) {
+   if (vsi->info.pvid && f->vlan != vsi->info.pvid)
+   f->vlan = vsi->info.pvid;
+   else if (vlan_filters && f->vlan == I40E_VLAN_ANY)
+   f->vlan = 0;
+   else if (!vlan_filters && f->vlan == 0)
+   f->vlan = I40E_VLAN_ANY;
+   }
+
+   /* Update the remaining active filters */
+   hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) {
+   /* Combine the checks for whether a filter needs to be changed
+* and then determine the new VLAN inside the if block, in
+* order to 

[net-next v2 14/19] i40e: recalculate vsi->active_filters from hash contents

2016-12-07 Thread Jeff Kirsher
From: Jacob Keller 

Previous code refactors have accidentally caused issues with the
counting of active_filters. Avoid similar issues in the future by simply
re-counting the active filters every time after we handle add and delete
of all the filters. Additionally this allows us to simplify the check
for when we exit promiscuous mode since we can combine the check for
failed filters at the same time.

Additionally since we recount filters at the end we need to set
vsi->promisc_threshold as well.

The resulting code takes a bit longer since we do have to loop over
filters again. However, the result is more readable and less likely to
become incorrect due to failed accounting of filters in the future.
Finally, this ensures that it is not possible for vsi->active_filters to
ever underflow since we never decrement it.

Change-ID: Ib4f3a377e60eb1fa6c91ea86cc02238c08edd102
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 52 -
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 806fd56..2ccf376 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1870,12 +1870,10 @@ void i40e_aqc_add_filters(struct i40e_vsi *vsi, const 
char *vsi_name,
aq_ret = i40e_aq_add_macvlan(hw, vsi->seid, list, num_add, NULL);
aq_err = hw->aq.asq_last_status;
fcnt = i40e_update_filter_state(num_add, list, add_head, aq_ret);
-   vsi->active_filters += fcnt;
 
if (fcnt != num_add) {
*promisc_changed = true;
set_bit(__I40E_FILTER_OVERFLOW_PROMISC, >state);
-   vsi->promisc_threshold = (vsi->active_filters * 3) / 4;
dev_warn(>back->pdev->dev,
 "Error %s adding RX filters on %s, promiscuous mode 
forced on\n",
 i40e_aq_str(hw, aq_err),
@@ -1939,6 +1937,7 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi)
struct i40e_hw *hw = >back->hw;
unsigned int vlan_any_filters = 0;
unsigned int non_vlan_filters = 0;
+   unsigned int failed_filters = 0;
unsigned int vlan_filters = 0;
bool promisc_changed = false;
char vsi_name[16] = "PF";
@@ -1985,7 +1984,6 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi)
/* Move the element into temporary del_list */
hash_del(>hlist);
hlist_add_head(>hlist, _del_list);
-   vsi->active_filters--;
 
/* Avoid counting removed filters */
continue;
@@ -2046,7 +2044,6 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi)
f->state = I40E_FILTER_REMOVE;
hash_del(>hlist);
hlist_add_head(>hlist, _del_list);
-   vsi->active_filters--;
}
 
/* Also update any filters on the tmp_add list */
@@ -2203,27 +2200,36 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi)
add_list = NULL;
}
 
-   /* Check to see if we can drop out of overflow promiscuous mode. */
+   /* Determine the number of active and failed filters. */
+   spin_lock_bh(>mac_filter_hash_lock);
+   vsi->active_filters = 0;
+   hash_for_each(vsi->mac_filter_hash, bkt, f, hlist) {
+   if (f->state == I40E_FILTER_ACTIVE)
+   vsi->active_filters++;
+   else if (f->state == I40E_FILTER_FAILED)
+   failed_filters++;
+   }
+   spin_unlock_bh(>mac_filter_hash_lock);
+
+   /* If promiscuous mode has changed, we need to calculate a new
+* threshold for when we are safe to exit
+*/
+   if (promisc_changed)
+   vsi->promisc_threshold = (vsi->active_filters * 3) / 4;
+
+   /* Check if we are able to exit overflow promiscuous mode. We can
+* safely exit if we didn't just enter, we no longer have any failed
+* filters, and we have reduced filters below the threshold value.
+*/
if (test_bit(__I40E_FILTER_OVERFLOW_PROMISC, >state) &&
+   !promisc_changed && !failed_filters &&
(vsi->active_filters < vsi->promisc_threshold)) {
-   int failed_count = 0;
-   /* See if we have any failed filters. We can't drop out of
-* promiscuous until these have all been deleted.
-*/
-   spin_lock_bh(>mac_filter_hash_lock);
-   hash_for_each(vsi->mac_filter_hash, bkt, f, hlist) {
-   

[net-next v2 02/19] i40e: simplify txd use count calculation

2016-12-07 Thread Jeff Kirsher
From: Mitch Williams 

The i40e_txd_use_count function was fast but confusing. In the comments,
it even admits that it's ugly. So replace it with a new function that is
(very) slightly faster and has extensive commenting to help the thicker
among us (including the author, who will forget in a week) understand
how it works.

Change-ID: Ifb533f13786a0bf39cb29f77969a5be2c83d9a87
Signed-off-by: Mitch Williams 
Signed-off-by: Alexander Duyck 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   | 45 +--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 45 +--
 2 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index de8550f..e065321 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -173,26 +173,37 @@ static inline bool i40e_test_staterr(union i40e_rx_desc 
*rx_desc,
 #define I40E_MAX_DATA_PER_TXD_ALIGNED \
(I40E_MAX_DATA_PER_TXD & ~(I40E_MAX_READ_REQ_SIZE - 1))
 
-/* This ugly bit of math is equivalent to DIV_ROUNDUP(size, X) where X is
- * the value I40E_MAX_DATA_PER_TXD_ALIGNED.  It is needed due to the fact
- * that 12K is not a power of 2 and division is expensive.  It is used to
- * approximate the number of descriptors used per linear buffer.  Note
- * that this will overestimate in some cases as it doesn't account for the
- * fact that we will add up to 4K - 1 in aligning the 12K buffer, however
- * the error should not impact things much as large buffers usually mean
- * we will use fewer descriptors then there are frags in an skb.
+/**
+ * i40e_txd_use_count  - estimate the number of descriptors needed for Tx
+ * @size: transmit request size in bytes
+ *
+ * Due to hardware alignment restrictions (4K alignment), we need to
+ * assume that we can have no more than 12K of data per descriptor, even
+ * though each descriptor can take up to 16K - 1 bytes of aligned memory.
+ * Thus, we need to divide by 12K. But division is slow! Instead,
+ * we decompose the operation into shifts and one relatively cheap
+ * multiply operation.
+ *
+ * To divide by 12K, we first divide by 4K, then divide by 3:
+ * To divide by 4K, shift right by 12 bits
+ * To divide by 3, multiply by 85, then divide by 256
+ * (Divide by 256 is done by shifting right by 8 bits)
+ * Finally, we add one to round up. Because 256 isn't an exact multiple of
+ * 3, we'll underestimate near each multiple of 12K. This is actually more
+ * accurate as we have 4K - 1 of wiggle room that we can fit into the last
+ * segment.  For our purposes this is accurate out to 1M which is orders of
+ * magnitude greater than our largest possible GSO size.
+ *
+ * This would then be implemented as:
+ * return (((size >> 12) * 85) >> 8) + 1;
+ *
+ * Since multiplication and division are commutative, we can reorder
+ * operations into:
+ * return ((size * 85) >> 20) + 1;
  */
 static inline unsigned int i40e_txd_use_count(unsigned int size)
 {
-   const unsigned int max = I40E_MAX_DATA_PER_TXD_ALIGNED;
-   const unsigned int reciprocal = ((1ull << 32) - 1 + (max / 2)) / max;
-   unsigned int adjust = ~(u32)0;
-
-   /* if we rounded up on the reciprocal pull down the adjustment */
-   if ((max * reciprocal) > adjust)
-   adjust = ~(u32)(reciprocal - 1);
-
-   return (u32)u64)size * reciprocal) + adjust) >> 32);
+   return ((size * 85) >> 20) + 1;
 }
 
 /* Tx Descriptors needed, worst case */
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
index a586e19..a5fc789 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
@@ -173,26 +173,37 @@ static inline bool i40e_test_staterr(union i40e_rx_desc 
*rx_desc,
 #define I40E_MAX_DATA_PER_TXD_ALIGNED \
(I40E_MAX_DATA_PER_TXD & ~(I40E_MAX_READ_REQ_SIZE - 1))
 
-/* This ugly bit of math is equivalent to DIV_ROUNDUP(size, X) where X is
- * the value I40E_MAX_DATA_PER_TXD_ALIGNED.  It is needed due to the fact
- * that 12K is not a power of 2 and division is expensive.  It is used to
- * approximate the number of descriptors used per linear buffer.  Note
- * that this will overestimate in some cases as it doesn't account for the
- * fact that we will add up to 4K - 1 in aligning the 12K buffer, however
- * the error should not impact things much as large buffers usually mean
- * we will use fewer descriptors then there are frags in an skb.
+/**
+ * i40e_txd_use_count  - estimate the number of descriptors needed for Tx
+ * @size: transmit request size in bytes
+ *
+ * Due to hardware alignment restrictions (4K alignment), we need to
+ * 

[net-next v2 00/19][pull request] 40GbE Intel Wired LAN Driver Updates 2016-12-07

2016-12-07 Thread Jeff Kirsher
This series contains updates to i40e and i40evf only.

Filip modifies the i40e to log link speed change and when the link is
brought up and down.

Mitch replaces i40e_txd_use_count() with a new function which is slightly
faster and better documented so the dim witted can better follow the
code.  Fixes the locking of the service task so that it is actually
done in the service task and not in the scheduling function which calls
the service task.

Jacob, being the busy little beaver he is, provides most of the changes
starting restores a workaround that is still needed in some configurations,
specifically the Ethernet Controller XL710 for 40GbE QSFP+.  Removes
duplicate code and simplifies the i40e_vsi_add_vlan() and
i40e_vsi_kill_vlan() functions.  Removes detection of PTP frames over L4
(UDP) on the XL710 MAC, since there was a product decision to defeature
it.  Fixed a previous refactor of active filters which caused issues in
the accounting of active_filters.  Remaining work was done in the VLAN
filters to improve readability and simplify code as much as possible
to reduce inconsistencies.

Alex fixes foul budget accounting in core code by returning actual
work done, capped to budget-1.

Henry fixes the "ethtool -p" function for 1G BaseT PHYs.

Carolyn adds support for 25G devices for i40e and i40evf.

Michal adds functions to apply the correct access method for external PHYs
which could use Clause22 or Clause45 depending on the PHY.

v2: dropped last patch from previous series, since changes are needed based
on feedback from Sergei Shtylyov

The following are changes since commit d4aea20d889e05575bb331a3dadf176176f7d631:
  tun: Use netif_receive_skb instead of netif_rx
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 40GbE

Alexander Duyck (1):
  i40e/i40evf: napi_poll must return the work done

Bimmy Pujari (1):
  Changed version from 1.6.21 to 1.6.25

Carolyn Wyborny (2):
  i40e: Add support for 25G devices
  i40e: Add FEC for 25g

Filip Sadowski (1):
  i40e: Driver prints log message on link speed change

Henry Tieman (1):
  i40e: Blink LED on 1G BaseT boards

Jacob Keller (10):
  i40e: restore workaround for removing default MAC filter
  i40e: remove code to handle dev_addr specially
  i40e: use unsigned printf format specifier for active_filters count
  i40e: defeature support for PTP L4 frame detection on XL710
  i40e: recalculate vsi->active_filters from hash contents
  i40e: refactor i40e_update_filter_state to avoid passing aq_err
  i40e: delete filter after adding its replacement when converting
  i40e: factor out addition/deletion of VLAN per each MAC address
  i40e: use (add|rm)_vlan_all_mac helper functions when changing PVID
  i40e: move all updates for VLAN mode into i40e_sync_vsi_filters

Michal Kosiarz (1):
  i40e: Add functions which apply correct PHY access method for read and
write operation

Mitch Williams (2):
  i40e: simplify txd use count calculation
  i40e: lock service task correctly

 drivers/net/ethernet/intel/i40e/i40e.h |   6 +-
 drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h  |  51 ++-
 drivers/net/ethernet/intel/i40e/i40e_common.c  |  85 +++-
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c |   2 +-
 drivers/net/ethernet/intel/i40e/i40e_devids.h  |   2 +
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  51 ++-
 drivers/net/ethernet/intel/i40e/i40e_main.c| 485 -
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |   4 +
 drivers/net/ethernet/intel/i40e/i40e_ptp.c |  21 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c|   2 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h|  45 +-
 drivers/net/ethernet/intel/i40e/i40e_type.h|  82 ++--
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |  46 +-
 .../net/ethernet/intel/i40evf/i40e_adminq_cmd.h|  51 ++-
 drivers/net/ethernet/intel/i40evf/i40e_common.c|   2 +
 drivers/net/ethernet/intel/i40evf/i40e_devids.h|   2 +
 drivers/net/ethernet/intel/i40evf/i40e_prototype.h |   4 +
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c  |   2 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h  |  45 +-
 drivers/net/ethernet/intel/i40evf/i40e_type.h  |  82 ++--
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c |   8 +
 drivers/net/ethernet/intel/i40evf/i40evf_main.c|   2 +-
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c|   3 +
 23 files changed, 723 insertions(+), 360 deletions(-)

-- 
2.9.3



[net-next v2 16/19] i40e: delete filter after adding its replacement when converting

2016-12-07 Thread Jeff Kirsher
From: Jacob Keller 

Fix a subtle issue with the code for converting VID=-1 filters into VID=0
filters when adding a new VLAN. Previously the code deleted the VID=-1
filter, and then added a new VID=0 filter. In the rare case that the
addition fails due to -ENOMEM, we end up completely deleting the filter
which prevents recovery if memory pressure subsides. While it is not
strictly an issue because it is likely that memory issues would result
in many other problems, we shouldn't delete the filter until after the
addition succeeds.

Change-ID: Icba07ddd04ecc6a3b27c2e29f2c1c8673d266826
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 8e65972..f9e9c90 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2535,7 +2535,6 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
 I40E_VLAN_ANY);
if (!del_f)
continue;
-   __i40e_del_filter(vsi, del_f);
add_f = i40e_add_filter(vsi, f->macaddr, 0);
if (!add_f) {
dev_info(>back->pdev->dev,
@@ -2544,6 +2543,7 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
spin_unlock_bh(>mac_filter_hash_lock);
return -ENOMEM;
}
+   __i40e_del_filter(vsi, del_f);
}
}
 
-- 
2.9.3



[net-next v2 13/19] i40e: defeature support for PTP L4 frame detection on XL710

2016-12-07 Thread Jeff Kirsher
From: Jacob Keller 

A product decision has been made to defeature detection of PTP frames
over L4 (UDP) on the XL710 MAC. Do not advertise support for L4
timestamping.

Change-ID: I41fbb0f84ebb27c43e23098c08156f2625c6ee06
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h |  1 +
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 15 +--
 drivers/net/ethernet/intel/i40e/i40e_main.c|  3 ++-
 drivers/net/ethernet/intel/i40e/i40e_ptp.c | 21 +++--
 4 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index b8f2978..f1d838f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -360,6 +360,7 @@ struct i40e_pf {
 #define I40E_FLAG_PF_MAC   BIT_ULL(50)
 #define I40E_FLAG_TRUE_PROMISC_SUPPORT BIT_ULL(51)
 #define I40E_FLAG_HAVE_CRT_RETIMER BIT_ULL(52)
+#define I40E_FLAG_PTP_L4_CAPABLE   BIT_ULL(53)
 
/* tracks features that get auto disabled by errors */
u64 auto_disable_flags;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 935160a..cc1465a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1681,8 +1681,19 @@ static int i40e_get_ts_info(struct net_device *dev,
info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON);
 
info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
-  BIT(HWTSTAMP_FILTER_PTP_V1_L4_EVENT) |
-  BIT(HWTSTAMP_FILTER_PTP_V2_EVENT);
+  BIT(HWTSTAMP_FILTER_PTP_V2_L2_EVENT) |
+  BIT(HWTSTAMP_FILTER_PTP_V2_L2_SYNC) |
+  BIT(HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ);
+
+   if (pf->flags & I40E_FLAG_PTP_L4_CAPABLE)
+   info->rx_filters |= BIT(HWTSTAMP_FILTER_PTP_V1_L4_SYNC) |
+   BIT(HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ) |
+   BIT(HWTSTAMP_FILTER_PTP_V2_EVENT) |
+   BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) |
+   BIT(HWTSTAMP_FILTER_PTP_V2_SYNC) |
+   BIT(HWTSTAMP_FILTER_PTP_V2_L4_SYNC) |
+   BIT(HWTSTAMP_FILTER_PTP_V2_DELAY_REQ) |
+   BIT(HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ);
 
return 0;
 }
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index c47e9c5..806fd56 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -8699,7 +8699,8 @@ static int i40e_sw_init(struct i40e_pf *pf)
 I40E_FLAG_MULTIPLE_TCP_UDP_RSS_PCTYPE |
 I40E_FLAG_NO_PCI_LINK_CHECK |
 I40E_FLAG_USE_SET_LLDP_MIB |
-I40E_FLAG_GENEVE_OFFLOAD_CAPABLE;
+I40E_FLAG_GENEVE_OFFLOAD_CAPABLE |
+I40E_FLAG_PTP_L4_CAPABLE;
} else if ((pf->hw.aq.api_maj_ver > 1) ||
   ((pf->hw.aq.api_maj_ver == 1) &&
(pf->hw.aq.api_min_ver > 4))) {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c 
b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index 5e2272c..9e49ffa 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -521,6 +521,8 @@ static int i40e_ptp_set_timestamp_mode(struct i40e_pf *pf,
case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+   if (!(pf->flags & I40E_FLAG_PTP_L4_CAPABLE))
+   return -ERANGE;
pf->ptp_rx = true;
tsyntype = I40E_PRTTSYN_CTL1_V1MESSTYPE0_MASK |
   I40E_PRTTSYN_CTL1_TSYNTYPE_V1 |
@@ -528,19 +530,26 @@ static int i40e_ptp_set_timestamp_mode(struct i40e_pf *pf,
config->rx_filter = HWTSTAMP_FILTER_PTP_V1_L4_EVENT;
break;
case HWTSTAMP_FILTER_PTP_V2_EVENT:
-   case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
case HWTSTAMP_FILTER_PTP_V2_SYNC:
-   case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
-   case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+   if (!(pf->flags & I40E_FLAG_PTP_L4_CAPABLE))
+   return -ERANGE;
+   

[net-next v2 18/19] i40e: use (add|rm)_vlan_all_mac helper functions when changing PVID

2016-12-07 Thread Jeff Kirsher
From: Jacob Keller 

The current flow for adding or updating the PVID for a VF uses
i40e_vsi_add_vlan and i40e_vsi_kill_vlan which each take, then release
the hash lock. In addition the two functions also must take special care
that they do not perform VLAN mode changes as this will make the code in
i40e_ndo_set_vf_port_vlan behave incorrectly.

Fix these issues by using the new helper functions i40e_add_vlan_all_mac
and i40e_rm_vlan_all_mac which expect the hash lock to already be taken.
Additionally these functions do not perform any state updates in regards
to VLAN mode, so they are safe to use in the PVID update flow.

It should be noted that we don't need the VLAN mode update code here,
because there are only a few flows here.

(a) we're adding a new PVID
  In this case, if we already had VLAN filters the VSI is knocked
  offline so we don't need to worry about pre-existing VLAN filters

(b) we're replacing an existing PVID
  In this case, we can't have any VLAN filters except those with the old
  PVID which we already take care of manually.

(c) we're removing an existing PVID
  Similarly to above, we can't have any existing VLAN filters except
  those with the old PVID which we already take care of correctly.

Because of this, we do not need (or even want) the special accounting
done in i40e_vsi_add_vlan, so use of the helpers is a saner alternative.
It also opens the door for a future patch which will refactor the flow
of i40e_vsi_add_vlan now that it is not needed in this function.

Change-ID: Ia841f63da94e12b106f41cf7d28ce8ce92f2ad99
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h |  2 +
 drivers/net/ethernet/intel/i40e/i40e_main.c|  4 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 43 ++
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index f1d838f..ba8d309 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -852,7 +852,9 @@ int i40e_open(struct net_device *netdev);
 int i40e_close(struct net_device *netdev);
 int i40e_vsi_open(struct i40e_vsi *vsi);
 void i40e_vlan_stripping_disable(struct i40e_vsi *vsi);
+int i40e_add_vlan_all_mac(struct i40e_vsi *vsi, s16 vid);
 int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid);
+void i40e_rm_vlan_all_mac(struct i40e_vsi *vsi, s16 vid);
 void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid);
 struct i40e_mac_filter *i40e_put_mac_in_vlan(struct i40e_vsi *vsi,
 const u8 *macaddr);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 8aedfb7..49261cc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2505,7 +2505,7 @@ static void i40e_vlan_rx_register(struct net_device 
*netdev, u32 features)
  * NOTE: this function expects to be called while under the
  * mac_filter_hash_lock
  **/
-static int i40e_add_vlan_all_mac(struct i40e_vsi *vsi, s16 vid)
+int i40e_add_vlan_all_mac(struct i40e_vsi *vsi, s16 vid)
 {
struct i40e_mac_filter *f, *add_f;
struct hlist_node *h;
@@ -2596,7 +2596,7 @@ int i40e_vsi_add_vlan(struct i40e_vsi *vsi, s16 vid)
  * NOTE: this function expects to be called while under the
  * mac_filter_hash_lock
  */
-static void i40e_rm_vlan_all_mac(struct i40e_vsi *vsi, s16 vid)
+void i40e_rm_vlan_all_mac(struct i40e_vsi *vsi, s16 vid)
 {
struct i40e_mac_filter *f;
struct hlist_node *h;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index d28b684..a6198b7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -2766,7 +2766,6 @@ int i40e_ndo_set_vf_port_vlan(struct net_device *netdev, 
int vf_id,
u16 vlanprio = vlan_id | (qos << I40E_VLAN_PRIORITY_SHIFT);
struct i40e_netdev_priv *np = netdev_priv(netdev);
struct i40e_pf *pf = np->vsi->back;
-   bool is_vsi_in_vlan = false;
struct i40e_vsi *vsi;
struct i40e_vf *vf;
int ret = 0;
@@ -2803,11 +2802,10 @@ int i40e_ndo_set_vf_port_vlan(struct net_device 
*netdev, int vf_id,
/* duplicate request, so just return success */
goto error_pvid;
 
+   /* Locked once because multiple functions below iterate list */
spin_lock_bh(>mac_filter_hash_lock);
-   is_vsi_in_vlan = i40e_is_vsi_in_vlan(vsi);
-   spin_unlock_bh(>mac_filter_hash_lock);
 
-   if (le16_to_cpu(vsi->info.pvid) == 0 && is_vsi_in_vlan) {
+   if (le16_to_cpu(vsi->info.pvid) == 0 && i40e_is_vsi_in_vlan(vsi)) {

[net-next v2 11/19] i40e: Add functions which apply correct PHY access method for read and write operation

2016-12-07 Thread Jeff Kirsher
From: Michal Kosiarz 

Depending on external PHY type, register access method should be
different. Clause22 or Clause45 can be chosen for different PHYs.
Implemented functions apply correct access method for used device.

Change-ID: If39d5f0da9c0b905a8cbdc1ab89885535e7d0426
Signed-off-by: Michal Kosiarz 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_common.c  | 72 ++
 drivers/net/ethernet/intel/i40e/i40e_prototype.h   |  4 ++
 drivers/net/ethernet/intel/i40evf/i40e_prototype.h |  4 ++
 3 files changed, 80 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index f8c4c14..1287359 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -4676,6 +4676,78 @@ i40e_status i40e_write_phy_register_clause45(struct 
i40e_hw *hw,
 }
 
 /**
+ * i40e_write_phy_register
+ * @hw: pointer to the HW structure
+ * @page: registers page number
+ * @reg: register address in the page
+ * @phy_adr: PHY address on MDIO interface
+ * @value: PHY register value
+ *
+ * Writes value to specified PHY register
+ **/
+i40e_status i40e_write_phy_register(struct i40e_hw *hw,
+   u8 page, u16 reg, u8 phy_addr, u16 value)
+{
+   i40e_status status;
+
+   switch (hw->device_id) {
+   case I40E_DEV_ID_1G_BASE_T_X722:
+   status = i40e_write_phy_register_clause22(hw, reg, phy_addr,
+ value);
+   break;
+   case I40E_DEV_ID_10G_BASE_T:
+   case I40E_DEV_ID_10G_BASE_T4:
+   case I40E_DEV_ID_10G_BASE_T_X722:
+   case I40E_DEV_ID_25G_B:
+   case I40E_DEV_ID_25G_SFP28:
+   status = i40e_write_phy_register_clause45(hw, page, reg,
+ phy_addr, value);
+   break;
+   default:
+   status = I40E_ERR_UNKNOWN_PHY;
+   break;
+   }
+
+   return status;
+}
+
+/**
+ * i40e_read_phy_register
+ * @hw: pointer to the HW structure
+ * @page: registers page number
+ * @reg: register address in the page
+ * @phy_adr: PHY address on MDIO interface
+ * @value: PHY register value
+ *
+ * Reads specified PHY register value
+ **/
+i40e_status i40e_read_phy_register(struct i40e_hw *hw,
+  u8 page, u16 reg, u8 phy_addr, u16 *value)
+{
+   i40e_status status;
+
+   switch (hw->device_id) {
+   case I40E_DEV_ID_1G_BASE_T_X722:
+   status = i40e_read_phy_register_clause22(hw, reg, phy_addr,
+value);
+   break;
+   case I40E_DEV_ID_10G_BASE_T:
+   case I40E_DEV_ID_10G_BASE_T4:
+   case I40E_DEV_ID_10G_BASE_T_X722:
+   case I40E_DEV_ID_25G_B:
+   case I40E_DEV_ID_25G_SFP28:
+   status = i40e_read_phy_register_clause45(hw, page, reg,
+phy_addr, value);
+   break;
+   default:
+   status = I40E_ERR_UNKNOWN_PHY;
+   break;
+   }
+
+   return status;
+}
+
+/**
  * i40e_get_phy_address
  * @hw: pointer to the HW structure
  * @dev_num: PHY port num that address we want
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h 
b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index 37d67e7..2551fc8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -373,6 +373,10 @@ i40e_status i40e_read_phy_register_clause45(struct i40e_hw 
*hw,
u8 page, u16 reg, u8 phy_addr, u16 *value);
 i40e_status i40e_write_phy_register_clause45(struct i40e_hw *hw,
u8 page, u16 reg, u8 phy_addr, u16 value);
+i40e_status i40e_read_phy_register(struct i40e_hw *hw, u8 page, u16 reg,
+  u8 phy_addr, u16 *value);
+i40e_status i40e_write_phy_register(struct i40e_hw *hw, u8 page, u16 reg,
+   u8 phy_addr, u16 value);
 u8 i40e_get_phy_address(struct i40e_hw *hw, u8 dev_num);
 i40e_status i40e_blink_phy_link_led(struct i40e_hw *hw,
u32 time, u32 interval);
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h 
b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h
index d89d521..ba6c6bd 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_prototype.h
@@ -115,6 +115,10 @@ i40e_status i40e_read_phy_register(struct i40e_hw *hw, u8 
page,
   u16 reg, u8 phy_addr, u16 *value);
 i40e_status i40e_write_phy_register(struct i40e_hw *hw, u8 page,

[net-next v2 15/19] i40e: refactor i40e_update_filter_state to avoid passing aq_err

2016-12-07 Thread Jeff Kirsher
From: Jacob Keller 

The current caller of i40e_update_filter_state incorrectly passes
aq_ret, an i40e_status variable, instead of the expected aq_err. This
happens to work because i40e_status is actually just a typedef integer,
and 0 is still the successful return. However i40e_update_filter_state
has special handling for ENOSPC which is currently being ignored.

Also notice that firmware does not update the per-filter response for
many types of errors, such as EINVAL. Thus, modify the filter setup so
that the firmware response memory is pre-set with I40E_AQC_MM_ERR_NO_RES.

This enables us to refactor i40e_update_filter_state, removing the need
to pass aq_err and avoiding a need for having 3 different flows for
checking the filter state.

The resulting code for i40e_update_filter_state is much simpler, only
a single loop and we always check each filter response value every time.
Since we pre-set the response value to match our expected error this
correctly works for all success and error flows.

Change-ID: Ie292c9511f34ee18c6ef40f955ad13e28b7aea7d
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_main.c | 58 +++--
 1 file changed, 21 insertions(+), 37 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 2ccf376..8e65972 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1757,7 +1757,6 @@ static void i40e_undo_filter_entries(struct i40e_vsi *vsi,
  * @count: Number of filters added
  * @add_list: return data from fw
  * @head: pointer to first filter in current batch
- * @aq_err: status from fw
  *
  * MAC filter entries from list were slated to be added to device. Returns
  * number of successful filters. Note that 0 does NOT mean success!
@@ -1765,47 +1764,30 @@ static void i40e_undo_filter_entries(struct i40e_vsi 
*vsi,
 static int
 i40e_update_filter_state(int count,
 struct i40e_aqc_add_macvlan_element_data *add_list,
-struct i40e_mac_filter *add_head, int aq_err)
+struct i40e_mac_filter *add_head)
 {
int retval = 0;
int i;
 
-
-   if (!aq_err) {
-   retval = count;
-   /* Everything's good, mark all filters active. */
-   for (i = 0; i < count ; i++) {
-   add_head->state = I40E_FILTER_ACTIVE;
-   add_head = hlist_entry(add_head->hlist.next,
-  typeof(struct i40e_mac_filter),
-  hlist);
-   }
-   } else if (aq_err == I40E_AQ_RC_ENOSPC) {
-   /* Device ran out of filter space. Check the return value
-* for each filter to see which ones are active.
+   for (i = 0; i < count; i++) {
+   /* Always check status of each filter. We don't need to check
+* the firmware return status because we pre-set the filter
+* status to I40E_AQC_MM_ERR_NO_RES when sending the filter
+* request to the adminq. Thus, if it no longer matches then
+* we know the filter is active.
 */
-   for (i = 0; i < count ; i++) {
-   if (add_list[i].match_method ==
-   I40E_AQC_MM_ERR_NO_RES) {
-   add_head->state = I40E_FILTER_FAILED;
-   } else {
-   add_head->state = I40E_FILTER_ACTIVE;
-   retval++;
-   }
-   add_head = hlist_entry(add_head->hlist.next,
-  typeof(struct i40e_mac_filter),
-  hlist);
-   }
-   } else {
-   /* Some other horrible thing happened, fail all filters */
-   retval = 0;
-   for (i = 0; i < count ; i++) {
+   if (add_list[i].match_method == I40E_AQC_MM_ERR_NO_RES) {
add_head->state = I40E_FILTER_FAILED;
-   add_head = hlist_entry(add_head->hlist.next,
-  typeof(struct i40e_mac_filter),
-  hlist);
+   } else {
+   add_head->state = I40E_FILTER_ACTIVE;
+   retval++;
}
+
+   add_head = hlist_entry(add_head->hlist.next,
+  typeof(struct i40e_mac_filter),
+  hlist);
}
+
return retval;
 }
 
@@ -1864,12 +1846,11 @@ void i40e_aqc_add_filters(struct i40e_vsi 

[net-next v2 08/19] i40e: use unsigned printf format specifier for active_filters count

2016-12-07 Thread Jeff Kirsher
From: Jacob Keller 

Replace the %d specifier used for printing vsi->active_filters and
vsi->promisc_threshold with an unsigned %u format specifier. While it is
unlikely in practice that these values will ever reach such a large
number they are unsigned values and thus should not be interpreted as
negative numbers.

Change-ID: Iff050fad5a1c8537c4c57fcd527441cd95cfc0d4
Signed-off-by: Jacob Keller 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c 
b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index b8a03a0..f1f41f1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -172,7 +172,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int 
seid)
 f->macaddr, f->vlan,
 i40e_filter_state_string[f->state]);
}
-   dev_info(>pdev->dev, "active_filters %d, promisc_threshold %d, 
overflow promisc %s\n",
+   dev_info(>pdev->dev, "active_filters %u, promisc_threshold %u, 
overflow promisc %s\n",
 vsi->active_filters, vsi->promisc_threshold,
 (test_bit(__I40E_FILTER_OVERFLOW_PROMISC, >state) ?
  "ON" : "OFF"));
-- 
2.9.3



Re: [PATCH] net: pch_gbe: Fix TX RX descriptor accesses for big endian systems

2016-12-07 Thread Francois Romieu
Hassan Naveed  :
> Fix pch_gbe driver for ethernet operations for a big endian CPU.
> Values written to and read from transmit and receive descriptors
> in the pch_gbe driver are byte swapped from the perspective of a
> big endian CPU, since the ethernet controller always operates in
> little endian mode. Rectify this by appropriately byte swapping
> these descriptor field values in the driver software.

You should also use __le{16/32} types in struct pch_gbe_{rx/tx}_desc.

-- 
Ueimor


Re: [patch] ser_gigaset: return -ENOMEM on error instead of success

2016-12-07 Thread Tilman Schmidt
Hi Paul,

Am 07.12.2016 um 22:08 schrieb Paul Bolle:
> On Wed, 2016-12-07 at 21:57 +0100, Tilman Schmidt wrote:
>> Not much of a mess, I reckon. Everything that has been allocated and
>> registered up to that point is properly deallocated and unregistered.
>> The code just fails to tell the kernel that module initialization has
>> failed, so the module remains loaded even though it can never be
>> called because it isn't hooked anywhere. That's a nuisance and a
>> waste of RAM, but not much more.
> 
> Yes.
> 
> But then the removal of the module, which is the only reasonable thing to do
> after all this has happened, seems to trigger a WARN in driver_unregister().
> And it's that WARN that I think requires the entire stable song and dance.

Ah, yes, of course, because driver_unregister() has already been run
in the failure path of module_init and is now called a second time.
Not sure how much evil that does beyond the WARN, but I agree it's
worth investigating.

Best regards,
Tilman

-- 
Tilman Schmidt  E-Mail: til...@imap.cc
Bonn, Germany
Nous, on a des fleurs et des bougies pour nous protéger.



signature.asc
Description: OpenPGP digital signature


[PATCH 27/50] netfilter: nft_fib_ipv4: initialize *dest to zero

2016-12-07 Thread Pablo Neira Ayuso
From: Liping Zhang 

Otherwise, if fib lookup fail, *dest will be filled with garbage value,
so reverse path filtering will not work properly:
 # nft add rule x prerouting fib saddr oif eq 0 drop

Fixes: f6d0cbcf09c5 ("netfilter: nf_tables: add fib expression")
Signed-off-by: Liping Zhang 
Acked-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/nft_fib_ipv4.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c 
b/net/ipv4/netfilter/nft_fib_ipv4.c
index bfffa742f397..258136364f5e 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -122,6 +122,8 @@ void nft_fib4_eval(const struct nft_expr *expr, struct 
nft_regs *regs,
fl4.saddr = get_saddr(iph->daddr);
}
 
+   *dest = 0;
+
if (fib_lookup(nft_net(pkt), , , FIB_LOOKUP_IGNORE_LINKSTATE))
return;
 
-- 
2.1.4



[PATCH 03/50] netfilter: update Arturo Borrero Gonzalez email address

2016-12-07 Thread Pablo Neira Ayuso
From: Arturo Borrero Gonzalez 

The email address has changed, let's update the copyright statements.

Signed-off-by: Arturo Borrero Gonzalez 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/nft_masq_ipv4.c  | 4 ++--
 net/ipv4/netfilter/nft_redir_ipv4.c | 4 ++--
 net/ipv6/netfilter/nft_masq_ipv6.c  | 4 ++--
 net/ipv6/netfilter/nft_redir_ipv6.c | 4 ++--
 net/netfilter/nft_masq.c| 4 ++--
 net/netfilter/nft_redir.c   | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c 
b/net/ipv4/netfilter/nft_masq_ipv4.c
index 4f697e431811..4d69f99b8707 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Arturo Borrero Gonzalez 
+ * Copyright (c) 2014 Arturo Borrero Gonzalez 
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -77,5 +77,5 @@ module_init(nft_masq_ipv4_module_init);
 module_exit(nft_masq_ipv4_module_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez ");
+MODULE_AUTHOR("Arturo Borrero Gonzalez 
+ * Copyright (c) 2014 Arturo Borrero Gonzalez 
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -71,5 +71,5 @@ module_init(nft_redir_ipv4_module_init);
 module_exit(nft_redir_ipv4_module_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez ");
+MODULE_AUTHOR("Arturo Borrero Gonzalez ");
 MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c 
b/net/ipv6/netfilter/nft_masq_ipv6.c
index a2aff1277b40..93d758f70334 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Arturo Borrero Gonzalez 
+ * Copyright (c) 2014 Arturo Borrero Gonzalez 
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -78,5 +78,5 @@ module_init(nft_masq_ipv6_module_init);
 module_exit(nft_masq_ipv6_module_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez ");
+MODULE_AUTHOR("Arturo Borrero Gonzalez ");
 MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq");
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c 
b/net/ipv6/netfilter/nft_redir_ipv6.c
index bfcd5af6bc15..2850fcd8583f 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Arturo Borrero Gonzalez 
+ * Copyright (c) 2014 Arturo Borrero Gonzalez 
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -72,5 +72,5 @@ module_init(nft_redir_ipv6_module_init);
 module_exit(nft_redir_ipv6_module_exit);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez ");
+MODULE_AUTHOR("Arturo Borrero Gonzalez ");
 MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir");
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 81b5ad6165ac..bf92de01410f 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Arturo Borrero Gonzalez 
+ * Copyright (c) 2014 Arturo Borrero Gonzalez 
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -105,4 +105,4 @@ int nft_masq_dump(struct sk_buff *skb, const struct 
nft_expr *expr)
 EXPORT_SYMBOL_GPL(nft_masq_dump);
 
 MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez ");
+MODULE_AUTHOR("Arturo Borrero Gonzalez ");
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 03f7bf40ae75..967e09b099b2 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2014 Arturo Borrero Gonzalez 
+ * Copyright (c) 2014 Arturo Borrero Gonzalez 

[PATCH 06/50] netfilter: built-in NAT support for UDPlite

2016-12-07 Thread Pablo Neira Ayuso
From: Davide Caratti 

CONFIG_NF_NAT_PROTO_UDPLITE is no more a tristate. When set to y, NAT
support for UDPlite protocol is built-in into nf_nat.ko.

footprint test:

(nf_nat_proto_)   |udplite || nf_nat
--+++
no builtin| 408048 || 2241312
UDPLITE builtin   |   -|| 2577256

Signed-off-by: Davide Caratti 
Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_nat_l4proto.h |  3 +++
 net/netfilter/Kconfig  |  2 +-
 net/netfilter/Makefile |  5 ++---
 net/netfilter/nf_nat_core.c|  4 
 net/netfilter/nf_nat_proto_udplite.c   | 35 +-
 5 files changed, 11 insertions(+), 38 deletions(-)

diff --git a/include/net/netfilter/nf_nat_l4proto.h 
b/include/net/netfilter/nf_nat_l4proto.h
index 2cbaf3856e21..3923150f2a1e 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -60,6 +60,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_dccp;
 #ifdef CONFIG_NF_NAT_PROTO_SCTP
 extern const struct nf_nat_l4proto nf_nat_l4proto_sctp;
 #endif
+#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
+extern const struct nf_nat_l4proto nf_nat_l4proto_udplite;
+#endif
 
 bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 enum nf_nat_manip_type maniptype,
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index ad72edf1f6ec..496e1dcbd003 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -389,7 +389,7 @@ config NF_NAT_PROTO_DCCP
default NF_NAT && NF_CT_PROTO_DCCP
 
 config NF_NAT_PROTO_UDPLITE
-   tristate
+   bool
depends on NF_NAT && NF_CT_PROTO_UDPLITE
default NF_NAT && NF_CT_PROTO_UDPLITE
 
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 02ef6decf94d..3b97d89df2cd 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -45,8 +45,10 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
 nf_nat-y   := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
   nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
 
+# NAT protocols (nf_nat)
 nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
 nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
+nf_nat-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
 
 # generic transport layer logging
 obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
@@ -57,9 +59,6 @@ obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o
 obj-$(CONFIG_NF_NAT) += nf_nat.o
 obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
 
-# NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
-
 # NAT helpers
 obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
 obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 80858bd110cc..94b14c5a8b17 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -690,6 +690,10 @@ int nf_nat_l3proto_register(const struct nf_nat_l3proto 
*l3proto)
RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_SCTP],
 _nat_l4proto_sctp);
 #endif
+#ifdef CONFIG_NF_NAT_PROTO_UDPLITE
+   RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDPLITE],
+_nat_l4proto_udplite);
+#endif
mutex_unlock(_nat_proto_mutex);
 
RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
diff --git a/net/netfilter/nf_nat_proto_udplite.c 
b/net/netfilter/nf_nat_proto_udplite.c
index 58340c97bd83..366bfbfd82a1 100644
--- a/net/netfilter/nf_nat_proto_udplite.c
+++ b/net/netfilter/nf_nat_proto_udplite.c
@@ -8,11 +8,9 @@
  */
 
 #include 
-#include 
 #include 
 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -64,7 +62,7 @@ udplite_manip_pkt(struct sk_buff *skb,
return true;
 }
 
-static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
+const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
.l4proto= IPPROTO_UDPLITE,
.manip_pkt  = udplite_manip_pkt,
.in_range   = nf_nat_l4proto_in_range,
@@ -73,34 +71,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_udplite = {
.nlattr_to_range= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
-
-static int __init nf_nat_proto_udplite_init(void)
-{
-   int err;
-
-   err = nf_nat_l4proto_register(NFPROTO_IPV4, _nat_l4proto_udplite);
-   if (err < 0)
-   goto err1;
-   err = nf_nat_l4proto_register(NFPROTO_IPV6, _nat_l4proto_udplite);
-   if (err < 0)
-   goto err2;
-   return 0;
-
-err2:
-   nf_nat_l4proto_unregister(NFPROTO_IPV4, _nat_l4proto_udplite);
-err1:
-   return err;
-}
-
-static void __exit nf_nat_proto_udplite_fini(void)
-{
-   nf_nat_l4proto_unregister(NFPROTO_IPV6, _nat_l4proto_udplite);
-   

RE: [net-next 20/20] i40e: don't allow i40e_vsi_(add|kill)_vlan to operate when VID<1

2016-12-07 Thread Keller, Jacob E


> -Original Message-
> From: Kirsher, Jeffrey T
> Sent: Wednesday, December 07, 2016 1:53 PM
> To: Keller, Jacob E ; Sergei Shtylyov
> ; da...@davemloft.net
> Cc: netdev@vger.kernel.org; nhor...@redhat.com; sassm...@redhat.com;
> jogre...@redhat.com; guru.anbalag...@oracle.com
> Subject: Re: [net-next 20/20] i40e: don't allow i40e_vsi_(add|kill)_vlan to 
> operate
> when VID<1
> 
> On Wed, 2016-12-07 at 13:50 -0800, Keller, Jacob E wrote:
> > > -Original Message-
> > > From: Sergei Shtylyov [mailto:sergei.shtyl...@cogentembedded.com]
> > > Sent: Wednesday, December 07, 2016 2:11 AM
> > > To: Kirsher, Jeffrey T ; davem@davemloft.n
> > > et
> > > Cc: Keller, Jacob E ; netdev@vger.kernel.org;
> > > nhor...@redhat.com; sassm...@redhat.com; jogre...@redhat.com;
> > > guru.anbalag...@oracle.com
> > > Subject: Re: [net-next 20/20] i40e: don't allow
> > > i40e_vsi_(add|kill)_vlan to operate
> > > when VID<1
> > >
> > > Hello!
> > > > +   if (!(vid > 0) || vsi->info.pvid)
> > >
> > >  Why not just '!vid'?
> >
> > Left over artifact of this previously being a signed value. We can fix
> > this.
> >
> > Thanks,
> > Jake
> >
> > > > -void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid)
> > > > +void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, u16 vid)
> > > >   {
> > > > +   if (!(vid > 0) || vsi->info.pvid)
> > >
> > >  Likewise.
> >
> > Same here. Can get this fixed.
> 
> While you are fixing this up and sending me a new version of this patch, I
> will just drop this from the series and re-send.

Yes, since it's the last patch that's fine.

Thanks,
Jake


[PATCH 32/50] netfilter: nf_tables: add stateful objects

2016-12-07 Thread Pablo Neira Ayuso
This patch augments nf_tables to support stateful objects. This new
infrastructure allows you to create, dump and delete stateful objects,
that are identified by a user-defined name.

This patch adds the generic infrastructure, follow up patches add
support for two stateful objects: counters and quotas.

This patch provides a native infrastructure for nf_tables to replace
nfacct, the extended accounting infrastructure for iptables.

Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_tables.h|  79 +
 include/uapi/linux/netfilter/nf_tables.h |  29 ++
 net/netfilter/nf_tables_api.c| 516 +++
 3 files changed, 624 insertions(+)

diff --git a/include/net/netfilter/nf_tables.h 
b/include/net/netfilter/nf_tables.h
index 32970cba184a..903cd618f50e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -875,6 +875,7 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, void 
*priv);
  * @list: used internally
  * @chains: chains in the table
  * @sets: sets in the table
+ * @objects: stateful objects in the table
  * @hgenerator: handle generator state
  * @use: number of chain references to this table
  * @flags: table flag (see enum nft_table_flags)
@@ -885,6 +886,7 @@ struct nft_table {
struct list_headlist;
struct list_headchains;
struct list_headsets;
+   struct list_headobjects;
u64 hgenerator;
u32 use;
u16 flags:14,
@@ -935,6 +937,73 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
 const struct nft_verdict *v);
 
 /**
+ * struct nft_object - nf_tables stateful object
+ *
+ * @list: table stateful object list node
+ * @type: pointer to object type
+ * @data: pointer to object data
+ * @name: name of this stateful object
+ * @genmask: generation mask
+ * @use: number of references to this stateful object
+ * @data: object data, layout depends on type
+ */
+struct nft_object {
+   struct list_headlist;
+   charname[NFT_OBJ_MAXNAMELEN];
+   u32 genmask:2,
+   use:30;
+   /* runtime data below here */
+   const struct nft_object_type*type cacheline_aligned;
+   unsigned char   data[]
+   __attribute__((aligned(__alignof__(u64;
+};
+
+static inline void *nft_obj_data(const struct nft_object *obj)
+{
+   return (void *)obj->data;
+}
+
+#define nft_expr_obj(expr) *((struct nft_object **)nft_expr_priv(expr))
+
+struct nft_object *nf_tables_obj_lookup(const struct nft_table *table,
+   const struct nlattr *nla, u32 objtype,
+   u8 genmask);
+
+/**
+ * struct nft_object_type - stateful object type
+ *
+ * @eval: stateful object evaluation function
+ * @list: list node in list of object types
+ * @type: stateful object numeric type
+ * @size: stateful object size
+ * @owner: module owner
+ * @maxattr: maximum netlink attribute
+ * @policy: netlink attribute policy
+ * @init: initialize object from netlink attributes
+ * @destroy: release existing stateful object
+ * @dump: netlink dump stateful object
+ */
+struct nft_object_type {
+   void(*eval)(struct nft_object *obj,
+   struct nft_regs *regs,
+   const struct nft_pktinfo *pkt);
+   struct list_headlist;
+   u32 type;
+   unsigned intsize;
+   unsigned intmaxattr;
+   struct module   *owner;
+   const struct nla_policy *policy;
+   int (*init)(const struct nlattr * const 
tb[],
+   struct nft_object *obj);
+   void(*destroy)(struct nft_object *obj);
+   int (*dump)(struct sk_buff *skb,
+   const struct nft_object *obj);
+};
+
+int nft_register_obj(struct nft_object_type *obj_type);
+void nft_unregister_obj(struct nft_object_type *obj_type);
+
+/**
  * struct nft_traceinfo - nft tracing information and state
  *
  * @pkt: pktinfo currently processed
@@ -981,6 +1050,9 @@ void nft_trace_notify(struct nft_traceinfo *info);
 #define MODULE_ALIAS_NFT_SET() \
MODULE_ALIAS("nft-set")
 
+#define MODULE_ALIAS_NFT_OBJ(type) \
+   MODULE_ALIAS("nft-obj-" __stringify(type))
+
 /*
  * The gencursor defines two generations, the currently 

[PATCH 00/50] Netfilter/IPVS updates for net-next

2016-12-07 Thread Pablo Neira Ayuso
Hi David,

The following patchset contains a large Netfilter update for net-next,
to summarise:

1) Add support for stateful objects. This series provides a nf_tables
   native alternative to the extended accounting infrastructure for
   nf_tables. Two initial stateful objects are supported: counters and
   quotas. Objects are identified by a user-defined name, you can fetch
   and reset them anytime. You can also use a maps to allow fast lookups
   using any arbitrary key combination. More info at:

   http://marc.info/?l=netfilter-devel=148029128323837=2

2) On-demand registration of nf_conntrack and defrag hooks per netns.
   Register nf_conntrack hooks if we have a stateful ruleset, ie.
   state-based filtering or NAT. The new nf_conntrack_default_on sysctl
   enables this from newly created netnamespaces. Default behaviour is not
   modified. Patches from Florian Westphal.

3) Allocate 4k chunks and then use these for x_tables counter allocation
   requests, this improves ruleset load time and also datapath ruleset
   evaluation, patches from Florian Westphal.

4) Add support for ebpf to the existing x_tables bpf extension.
   From Willem de Bruijn.

5) Update layer 4 checksum if any of the pseudoheader fields is updated.
   This provides a limited form of 1:1 stateless NAT that make sense in
   specific scenario, eg. load balancing.

6) Add support to flush sets in nf_tables. This series comes with a new
   set->ops->deactivate_one() indirection given that we have to walk
   over the list of set elements, then deactivate them one by one.
   The existing set->ops->deactivate() performs an element lookup that
   we don't need.

7) Two patches to avoid cloning packets, thus speed up packet forwarding
   via nft_fwd from ingress. From Florian Westphal.

8) Two IPVS patches via Simon Horman: Decrement ttl in all modes to
   prevent infinite loops, patch from Dwip Banerjee. And one minor
   refactoring from Gao feng.

9) Revisit recent log support for nf_tables netdev families: One patch
   to ensure that we correctly handle non-ethernet packets. Another
   patch to add missing logger definition for netdev. Patches from
   Liping Zhang.

10) Three patches for nft_fib, one to address insufficient register
initialization and another to solve incorrect (although harmless)
byteswap operation. Moreover update xt_rpfilter and nft_fib to match
lbcast packets with zeronet as source, eg. DHCP Discover packets
(0.0.0.0 -> 255.255.255.255). Also from Liping Zhang.

11) Built-in DCCP, SCTP and UDPlite conntrack and NAT support, from
Davide Caratti. While DCCP is rather hopeless lately, and UDPlite has
been broken in many-cast mode for some little time, let's give them a
chance by placing them at the same level as other existing protocols.
Thus, users don't explicitly have to modprobe support for this and
NAT rules work for them. Some people point to the lack of support in
SOHO Linux-based routers that make deployment of new protocols harder.
I guess other middleboxes outthere on the Internet are also to blame.
Anyway, let's see if this has any impact in the midrun.

12) Skip software SCTP software checksum calculation if the NIC comes
with SCTP checksum offload support. From Davide Caratti.

13) Initial core factoring to prepare conversion to hook array. Three
patches from Aaron Conole.

14) Gao Feng made a wrong conversion to switch in the xt_multiport
extension in a patch coming in the previous batch. Fix it in this
batch.

15) Get vmalloc call in sync with kmalloc flags to avoid a warning
and likely OOM killer intervention from x_tables. From Marcelo
Ricardo Leitner.

16) Update Arturo Borrero's email address in all source code headers.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git

Thanks!



The following changes since commit adc176c5472214971d77c1a61c83db9b01e9cdc7:

  ipv6 addrconf: Implemented enhanced DAD (RFC7527) (2016-12-03 23:21:37 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git HEAD

for you to fetch changes up to 73c25fb139337ac4fe1695ae3c056961855594db:

  netfilter: nft_quota: allow to restore consumed quota (2016-12-07 14:40:53 
+0100)


Aaron Conole (3):
  netfilter: introduce accessor functions for hook entries
  netfilter: decouple nf_hook_entry and nf_hook_ops
  netfilter: convert while loops to for loops

Arturo Borrero Gonzalez (1):
  netfilter: update Arturo Borrero Gonzalez email address

Davide Caratti (8):
  netfilter: built-in NAT support for DCCP
  netfilter: built-in NAT support for SCTP
  netfilter: built-in NAT support for UDPlite
  netfilter: nf_conntrack_tuple_common.h: fix #include
  netfilter: conntrack: built-in support for DCCP

[PATCH 26/50] netfilter: nft_fib: convert htonl to ntohl properly

2016-12-07 Thread Pablo Neira Ayuso
From: Liping Zhang 

Acctually ntohl and htonl are identical, so this doesn't affect
anything, but it is conceptually wrong.

Signed-off-by: Liping Zhang 
Acked-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/nft_fib_ipv4.c | 2 +-
 net/ipv6/netfilter/nft_fib_ipv6.c | 2 +-
 net/netfilter/nft_fib.c   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c 
b/net/ipv4/netfilter/nft_fib_ipv4.c
index 1b49966484b3..bfffa742f397 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -198,7 +198,7 @@ nft_fib4_select_ops(const struct nft_ctx *ctx,
if (!tb[NFTA_FIB_RESULT])
return ERR_PTR(-EINVAL);
 
-   result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+   result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
 
switch (result) {
case NFT_FIB_RESULT_OIF:
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c 
b/net/ipv6/netfilter/nft_fib_ipv6.c
index d526bb594956..c947aad8bcc6 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -235,7 +235,7 @@ nft_fib6_select_ops(const struct nft_ctx *ctx,
if (!tb[NFTA_FIB_RESULT])
return ERR_PTR(-EINVAL);
 
-   result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+   result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
 
switch (result) {
case NFT_FIB_RESULT_OIF:
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index 249c9b80c150..29a4906adc27 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -86,7 +86,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct 
nft_expr *expr,
if ((priv->flags & (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR)) == 0)
return -EINVAL;
 
-   priv->result = htonl(nla_get_be32(tb[NFTA_FIB_RESULT]));
+   priv->result = ntohl(nla_get_be32(tb[NFTA_FIB_RESULT]));
priv->dreg = nft_parse_register(tb[NFTA_FIB_DREG]);
 
switch (priv->result) {
-- 
2.1.4



[PATCH 23/50] netfilter: x_tables: pass xt_counters struct instead of packet counter

2016-12-07 Thread Pablo Neira Ayuso
From: Florian Westphal 

On SMP we overload the packet counter (unsigned long) to contain
percpu offset.  Hide this from callers and pass xt_counters address
instead.

Preparation patch to allocate the percpu counters in page-sized batch
chunks.

Signed-off-by: Florian Westphal 
Acked-by: Eric Dumazet 
Signed-off-by: Pablo Neira Ayuso 
---
 include/linux/netfilter/x_tables.h | 6 +-
 net/ipv4/netfilter/arp_tables.c| 4 ++--
 net/ipv4/netfilter/ip_tables.c | 4 ++--
 net/ipv6/netfilter/ip6_tables.c| 5 ++---
 net/netfilter/x_tables.c   | 9 +
 5 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h 
b/include/linux/netfilter/x_tables.h
index cd4eaf8df445..6e61edeb68e3 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -430,11 +430,7 @@ static inline unsigned long xt_percpu_counter_alloc(void)
 
return 0;
 }
-static inline void xt_percpu_counter_free(u64 pcnt)
-{
-   if (nr_cpu_ids > 1)
-   free_percpu((void __percpu *) (unsigned long) pcnt);
-}
+void xt_percpu_counter_free(struct xt_counters *cnt);
 
 static inline struct xt_counters *
 xt_get_this_cpu_counter(struct xt_counters *cnt)
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 848a0704b28f..019f8e8dda6d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -439,7 +439,7 @@ find_check_entry(struct arpt_entry *e, const char *name, 
unsigned int size)
 err:
module_put(t->u.kernel.target->me);
 out:
-   xt_percpu_counter_free(e->counters.pcnt);
+   xt_percpu_counter_free(>counters);
 
return ret;
 }
@@ -519,7 +519,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
if (par.target->destroy != NULL)
par.target->destroy();
module_put(par.target->me);
-   xt_percpu_counter_free(e->counters.pcnt);
+   xt_percpu_counter_free(>counters);
 }
 
 /* Checks and translates the user-supplied table segment (held in
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 46815c8a60d7..acc9a0c45bdf 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -582,7 +582,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, 
const char *name,
cleanup_match(ematch, net);
}
 
-   xt_percpu_counter_free(e->counters.pcnt);
+   xt_percpu_counter_free(>counters);
 
return ret;
 }
@@ -670,7 +670,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
if (par.target->destroy != NULL)
par.target->destroy();
module_put(par.target->me);
-   xt_percpu_counter_free(e->counters.pcnt);
+   xt_percpu_counter_free(>counters);
 }
 
 /* Checks and translates the user-supplied table segment (held in
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 6ff42b8301cc..88b56a98905b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -612,7 +612,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, 
const char *name,
cleanup_match(ematch, net);
}
 
-   xt_percpu_counter_free(e->counters.pcnt);
+   xt_percpu_counter_free(>counters);
 
return ret;
 }
@@ -699,8 +699,7 @@ static void cleanup_entry(struct ip6t_entry *e, struct net 
*net)
if (par.target->destroy != NULL)
par.target->destroy();
module_put(par.target->me);
-
-   xt_percpu_counter_free(e->counters.pcnt);
+   xt_percpu_counter_free(>counters);
 }
 
 /* Checks and translates the user-supplied table segment (held in
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index ad818e52859b..0580029eb0ee 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1615,6 +1615,15 @@ void xt_proto_fini(struct net *net, u_int8_t af)
 }
 EXPORT_SYMBOL_GPL(xt_proto_fini);
 
+void xt_percpu_counter_free(struct xt_counters *counters)
+{
+   unsigned long pcnt = counters->pcnt;
+
+   if (nr_cpu_ids > 1)
+   free_percpu((void __percpu *)pcnt);
+}
+EXPORT_SYMBOL_GPL(xt_percpu_counter_free);
+
 static int __net_init xt_net_init(struct net *net)
 {
int i;
-- 
2.1.4



[PATCH 05/50] netfilter: built-in NAT support for SCTP

2016-12-07 Thread Pablo Neira Ayuso
From: Davide Caratti 

CONFIG_NF_NAT_PROTO_SCTP is no more a tristate. When set to y, NAT
support for SCTP protocol is built-in into nf_nat.ko.

footprint test:

(nf_nat_proto_)   | sctp   || nf_nat
--+++
no builtin| 428344 || 2241312
SCTP builtin  |   -|| 2597032

Signed-off-by: Davide Caratti 
Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_nat_l4proto.h |  3 +++
 net/netfilter/Kconfig  |  2 +-
 net/netfilter/Makefile |  2 +-
 net/netfilter/nf_nat_core.c|  4 
 net/netfilter/nf_nat_proto_sctp.c  | 35 +-
 5 files changed, 10 insertions(+), 36 deletions(-)

diff --git a/include/net/netfilter/nf_nat_l4proto.h 
b/include/net/netfilter/nf_nat_l4proto.h
index 92b147be00ef..2cbaf3856e21 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -57,6 +57,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_unknown;
 #ifdef CONFIG_NF_NAT_PROTO_DCCP
 extern const struct nf_nat_l4proto nf_nat_l4proto_dccp;
 #endif
+#ifdef CONFIG_NF_NAT_PROTO_SCTP
+extern const struct nf_nat_l4proto nf_nat_l4proto_sctp;
+#endif
 
 bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 enum nf_nat_manip_type maniptype,
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 13092e5cd245..ad72edf1f6ec 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -394,7 +394,7 @@ config NF_NAT_PROTO_UDPLITE
default NF_NAT && NF_CT_PROTO_UDPLITE
 
 config NF_NAT_PROTO_SCTP
-   tristate
+   bool
default NF_NAT && NF_CT_PROTO_SCTP
depends on NF_NAT && NF_CT_PROTO_SCTP
select LIBCRC32C
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 9ea0c98e51e6..02ef6decf94d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -46,6 +46,7 @@ nf_nat-y  := nf_nat_core.o nf_nat_proto_unknown.o 
nf_nat_proto_common.o \
   nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
 
 nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
+nf_nat-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
 
 # generic transport layer logging
 obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
@@ -58,7 +59,6 @@ obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
 
 # NAT protocols (nf_nat)
 obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
-obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
 
 # NAT helpers
 obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 69b121d11275..80858bd110cc 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -686,6 +686,10 @@ int nf_nat_l3proto_register(const struct nf_nat_l3proto 
*l3proto)
RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_DCCP],
 _nat_l4proto_dccp);
 #endif
+#ifdef CONFIG_NF_NAT_PROTO_SCTP
+   RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_SCTP],
+_nat_l4proto_sctp);
+#endif
mutex_unlock(_nat_proto_mutex);
 
RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
diff --git a/net/netfilter/nf_nat_proto_sctp.c 
b/net/netfilter/nf_nat_proto_sctp.c
index cbc7ade1487b..2e14108ff697 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -7,9 +7,7 @@
  */
 
 #include 
-#include 
 #include 
-#include 
 #include 
 
 #include 
@@ -54,7 +52,7 @@ sctp_manip_pkt(struct sk_buff *skb,
return true;
 }
 
-static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
+const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
.l4proto= IPPROTO_SCTP,
.manip_pkt  = sctp_manip_pkt,
.in_range   = nf_nat_l4proto_in_range,
@@ -63,34 +61,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_sctp = {
.nlattr_to_range= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
-
-static int __init nf_nat_proto_sctp_init(void)
-{
-   int err;
-
-   err = nf_nat_l4proto_register(NFPROTO_IPV4, _nat_l4proto_sctp);
-   if (err < 0)
-   goto err1;
-   err = nf_nat_l4proto_register(NFPROTO_IPV6, _nat_l4proto_sctp);
-   if (err < 0)
-   goto err2;
-   return 0;
-
-err2:
-   nf_nat_l4proto_unregister(NFPROTO_IPV4, _nat_l4proto_sctp);
-err1:
-   return err;
-}
-
-static void __exit nf_nat_proto_sctp_exit(void)
-{
-   nf_nat_l4proto_unregister(NFPROTO_IPV6, _nat_l4proto_sctp);
-   nf_nat_l4proto_unregister(NFPROTO_IPV4, _nat_l4proto_sctp);
-}
-
-module_init(nf_nat_proto_sctp_init);
-module_exit(nf_nat_proto_sctp_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("SCTP NAT protocol helper");
-MODULE_AUTHOR("Patrick McHardy ");
-- 
2.1.4



[PATCH 30/50] netfilter: ingress: translate 0 nf_hook_slow retval to -1

2016-12-07 Thread Pablo Neira Ayuso
From: Florian Westphal 

The caller assumes that < 0 means that skb was stolen (or free'd).

All other return values continue skb processing.

nf_hook_slow returns 3 different return value types:

A) a (negative) errno value: the skb was dropped (NF_DROP, e.g.
by iptables '-j DROP' rule).

B) 0. The skb was stolen by the hook or queued to userspace.

C) 1. all hooks returned NF_ACCEPT so the caller should invoke
   the okfn so packet processing can continue.

nft ingress facility currently doesn't have the 'okfn' that
the NF_HOOK() macros use; there is no nfqueue support either.

So 1 means that nf_hook_ingress() caller should go on processing the skb.

In order to allow use of NF_STOLEN from ingress we need to translate
this to an errno number, else we'd crash because we continue with
already-free'd (or about to be free-d) skb.

The errno value isn't checked, its just important that its less than 0,
so return -1.

Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 include/linux/netfilter_ingress.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/netfilter_ingress.h 
b/include/linux/netfilter_ingress.h
index 2dc3b49b804a..59476061de86 100644
--- a/include/linux/netfilter_ingress.h
+++ b/include/linux/netfilter_ingress.h
@@ -19,6 +19,7 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
 {
struct nf_hook_entry *e = rcu_dereference(skb->dev->nf_hooks_ingress);
struct nf_hook_state state;
+   int ret;
 
/* Must recheck the ingress hook head, in the event it became NULL
 * after the check in nf_hook_ingress_active evaluated to true.
@@ -29,7 +30,11 @@ static inline int nf_hook_ingress(struct sk_buff *skb)
nf_hook_state_init(, NF_NETDEV_INGRESS,
   NFPROTO_NETDEV, skb->dev, NULL, NULL,
   dev_net(skb->dev), NULL);
-   return nf_hook_slow(skb, , e);
+   ret = nf_hook_slow(skb, , e);
+   if (ret == 0)
+   return -1;
+
+   return ret;
 }
 
 static inline void nf_hook_ingress_init(struct net_device *dev)
-- 
2.1.4



[PATCH 31/50] netfilter: add and use nf_fwd_netdev_egress

2016-12-07 Thread Pablo Neira Ayuso
From: Florian Westphal 

... so we can use current skb instead of working with a clone.

Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_dup_netdev.h |  1 +
 net/netfilter/nf_dup_netdev.c | 33 +
 net/netfilter/nft_fwd_netdev.c|  4 ++--
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/include/net/netfilter/nf_dup_netdev.h 
b/include/net/netfilter/nf_dup_netdev.h
index 397dcae349f9..3e919356bedf 100644
--- a/include/net/netfilter/nf_dup_netdev.h
+++ b/include/net/netfilter/nf_dup_netdev.h
@@ -2,5 +2,6 @@
 #define _NF_DUP_NETDEV_H_
 
 void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif);
+void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif);
 
 #endif
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
index 44ae986c383f..c9d7f95768ab 100644
--- a/net/netfilter/nf_dup_netdev.c
+++ b/net/netfilter/nf_dup_netdev.c
@@ -14,6 +14,29 @@
 #include 
 #include 
 
+static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev)
+{
+   if (skb_mac_header_was_set(skb))
+   skb_push(skb, skb->mac_len);
+
+   skb->dev = dev;
+   dev_queue_xmit(skb);
+}
+
+void nf_fwd_netdev_egress(const struct nft_pktinfo *pkt, int oif)
+{
+   struct net_device *dev;
+
+   dev = dev_get_by_index_rcu(nft_net(pkt), oif);
+   if (!dev) {
+   kfree_skb(pkt->skb);
+   return;
+   }
+
+   nf_do_netdev_egress(pkt->skb, dev);
+}
+EXPORT_SYMBOL_GPL(nf_fwd_netdev_egress);
+
 void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
 {
struct net_device *dev;
@@ -24,14 +47,8 @@ void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int 
oif)
return;
 
skb = skb_clone(pkt->skb, GFP_ATOMIC);
-   if (skb == NULL)
-   return;
-
-   if (skb_mac_header_was_set(skb))
-   skb_push(skb, skb->mac_len);
-
-   skb->dev = dev;
-   dev_queue_xmit(skb);
+   if (skb)
+   nf_do_netdev_egress(skb, dev);
 }
 EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
 
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 763ebc3e0b2b..ce13a50b9189 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -26,8 +26,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
int oif = regs->data[priv->sreg_dev];
 
-   nf_dup_netdev_egress(pkt, oif);
-   regs->verdict.code = NF_DROP;
+   nf_fwd_netdev_egress(pkt, oif);
+   regs->verdict.code = NF_STOLEN;
 }
 
 static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
-- 
2.1.4



[PATCH 29/50] netfilter: xt_multiport: Fix wrong unmatch result with multiple ports

2016-12-07 Thread Pablo Neira Ayuso
From: Gao Feng 

I lost one test case in the last commit for xt_multiport.
For example, the rule is "-m multiport --dports 22,80,443".
When first port is unmatched and the second is matched, the curent codes
could not return the right result.
It would return false directly when the first port is unmatched.

Fixes: dd2602d00f80 ("netfilter: xt_multiport: Use switch case instead
of multiple condition checks")
Signed-off-by: Gao Feng 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/xt_multiport.c | 26 +++---
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c
index ec06fb1cb16f..1cde0e4985b7 100644
--- a/net/netfilter/xt_multiport.c
+++ b/net/netfilter/xt_multiport.c
@@ -44,12 +44,18 @@ ports_match_v1(const struct xt_multiport_v1 *minfo,
 
switch (minfo->flags) {
case XT_MULTIPORT_SOURCE:
-   return (src >= s && src <= e) ^ minfo->invert;
+   if (src >= s && src <= e)
+   return true ^ minfo->invert;
+   break;
case XT_MULTIPORT_DESTINATION:
-   return (dst >= s && dst <= e) ^ minfo->invert;
+   if (dst >= s && dst <= e)
+   return true ^ minfo->invert;
+   break;
case XT_MULTIPORT_EITHER:
-   return ((dst >= s && dst <= e) ||
-   (src >= s && src <= e)) ^ minfo->invert;
+   if ((dst >= s && dst <= e) ||
+   (src >= s && src <= e))
+   return true ^ minfo->invert;
+   break;
default:
break;
}
@@ -59,11 +65,17 @@ ports_match_v1(const struct xt_multiport_v1 *minfo,
 
switch (minfo->flags) {
case XT_MULTIPORT_SOURCE:
-   return (src == s) ^ minfo->invert;
+   if (src == s)
+   return true ^ minfo->invert;
+   break;
case XT_MULTIPORT_DESTINATION:
-   return (dst == s) ^ minfo->invert;
+   if (dst == s)
+   return true ^ minfo->invert;
+   break;
case XT_MULTIPORT_EITHER:
-   return (src == s || dst == s) ^ minfo->invert;
+   if (src == s || dst == s)
+   return true ^ minfo->invert;
+   break;
default:
break;
}
-- 
2.1.4



[PATCH 39/50] netfilter: nft_quota: add depleted flag for objects

2016-12-07 Thread Pablo Neira Ayuso
Notify on depleted quota objects. The NFT_QUOTA_F_DEPLETED flag
indicates we have reached overquota.

Add pointer to table from nft_object, so we can use it when sending the
depletion notification to userspace.

Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_tables.h|  2 ++
 include/uapi/linux/netfilter/nf_tables.h |  1 +
 net/netfilter/nf_tables_api.c|  1 +
 net/netfilter/nft_quota.c| 36 +---
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h 
b/include/net/netfilter/nf_tables.h
index 339e374c28b5..ce6fb6e83b32 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -940,6 +940,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
  * struct nft_object - nf_tables stateful object
  *
  * @list: table stateful object list node
+ * @table: table this object belongs to
  * @type: pointer to object type
  * @data: pointer to object data
  * @name: name of this stateful object
@@ -950,6 +951,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type,
 struct nft_object {
struct list_headlist;
charname[NFT_OBJ_MAXNAMELEN];
+   struct nft_table*table;
u32 genmask:2,
use:30;
/* runtime data below here */
diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index 399eac1eee91..4864caca1e8e 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -983,6 +983,7 @@ enum nft_queue_attributes {
 
 enum nft_quota_flags {
NFT_QUOTA_F_INV = (1 << 0),
+   NFT_QUOTA_F_DEPLETED= (1 << 1),
 };
 
 /**
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 9d2ed3f520ef..c5419701ca79 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4075,6 +4075,7 @@ static int nf_tables_newobj(struct net *net, struct sock 
*nlsk,
err = PTR_ERR(obj);
goto err1;
}
+   obj->table = table;
nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN);
 
err = nft_trans_obj_add(, NFT_MSG_NEWOBJ, obj);
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 5d25f57497cb..7f27ebdce7ab 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -17,7 +17,7 @@
 
 struct nft_quota {
u64 quota;
-   boolinvert;
+   unsigned long   flags;
atomic64_t  consumed;
 };
 
@@ -27,11 +27,16 @@ static inline bool nft_overquota(struct nft_quota *priv,
return atomic64_add_return(skb->len, >consumed) >= priv->quota;
 }
 
+static inline bool nft_quota_invert(struct nft_quota *priv)
+{
+   return priv->flags & NFT_QUOTA_F_INV;
+}
+
 static inline void nft_quota_do_eval(struct nft_quota *priv,
 struct nft_regs *regs,
 const struct nft_pktinfo *pkt)
 {
-   if (nft_overquota(priv, pkt->skb) ^ priv->invert)
+   if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv))
regs->verdict.code = NFT_BREAK;
 }
 
@@ -40,19 +45,29 @@ static const struct nla_policy 
nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
[NFTA_QUOTA_FLAGS]  = { .type = NLA_U32 },
 };
 
+#define NFT_QUOTA_DEPLETED_BIT 1   /* From NFT_QUOTA_F_DEPLETED. */
+
 static void nft_quota_obj_eval(struct nft_object *obj,
   struct nft_regs *regs,
   const struct nft_pktinfo *pkt)
 {
struct nft_quota *priv = nft_obj_data(obj);
+   bool overquota;
 
-   nft_quota_do_eval(priv, regs, pkt);
+   overquota = nft_overquota(priv, pkt->skb);
+   if (overquota ^ nft_quota_invert(priv))
+   regs->verdict.code = NFT_BREAK;
+
+   if (overquota &&
+   !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, >flags))
+   nft_obj_notify(nft_net(pkt), obj->table, obj, 0, 0,
+  NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC);
 }
 
 static int nft_quota_do_init(const struct nlattr * const tb[],
 struct nft_quota *priv)
 {
-   u32 flags = 0;
+   unsigned long flags = 0;
u64 quota;
 
if (!tb[NFTA_QUOTA_BYTES])
@@ -66,10 +81,12 @@ static int nft_quota_do_init(const struct nlattr * const 
tb[],
flags = ntohl(nla_get_be32(tb[NFTA_QUOTA_FLAGS]));
if (flags & ~NFT_QUOTA_F_INV)
return -EINVAL;
+   if (flags & NFT_QUOTA_F_DEPLETED)
+   return -EOPNOTSUPP;
}
 
priv->quota = quota;
-   priv->invert = (flags & NFT_QUOTA_F_INV) ? true : false;
+   priv->flags = flags;
  

[PATCH 15/50] netfilter: nat: add dependencies on conntrack module

2016-12-07 Thread Pablo Neira Ayuso
From: Florian Westphal 

MASQUERADE, S/DNAT and REDIRECT already call functions that depend on the
conntrack module.

However, since the conntrack hooks are now registered in a lazy fashion
(i.e., only when needed) a symbol reference is not enough.

Thus, when something is added to a nat table, make sure that it will see
packets by calling nf_ct_netns_get() which will register the conntrack
hooks in the current netns.

An alternative would be to add these dependencies to the NAT table.

However, that has problems when using non-modular builds -- we might
register e.g. ipv6 conntrack before its initcall has run, leading to NULL
deref crashes since its per-netns storage has not yet been allocated.

Adding the dependency in the modules instead has the advantage that nat
table also does not register its hooks until rules are added.

Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/ipt_MASQUERADE.c |  8 +++-
 net/netfilter/xt_NETMAP.c   | 11 +--
 net/netfilter/xt_REDIRECT.c | 12 ++--
 net/netfilter/xt_nat.c  | 18 +-
 4 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c 
b/net/ipv4/netfilter/ipt_MASQUERADE.c
index 34cfb9b0bc0a..a03e4e7ef5f9 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -41,7 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param 
*par)
pr_debug("bad rangesize %u\n", mr->rangesize);
return -EINVAL;
}
-   return 0;
+   return nf_ct_netns_get(par->net, par->family);
 }
 
 static unsigned int
@@ -59,6 +59,11 @@ masquerade_tg(struct sk_buff *skb, const struct 
xt_action_param *par)
  xt_out(par));
 }
 
+static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
+{
+   nf_ct_netns_put(par->net, par->family);
+}
+
 static struct xt_target masquerade_tg_reg __read_mostly = {
.name   = "MASQUERADE",
.family = NFPROTO_IPV4,
@@ -67,6 +72,7 @@ static struct xt_target masquerade_tg_reg __read_mostly = {
.table  = "nat",
.hooks  = 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg_check,
+   .destroy= masquerade_tg_destroy,
.me = THIS_MODULE,
 };
 
diff --git a/net/netfilter/xt_NETMAP.c b/net/netfilter/xt_NETMAP.c
index 94d0b5411192..e45a01255e70 100644
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -60,7 +60,12 @@ static int netmap_tg6_checkentry(const struct xt_tgchk_param 
*par)
 
if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
return -EINVAL;
-   return 0;
+   return nf_ct_netns_get(par->net, par->family);
+}
+
+static void netmap_tg_destroy(const struct xt_tgdtor_param *par)
+{
+   nf_ct_netns_put(par->net, par->family);
 }
 
 static unsigned int
@@ -111,7 +116,7 @@ static int netmap_tg4_check(const struct xt_tgchk_param 
*par)
pr_debug("bad rangesize %u.\n", mr->rangesize);
return -EINVAL;
}
-   return 0;
+   return nf_ct_netns_get(par->net, par->family);
 }
 
 static struct xt_target netmap_tg_reg[] __read_mostly = {
@@ -127,6 +132,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = {
  (1 << NF_INET_LOCAL_OUT) |
  (1 << NF_INET_LOCAL_IN),
.checkentry = netmap_tg6_checkentry,
+   .destroy= netmap_tg_destroy,
.me = THIS_MODULE,
},
{
@@ -141,6 +147,7 @@ static struct xt_target netmap_tg_reg[] __read_mostly = {
  (1 << NF_INET_LOCAL_OUT) |
  (1 << NF_INET_LOCAL_IN),
.checkentry = netmap_tg4_check,
+   .destroy= netmap_tg_destroy,
.me = THIS_MODULE,
},
 };
diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
index 651dce65a30b..98a4c6d4f1cb 100644
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -40,7 +40,13 @@ static int redirect_tg6_checkentry(const struct 
xt_tgchk_param *par)
 
if (range->flags & NF_NAT_RANGE_MAP_IPS)
return -EINVAL;
-   return 0;
+
+   return nf_ct_netns_get(par->net, par->family);
+}
+
+static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
+{
+   nf_ct_netns_put(par->net, par->family);
 }
 
 /* FIXME: Take multiple ranges --RR */
@@ -56,7 +62,7 @@ static int redirect_tg4_check(const struct xt_tgchk_param 
*par)
pr_debug("bad rangesize %u.\n", mr->rangesize);
return -EINVAL;
}
-   return 0;
+   return nf_ct_netns_get(par->net, par->family);
 }
 
 static unsigned int
@@ -72,6 +78,7 @@ static struct xt_target redirect_tg_reg[] 

[PATCH 24/50] netfilter: x_tables: pass xt_counters struct to counter allocator

2016-12-07 Thread Pablo Neira Ayuso
From: Florian Westphal 

Keeps some noise away from a followup patch.

Signed-off-by: Florian Westphal 
Acked-by: Eric Dumazet 
Signed-off-by: Pablo Neira Ayuso 
---
 include/linux/netfilter/x_tables.h | 27 +--
 net/ipv4/netfilter/arp_tables.c|  5 +
 net/ipv4/netfilter/ip_tables.c |  5 +
 net/ipv6/netfilter/ip6_tables.c|  5 +
 net/netfilter/x_tables.c   | 30 ++
 5 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h 
b/include/linux/netfilter/x_tables.h
index 6e61edeb68e3..05a94bd32c55 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -404,32 +404,7 @@ static inline unsigned long ifname_compare_aligned(const 
char *_a,
 }
 
 
-/* On SMP, ip(6)t_entry->counters.pcnt holds address of the
- * real (percpu) counter.  On !SMP, its just the packet count,
- * so nothing needs to be done there.
- *
- * xt_percpu_counter_alloc returns the address of the percpu
- * counter, or 0 on !SMP. We force an alignment of 16 bytes
- * so that bytes/packets share a common cache line.
- *
- * Hence caller must use IS_ERR_VALUE to check for error, this
- * allows us to return 0 for single core systems without forcing
- * callers to deal with SMP vs. NONSMP issues.
- */
-static inline unsigned long xt_percpu_counter_alloc(void)
-{
-   if (nr_cpu_ids > 1) {
-   void __percpu *res = __alloc_percpu(sizeof(struct xt_counters),
-   sizeof(struct xt_counters));
-
-   if (res == NULL)
-   return -ENOMEM;
-
-   return (__force unsigned long) res;
-   }
-
-   return 0;
-}
+bool xt_percpu_counter_alloc(struct xt_counters *counters);
 void xt_percpu_counter_free(struct xt_counters *cnt);
 
 static inline struct xt_counters *
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 019f8e8dda6d..808deb275ceb 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -415,13 +415,10 @@ find_check_entry(struct arpt_entry *e, const char *name, 
unsigned int size)
 {
struct xt_entry_target *t;
struct xt_target *target;
-   unsigned long pcnt;
int ret;
 
-   pcnt = xt_percpu_counter_alloc();
-   if (IS_ERR_VALUE(pcnt))
+   if (!xt_percpu_counter_alloc(>counters))
return -ENOMEM;
-   e->counters.pcnt = pcnt;
 
t = arpt_get_target(e);
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index acc9a0c45bdf..a48430d3420f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -539,12 +539,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, 
const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
-   unsigned long pcnt;
 
-   pcnt = xt_percpu_counter_alloc();
-   if (IS_ERR_VALUE(pcnt))
+   if (!xt_percpu_counter_alloc(>counters))
return -ENOMEM;
-   e->counters.pcnt = pcnt;
 
j = 0;
mtpar.net   = net;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 88b56a98905b..a5a92083fd62 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -570,12 +570,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, 
const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
-   unsigned long pcnt;
 
-   pcnt = xt_percpu_counter_alloc();
-   if (IS_ERR_VALUE(pcnt))
+   if (!xt_percpu_counter_alloc(>counters))
return -ENOMEM;
-   e->counters.pcnt = pcnt;
 
j = 0;
mtpar.net   = net;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 0580029eb0ee..be5e83047594 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1615,6 +1615,36 @@ void xt_proto_fini(struct net *net, u_int8_t af)
 }
 EXPORT_SYMBOL_GPL(xt_proto_fini);
 
+/**
+ * xt_percpu_counter_alloc - allocate x_tables rule counter
+ *
+ * @counter: pointer to counter struct inside the ip(6)/arpt_entry struct
+ *
+ * On SMP, the packet counter [ ip(6)t_entry->counters.pcnt ] will then
+ * contain the address of the real (percpu) counter.
+ *
+ * Rule evaluation needs to use xt_get_this_cpu_counter() helper
+ * to fetch the real percpu counter.
+ *
+ * returns false on error.
+ */
+bool xt_percpu_counter_alloc(struct xt_counters *counter)
+{
+   void __percpu *res;
+
+   if (nr_cpu_ids <= 1)
+   return true;
+
+   res = __alloc_percpu(sizeof(struct xt_counters),
+sizeof(struct xt_counters));
+   if (!res)
+   

[PATCH 14/50] netfilter: add and use nf_ct_netns_get/put

2016-12-07 Thread Pablo Neira Ayuso
From: Florian Westphal 

currently aliased to try_module_get/_put.
Will be changed in next patch when we add functions to make use of ->net
argument to store usercount per l3proto tracker.

This is needed to avoid registering the conntrack hooks in all netns and
later only enable connection tracking in those that need conntrack.

Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_conntrack.h |  4 
 net/ipv4/netfilter/ipt_CLUSTERIP.c   |  4 ++--
 net/ipv4/netfilter/ipt_SYNPROXY.c|  4 ++--
 net/ipv6/netfilter/ip6t_SYNPROXY.c   |  4 ++--
 net/netfilter/nf_conntrack_proto.c   | 12 
 net/netfilter/nft_ct.c   | 26 +-
 net/netfilter/xt_CONNSECMARK.c   |  4 ++--
 net/netfilter/xt_CT.c|  6 +++---
 net/netfilter/xt_connbytes.c |  4 ++--
 net/netfilter/xt_connlabel.c |  6 +++---
 net/netfilter/xt_connlimit.c |  6 +++---
 net/netfilter/xt_connmark.c  |  8 
 net/netfilter/xt_conntrack.c |  4 ++--
 net/netfilter/xt_helper.c|  4 ++--
 net/netfilter/xt_state.c |  4 ++--
 15 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h 
b/include/net/netfilter/nf_conntrack.h
index d9d52c020a70..5916aa9ab3f0 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -181,6 +181,10 @@ static inline void nf_ct_put(struct nf_conn *ct)
 int nf_ct_l3proto_try_module_get(unsigned short l3proto);
 void nf_ct_l3proto_module_put(unsigned short l3proto);
 
+/* load module; enable/disable conntrack in this namespace */
+int nf_ct_netns_get(struct net *net, u8 nfproto);
+void nf_ct_netns_put(struct net *net, u8 nfproto);
+
 /*
  * Allocate a hashtable of hlist_head (if nulls == 0),
  * or hlist_nulls_head (if nulls == 1)
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c 
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index e6e206fa86c8..21db00d0362b 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -419,7 +419,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param 
*par)
}
cipinfo->config = config;
 
-   ret = nf_ct_l3proto_try_module_get(par->family);
+   ret = nf_ct_netns_get(par->net, par->family);
if (ret < 0)
pr_info("cannot load conntrack support for proto=%u\n",
par->family);
@@ -444,7 +444,7 @@ static void clusterip_tg_destroy(const struct 
xt_tgdtor_param *par)
 
clusterip_config_put(cipinfo->config);
 
-   nf_ct_l3proto_module_put(par->family);
+   nf_ct_netns_get(par->net, par->family);
 }
 
 #ifdef CONFIG_COMPAT
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c 
b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 361411688221..30c0de53e254 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -418,12 +418,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param 
*par)
e->ip.invflags & XT_INV_PROTO)
return -EINVAL;
 
-   return nf_ct_l3proto_try_module_get(par->family);
+   return nf_ct_netns_get(par->net, par->family);
 }
 
 static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
 {
-   nf_ct_l3proto_module_put(par->family);
+   nf_ct_netns_put(par->net, par->family);
 }
 
 static struct xt_target synproxy_tg4_reg __read_mostly = {
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c 
b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index 99a1216287c8..98c8dd38575a 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -440,12 +440,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param 
*par)
e->ipv6.invflags & XT_INV_PROTO)
return -EINVAL;
 
-   return nf_ct_l3proto_try_module_get(par->family);
+   return nf_ct_netns_get(par->net, par->family);
 }
 
 static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
 {
-   nf_ct_l3proto_module_put(par->family);
+   nf_ct_netns_put(par->net, par->family);
 }
 
 static struct xt_target synproxy_tg6_reg __read_mostly = {
diff --git a/net/netfilter/nf_conntrack_proto.c 
b/net/netfilter/nf_conntrack_proto.c
index b218e70b2f74..948f1e2fc80b 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -125,6 +125,18 @@ void nf_ct_l3proto_module_put(unsigned short l3proto)
 }
 EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
 
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+   return nf_ct_l3proto_try_module_get(nfproto);
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_get);
+
+void nf_ct_netns_put(struct net *net, u8 nfproto)
+{
+   nf_ct_l3proto_module_put(nfproto);
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_put);
+
 struct nf_conntrack_l4proto *
 nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
 {
diff --git a/net/netfilter/nft_ct.c 

[PATCH 43/50] netfilter: rpfilter: bypass ipv4 lbcast packets with zeronet source

2016-12-07 Thread Pablo Neira Ayuso
From: Liping Zhang 

Otherwise, DHCP Discover packets(0.0.0.0->255.255.255.255) may be
dropped incorrectly.

Signed-off-by: Liping Zhang 
Acked-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/ipt_rpfilter.c |  8 +---
 net/ipv4/netfilter/nft_fib_ipv4.c | 13 +++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_rpfilter.c 
b/net/ipv4/netfilter/ipt_rpfilter.c
index 59b49945b481..f273098e48fd 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -83,10 +83,12 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct 
xt_action_param *par)
return true ^ invert;
 
iph = ip_hdr(skb);
-   if (ipv4_is_multicast(iph->daddr)) {
-   if (ipv4_is_zeronet(iph->saddr))
-   return ipv4_is_local_multicast(iph->daddr) ^ invert;
+   if (ipv4_is_zeronet(iph->saddr)) {
+   if (ipv4_is_lbcast(iph->daddr) ||
+   ipv4_is_local_multicast(iph->daddr))
+   return true ^ invert;
}
+
flow.flowi4_iif = LOOPBACK_IFINDEX;
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c 
b/net/ipv4/netfilter/nft_fib_ipv4.c
index 258136364f5e..965b1a161369 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -101,12 +101,13 @@ void nft_fib4_eval(const struct nft_expr *expr, struct 
nft_regs *regs,
}
 
iph = ip_hdr(pkt->skb);
-   if (ipv4_is_multicast(iph->daddr) &&
-   ipv4_is_zeronet(iph->saddr) &&
-   ipv4_is_local_multicast(iph->daddr)) {
-   nft_fib_store_result(dest, priv->result, pkt,
-get_ifindex(pkt->skb->dev));
-   return;
+   if (ipv4_is_zeronet(iph->saddr)) {
+   if (ipv4_is_lbcast(iph->daddr) ||
+   ipv4_is_local_multicast(iph->daddr)) {
+   nft_fib_store_result(dest, priv->result, pkt,
+get_ifindex(pkt->skb->dev));
+   return;
+   }
}
 
if (priv->flags & NFTA_FIB_F_MARK)
-- 
2.1.4



[PATCH 25/50] netfilter: x_tables: pack percpu counter allocations

2016-12-07 Thread Pablo Neira Ayuso
From: Florian Westphal 

instead of allocating each xt_counter individually, allocate 4k chunks
and then use these for counter allocation requests.

This should speed up rule evaluation by increasing data locality,
also speeds up ruleset loading because we reduce calls to the percpu
allocator.

As Eric points out we can't use PAGE_SIZE, page_allocator would fail on
arches with 64k page size.

Suggested-by: Eric Dumazet 
Signed-off-by: Florian Westphal 
Acked-by: Eric Dumazet 
Signed-off-by: Pablo Neira Ayuso 
---
 include/linux/netfilter/x_tables.h |  7 ++-
 net/ipv4/netfilter/arp_tables.c|  9 ++---
 net/ipv4/netfilter/ip_tables.c |  9 ++---
 net/ipv6/netfilter/ip6_tables.c|  9 ++---
 net/netfilter/x_tables.c   | 33 -
 5 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h 
b/include/linux/netfilter/x_tables.h
index 05a94bd32c55..5117e4d2ddfa 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -403,8 +403,13 @@ static inline unsigned long ifname_compare_aligned(const 
char *_a,
return ret;
 }
 
+struct xt_percpu_counter_alloc_state {
+   unsigned int off;
+   const char __percpu *mem;
+};
 
-bool xt_percpu_counter_alloc(struct xt_counters *counters);
+bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state,
+struct xt_counters *counter);
 void xt_percpu_counter_free(struct xt_counters *cnt);
 
 static inline struct xt_counters *
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 808deb275ceb..1258a9ab62ef 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -411,13 +411,14 @@ static inline int check_target(struct arpt_entry *e, 
const char *name)
 }
 
 static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+struct xt_percpu_counter_alloc_state *alloc_state)
 {
struct xt_entry_target *t;
struct xt_target *target;
int ret;
 
-   if (!xt_percpu_counter_alloc(>counters))
+   if (!xt_percpu_counter_alloc(alloc_state, >counters))
return -ENOMEM;
 
t = arpt_get_target(e);
@@ -525,6 +526,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
 static int translate_table(struct xt_table_info *newinfo, void *entry0,
   const struct arpt_replace *repl)
 {
+   struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct arpt_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -587,7 +589,8 @@ static int translate_table(struct xt_table_info *newinfo, 
void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
-   ret = find_check_entry(iter, repl->name, repl->size);
+   ret = find_check_entry(iter, repl->name, repl->size,
+  _state);
if (ret != 0)
break;
++i;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a48430d3420f..308b456723f0 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -531,7 +531,8 @@ static int check_target(struct ipt_entry *e, struct net 
*net, const char *name)
 
 static int
 find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
-unsigned int size)
+unsigned int size,
+struct xt_percpu_counter_alloc_state *alloc_state)
 {
struct xt_entry_target *t;
struct xt_target *target;
@@ -540,7 +541,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, 
const char *name,
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
 
-   if (!xt_percpu_counter_alloc(>counters))
+   if (!xt_percpu_counter_alloc(alloc_state, >counters))
return -ENOMEM;
 
j = 0;
@@ -676,6 +677,7 @@ static int
 translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
const struct ipt_replace *repl)
 {
+   struct xt_percpu_counter_alloc_state alloc_state = { 0 };
struct ipt_entry *iter;
unsigned int *offsets;
unsigned int i;
@@ -735,7 +737,8 @@ translate_table(struct net *net, struct xt_table_info 
*newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
-   ret = find_check_entry(iter, net, repl->name, repl->size);
+   ret = find_check_entry(iter, net, repl->name, repl->size,
+  _state);
if (ret != 0)
 

[PATCH 09/50] netfilter: nf_conntrack_tuple_common.h: fix #include

2016-12-07 Thread Pablo Neira Ayuso
From: Davide Caratti 

To allow usage of enum ip_conntrack_dir in include/net/netns/conntrack.h,
this patch encloses #include  in a #ifndef __KERNEL__
directive, so that compiler errors caused by unwanted inclusion of
include/linux/netfilter.h are avoided.
In addition, #include  line has
been added to resolve correctly CTINFO2DIR macro.

Signed-off-by: Davide Caratti 
Acked-by: Mikko Rapeli 
Signed-off-by: Pablo Neira Ayuso 
---
 include/uapi/linux/netfilter/nf_conntrack_tuple_common.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h 
b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
index a9c3834abdd4..526b42496b78 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_tuple_common.h
@@ -2,7 +2,10 @@
 #define _NF_CONNTRACK_TUPLE_COMMON_H
 
 #include 
+#ifndef __KERNEL__
 #include 
+#endif
+#include  /* IP_CT_IS_REPLY */
 
 enum ip_conntrack_dir {
IP_CT_DIR_ORIGINAL,
-- 
2.1.4



[PATCH 04/50] netfilter: built-in NAT support for DCCP

2016-12-07 Thread Pablo Neira Ayuso
From: Davide Caratti 

CONFIG_NF_NAT_PROTO_DCCP is no more a tristate. When set to y, NAT
support for DCCP protocol is built-in into nf_nat.ko.

footprint test:

(nf_nat_proto_)   | dccp   || nf_nat
--+++
no builtin| 409800 || 2241312
DCCP builtin  |   -|| 2578968

Signed-off-by: Davide Caratti 
Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_nat_l4proto.h |  3 +++
 net/netfilter/Kconfig  |  2 +-
 net/netfilter/Makefile |  3 ++-
 net/netfilter/nf_nat_core.c|  4 
 net/netfilter/nf_nat_proto_dccp.c  | 36 +-
 5 files changed, 11 insertions(+), 37 deletions(-)

diff --git a/include/net/netfilter/nf_nat_l4proto.h 
b/include/net/netfilter/nf_nat_l4proto.h
index 12f4cc841b6e..92b147be00ef 100644
--- a/include/net/netfilter/nf_nat_l4proto.h
+++ b/include/net/netfilter/nf_nat_l4proto.h
@@ -54,6 +54,9 @@ extern const struct nf_nat_l4proto nf_nat_l4proto_udp;
 extern const struct nf_nat_l4proto nf_nat_l4proto_icmp;
 extern const struct nf_nat_l4proto nf_nat_l4proto_icmpv6;
 extern const struct nf_nat_l4proto nf_nat_l4proto_unknown;
+#ifdef CONFIG_NF_NAT_PROTO_DCCP
+extern const struct nf_nat_l4proto nf_nat_l4proto_dccp;
+#endif
 
 bool nf_nat_l4proto_in_range(const struct nf_conntrack_tuple *tuple,
 enum nf_nat_manip_type maniptype,
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 44410d30d461..13092e5cd245 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -384,7 +384,7 @@ config NF_NAT_NEEDED
default y
 
 config NF_NAT_PROTO_DCCP
-   tristate
+   bool
depends on NF_NAT && NF_CT_PROTO_DCCP
default NF_NAT && NF_CT_PROTO_DCCP
 
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 5bbf767672ec..9ea0c98e51e6 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -45,6 +45,8 @@ obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o
 nf_nat-y   := nf_nat_core.o nf_nat_proto_unknown.o nf_nat_proto_common.o \
   nf_nat_proto_udp.o nf_nat_proto_tcp.o nf_nat_helper.o
 
+nf_nat-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
+
 # generic transport layer logging
 obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o
 
@@ -55,7 +57,6 @@ obj-$(CONFIG_NF_NAT) += nf_nat.o
 obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
 
 # NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_DCCP) += nf_nat_proto_dccp.o
 obj-$(CONFIG_NF_NAT_PROTO_UDPLITE) += nf_nat_proto_udplite.o
 obj-$(CONFIG_NF_NAT_PROTO_SCTP) += nf_nat_proto_sctp.o
 
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 5b9c884a452e..69b121d11275 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -682,6 +682,10 @@ int nf_nat_l3proto_register(const struct nf_nat_l3proto 
*l3proto)
 _nat_l4proto_tcp);
RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_UDP],
 _nat_l4proto_udp);
+#ifdef CONFIG_NF_NAT_PROTO_DCCP
+   RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_DCCP],
+_nat_l4proto_dccp);
+#endif
mutex_unlock(_nat_proto_mutex);
 
RCU_INIT_POINTER(nf_nat_l3protos[l3proto->l3proto], l3proto);
diff --git a/net/netfilter/nf_nat_proto_dccp.c 
b/net/netfilter/nf_nat_proto_dccp.c
index 15c47b246d0d..269fcd5dc34c 100644
--- a/net/netfilter/nf_nat_proto_dccp.c
+++ b/net/netfilter/nf_nat_proto_dccp.c
@@ -10,8 +10,6 @@
  */
 
 #include 
-#include 
-#include 
 #include 
 #include 
 
@@ -73,7 +71,7 @@ dccp_manip_pkt(struct sk_buff *skb,
return true;
 }
 
-static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
+const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
.l4proto= IPPROTO_DCCP,
.manip_pkt  = dccp_manip_pkt,
.in_range   = nf_nat_l4proto_in_range,
@@ -82,35 +80,3 @@ static const struct nf_nat_l4proto nf_nat_l4proto_dccp = {
.nlattr_to_range= nf_nat_l4proto_nlattr_to_range,
 #endif
 };
-
-static int __init nf_nat_proto_dccp_init(void)
-{
-   int err;
-
-   err = nf_nat_l4proto_register(NFPROTO_IPV4, _nat_l4proto_dccp);
-   if (err < 0)
-   goto err1;
-   err = nf_nat_l4proto_register(NFPROTO_IPV6, _nat_l4proto_dccp);
-   if (err < 0)
-   goto err2;
-   return 0;
-
-err2:
-   nf_nat_l4proto_unregister(NFPROTO_IPV4, _nat_l4proto_dccp);
-err1:
-   return err;
-}
-
-static void __exit nf_nat_proto_dccp_fini(void)
-{
-   nf_nat_l4proto_unregister(NFPROTO_IPV6, _nat_l4proto_dccp);
-   nf_nat_l4proto_unregister(NFPROTO_IPV4, _nat_l4proto_dccp);
-
-}
-
-module_init(nf_nat_proto_dccp_init);
-module_exit(nf_nat_proto_dccp_fini);
-
-MODULE_AUTHOR("Patrick McHardy ");
-MODULE_DESCRIPTION("DCCP 

[PATCH 16/50] netfilter: nf_tables: add conntrack dependencies for nat/masq/redir expressions

2016-12-07 Thread Pablo Neira Ayuso
From: Florian Westphal 

so that conntrack core will add the needed hooks in this namespace.

Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/nft_masq_ipv4.c  |  7 +++
 net/ipv4/netfilter/nft_redir_ipv4.c |  7 +++
 net/ipv6/netfilter/nft_masq_ipv6.c  |  7 +++
 net/ipv6/netfilter/nft_redir_ipv6.c |  7 +++
 net/netfilter/nft_masq.c|  2 +-
 net/netfilter/nft_nat.c | 11 ++-
 net/netfilter/nft_redir.c   |  2 +-
 7 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c 
b/net/ipv4/netfilter/nft_masq_ipv4.c
index 4d69f99b8707..a0ea8aad1bf1 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -35,12 +35,19 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
, nft_out(pkt));
 }
 
+static void
+nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+   nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
+}
+
 static struct nft_expr_type nft_masq_ipv4_type;
 static const struct nft_expr_ops nft_masq_ipv4_ops = {
.type   = _masq_ipv4_type,
.size   = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
.eval   = nft_masq_ipv4_eval,
.init   = nft_masq_init,
+   .destroy= nft_masq_ipv4_destroy,
.dump   = nft_masq_dump,
.validate   = nft_masq_validate,
 };
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c 
b/net/ipv4/netfilter/nft_redir_ipv4.c
index 62c18e68ac58..1650ed23c15d 100644
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ b/net/ipv4/netfilter/nft_redir_ipv4.c
@@ -38,12 +38,19 @@ static void nft_redir_ipv4_eval(const struct nft_expr *expr,
regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, , nft_hook(pkt));
 }
 
+static void
+nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+   nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
+}
+
 static struct nft_expr_type nft_redir_ipv4_type;
 static const struct nft_expr_ops nft_redir_ipv4_ops = {
.type   = _redir_ipv4_type,
.size   = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
.eval   = nft_redir_ipv4_eval,
.init   = nft_redir_init,
+   .destroy= nft_redir_ipv4_destroy,
.dump   = nft_redir_dump,
.validate   = nft_redir_validate,
 };
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c 
b/net/ipv6/netfilter/nft_masq_ipv6.c
index 93d758f70334..6c5b5b1830a7 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -36,12 +36,19 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
nft_out(pkt));
 }
 
+static void
+nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+   nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
+}
+
 static struct nft_expr_type nft_masq_ipv6_type;
 static const struct nft_expr_ops nft_masq_ipv6_ops = {
.type   = _masq_ipv6_type,
.size   = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
.eval   = nft_masq_ipv6_eval,
.init   = nft_masq_init,
+   .destroy= nft_masq_ipv6_destroy,
.dump   = nft_masq_dump,
.validate   = nft_masq_validate,
 };
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c 
b/net/ipv6/netfilter/nft_redir_ipv6.c
index 2850fcd8583f..f5ac080fc084 100644
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ b/net/ipv6/netfilter/nft_redir_ipv6.c
@@ -39,12 +39,19 @@ static void nft_redir_ipv6_eval(const struct nft_expr *expr,
nf_nat_redirect_ipv6(pkt->skb, , nft_hook(pkt));
 }
 
+static void
+nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+{
+   nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
+}
+
 static struct nft_expr_type nft_redir_ipv6_type;
 static const struct nft_expr_ops nft_redir_ipv6_ops = {
.type   = _redir_ipv6_type,
.size   = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
.eval   = nft_redir_ipv6_eval,
.init   = nft_redir_init,
+   .destroy= nft_redir_ipv6_destroy,
.dump   = nft_redir_dump,
.validate   = nft_redir_validate,
 };
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index bf92de01410f..11ce016cd479 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -77,7 +77,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
}
}
 
-   return 0;
+   return nf_ct_netns_get(ctx->net, ctx->afi->family);
 }
 EXPORT_SYMBOL_GPL(nft_masq_init);
 
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index ee2d71753746..19a7bf3236f9 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ 

[PATCH 35/50] netfilter: nf_tables: add stateful object reference expression

2016-12-07 Thread Pablo Neira Ayuso
This new expression allows us to refer to existing stateful objects from
rules.

Signed-off-by: Pablo Neira Ayuso 
---
 include/uapi/linux/netfilter/nf_tables.h |  14 
 net/netfilter/Kconfig|   6 ++
 net/netfilter/Makefile   |   1 +
 net/netfilter/nft_objref.c   | 112 +++
 4 files changed, 133 insertions(+)
 create mode 100644 net/netfilter/nft_objref.c

diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index ad0577ba5d2a..1043ce4250c5 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1138,6 +1138,20 @@ enum nft_fwd_attributes {
 #define NFTA_FWD_MAX   (__NFTA_FWD_MAX - 1)
 
 /**
+ * enum nft_objref_attributes - nf_tables stateful object expression netlink 
attributes
+ *
+ * @NFTA_OBJREF_IMM_TYPE: object type for immediate reference (NLA_U32: 
nft_register)
+ * @NFTA_OBJREF_IMM_NAME: object name for immediate reference (NLA_STRING)
+ */
+enum nft_objref_attributes {
+   NFTA_OBJREF_UNSPEC,
+   NFTA_OBJREF_IMM_TYPE,
+   NFTA_OBJREF_IMM_NAME,
+   __NFTA_OBJREF_MAX
+};
+#define NFTA_OBJREF_MAX(__NFTA_OBJREF_MAX - 1)
+
+/**
  * enum nft_gen_attributes - nf_tables ruleset generation attributes
  *
  * @NFTA_GEN_ID: Ruleset generation ID (NLA_U32)
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index def4be06cda6..63729b489c2c 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -551,6 +551,12 @@ config NFT_NAT
  This option adds the "nat" expression that you can use to perform
  typical Network Address Translation (NAT) packet transformations.
 
+config NFT_OBJREF
+   tristate "Netfilter nf_tables stateful object reference module"
+   help
+ This option adds the "objref" expression that allows you to refer to
+ stateful objects, such as counters and quotas.
+
 config NFT_QUEUE
depends on NETFILTER_NETLINK_QUEUE
tristate "Netfilter nf_tables queue module"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index e4c8c1d7aaed..ca30d1960f1d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -88,6 +88,7 @@ obj-$(CONFIG_NFT_NUMGEN)  += nft_numgen.o
 obj-$(CONFIG_NFT_CT)   += nft_ct.o
 obj-$(CONFIG_NFT_LIMIT)+= nft_limit.o
 obj-$(CONFIG_NFT_NAT)  += nft_nat.o
+obj-$(CONFIG_NFT_OBJREF)   += nft_objref.o
 obj-$(CONFIG_NFT_QUEUE)+= nft_queue.o
 obj-$(CONFIG_NFT_QUOTA)+= nft_quota.o
 obj-$(CONFIG_NFT_REJECT)   += nft_reject.o
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
new file mode 100644
index ..23820f796aad
--- /dev/null
+++ b/net/netfilter/nft_objref.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2012-2016 Pablo Neira Ayuso 
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define nft_objref_priv(expr)  *((struct nft_object **)nft_expr_priv(expr))
+
+static void nft_objref_eval(const struct nft_expr *expr,
+   struct nft_regs *regs,
+   const struct nft_pktinfo *pkt)
+{
+   struct nft_object *obj = nft_objref_priv(expr);
+
+   obj->type->eval(obj, regs, pkt);
+}
+
+static int nft_objref_init(const struct nft_ctx *ctx,
+  const struct nft_expr *expr,
+  const struct nlattr * const tb[])
+{
+   struct nft_object *obj = nft_objref_priv(expr);
+   u8 genmask = nft_genmask_next(ctx->net);
+   u32 objtype;
+
+   if (!tb[NFTA_OBJREF_IMM_NAME] ||
+   !tb[NFTA_OBJREF_IMM_TYPE])
+   return -EINVAL;
+
+   objtype = ntohl(nla_get_be32(tb[NFTA_OBJREF_IMM_TYPE]));
+   obj = nf_tables_obj_lookup(ctx->table, tb[NFTA_OBJREF_IMM_NAME], 
objtype,
+  genmask);
+   if (IS_ERR(obj))
+   return -ENOENT;
+
+   nft_objref_priv(expr) = obj;
+   obj->use++;
+
+   return 0;
+}
+
+static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+   const struct nft_object *obj = nft_objref_priv(expr);
+
+   if (nla_put_string(skb, NFTA_OBJREF_IMM_NAME, obj->name) ||
+   nla_put_be32(skb, NFTA_OBJREF_IMM_TYPE, htonl(obj->type->type)))
+   goto nla_put_failure;
+
+   return 0;
+
+nla_put_failure:
+   return -1;
+}
+
+static void nft_objref_destroy(const struct nft_ctx *ctx,
+  const struct nft_expr *expr)
+{
+   struct nft_object *obj = nft_objref_priv(expr);
+
+   obj->use--;
+}
+
+static struct nft_expr_type nft_objref_type;
+static const struct 

[PATCH 44/50] netfilter: nat: skip checksum on offload SCTP packets

2016-12-07 Thread Pablo Neira Ayuso
From: Davide Caratti 

SCTP GSO and hardware can do CRC32c computation after netfilter processing,
so we can avoid calling sctp_compute_checksum() on skb if skb->ip_summed
is equal to CHECKSUM_PARTIAL. Moreover, set skb->ip_summed to CHECKSUM_NONE
when the NAT code computes the CRC, to prevent offloaders from computing
it again (on ixgbe this resulted in a transmission with wrong L4 checksum).

Signed-off-by: Davide Caratti 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_nat_proto_sctp.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nf_nat_proto_sctp.c 
b/net/netfilter/nf_nat_proto_sctp.c
index 2e14108ff697..31d358691af0 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -47,7 +47,10 @@ sctp_manip_pkt(struct sk_buff *skb,
hdr->dest = tuple->dst.u.sctp.port;
}
 
-   hdr->checksum = sctp_compute_cksum(skb, hdroff);
+   if (skb->ip_summed != CHECKSUM_PARTIAL) {
+   hdr->checksum = sctp_compute_cksum(skb, hdroff);
+   skb->ip_summed = CHECKSUM_NONE;
+   }
 
return true;
 }
-- 
2.1.4



Re: [net-next 20/20] i40e: don't allow i40e_vsi_(add|kill)_vlan to operate when VID<1

2016-12-07 Thread Jeff Kirsher
On Wed, 2016-12-07 at 13:50 -0800, Keller, Jacob E wrote:
> > -Original Message-
> > From: Sergei Shtylyov [mailto:sergei.shtyl...@cogentembedded.com]
> > Sent: Wednesday, December 07, 2016 2:11 AM
> > To: Kirsher, Jeffrey T ; davem@davemloft.n
> > et
> > Cc: Keller, Jacob E ; netdev@vger.kernel.org;
> > nhor...@redhat.com; sassm...@redhat.com; jogre...@redhat.com;
> > guru.anbalag...@oracle.com
> > Subject: Re: [net-next 20/20] i40e: don't allow
> > i40e_vsi_(add|kill)_vlan to operate
> > when VID<1
> > 
> > Hello!
> > > +   if (!(vid > 0) || vsi->info.pvid)
> > 
> >  Why not just '!vid'?
> 
> Left over artifact of this previously being a signed value. We can fix
> this.
> 
> Thanks,
> Jake
> 
> > > -void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, s16 vid)
> > > +void i40e_vsi_kill_vlan(struct i40e_vsi *vsi, u16 vid)
> > >   {
> > > +   if (!(vid > 0) || vsi->info.pvid)
> > 
> >  Likewise.
> 
> Same here. Can get this fixed.

While you are fixing this up and sending me a new version of this patch, I
will just drop this from the series and re-send.

signature.asc
Description: This is a digitally signed message part


[PATCH 22/50] netfilter: convert while loops to for loops

2016-12-07 Thread Pablo Neira Ayuso
From: Aaron Conole 

This is to facilitate converting from a singly-linked list to an array
of elements.

Signed-off-by: Aaron Conole 
Signed-off-by: Pablo Neira Ayuso 
---
 net/bridge/br_netfilter_hooks.c | 8 
 net/netfilter/core.c| 6 ++
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index adad2eed29e6..b12501a77f18 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1008,10 +1008,10 @@ int br_nf_hook_thresh(unsigned int hook, struct net 
*net,
struct nf_hook_state state;
int ret;
 
-   elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
-
-   while (elem && (nf_hook_entry_priority(elem) <= NF_BR_PRI_BRNF))
-   elem = rcu_dereference(elem->next);
+   for (elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]);
+elem && nf_hook_entry_priority(elem) <= NF_BR_PRI_BRNF;
+elem = rcu_dereference(elem->next))
+   ;
 
if (!elem)
return okfn(net, sk, skb);
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 2bb46e2d8d30..ce6adfae521a 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -107,10 +107,9 @@ int nf_register_net_hook(struct net *net, const struct 
nf_hook_ops *reg)
mutex_lock(_hook_mutex);
 
/* Find the spot in the list */
-   while ((p = nf_entry_dereference(*pp)) != NULL) {
+   for (; (p = nf_entry_dereference(*pp)) != NULL; pp = >next) {
if (reg->priority < nf_hook_entry_priority(p))
break;
-   pp = >next;
}
rcu_assign_pointer(entry->next, p);
rcu_assign_pointer(*pp, entry);
@@ -137,12 +136,11 @@ void nf_unregister_net_hook(struct net *net, const struct 
nf_hook_ops *reg)
return;
 
mutex_lock(_hook_mutex);
-   while ((p = nf_entry_dereference(*pp)) != NULL) {
+   for (; (p = nf_entry_dereference(*pp)) != NULL; pp = >next) {
if (nf_hook_entry_ops(p) == reg) {
rcu_assign_pointer(*pp, p->next);
break;
}
-   pp = >next;
}
mutex_unlock(_hook_mutex);
if (!p) {
-- 
2.1.4



[PATCH 41/50] netfilter: nft_objref: support for stateful object maps

2016-12-07 Thread Pablo Neira Ayuso
This patch allows us to refer to stateful object dictionaries, the
source register indicates the key data to be used to look up for the
corresponding state object. We can refer to these maps through names or,
alternatively, the map transaction id. This allows us to refer to both
anonymous and named maps.

Signed-off-by: Pablo Neira Ayuso 
---
 include/uapi/linux/netfilter/nf_tables.h |   6 ++
 net/netfilter/nf_tables_api.c|   4 ++
 net/netfilter/nft_objref.c   | 116 ++-
 3 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index a6b52dbff08c..881d49e94569 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1153,11 +1153,17 @@ enum nft_fwd_attributes {
  *
  * @NFTA_OBJREF_IMM_TYPE: object type for immediate reference (NLA_U32: 
nft_register)
  * @NFTA_OBJREF_IMM_NAME: object name for immediate reference (NLA_STRING)
+ * @NFTA_OBJREF_SET_SREG: source register of the data to look for (NLA_U32: 
nft_registers)
+ * @NFTA_OBJREF_SET_NAME: name of the set where to look for (NLA_STRING)
+ * @NFTA_OBJREF_SET_ID: id of the set where to look for in this transaction 
(NLA_U32)
  */
 enum nft_objref_attributes {
NFTA_OBJREF_UNSPEC,
NFTA_OBJREF_IMM_TYPE,
NFTA_OBJREF_IMM_NAME,
+   NFTA_OBJREF_SET_SREG,
+   NFTA_OBJREF_SET_NAME,
+   NFTA_OBJREF_SET_ID,
__NFTA_OBJREF_MAX
 };
 #define NFTA_OBJREF_MAX(__NFTA_OBJREF_MAX - 1)
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 8228714c42d5..b4db5bf4c135 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2504,6 +2504,7 @@ struct nft_set *nf_tables_set_lookup(const struct 
nft_table *table,
}
return ERR_PTR(-ENOENT);
 }
+EXPORT_SYMBOL_GPL(nf_tables_set_lookup);
 
 struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
  const struct nlattr *nla,
@@ -2522,6 +2523,7 @@ struct nft_set *nf_tables_set_lookup_byid(const struct 
net *net,
}
return ERR_PTR(-ENOENT);
 }
+EXPORT_SYMBOL_GPL(nf_tables_set_lookup_byid);
 
 static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
const char *name)
@@ -3124,6 +3126,7 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct 
nft_set *set,
list_add_tail_rcu(>list, >bindings);
return 0;
 }
+EXPORT_SYMBOL_GPL(nf_tables_bind_set);
 
 void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
  struct nft_set_binding *binding)
@@ -3134,6 +3137,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, 
struct nft_set *set,
nft_is_active(ctx->net, set))
nf_tables_set_destroy(ctx, set);
 }
+EXPORT_SYMBOL_GPL(nf_tables_unbind_set);
 
 const struct nft_set_ext_type nft_set_ext_types[] = {
[NFT_SET_EXT_KEY]   = {
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 23820f796aad..415a65ba2b85 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -81,14 +81,128 @@ static const struct nft_expr_ops nft_objref_ops = {
.dump   = nft_objref_dump,
 };
 
+struct nft_objref_map {
+   struct nft_set  *set;
+   enum nft_registers  sreg:8;
+   struct nft_set_binding  binding;
+};
+
+static void nft_objref_map_eval(const struct nft_expr *expr,
+   struct nft_regs *regs,
+   const struct nft_pktinfo *pkt)
+{
+   struct nft_objref_map *priv = nft_expr_priv(expr);
+   const struct nft_set *set = priv->set;
+   const struct nft_set_ext *ext;
+   struct nft_object *obj;
+   bool found;
+
+   found = set->ops->lookup(nft_net(pkt), set, >data[priv->sreg],
+);
+   if (!found) {
+   regs->verdict.code = NFT_BREAK;
+   return;
+   }
+   obj = *nft_set_ext_obj(ext);
+   obj->type->eval(obj, regs, pkt);
+}
+
+static int nft_objref_map_init(const struct nft_ctx *ctx,
+  const struct nft_expr *expr,
+  const struct nlattr * const tb[])
+{
+   struct nft_objref_map *priv = nft_expr_priv(expr);
+   u8 genmask = nft_genmask_next(ctx->net);
+   struct nft_set *set;
+   int err;
+
+   set = nf_tables_set_lookup(ctx->table, tb[NFTA_OBJREF_SET_NAME], 
genmask);
+   if (IS_ERR(set)) {
+   if (tb[NFTA_OBJREF_SET_ID]) {
+   set = nf_tables_set_lookup_byid(ctx->net,
+   tb[NFTA_OBJREF_SET_ID],
+   genmask);
+   }
+   if (IS_ERR(set))
+  

[PATCH 40/50] netfilter: nf_tables: add stateful object reference to set elements

2016-12-07 Thread Pablo Neira Ayuso
This patch allows you to refer to stateful objects from set elements.
This provides the infrastructure to create maps where the right hand
side of the mapping is a stateful object.

This allows us to build dictionaries of stateful objects, that you can
use to perform fast lookups using any arbitrary key combination.

Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_tables.h|  9 
 include/uapi/linux/netfilter/nf_tables.h |  8 
 net/netfilter/nf_tables_api.c| 72 +++-
 3 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/include/net/netfilter/nf_tables.h 
b/include/net/netfilter/nf_tables.h
index ce6fb6e83b32..85f0f03f1e87 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -326,6 +326,7 @@ void nft_unregister_set(struct nft_set_ops *ops);
  * @name: name of the set
  * @ktype: key type (numeric type defined by userspace, not used in the 
kernel)
  * @dtype: data type (verdict or numeric type defined by userspace)
+ * @objtype: object type (see NFT_OBJECT_* definitions)
  * @size: maximum set size
  * @nelems: number of elements
  * @ndeact: number of deactivated elements queued for removal
@@ -347,6 +348,7 @@ struct nft_set {
charname[NFT_SET_MAXNAMELEN];
u32 ktype;
u32 dtype;
+   u32 objtype;
u32 size;
atomic_tnelems;
u32 ndeact;
@@ -416,6 +418,7 @@ void nf_tables_unbind_set(const struct nft_ctx *ctx, struct 
nft_set *set,
  * @NFT_SET_EXT_EXPIRATION: element expiration time
  * @NFT_SET_EXT_USERDATA: user data associated with the element
  * @NFT_SET_EXT_EXPR: expression assiociated with the element
+ * @NFT_SET_EXT_OBJREF: stateful object reference associated with element
  * @NFT_SET_EXT_NUM: number of extension types
  */
 enum nft_set_extensions {
@@ -426,6 +429,7 @@ enum nft_set_extensions {
NFT_SET_EXT_EXPIRATION,
NFT_SET_EXT_USERDATA,
NFT_SET_EXT_EXPR,
+   NFT_SET_EXT_OBJREF,
NFT_SET_EXT_NUM
 };
 
@@ -554,6 +558,11 @@ static inline struct nft_set_ext *nft_set_elem_ext(const 
struct nft_set *set,
return elem + set->ops->elemsize;
 }
 
+static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext 
*ext)
+{
+   return nft_set_ext(ext, NFT_SET_EXT_OBJREF);
+}
+
 void *nft_set_elem_init(const struct nft_set *set,
const struct nft_set_ext_tmpl *tmpl,
const u32 *key, const u32 *data,
diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index 4864caca1e8e..a6b52dbff08c 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -255,6 +255,7 @@ enum nft_rule_compat_attributes {
  * @NFT_SET_MAP: set is used as a dictionary
  * @NFT_SET_TIMEOUT: set uses timeouts
  * @NFT_SET_EVAL: set contains expressions for evaluation
+ * @NFT_SET_OBJECT: set contains stateful objects
  */
 enum nft_set_flags {
NFT_SET_ANONYMOUS   = 0x1,
@@ -263,6 +264,7 @@ enum nft_set_flags {
NFT_SET_MAP = 0x8,
NFT_SET_TIMEOUT = 0x10,
NFT_SET_EVAL= 0x20,
+   NFT_SET_OBJECT  = 0x40,
 };
 
 /**
@@ -304,6 +306,7 @@ enum nft_set_desc_attributes {
  * @NFTA_SET_TIMEOUT: default timeout value (NLA_U64)
  * @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32)
  * @NFTA_SET_USERDATA: user data (NLA_BINARY)
+ * @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*)
  */
 enum nft_set_attributes {
NFTA_SET_UNSPEC,
@@ -321,6 +324,7 @@ enum nft_set_attributes {
NFTA_SET_GC_INTERVAL,
NFTA_SET_USERDATA,
NFTA_SET_PAD,
+   NFTA_SET_OBJ_TYPE,
__NFTA_SET_MAX
 };
 #define NFTA_SET_MAX   (__NFTA_SET_MAX - 1)
@@ -344,6 +348,7 @@ enum nft_set_elem_flags {
  * @NFTA_SET_ELEM_EXPIRATION: expiration time (NLA_U64)
  * @NFTA_SET_ELEM_USERDATA: user data (NLA_BINARY)
  * @NFTA_SET_ELEM_EXPR: expression (NLA_NESTED: nft_expr_attributes)
+ * @NFTA_SET_ELEM_OBJREF: stateful object reference (NLA_STRING)
  */
 enum nft_set_elem_attributes {
NFTA_SET_ELEM_UNSPEC,
@@ -355,6 +360,7 @@ enum nft_set_elem_attributes {
NFTA_SET_ELEM_USERDATA,
NFTA_SET_ELEM_EXPR,
NFTA_SET_ELEM_PAD,
+   NFTA_SET_ELEM_OBJREF,
__NFTA_SET_ELEM_MAX
 };
 #define NFTA_SET_ELEM_MAX  (__NFTA_SET_ELEM_MAX - 1)
@@ -1207,6 +1213,8 @@ enum nft_fib_flags {
 #define NFT_OBJECT_UNSPEC  0
 #define NFT_OBJECT_COUNTER 1
 #define NFT_OBJECT_QUOTA   2
+#define __NFT_OBJECT_MAX   3
+#define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1)
 
 /**

[PATCH 42/50] netfilter: nf_tables: allow to filter stateful object dumps by type

2016-12-07 Thread Pablo Neira Ayuso
This patch adds the netlink code to filter out dump of stateful objects,
through the NFTA_OBJ_TYPE netlink attribute.

Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_tables_api.c | 50 +++
 1 file changed, 50 insertions(+)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index b4db5bf4c135..b04d4ee1d533 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4183,12 +4183,18 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, 
struct net *net,
return -1;
 }
 
+struct nft_obj_filter {
+   chartable[NFT_OBJ_MAXNAMELEN];
+   u32 type;
+};
+
 static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
 {
const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
const struct nft_af_info *afi;
const struct nft_table *table;
unsigned int idx = 0, s_idx = cb->args[0];
+   struct nft_obj_filter *filter = cb->data;
struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_object *obj;
@@ -4213,6 +4219,13 @@ static int nf_tables_dump_obj(struct sk_buff *skb, 
struct netlink_callback *cb)
if (idx > s_idx)
memset(>args[1], 0,
   sizeof(cb->args) - 
sizeof(cb->args[0]));
+   if (filter->table[0] &&
+   strcmp(filter->table, table->name))
+   goto cont;
+   if (filter->type != NFT_OBJECT_UNSPEC &&
+   obj->type->type != filter->type)
+   goto cont;
+
if (nf_tables_fill_obj_info(skb, net, 
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NFT_MSG_NEWOBJ,
@@ -4233,6 +4246,31 @@ static int nf_tables_dump_obj(struct sk_buff *skb, 
struct netlink_callback *cb)
return skb->len;
 }
 
+static int nf_tables_dump_obj_done(struct netlink_callback *cb)
+{
+   kfree(cb->data);
+
+   return 0;
+}
+
+static struct nft_obj_filter *
+nft_obj_filter_alloc(const struct nlattr * const nla[])
+{
+   struct nft_obj_filter *filter;
+
+   filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+   if (!filter)
+   return ERR_PTR(-ENOMEM);
+
+   if (nla[NFTA_OBJ_TABLE])
+   nla_strlcpy(filter->table, nla[NFTA_OBJ_TABLE],
+   NFT_TABLE_MAXNAMELEN);
+   if (nla[NFTA_OBJ_TYPE])
+   filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
+
+   return filter;
+}
+
 static int nf_tables_getobj(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
@@ -4251,7 +4289,19 @@ static int nf_tables_getobj(struct net *net, struct sock 
*nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = nf_tables_dump_obj,
+   .done = nf_tables_dump_obj_done,
};
+
+   if (nla[NFTA_OBJ_TABLE] ||
+   nla[NFTA_OBJ_TYPE]) {
+   struct nft_obj_filter *filter;
+
+   filter = nft_obj_filter_alloc(nla);
+   if (IS_ERR(filter))
+   return -ENOMEM;
+
+   c.data = filter;
+   }
return netlink_dump_start(nlsk, skb, nlh, );
}
 
-- 
2.1.4



[PATCH 34/50] netfilter: nft_quota: add stateful object type

2016-12-07 Thread Pablo Neira Ayuso
Register a new quota stateful object type into the new stateful object
infrastructure.

Signed-off-by: Pablo Neira Ayuso 
---
 include/uapi/linux/netfilter/nf_tables.h |  1 +
 net/netfilter/nft_quota.c| 96 +++-
 2 files changed, 84 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index e352ef65d753..ad0577ba5d2a 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1187,6 +1187,7 @@ enum nft_fib_flags {
 
 #define NFT_OBJECT_UNSPEC  0
 #define NFT_OBJECT_COUNTER 1
+#define NFT_OBJECT_QUOTA   2
 
 /**
  * enum nft_object_attributes - nf_tables stateful object netlink attributes
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index c00104c07095..09ce72b1d6bf 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -27,12 +27,10 @@ static inline bool nft_overquota(struct nft_quota *priv,
return atomic64_sub_return(pkt->skb->len, >remain) < 0;
 }
 
-static void nft_quota_eval(const struct nft_expr *expr,
-  struct nft_regs *regs,
-  const struct nft_pktinfo *pkt)
+static inline void nft_quota_do_eval(struct nft_quota *priv,
+struct nft_regs *regs,
+const struct nft_pktinfo *pkt)
 {
-   struct nft_quota *priv = nft_expr_priv(expr);
-
if (nft_overquota(priv, pkt) ^ priv->invert)
regs->verdict.code = NFT_BREAK;
 }
@@ -42,11 +40,18 @@ static const struct nla_policy 
nft_quota_policy[NFTA_QUOTA_MAX + 1] = {
[NFTA_QUOTA_FLAGS]  = { .type = NLA_U32 },
 };
 
-static int nft_quota_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
+static void nft_quota_obj_eval(struct nft_object *obj,
+  struct nft_regs *regs,
+  const struct nft_pktinfo *pkt)
+{
+   struct nft_quota *priv = nft_obj_data(obj);
+
+   nft_quota_do_eval(priv, regs, pkt);
+}
+
+static int nft_quota_do_init(const struct nlattr * const tb[],
+struct nft_quota *priv)
 {
-   struct nft_quota *priv = nft_expr_priv(expr);
u32 flags = 0;
u64 quota;
 
@@ -70,9 +75,16 @@ static int nft_quota_init(const struct nft_ctx *ctx,
return 0;
 }
 
-static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_quota_obj_init(const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+   struct nft_quota *priv = nft_obj_data(obj);
+
+   return nft_quota_do_init(tb, priv);
+}
+
+static int nft_quota_do_dump(struct sk_buff *skb, const struct nft_quota *priv)
 {
-   const struct nft_quota *priv = nft_expr_priv(expr);
u32 flags = priv->invert ? NFT_QUOTA_F_INV : 0;
 
if (nla_put_be64(skb, NFTA_QUOTA_BYTES, cpu_to_be64(priv->quota),
@@ -85,6 +97,49 @@ static int nft_quota_dump(struct sk_buff *skb, const struct 
nft_expr *expr)
return -1;
 }
 
+static int nft_quota_obj_dump(struct sk_buff *skb, const struct nft_object 
*obj)
+{
+   struct nft_quota *priv = nft_obj_data(obj);
+
+   return nft_quota_do_dump(skb, priv);
+}
+
+static struct nft_object_type nft_quota_obj __read_mostly = {
+   .type   = NFT_OBJECT_QUOTA,
+   .size   = sizeof(struct nft_quota),
+   .maxattr= NFTA_QUOTA_MAX,
+   .policy = nft_quota_policy,
+   .init   = nft_quota_obj_init,
+   .eval   = nft_quota_obj_eval,
+   .dump   = nft_quota_obj_dump,
+   .owner  = THIS_MODULE,
+};
+
+static void nft_quota_eval(const struct nft_expr *expr,
+  struct nft_regs *regs,
+  const struct nft_pktinfo *pkt)
+{
+   struct nft_quota *priv = nft_expr_priv(expr);
+
+   nft_quota_do_eval(priv, regs, pkt);
+}
+
+static int nft_quota_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+   struct nft_quota *priv = nft_expr_priv(expr);
+
+   return nft_quota_do_init(tb, priv);
+}
+
+static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+   const struct nft_quota *priv = nft_expr_priv(expr);
+
+   return nft_quota_do_dump(skb, priv);
+}
+
 static struct nft_expr_type nft_quota_type;
 static const struct nft_expr_ops nft_quota_ops = {
.type   = _quota_type,
@@ -105,12 +160,26 @@ static struct nft_expr_type nft_quota_type __read_mostly 
= {
 
 static int __init nft_quota_module_init(void)
 {
-return nft_register_expr(_quota_type);
+   int err;
+
+   err = nft_register_obj(_quota_obj);
+   if (err 

[PATCH 28/50] netfilter: nft_payload: layer 4 checksum adjustment for pseudoheader fields

2016-12-07 Thread Pablo Neira Ayuso
This patch adds a new flag that signals the kernel to update layer 4
checksum if the packet field belongs to the layer 4 pseudoheader. This
implicitly provides stateless NAT 1:1 that is useful under very specific
usecases.

Since rules mangling layer 3 fields that are part of the pseudoheader
may potentially convey any layer 4 packet, we have to deal with the
layer 4 checksum adjustment using protocol specific code.

This patch adds support for TCP, UDP and ICMPv6, since they include the
pseudoheader in the layer 4 checksum calculation. ICMP doesn't, so we
can skip it.

Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_tables_core.h   |   1 +
 include/uapi/linux/netfilter/nf_tables.h |   6 ++
 net/netfilter/nft_payload.c  | 107 +--
 3 files changed, 109 insertions(+), 5 deletions(-)

diff --git a/include/net/netfilter/nf_tables_core.h 
b/include/net/netfilter/nf_tables_core.h
index 862373d4ea9d..8f690effec37 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -45,6 +45,7 @@ struct nft_payload_set {
enum nft_registers  sreg:8;
u8  csum_type;
u8  csum_offset;
+   u8  csum_flags;
 };
 
 extern const struct nft_expr_ops nft_payload_fast_ops;
diff --git a/include/uapi/linux/netfilter/nf_tables.h 
b/include/uapi/linux/netfilter/nf_tables.h
index 14e5f619167e..f030e59aa2ec 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -659,6 +659,10 @@ enum nft_payload_csum_types {
NFT_PAYLOAD_CSUM_INET,
 };
 
+enum nft_payload_csum_flags {
+   NFT_PAYLOAD_L4CSUM_PSEUDOHDR = (1 << 0),
+};
+
 /**
  * enum nft_payload_attributes - nf_tables payload expression netlink 
attributes
  *
@@ -669,6 +673,7 @@ enum nft_payload_csum_types {
  * @NFTA_PAYLOAD_SREG: source register to load data from (NLA_U32: 
nft_registers)
  * @NFTA_PAYLOAD_CSUM_TYPE: checksum type (NLA_U32)
  * @NFTA_PAYLOAD_CSUM_OFFSET: checksum offset relative to base (NLA_U32)
+ * @NFTA_PAYLOAD_CSUM_FLAGS: checksum flags (NLA_U32)
  */
 enum nft_payload_attributes {
NFTA_PAYLOAD_UNSPEC,
@@ -679,6 +684,7 @@ enum nft_payload_attributes {
NFTA_PAYLOAD_SREG,
NFTA_PAYLOAD_CSUM_TYPE,
NFTA_PAYLOAD_CSUM_OFFSET,
+   NFTA_PAYLOAD_CSUM_FLAGS,
__NFTA_PAYLOAD_MAX
 };
 #define NFTA_PAYLOAD_MAX   (__NFTA_PAYLOAD_MAX - 1)
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 98fb5d7b8087..36d2b1096546 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008-2009 Patrick McHardy 
+ * Copyright (c) 2016 Pablo Neira Ayuso 
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -17,6 +18,10 @@
 #include 
 #include 
 #include 
+/* For layer 4 checksum field offset. */
+#include 
+#include 
+#include 
 
 /* add vlan header into the user buffer for if tag was removed by offloads */
 static bool
@@ -164,6 +169,87 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.dump   = nft_payload_dump,
 };
 
+static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
+{
+   *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
+   if (*sum == 0)
+   *sum = CSUM_MANGLED_0;
+}
+
+static bool nft_payload_udp_checksum(struct sk_buff *skb, unsigned int thoff)
+{
+   struct udphdr *uh, _uh;
+
+   uh = skb_header_pointer(skb, thoff, sizeof(_uh), &_uh);
+   if (!uh)
+   return false;
+
+   return uh->check;
+}
+
+static int nft_payload_l4csum_offset(const struct nft_pktinfo *pkt,
+struct sk_buff *skb,
+unsigned int *l4csum_offset)
+{
+   switch (pkt->tprot) {
+   case IPPROTO_TCP:
+   *l4csum_offset = offsetof(struct tcphdr, check);
+   break;
+   case IPPROTO_UDP:
+   if (!nft_payload_udp_checksum(skb, pkt->xt.thoff))
+   return -1;
+   /* Fall through. */
+   case IPPROTO_UDPLITE:
+   *l4csum_offset = offsetof(struct udphdr, check);
+   break;
+   case IPPROTO_ICMPV6:
+   *l4csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+   break;
+   default:
+   return -1;
+   }
+
+   *l4csum_offset += pkt->xt.thoff;
+   return 0;
+}
+
+static int nft_payload_l4csum_update(const struct nft_pktinfo *pkt,
+struct sk_buff *skb,
+__wsum fsum, __wsum tsum)
+{
+   int l4csum_offset;
+   __sum16 sum;
+
+   /* If we cannot determine layer 4 checksum offset or this packet 

  1   2   3   4   >