Re: bnx2x: kernel panic in the bnx2x driver

2018-06-22 Thread Vishwanath Pai
The patch below worked for me (on 4.14.51 LTS kernel):

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 1e33abd..2b3863a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -3387,14 +3387,20 @@ static int bnx2x_set_rss_flags(struct bnx2x *bp, struct 
ethtool_rxnfc *info)
DP(BNX2X_MSG_ETHTOOL,
   "rss re-configured, UDP 4-tupple %s\n",
   udp_rss_requested ? "enabled" : "disabled");
-   return bnx2x_rss(bp, >rss_conf_obj, false, true);
+   if (bp->state == BNX2X_STATE_OPEN)
+   return bnx2x_rss(bp, >rss_conf_obj, false, 
true);
+   else
+   return 0;
} else if ((info->flow_type == UDP_V6_FLOW) &&
   (bp->rss_conf_obj.udp_rss_v6 != udp_rss_requested)) {
bp->rss_conf_obj.udp_rss_v6 = udp_rss_requested;
DP(BNX2X_MSG_ETHTOOL,
   "rss re-configured, UDP 4-tupple %s\n",
   udp_rss_requested ? "enabled" : "disabled");
-   return bnx2x_rss(bp, >rss_conf_obj, false, true);
+   if (bp->state == BNX2X_STATE_OPEN)
+   return bnx2x_rss(bp, >rss_conf_obj, false, 
true);
+   else
+   return 0;
}
return 0;
 
Although I think there might be another place where we may need to fix this as 
well:
bnx2x_config_rss_eth()

Thanks,
Vishwanath

On 06/22/2018 10:57 AM, Vishwanath Pai wrote:
> Ah, that is great! I will test it out on my machine and let you know.
>
> Thanks,
> Vishwanath
>
> On 06/22/2018 10:21 AM, Kalluru, Sudarsana wrote:
>> Hi Vishwanath,
>> The config will be cached in the device structure 
>> (bp->rss_conf_obj.udp_rss_v4) in this scenario, and will be applied in the 
>> load path (bnx2x_nic_load() --> bnx2x_init_rss()). Have unit tested the 
>> change on my setup.
>>
>> Thanks,
>> Sudarsana
>>
>> -Original Message-
>> From: Vishwanath Pai [mailto:v...@akamai.com] 
>> Sent: 22 June 2018 18:52
>> To: Kalluru, Sudarsana ; Elior, Ariel 
>> ; Dept-Eng Everest Linux L2 
>> 
>> Cc: da...@davemloft.net; netdev@vger.kernel.org; dbane...@akamai.com; 
>> pai.vishw...@gmail.com
>> Subject: Re: bnx2x: kernel panic in the bnx2x driver
>>
>> Hi Sudarsana,
>>
>> Thanks for taking a look at my email. The fix you suggested would definitely 
>> fix the kernel panic, but at the same time wouldn't it also silently ignore 
>> the request by ethtool to set rx-flow-hash?
>>
>> Thanks,
>> Vishwanath
>>
>> On 06/22/2018 06:20 AM, Kalluru, Sudarsana wrote:
>>> Hi Vishwanath,
>>> Thanks for your mail, and the analysis.
>>> The fix would be to invoke bnx2x_rss() only when the device is opened,
>>>   if (bp->state == BNX2X_STATE_OPEN)
>>>   return bnx2x_rss(bp, >rss_conf_obj, false, true);
>>>   else
>>>   return 0;
>>> Ariel,
>>>Could you please review the path (bnx2x_set_rss_flags()--> bnx2x_rss()) 
>>> and confirm/correct on the above.
>>>
>>> Thanks,
>>> Sudarsana
>>>
>>> -Original Message-
>>> From: Vishwanath Pai [mailto:v...@akamai.com]
>>> Sent: 22 June 2018 10:37
>>> To: Elior, Ariel ; Dept-Eng Everest Linux L2 
>>> 
>>> Cc: da...@davemloft.net; netdev@vger.kernel.org; dbane...@akamai.com; 
>>> pai.vishw...@gmail.com
>>> Subject: bnx2x: kernel panic in the bnx2x driver
>>>
>>> External Email
>>>
>>> Hi,
>>>
>>> We recently noticed a kernel panic in the bnx2x driver when trying to 
>>> set rx-flow-hash parameters via ethtool during if-pre-up.d. I am 
>>> running kernel
>>> v4.17.2 from ubuntu-mainline-ppa. I have added the stack trace below:
>>>
>>> [   18.280209] BUG: unable to handle kernel NULL pointer dereference at 
>>> 
>>> [   18.280212] PGD 800407a79067 P4D 800407a79067 PUD 40ce8a067 PMD 0
>>> [   18.280214] Oops: 0010 [#1] SMP PTI
>>> [   18.280215] Modules linked in: intel_rapl x86_pkg_temp_thermal 
>>> intel_powerclamp kvm_intel joydev input_led kvm irqbypass crct10dif_pclmul 
>>> cr

Re: bnx2x: kernel panic in the bnx2x driver

2018-06-22 Thread Vishwanath Pai
Ah, that is great! I will test it out on my machine and let you know.

Thanks,
Vishwanath

On 06/22/2018 10:21 AM, Kalluru, Sudarsana wrote:
> Hi Vishwanath,
> The config will be cached in the device structure 
> (bp->rss_conf_obj.udp_rss_v4) in this scenario, and will be applied in the 
> load path (bnx2x_nic_load() --> bnx2x_init_rss()). Have unit tested the 
> change on my setup.
>
> Thanks,
> Sudarsana
>
> -Original Message-
> From: Vishwanath Pai [mailto:v...@akamai.com] 
> Sent: 22 June 2018 18:52
> To: Kalluru, Sudarsana ; Elior, Ariel 
> ; Dept-Eng Everest Linux L2 
> 
> Cc: da...@davemloft.net; netdev@vger.kernel.org; dbane...@akamai.com; 
> pai.vishw...@gmail.com
> Subject: Re: bnx2x: kernel panic in the bnx2x driver
>
> Hi Sudarsana,
>
> Thanks for taking a look at my email. The fix you suggested would definitely 
> fix the kernel panic, but at the same time wouldn't it also silently ignore 
> the request by ethtool to set rx-flow-hash?
>
> Thanks,
> Vishwanath
>
> On 06/22/2018 06:20 AM, Kalluru, Sudarsana wrote:
>> Hi Vishwanath,
>> Thanks for your mail, and the analysis.
>> The fix would be to invoke bnx2x_rss() only when the device is opened,
>>   if (bp->state == BNX2X_STATE_OPEN)
>>   return bnx2x_rss(bp, >rss_conf_obj, false, true);
>>   else
>>   return 0;
>> Ariel,
>>Could you please review the path (bnx2x_set_rss_flags()--> bnx2x_rss()) 
>> and confirm/correct on the above.
>>
>> Thanks,
>> Sudarsana
>>
>> -Original Message-
>> From: Vishwanath Pai [mailto:v...@akamai.com]
>> Sent: 22 June 2018 10:37
>> To: Elior, Ariel ; Dept-Eng Everest Linux L2 
>> 
>> Cc: da...@davemloft.net; netdev@vger.kernel.org; dbane...@akamai.com; 
>> pai.vishw...@gmail.com
>> Subject: bnx2x: kernel panic in the bnx2x driver
>>
>> External Email
>>
>> Hi,
>>
>> We recently noticed a kernel panic in the bnx2x driver when trying to 
>> set rx-flow-hash parameters via ethtool during if-pre-up.d. I am 
>> running kernel
>> v4.17.2 from ubuntu-mainline-ppa. I have added the stack trace below:
>>
>> [   18.280209] BUG: unable to handle kernel NULL pointer dereference at 
>> 
>> [   18.280212] PGD 800407a79067 P4D 800407a79067 PUD 40ce8a067 PMD 0
>> [   18.280214] Oops: 0010 [#1] SMP PTI
>> [   18.280215] Modules linked in: intel_rapl x86_pkg_temp_thermal 
>> intel_powerclamp kvm_intel joydev input_led kvm irqbypass crct10dif_pclmul 
>> crc32_pclmul ghash_clmulni_intel pcbc hid_eneric aesni_intel gpio_ich 
>> aes_x86_64 usbhid lpc_ich crpto_simd ie31200_edac cryptd glue_helper 
>> intel_cstate mac_hid intel_rapl_perf bnx2x mdio tcp_bbr netconsole 
>> ipmi_devintf ipmi_msghandler i2c_i801 coretemp autofs4 raid10 raid456 
>> libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx 
>> raid6_pq raid1 raid0 multipath linear sha26_mb mcryptd sha256_ssse3 hid ast 
>> i2c_algo_bit ttm drm_kms_helper syscopyarea sysfillrect sysimgblt mpt3sas 
>> fb_sys_fops drm raid_class scsi_transport_sas ahci libahci shpchp video
>> [   18.280241] CPU: 6 PID: 1081 Comm: ethtool Not tainted 
>> 4.17.2-041702-generic #201806160433
>> [   18.280242] Hardware name: Foxconn CangJie/CangJie, BIOS CC1F108D 
>> 02/26/2014
>> [   18.280243] RIP: 0010:  (null)
>> [   18.280243] RSP: 0018:b84bc260b9c0 EFLAGS: 00010246
>> [   18.280244] RAX:  RBX: 92f987f020f0 RCX: 
>> 
>> [   18.280245] RDX:  RSI: b84bc260b9f8 RDI: 
>> 92f987f020f0
>> [   18.280245] RBP: b8bc260b9e8 R08: 0001 R09: 
>> 
>> [   18.280246] R10: b84bc260bd20 R11:  R12: 
>> b84bc260b9f8
>> [   18.280246] R13: 92f987f008c0 R14: 7ffdb75bec40 R15: 
>> 
>> [   18.280247] FS:  7fc0e8798700() GS:92f99fd8() 
>> knlGS:
>> [   18.280248] CS:  0010 DS:  ES:  CR0: 80050033
>> [   18.280249] CR2:  CR3: 000409b4c003 CR4: 
>> 001606e0
>> [   18.280249] Call Trace:
>> [   18.280263]  ? bnx2x_config_rss+0x2f/0xd0 [bnx2x]
>> [   18.280270]  bnx2x_rss+0x1d9/0x210 [bnx2x]
>> [   18.280276]  bnx2x_set_rxnfc+0x17d/0x380 [bnx2x]
>> [   18.280279]  ethtool_set_rxnfc+0x9b/0x110
>> [   18.280281]  ? __do_page_cache_readahead+0x1da/0x2c0
>> [   18.280283]  ? security_capable+0x3c/0x60
>> [   18.280284]  dev_ethtool+0350/0x2610
>> [   

Re: bnx2x: kernel panic in the bnx2x driver

2018-06-22 Thread Vishwanath Pai
Hi Sudarsana,

Thanks for taking a look at my email. The fix you suggested would
definitely fix the kernel panic, but at the same time wouldn't it also
silently ignore the request by ethtool to set rx-flow-hash?

Thanks,
Vishwanath

On 06/22/2018 06:20 AM, Kalluru, Sudarsana wrote:
> Hi Vishwanath,
> Thanks for your mail, and the analysis.
> The fix would be to invoke bnx2x_rss() only when the device is opened,
>   if (bp->state == BNX2X_STATE_OPEN)
>   return bnx2x_rss(bp, >rss_conf_obj, false, true);
>   else
>   return 0;
> Ariel,
>Could you please review the path (bnx2x_set_rss_flags()--> bnx2x_rss()) 
> and confirm/correct on the above.
> 
> Thanks,
> Sudarsana
> 
> -Original Message-
> From: Vishwanath Pai [mailto:v...@akamai.com] 
> Sent: 22 June 2018 10:37
> To: Elior, Ariel ; Dept-Eng Everest Linux L2 
> 
> Cc: da...@davemloft.net; netdev@vger.kernel.org; dbane...@akamai.com; 
> pai.vishw...@gmail.com
> Subject: bnx2x: kernel panic in the bnx2x driver
> 
> External Email
> 
> Hi,
> 
> We recently noticed a kernel panic in the bnx2x driver when trying to set 
> rx-flow-hash parameters via ethtool during if-pre-up.d. I am running kernel
> v4.17.2 from ubuntu-mainline-ppa. I have added the stack trace below:
> 
> [   18.280209] BUG: unable to handle kernel NULL pointer dereference at 
> 
> [   18.280212] PGD 800407a79067 P4D 800407a79067 PUD 40ce8a067 PMD 0
> [   18.280214] Oops: 0010 [#1] SMP PTI
> [   18.280215] Modules linked in: intel_rapl x86_pkg_temp_thermal 
> intel_powerclamp kvm_intel joydev input_led kvm irqbypass crct10dif_pclmul 
> crc32_pclmul ghash_clmulni_intel pcbc hid_eneric aesni_intel gpio_ich 
> aes_x86_64 usbhid lpc_ich crpto_simd ie31200_edac cryptd glue_helper 
> intel_cstate mac_hid intel_rapl_perf bnx2x mdio tcp_bbr netconsole 
> ipmi_devintf ipmi_msghandler i2c_i801 coretemp autofs4 raid10 raid456 
> libcrc32c async_raid6_recov async_memcpy async_pq async_xor xor async_tx 
> raid6_pq raid1 raid0 multipath linear sha26_mb mcryptd sha256_ssse3 hid ast 
> i2c_algo_bit ttm drm_kms_helper syscopyarea sysfillrect sysimgblt mpt3sas 
> fb_sys_fops drm raid_class scsi_transport_sas ahci libahci shpchp video
> [   18.280241] CPU: 6 PID: 1081 Comm: ethtool Not tainted 
> 4.17.2-041702-generic #201806160433
> [   18.280242] Hardware name: Foxconn CangJie/CangJie, BIOS CC1F108D 
> 02/26/2014
> [   18.280243] RIP: 0010:  (null)
> [   18.280243] RSP: 0018:b84bc260b9c0 EFLAGS: 00010246
> [   18.280244] RAX:  RBX: 92f987f020f0 RCX: 
> 
> [   18.280245] RDX:  RSI: b84bc260b9f8 RDI: 
> 92f987f020f0
> [   18.280245] RBP: b8bc260b9e8 R08: 0001 R09: 
> 
> [   18.280246] R10: b84bc260bd20 R11:  R12: 
> b84bc260b9f8
> [   18.280246] R13: 92f987f008c0 R14: 7ffdb75bec40 R15: 
> 
> [   18.280247] FS:  7fc0e8798700() GS:92f99fd8() 
> knlGS:
> [   18.280248] CS:  0010 DS:  ES:  CR0: 80050033
> [   18.280249] CR2:  CR3: 000409b4c003 CR4: 
> 001606e0
> [   18.280249] Call Trace:
> [   18.280263]  ? bnx2x_config_rss+0x2f/0xd0 [bnx2x]
> [   18.280270]  bnx2x_rss+0x1d9/0x210 [bnx2x]
> [   18.280276]  bnx2x_set_rxnfc+0x17d/0x380 [bnx2x]
> [   18.280279]  ethtool_set_rxnfc+0x9b/0x110
> [   18.280281]  ? __do_page_cache_readahead+0x1da/0x2c0
> [   18.280283]  ? security_capable+0x3c/0x60
> [   18.280284]  dev_ethtool+0350/0x2610
> [   18.280286]  ? page_cache_async_readahead+0x71/0x80
> [   18.280288]  ? page_add_file_rmap+0x5d/0x220
> [   18.280290]  ? inet_ioctl+0x182/0x1a0
> [   18.280291]  dev_ioctl+0x203/0x3f0
> [   18.280293]  ? dev_ioctl+0x203/0x3f0
> [   18.280294]  sock_do_ioctl+0xae/0x150
> [   18.280296]  sock_ioctl+0x1e2/0x330
> [   18.280296]  ? sock_ioctl+0x1e2/0x330
> [   18.280299]  do_vfs_ioctl+0xa8/0x620
> [   18.280300]  ? dlci_ioctl_set+0x30/0x30
> [   18.280301]  ? do_vfs_ioctl+0xa8/0x620
> [   18.280302]  ? handle_mm_fault+0xe3/0x220
> [   18.280304]  ksys_ioctl+0x75/0x80
> [   18.280305]  __x64_sys_ioctl+0x1a/0x20
> [   18.280307]  do_syscall_64+0x5a/0x120
> [   18.280309]  entry_SYSCALL_64_aftr_hwframe+0x44/0xa9
> [   18.280310] RIP: 0033:0x7fc0e7fba107
> [   18.280311] RSP: 002b:7ffdb75beb78 EFLAGS: 0206 ORIG_RAX: 
> 0010
> [   18.280312] RAX: ffda RBX:  RCX: 
> 7fc0e7fba107
> [   18.280312] RDX: 7ffdb75bed60 RSI: 8946 RDI: 
> 0003
> [   18.280313] RBP: 7ffdb75bed50 R08: 7ffdb75bed60 R09: 
> 00

bnx2x: kernel panic in the bnx2x driver

2018-06-21 Thread Vishwanath Pai
Hi,

We recently noticed a kernel panic in the bnx2x driver when trying to set
rx-flow-hash parameters via ethtool during if-pre-up.d. I am running kernel
v4.17.2 from ubuntu-mainline-ppa. I have added the stack trace below:

[   18.280209] BUG: unable to handle kernel NULL pointer dereference at 

[   18.280212] PGD 800407a79067 P4D 800407a79067 PUD 40ce8a067 PMD 0
[   18.280214] Oops: 0010 [#1] SMP PTI
[   18.280215] Modules linked in: intel_rapl x86_pkg_temp_thermal 
intel_powerclamp kvm_intel joydev input_led kvm irqbypass crct10dif_pclmul 
crc32_pclmul ghash_clmulni_intel pcbc hid_eneric aesni_intel gpio_ich 
aes_x86_64 usbhid lpc_ich crpto_simd ie31200_edac cryptd glue_helper 
intel_cstate mac_hid intel_rapl_perf bnx2x mdio tcp_bbr netconsole ipmi_devintf 
ipmi_msghandler i2c_i801 coretemp autofs4 raid10 raid456 libcrc32c 
async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq raid1 
raid0 multipath linear sha26_mb mcryptd sha256_ssse3 hid ast i2c_algo_bit ttm 
drm_kms_helper syscopyarea sysfillrect sysimgblt mpt3sas fb_sys_fops drm 
raid_class scsi_transport_sas ahci libahci shpchp video
[   18.280241] CPU: 6 PID: 1081 Comm: ethtool Not tainted 4.17.2-041702-generic 
#201806160433
[   18.280242] Hardware name: Foxconn CangJie/CangJie, BIOS CC1F108D 02/26/2014
[   18.280243] RIP: 0010:  (null)
[   18.280243] RSP: 0018:b84bc260b9c0 EFLAGS: 00010246
[   18.280244] RAX:  RBX: 92f987f020f0 RCX: 
[   18.280245] RDX:  RSI: b84bc260b9f8 RDI: 92f987f020f0
[   18.280245] RBP: b8bc260b9e8 R08: 0001 R09: 
[   18.280246] R10: b84bc260bd20 R11:  R12: b84bc260b9f8
[   18.280246] R13: 92f987f008c0 R14: 7ffdb75bec40 R15: 
[   18.280247] FS:  7fc0e8798700() GS:92f99fd8() 
knlGS:
[   18.280248] CS:  0010 DS:  ES:  CR0: 80050033
[   18.280249] CR2:  CR3: 000409b4c003 CR4: 001606e0
[   18.280249] Call Trace:
[   18.280263]  ? bnx2x_config_rss+0x2f/0xd0 [bnx2x]
[   18.280270]  bnx2x_rss+0x1d9/0x210 [bnx2x]
[   18.280276]  bnx2x_set_rxnfc+0x17d/0x380 [bnx2x]
[   18.280279]  ethtool_set_rxnfc+0x9b/0x110
[   18.280281]  ? __do_page_cache_readahead+0x1da/0x2c0
[   18.280283]  ? security_capable+0x3c/0x60
[   18.280284]  dev_ethtool+0350/0x2610
[   18.280286]  ? page_cache_async_readahead+0x71/0x80
[   18.280288]  ? page_add_file_rmap+0x5d/0x220
[   18.280290]  ? inet_ioctl+0x182/0x1a0
[   18.280291]  dev_ioctl+0x203/0x3f0
[   18.280293]  ? dev_ioctl+0x203/0x3f0
[   18.280294]  sock_do_ioctl+0xae/0x150
[   18.280296]  sock_ioctl+0x1e2/0x330
[   18.280296]  ? sock_ioctl+0x1e2/0x330
[   18.280299]  do_vfs_ioctl+0xa8/0x620
[   18.280300]  ? dlci_ioctl_set+0x30/0x30
[   18.280301]  ? do_vfs_ioctl+0xa8/0x620
[   18.280302]  ? handle_mm_fault+0xe3/0x220
[   18.280304]  ksys_ioctl+0x75/0x80
[   18.280305]  __x64_sys_ioctl+0x1a/0x20
[   18.280307]  do_syscall_64+0x5a/0x120
[   18.280309]  entry_SYSCALL_64_aftr_hwframe+0x44/0xa9
[   18.280310] RIP: 0033:0x7fc0e7fba107
[   18.280311] RSP: 002b:7ffdb75beb78 EFLAGS: 0206 ORIG_RAX: 
0010
[   18.280312] RAX: ffda RBX:  RCX: 7fc0e7fba107
[   18.280312] RDX: 7ffdb75bed60 RSI: 8946 RDI: 0003
[   18.280313] RBP: 7ffdb75bed50 R08: 7ffdb75bed60 R09: 0001
[   18.280313] R10: 0541 R11: 0206 R12: 7ffdb75beed0
[   18.280314] R13: 00421020 R14: 0041fe28 R15: 0003
[   18.280315] Code:  Bad RIP value.
[   18.280317] RIP:   (null) RSP: b84bc260b9c0
[  18.280318] CR2: 
[   18.280319] ---[ end trace 5f361db3fb9059f1 ]---

To reproduce this I created a bash script in "/etc/network/if-pre-up.d/" with
these two lines:
ethtool -N $IFACE rx-flow-hash udp4 "sdfn"
ethtool -N $IFACE rx-flow-hash udp6 "sdfn"

The problem here is that rss_obj in bnx2x struct for the device hasn't been
initialized yet, which causes an exception in bnx2x_config_rss() when calling
"r->set_pending(r)" because r->set_pending is NULL. It looks like a lot many
things haven't been initialized at this point, most of that happens in this
function: "bnx2x_init_bp_objs()" which isn't called until ifup. Any thoughts on
how this can be fixed? Would it be possible to safely move bnx2x_init_bp_objs()
to maybe bnx2x_init_one() which runs much before ifup?

Thanks,
Vishwanath


Re: [PATCH net] netfilter: xt_hashlimit: fix lock imbalance

2018-02-14 Thread Vishwanath Pai
On 02/12/2018 11:11 AM, Eric Dumazet wrote:
> From: Eric Dumazet <eduma...@google.com>
> 
> syszkaller found that rcu was not held in hashlimit_mt_common()
> 
> We only need to enable BH at this point.
> 
> Fixes: bea74641e378 ("netfilter: xt_hashlimit: add rate match mode")
> Signed-off-by: Eric Dumazet <eduma...@google.com>
> Reported-by: syzkaller <syzkal...@googlegroups.com>
> ---
>  net/netfilter/xt_hashlimit.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
> index 
> ca6847403ca218c478d208080aab3e3c92a0a615..1fea48b4182f7cd5669b9c5e7100b2735cc590e7
>  100644
> --- a/net/netfilter/xt_hashlimit.c
> +++ b/net/netfilter/xt_hashlimit.c
> @@ -774,7 +774,7 @@ hashlimit_mt_common(const struct sk_buff *skb, struct 
> xt_action_param *par,
>   if (!dh->rateinfo.prev_window &&
>   (dh->rateinfo.current_rate <= dh->rateinfo.burst)) {
>   spin_unlock(>lock);
> - rcu_read_unlock_bh();
> + local_bh_enable();
>   return !(cfg->mode & XT_HASHLIMIT_INVERT);
>   } else {
>   goto overlimit;
> 

Thanks for fixing this.

Acked-by: Vishwanath Pai <v...@akamai.com>

-Vishwanath


[PATCH v2 net-next] net: display hw address of source machine during ipv6 DAD failure

2017-10-30 Thread Vishwanath Pai
This patch updates the error messages displayed in kernel log to include
hwaddress of the source machine that caused ipv6 duplicate address
detection failures.

Examples:

a) When we receive a NA packet from another machine advertising our
address:

ICMPv6: NA: 34:ab:cd:56:11:e8 advertised our address 2001:db8:: on eth0!

b) When we detect DAD failure during address assignment to an interface:

IPv6: eth0: IPv6 duplicate address 2001:db8:: used by 34:ab:cd:56:11:e8
detected!

v2:
Changed %pI6 to %pI6c in ndisc_recv_na()
Chaged the v6 address in the commit message to 2001:db8::

Suggested-by: Igor Lubashev <iluba...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 include/net/addrconf.h | 2 +-
 net/ipv6/addrconf.c| 6 +++---
 net/ipv6/ndisc.c   | 9 +
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 15b5ffd..2a616ea 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -208,7 +208,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr 
*mc_addr,
 void ipv6_mc_init_dev(struct inet6_dev *idev);
 void ipv6_mc_destroy_dev(struct inet6_dev *idev);
 int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed);
-void addrconf_dad_failure(struct inet6_ifaddr *ifp);
+void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp);
 
 bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
 const struct in6_addr *src_addr);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 5a8a102..cfa374c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1987,7 +1987,7 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp)
return err;
 }
 
-void addrconf_dad_failure(struct inet6_ifaddr *ifp)
+void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 {
struct inet6_dev *idev = ifp->idev;
struct net *net = dev_net(ifp->idev->dev);
@@ -1997,8 +1997,8 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
return;
}
 
-   net_info_ratelimited("%s: IPv6 duplicate address %pI6c detected!\n",
-ifp->idev->dev->name, >addr);
+   net_info_ratelimited("%s: IPv6 duplicate address %pI6c used by %pM 
detected!\n",
+ifp->idev->dev->name, >addr, 
eth_hdr(skb)->h_source);
 
spin_lock_bh(>lock);
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 266a530..f9c3ffe 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -46,6 +46,7 @@
 #endif
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -822,7 +823,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 * who is doing DAD
 * so fail our DAD process
 */
-   addrconf_dad_failure(ifp);
+   addrconf_dad_failure(skb, ifp);
return;
} else {
/*
@@ -975,7 +976,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
if (ifp) {
if (skb->pkt_type != PACKET_LOOPBACK
&& (ifp->flags & IFA_F_TENTATIVE)) {
-   addrconf_dad_failure(ifp);
+   addrconf_dad_failure(skb, ifp);
return;
}
/* What should we make now? The advertisement
@@ -989,8 +990,8 @@ static void ndisc_recv_na(struct sk_buff *skb)
 */
if (skb->pkt_type != PACKET_LOOPBACK)
ND_PRINTK(1, warn,
- "NA: someone advertises our address %pI6 on 
%s!\n",
- >addr, ifp->idev->dev->name);
+ "NA: %pM advertised our address %pI6c on 
%s!\n",
+ eth_hdr(skb)->h_source, >addr, 
ifp->idev->dev->name);
in6_ifa_put(ifp);
return;
}
-- 
1.9.1



Re: [PATCH net-next] net: display hw address of source machine during ipv6 DAD failure

2017-10-30 Thread Vishwanath Pai
On 10/30/2017 04:43 PM, David Ahern wrote:
> On 10/30/17 2:29 PM, Vishwanath Pai wrote:
>> This patch updates the error messages displayed in kernel log to include
>> hwaddress of the source machine that caused ipv6 duplicate address
>> detection failures.
>>
>> Examples:
>>
>> a) When we receive a NA packet from another machine advertising our
>> address:
>>
>> ICMPv6: NA: 34:ab:cd:56:11:e8 advertised our address 2601::2bb4 on eth0!
> 
> your example above does not agree with the format below. You have the
> compressed IPv6 address, yet the format ...
> 
>>
>> b) When we detect DAD failure during address assignment to an interface:
>>
>> IPv6: eth0: IPv6 duplicate address 2601::2b78 used by 34:ab:cd:56:11:e8
>> detected!
>>
>> Suggested-by: Igor Lubashev <iluba...@akamai.com>
>> Signed-off-by: Vishwanath Pai <v...@akamai.com>
>> ---
> 
> 
>> @@ -989,8 +990,8 @@ static void ndisc_recv_na(struct sk_buff *skb)
>>   */
>>  if (skb->pkt_type != PACKET_LOOPBACK)
>>  ND_PRINTK(1, warn,
>> -  "NA: someone advertises our address %pI6 on 
>> %s!\n",
>> -  >addr, ifp->idev->dev->name);
>> +  "NA: %pM advertised our address %pI6 on 
>> %s!\n",
> 
> ... is uncompressed. Please make that '%pI6c' instead of pI6 in the line
> above
> 
> 
>> +  eth_hdr(skb)->h_source, >addr, 
>> ifp->idev->dev->name);
>>  in6_ifa_put(ifp);
>>  return;
>>  }
> 
> 
> 

It does print out the full uncompressed IPv6 address. I modified the
message manually by hand while copying it out to the commit message in
order to hide the real IP address of my machines, but did not realize
that it was different from what the kernel would actually print out.

-Vishwanath


[PATCH net-next] net: display hw address of source machine during ipv6 DAD failure

2017-10-30 Thread Vishwanath Pai
This patch updates the error messages displayed in kernel log to include
hwaddress of the source machine that caused ipv6 duplicate address
detection failures.

Examples:

a) When we receive a NA packet from another machine advertising our
address:

ICMPv6: NA: 34:ab:cd:56:11:e8 advertised our address 2601::2bb4 on eth0!

b) When we detect DAD failure during address assignment to an interface:

IPv6: eth0: IPv6 duplicate address 2601::2b78 used by 34:ab:cd:56:11:e8
detected!

Suggested-by: Igor Lubashev <iluba...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 include/net/addrconf.h | 2 +-
 net/ipv6/addrconf.c| 6 +++---
 net/ipv6/ndisc.c   | 9 +
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 15b5ffd..2a616ea 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -208,7 +208,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr 
*mc_addr,
 void ipv6_mc_init_dev(struct inet6_dev *idev);
 void ipv6_mc_destroy_dev(struct inet6_dev *idev);
 int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed);
-void addrconf_dad_failure(struct inet6_ifaddr *ifp);
+void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp);
 
 bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
 const struct in6_addr *src_addr);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 5a8a102..cfa374c 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1987,7 +1987,7 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp)
return err;
 }
 
-void addrconf_dad_failure(struct inet6_ifaddr *ifp)
+void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
 {
struct inet6_dev *idev = ifp->idev;
struct net *net = dev_net(ifp->idev->dev);
@@ -1997,8 +1997,8 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
return;
}
 
-   net_info_ratelimited("%s: IPv6 duplicate address %pI6c detected!\n",
-ifp->idev->dev->name, >addr);
+   net_info_ratelimited("%s: IPv6 duplicate address %pI6c used by %pM 
detected!\n",
+ifp->idev->dev->name, >addr, 
eth_hdr(skb)->h_source);
 
spin_lock_bh(>lock);
 
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 266a530..9d6ea9d 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -46,6 +46,7 @@
 #endif
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -822,7 +823,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
 * who is doing DAD
 * so fail our DAD process
 */
-   addrconf_dad_failure(ifp);
+   addrconf_dad_failure(skb, ifp);
return;
} else {
/*
@@ -975,7 +976,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
if (ifp) {
if (skb->pkt_type != PACKET_LOOPBACK
&& (ifp->flags & IFA_F_TENTATIVE)) {
-   addrconf_dad_failure(ifp);
+   addrconf_dad_failure(skb, ifp);
return;
}
/* What should we make now? The advertisement
@@ -989,8 +990,8 @@ static void ndisc_recv_na(struct sk_buff *skb)
 */
if (skb->pkt_type != PACKET_LOOPBACK)
ND_PRINTK(1, warn,
- "NA: someone advertises our address %pI6 on 
%s!\n",
- >addr, ifp->idev->dev->name);
+ "NA: %pM advertised our address %pI6 on 
%s!\n",
+ eth_hdr(skb)->h_source, >addr, 
ifp->idev->dev->name);
in6_ifa_put(ifp);
return;
}
-- 
1.9.1



Re: [PATCH] netfilter: ipset: ipset list may return wrong member count for set with timeout

2017-09-11 Thread Vishwanath Pai
Hi Jozsef,

Thank you. Yes, that is true, the count can still be incorrect in the
case of a huge set.

Thanks,
Vishwanath

On 09/11/2017 03:36 PM, Jozsef Kadlecsik wrote:
> Hi,
> 
> Your patch is applied in the ipset git tree and I'm going to push it for 
> kernel inclusion.
> 
> I modified the comment part: the elements counter can still be incorrect 
> in the case of a huge set, because elements might time out during the 
> listing.
> 
> Thanks for your patience!
> 
> Best regards,
> Jozsef
> 
> On Thu, 17 Aug 2017, Vishwanath Pai wrote:
> 
>> Simple testcase:
>>
>> $ ipset create test hash:ip timeout 5
>> $ ipset add test 1.2.3.4
>> $ ipset add test 1.2.2.2
>> $ sleep 5
>>
>> $ ipset l
>> Name: test
>> Type: hash:ip
>> Revision: 5
>> Header: family inet hashsize 1024 maxelem 65536 timeout 5
>> Size in memory: 296
>> References: 0
>> Number of entries: 2
>> Members:
>>
>> We return "Number of entries: 2" but no members are listed. That is
>> because mtype_list runs "ip_set_timeout_expired" and does not list the
>> expired entries, but set->elements is never upated (until mtype_gc
>> cleans it up later).
>>
>> Reviewed-by: Joshua Hunt <joh...@akamai.com>
>> Signed-off-by: Vishwanath Pai <v...@akamai.com>
>> ---
>>  net/netfilter/ipset/ip_set_hash_gen.h | 12 +++-
>>  1 file changed, 11 insertions(+), 1 deletion(-)
>>
>> diff --git a/net/netfilter/ipset/ip_set_hash_gen.h 
>> b/net/netfilter/ipset/ip_set_hash_gen.h
>> index f236c0b..ff3d31c 100644
>> --- a/net/netfilter/ipset/ip_set_hash_gen.h
>> +++ b/net/netfilter/ipset/ip_set_hash_gen.h
>> @@ -1041,12 +1041,22 @@ struct htype {
>>  static int
>>  mtype_head(struct ip_set *set, struct sk_buff *skb)
>>  {
>> -const struct htype *h = set->data;
>> +struct htype *h = set->data;
>>  const struct htable *t;
>>  struct nlattr *nested;
>>  size_t memsize;
>>  u8 htable_bits;
>>  
>> +/* If any members have expired, set->elements will be wrong
>> + * mytype_expire function will update it with the right count.
>> + * we do not hold set->lock here, so grab it first.
>> + */
>> +if (SET_WITH_TIMEOUT(set)) {
>> +spin_lock_bh(>lock);
>> +mtype_expire(set, h);
>> +spin_unlock_bh(>lock);
>> +}
>> +
>>  rcu_read_lock_bh();
>>  t = rcu_dereference_bh_nfnl(h->table);
>>  memsize = mtype_ahash_memsize(h, t) + set->ext_size;
>> -- 
>> 1.9.1
>>
>>
> 
> -
> E-mail  : kad...@blackhole.kfki.hu, kadlecsik.joz...@wigner.mta.hu
> PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
> Address : Wigner Research Centre for Physics, Hungarian Academy of Sciences
>   H-1525 Budapest 114, POB. 49, Hungary
> 



[PATCH v2] netfilter: xt_hashlimit: fix build error caused by 64bit division

2017-09-07 Thread Vishwanath Pai
64bit division causes build/link errors on 32bit architectures. It
prints out error messages like:

ERROR: "__aeabi_uldivmod" [net/netfilter/xt_hashlimit.ko] undefined!

The value of avg passed through by userspace in BYTE mode cannot exceed
U32_MAX. Which means 64bit division in user2rate_bytes is unnecessary.
To fix this I have changed the type of param 'user' to u32.

Since anything greater than U32_MAX is an invalid input we error out in
hashlimit_mt_check_common() when this is the case.

Changes in v2:
Making return type as u32 would cause an overflow for small
values of 'user' (for example 2, 3 etc). To avoid this I bumped up
'r' to u64 again as well as the return type. This is OK since the
variable that stores the result is u64. We still avoid 64bit
division here since 'user' is u32.

Fixes: bea74641e378 ("netfilter: xt_hashlimit: add rate match mode")
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 net/netfilter/xt_hashlimit.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 10d4823..1c1941e 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <lafo...@netfilter.org>");
@@ -527,12 +528,12 @@ static u64 user2rate(u64 user)
}
 }
 
-static u64 user2rate_bytes(u64 user)
+static u64 user2rate_bytes(u32 user)
 {
u64 r;
 
-   r = user ? 0xULL / user : 0xULL;
-   r = (r - 1) << 4;
+   r = user ? U32_MAX / user : U32_MAX;
+   r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
return r;
 }
 
@@ -588,7 +589,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
dh->rateinfo.prev_window = 0;
dh->rateinfo.current_rate = 0;
if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
-   dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg);
+   dh->rateinfo.rate =
+   user2rate_bytes((u32)hinfo->cfg.avg);
if (hinfo->cfg.burst)
dh->rateinfo.burst =
hinfo->cfg.burst * dh->rateinfo.rate;
@@ -870,7 +872,7 @@ static int hashlimit_mt_check_common(const struct 
xt_mtchk_param *par,
 
/* Check for overflow. */
if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) {
-   if (cfg->avg == 0) {
+   if (cfg->avg == 0 || cfg->avg > U32_MAX) {
pr_info("hashlimit invalid rate\n");
return -ERANGE;
}
-- 
1.9.1



[PATCH] netfilter: xt_hashlimit: fix build error caused by 64bit division

2017-09-07 Thread Vishwanath Pai
64bit division causes build/link errors on 32bit architectures. It
prints out error messages like:

ERROR: "__aeabi_uldivmod" [net/netfilter/xt_hashlimit.ko] undefined!

The value of avg passed through by userspace in BYTE mode cannot exceed
U32_MAX. Which means 64bit division in user2rate_bytes is unnecessary.

This fix changes the size of both the param as well as return type on
user2rate_bytes to u32.

Since anything greater than U32_MAX is an invalid input we error out in
hashlimit_mt_check_common() when this is the case.

Also fixed warning about const pointer conversion in cfg_copy().

Fixes: bea74641e378 ("netfilter: xt_hashlimit: add rate match mode")
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 net/netfilter/xt_hashlimit.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 10d4823..1d818f1 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Harald Welte <lafo...@netfilter.org>");
@@ -527,12 +528,12 @@ static u64 user2rate(u64 user)
}
 }
 
-static u64 user2rate_bytes(u64 user)
+static u32 user2rate_bytes(u32 user)
 {
-   u64 r;
+   u32 r;
 
-   r = user ? 0xULL / user : 0xULL;
-   r = (r - 1) << 4;
+   r = user ? U32_MAX / user : U32_MAX;
+   r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
return r;
 }
 
@@ -588,7 +589,8 @@ static void rateinfo_init(struct dsthash_ent *dh,
dh->rateinfo.prev_window = 0;
dh->rateinfo.current_rate = 0;
if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) {
-   dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg);
+   dh->rateinfo.rate =
+   user2rate_bytes((u32)hinfo->cfg.avg);
if (hinfo->cfg.burst)
dh->rateinfo.burst =
hinfo->cfg.burst * dh->rateinfo.rate;
@@ -870,7 +872,7 @@ static int hashlimit_mt_check_common(const struct 
xt_mtchk_param *par,
 
/* Check for overflow. */
if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) {
-   if (cfg->avg == 0) {
+   if (cfg->avg == 0 || cfg->avg > U32_MAX) {
pr_info("hashlimit invalid rate\n");
return -ERANGE;
}
-- 
1.9.1



Re: [PATCH] netfilter: xt_hashlimit: avoid 64-bit division

2017-09-06 Thread Vishwanath Pai
On 09/06/2017 03:57 PM, Arnd Bergmann wrote:
> 64-bit division is expensive on 32-bit architectures, and
> requires a special function call to avoid a link error like:
> 
> net/netfilter/xt_hashlimit.o: In function `hashlimit_mt_common':
> xt_hashlimit.c:(.text+0x1328): undefined reference to `__aeabi_uldivmod'
> 
> In the case of hashlimit_mt_common, we don't actually need a
> 64-bit operation, we can simply rewrite the function slightly
> to make that clear to the compiler.
> 
> Fixes: bea74641e378 ("netfilter: xt_hashlimit: add rate match mode")
> Signed-off-by: Arnd Bergmann 
> ---
>  net/netfilter/xt_hashlimit.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
> index 10d48234f5f4..50b53d86eef5 100644
> --- a/net/netfilter/xt_hashlimit.c
> +++ b/net/netfilter/xt_hashlimit.c
> @@ -531,7 +531,10 @@ static u64 user2rate_bytes(u64 user)
>  {
>   u64 r;
>  
> - r = user ? 0xULL / user : 0xULL;
> + if (user > 0xULL)
> + return 0;
> +
> + r = user ? 0xULL / (u32)user : 0xULL;
>   r = (r - 1) << 4;
>   return r;
>  }
> 

I have submitted another patch to fix this:
https://patchwork.ozlabs.org/patch/809881/

We have seen this problem before, I was careful not to introduce this
again in the new patch but clearly I overlooked this particular line :(

In the other cases we fixed it by replacing division with div64_u64().

-Vishwanath


Re: [PATCH 1/2] netfilter/xt_hashlimit: new feature/algorithm for xt_hashlimit

2017-09-04 Thread Vishwanath Pai
On 09/04/2017 06:14 AM, Pablo Neira Ayuso wrote:
> Sounds good, applied, thanks.
> 
> A couple of questions: Does it really make sense to expose
> --hashlimit-rate-interval or are you using 1 second always there? I
> always wonder if it makes sense to expose yet another toggle that it's
> not clear to me how the user would take advantage from this.
> 
> Just curious here.


Thank you Pablo.

I generally just let rate-interval to default to 1, but I've added the
toggle to give the user more control on how often to sample the packet
rate. This wasn't an option before since we would always keep "delta" as
1 jiffy, but that is not the case in the new algorithm. Hence I have
exposed it as an option to the user.

Also, it looks like I broke the build on ARM and m68k arch. This is
because of the u64 division issue (sorry about that). I have sent
another patch to fix the problem "[PATCH] netfilter: xt_hashlimit: fix
64 bit division compile error". Please apply and send to net-next.

Thanks,
Vishwanath


[PATCH] netfilter: xt_hashlimit: fix 64 bit division compile error

2017-09-04 Thread Vishwanath Pai
commit bea74641e378 ("netfilter: xt_hashlimit: add rate match mode")
introduced a line where we divide two 64bit unsigned integers. This
breaks on ARM processors with the error:

ERROR: "__aeabi_uldivmod" [net/netfilter/xt_hashlimit.ko] undefined!

We can fix it by using div64_u64 instead.

Fixes: bea74641e378 ("netfilter: xt_hashlimit: add rate match mode")
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 net/netfilter/xt_hashlimit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 10d4823..fece7c2 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -531,7 +531,7 @@ static u64 user2rate_bytes(u64 user)
 {
u64 r;
 
-   r = user ? 0xULL / user : 0xULL;
+   r = user ? div64_u64(0xULL, user) : 0xULL;
r = (r - 1) << 4;
return r;
 }
-- 
1.9.1



[PATCH 2/2] netfilter/libxt_hashlimit: new feature/algorithm for xt_hashlimit

2017-08-18 Thread Vishwanath Pai
This patch adds a new feature to hashlimit that allows matching on the
current packet/byte rate without rate limiting. This can be enabled
with a new flag --hashlimit-rate-match. The match returns true if the
current rate of packets is above/below the user specified value.

The main difference between the existing algorithm and the new one is
that the existing algorithm rate-limits the flow whereas the new algorithm
does not. Instead it *classifies* the flow based on whether it is above or
below a certain rate. I will demonstrate this with an example below. Let
us assume this rule:

iptables -A INPUT -m hashlimit --hashlimit-above 10/s -j new_chain

If the packet rate is 15/s, the existing algorithm would ACCEPT 10 packets
every second and send 5 packets to "new_chain".

But with the new algorithm, as long as the rate of 15/s is sustained, all
packets will continue to match and every packet is sent to new_chain.

This new functionality will let us classify different flows based on their
current rate, so that further decisions can be made on them based on what
the current rate is.

This is how the new algorithm works:
We divide time into intervals of 1 (sec/min/hour) as specified by
the user. We keep track of the number of packets/bytes processed in the
current interval. After each interval we reset the counter to 0.

When we receive a packet for match, we look at the packet rate
during the current interval and the previous interval to make a decision:

if [ prev_rate < user and cur_rate < user ]
return Below
else
return Above

Where cur_rate is the number of packets/bytes seen in the current
interval, prev is the number of packets/bytes seen in the previous
interval and 'user' is the rate specified by the user.

We also provide flexibility to the user for choosing the time
interval using the option --hashilmit-interval. For example the user can
keep a low rate like x/hour but still keep the interval as small as 1
second.

To preserve backwards compatibility we have to add this feature in a new
revision, so I've created revision 3 for hashlimit. The two new options
we add are:

--hashlimit-rate-match
--hashlimit-rate-interval

I have updated the help text to add these new options. Also added a few
tests for the new options.

Suggested-by: Igor Lubashev <iluba...@akamai.com>
Reviewed-by: Josh Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 extensions/libxt_hashlimit.c   | 407 ++---
 extensions/libxt_hashlimit.man |   8 +
 extensions/libxt_hashlimit.t   |   4 +
 include/linux/netfilter/xt_hashlimit.h |  36 ++-
 4 files changed, 414 insertions(+), 41 deletions(-)

diff --git a/extensions/libxt_hashlimit.c b/extensions/libxt_hashlimit.c
index 9e63e1e..e51ac1a 100644
--- a/extensions/libxt_hashlimit.c
+++ b/extensions/libxt_hashlimit.c
@@ -68,10 +68,13 @@ enum {
O_HTABLE_MAX,
O_HTABLE_GCINT,
O_HTABLE_EXPIRE,
+   O_RATEMATCH,
+   O_INTERVAL,
F_BURST = 1 << O_BURST,
F_UPTO  = 1 << O_UPTO,
F_ABOVE = 1 << O_ABOVE,
F_HTABLE_EXPIRE = 1 << O_HTABLE_EXPIRE,
+   F_RATEMATCH = 1 << O_RATEMATCH,
 };
 
 static void hashlimit_mt_help(void)
@@ -95,6 +98,29 @@ static void hashlimit_mt_help(void)
 "\n", XT_HASHLIMIT_BURST);
 }
 
+static void hashlimit_mt_help_v3(void)
+{
+   printf(
+"hashlimit match options:\n"
+"  --hashlimit-uptomax average match rate\n"
+"   [Packets per second unless followed by \n"
+"   /sec /minute /hour /day postfixes]\n"
+"  --hashlimit-above   min average match rate\n"
+"  --hashlimit-mode   mode is a comma-separated list of\n"
+"   dstip,srcip,dstport,srcport (or none)\n"
+"  --hashlimit-srcmask  source address grouping prefix length\n"
+"  --hashlimit-dstmask  destination address grouping prefix 
length\n"
+"  --hashlimit-name   name for /proc/net/ipt_hashlimit\n"
+"  --hashlimit-burst  number to match in a burst, default %u\n"
+"  --hashlimit-htable-size number of hashtable buckets\n"
+"  --hashlimit-htable-max  number of hashtable entries\n"
+"  --hashlimit-htable-gcintervalinterval between garbage collection runs\n"
+"  --hashlimit-htable-expireafter which time are idle entries 
expired?\n"
+"  --hashlimit-rate-match   rate match the flow without rate-limiting 
it\n"
+"  --hashlimit-rate-intervalinterval in seconds for 
hashlimit-rate-match\n"
+"\n", XT_HASHLIMIT_BURST);
+}
+
 #define s struct xt_hashlimit_info
 static const struct xt_option_entry hashlimit_opts[] = {

[PATCH 1/2] netfilter/xt_hashlimit: new feature/algorithm for xt_hashlimit

2017-08-18 Thread Vishwanath Pai
This patch adds a new feature to hashlimit that allows matching on the
current packet/byte rate without rate limiting. This can be enabled
with a new flag --hashlimit-rate-match. The match returns true if the
current rate of packets is above/below the user specified value.

The main difference between the existing algorithm and the new one is
that the existing algorithm rate-limits the flow whereas the new
algorithm does not. Instead it *classifies* the flow based on whether
it is above or below a certain rate. I will demonstrate this with an
example below. Let us assume this rule:

iptables -A INPUT -m hashlimit --hashlimit-above 10/s -j new_chain

If the packet rate is 15/s, the existing algorithm would ACCEPT 10
packets every second and send 5 packets to "new_chain".

But with the new algorithm, as long as the rate of 15/s is sustained,
all packets will continue to match and every packet is sent to new_chain.

This new functionality will let us classify different flows based on
their current rate, so that further decisions can be made on them based on
what the current rate is.

This is how the new algorithm works:
We divide time into intervals of 1 (sec/min/hour) as specified by
the user. We keep track of the number of packets/bytes processed in the
current interval. After each interval we reset the counter to 0.

When we receive a packet for match, we look at the packet rate
during the current interval and the previous interval to make a
decision:

if [ prev_rate < user and cur_rate < user ]
return Below
else
return Above

Where cur_rate is the number of packets/bytes seen in the current
interval, prev is the number of packets/bytes seen in the previous
interval and 'user' is the rate specified by the user.

We also provide flexibility to the user for choosing the time
interval using the option --hashilmit-interval. For example the user can
keep a low rate like x/hour but still keep the interval as small as 1
second.

To preserve backwards compatibility we have to add this feature in a new
revision, so I've created revision 3 for hashlimit. The two new options
we add are:

--hashlimit-rate-match
--hashlimit-rate-interval

I have updated the help text to add these new options. Also added a few
tests for the new options.

Suggested-by: Igor Lubashev <iluba...@akamai.com>
Reviewed-by: Josh Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 include/linux/netfilter/xt_hashlimit.h  |   3 +-
 include/uapi/linux/netfilter/xt_hashlimit.h |  36 +++-
 net/netfilter/xt_hashlimit.c| 275 +---
 3 files changed, 284 insertions(+), 30 deletions(-)

diff --git a/include/linux/netfilter/xt_hashlimit.h 
b/include/linux/netfilter/xt_hashlimit.h
index 074790c..0fc458b 100644
--- a/include/linux/netfilter/xt_hashlimit.h
+++ b/include/linux/netfilter/xt_hashlimit.h
@@ -5,5 +5,6 @@
 
 #define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \
  XT_HASHLIMIT_HASH_SIP | XT_HASHLIMIT_HASH_SPT | \
- XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES)
+ XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES |\
+ XT_HASHLIMIT_RATE_MATCH)
 #endif /*_XT_HASHLIMIT_H*/
diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h 
b/include/uapi/linux/netfilter/xt_hashlimit.h
index 79da349..aa98573 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -19,12 +19,13 @@
 struct xt_hashlimit_htable;
 
 enum {
-   XT_HASHLIMIT_HASH_DIP = 1 << 0,
-   XT_HASHLIMIT_HASH_DPT = 1 << 1,
-   XT_HASHLIMIT_HASH_SIP = 1 << 2,
-   XT_HASHLIMIT_HASH_SPT = 1 << 3,
-   XT_HASHLIMIT_INVERT   = 1 << 4,
-   XT_HASHLIMIT_BYTES= 1 << 5,
+   XT_HASHLIMIT_HASH_DIP   = 1 << 0,
+   XT_HASHLIMIT_HASH_DPT   = 1 << 1,
+   XT_HASHLIMIT_HASH_SIP   = 1 << 2,
+   XT_HASHLIMIT_HASH_SPT   = 1 << 3,
+   XT_HASHLIMIT_INVERT = 1 << 4,
+   XT_HASHLIMIT_BYTES  = 1 << 5,
+   XT_HASHLIMIT_RATE_MATCH = 1 << 6,
 };
 
 struct hashlimit_cfg {
@@ -79,6 +80,21 @@ struct hashlimit_cfg2 {
__u8 srcmask, dstmask;
 };
 
+struct hashlimit_cfg3 {
+   __u64 avg;  /* Average secs between packets * scale */
+   __u64 burst;/* Period multiplier for upper limit. */
+   __u32 mode; /* bitmask of XT_HASHLIMIT_HASH_* */
+
+   /* user specified */
+   __u32 size; /* how many buckets */
+   __u32 max;  /* max number of entries */
+   __u32 gc_interval;  /* gc interval */
+   __u32 expire;   /* when do entries expire? */
+
+   __u32 interval;
+   __u8 srcmask, dstmask;
+};
+
 struct xt_hashlimit_mtinfo1 {
char nam

[PATCH] netfilter: ipset: ipset list may return wrong member count for set with timeout

2017-08-16 Thread Vishwanath Pai
Simple testcase:

$ ipset create test hash:ip timeout 5
$ ipset add test 1.2.3.4
$ ipset add test 1.2.2.2
$ sleep 5

$ ipset l
Name: test
Type: hash:ip
Revision: 5
Header: family inet hashsize 1024 maxelem 65536 timeout 5
Size in memory: 296
References: 0
Number of entries: 2
Members:

We return "Number of entries: 2" but no members are listed. That is
because mtype_list runs "ip_set_timeout_expired" and does not list the
expired entries, but set->elements is never upated (until mtype_gc
cleans it up later).

Reviewed-by: Joshua Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 net/netfilter/ipset/ip_set_hash_gen.h | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/ipset/ip_set_hash_gen.h 
b/net/netfilter/ipset/ip_set_hash_gen.h
index f236c0b..ff3d31c 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -1041,12 +1041,22 @@ struct htype {
 static int
 mtype_head(struct ip_set *set, struct sk_buff *skb)
 {
-   const struct htype *h = set->data;
+   struct htype *h = set->data;
const struct htable *t;
struct nlattr *nested;
size_t memsize;
u8 htable_bits;
 
+   /* If any members have expired, set->elements will be wrong
+* mytype_expire function will update it with the right count.
+* we do not hold set->lock here, so grab it first.
+*/
+   if (SET_WITH_TIMEOUT(set)) {
+   spin_lock_bh(>lock);
+   mtype_expire(set, h);
+   spin_unlock_bh(>lock);
+   }
+
rcu_read_lock_bh();
t = rcu_dereference_bh_nfnl(h->table);
memsize = mtype_ahash_memsize(h, t) + set->ext_size;
-- 
1.9.1



[PATCH] netfilter: ipset: print out warnings generated by commands

2017-03-21 Thread Vishwanath Pai
Warnings are only printed out for IPSET_CMD_TEST. The user won't see
warnings from other commands.

Reviewed-by: Josh Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 src/ipset.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ipset.c b/src/ipset.c
index 2c4fa10..b0bef7b 100644
--- a/src/ipset.c
+++ b/src/ipset.c
@@ -812,8 +812,8 @@ parse_commandline(int argc, char *argv[])
"Unknown argument %s", argv[1]);
ret = ipset_cmd(session, cmd, restore_line);
D("ret %d", ret);
-   /* Special case for TEST and non-quiet mode */
-   if (cmd == IPSET_CMD_TEST && ipset_session_warning(session)) {
+
+   if (ipset_session_warning(session)) {
if (!ipset_envopt_test(session, IPSET_ENV_QUIET))
fprintf(stderr, "%s", ipset_session_warning(session));
ipset_session_report_reset(session);
-- 
1.9.1



[PATCH 1/2] netfilter: ipset: warn users of list:set that parameter 'size' is ignored

2017-03-21 Thread Vishwanath Pai
Since kernel commit 00590fdd5be0 ("netfilter: ipset: Introduce RCU
locking in list type"), the parameter 'size' has not been in use and
is ignored by the kernel. This is not very apparent to the user. This
commit makes 'size' optional and also warns the user if they try to
specify it. We also don't print it out on 'ipset l'.

I created revision 4 to make this change, revision 3 should work with
older kernels just like before.

Reviewed-by: Josh Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 lib/ipset_list_set.c | 92 
 1 file changed, 92 insertions(+)

diff --git a/lib/ipset_list_set.c b/lib/ipset_list_set.c
index 45934e7..2d8bc7a 100644
--- a/lib/ipset_list_set.c
+++ b/lib/ipset_list_set.c
@@ -322,6 +322,31 @@ static const struct ipset_arg list_set_create_args3[] = {
{ },
 };
 
+/* Parse commandline arguments */
+static const struct ipset_arg list_set_create_args4[] = {
+   { .name = { "size", NULL },
+ .has_arg = IPSET_OPTIONAL_ARG,.opt = IPSET_OPT_SIZE,
+ .parse = ipset_parse_ignored,
+   },
+   { .name = { "timeout", NULL },
+ .has_arg = IPSET_MANDATORY_ARG,   .opt = IPSET_OPT_TIMEOUT,
+ .parse = ipset_parse_timeout, .print = ipset_print_number,
+   },
+   { .name = { "counters", NULL },
+ .has_arg = IPSET_NO_ARG,  .opt = IPSET_OPT_COUNTERS,
+ .parse = ipset_parse_flag,.print = ipset_print_flag,
+   },
+   { .name = { "comment", NULL },
+ .has_arg = IPSET_NO_ARG,  .opt = IPSET_OPT_CREATE_COMMENT,
+ .parse = ipset_parse_flag,.print = ipset_print_flag,
+   },
+   { .name = { "skbinfo", NULL },
+ .has_arg = IPSET_NO_ARG,  .opt = IPSET_OPT_SKBINFO,
+ .parse = ipset_parse_flag,.print = ipset_print_flag,
+   },
+   { },
+};
+
 static const struct ipset_arg list_set_adt_args3[] = {
{ .name = { "timeout", NULL },
  .has_arg = IPSET_MANDATORY_ARG,   .opt = IPSET_OPT_TIMEOUT,
@@ -426,6 +451,72 @@ static struct ipset_type ipset_list_set3 = {
.usage = list_set_usage3,
.description = "skbinfo support",
 };
+
+static const char list_set_usage4[] =
+"create SETNAME list:set\n"
+"   [timeout VALUE] [counters] [comment]\n"
+"  [skbinfo]\n"
+"addSETNAME NAME [before|after NAME] [timeout VALUE]\n"
+"   [packets VALUE] [bytes VALUE] [comment STRING]\n"
+"  [skbmark VALUE] [skbprio VALUE] [skbqueue VALUE]\n"
+"delSETNAME NAME [before|after NAME]\n"
+"test   SETNAME NAME [before|after NAME]\n\n"
+"where NAME are existing set names.\n";
+
+static struct ipset_type ipset_list_set4 = {
+   .name = "list:set",
+   .alias = { "setlist", NULL },
+   .revision = 4,
+   .family = NFPROTO_UNSPEC,
+   .dimension = IPSET_DIM_ONE,
+   .elem = {
+   [IPSET_DIM_ONE - 1] = {
+   .parse = ipset_parse_setname,
+   .print = ipset_print_name,
+   .opt = IPSET_OPT_NAME
+   },
+   },
+   .compat_parse_elem = ipset_parse_name_compat,
+   .args = {
+   [IPSET_CREATE] = list_set_create_args4,
+   [IPSET_ADD] = list_set_adt_args3,
+   [IPSET_DEL] = list_set_adt_args2,
+   [IPSET_TEST] = list_set_adt_args2,
+   },
+   .mandatory = {
+   [IPSET_CREATE] = 0,
+   [IPSET_ADD] = IPSET_FLAG(IPSET_OPT_NAME),
+   [IPSET_DEL] = IPSET_FLAG(IPSET_OPT_NAME),
+   [IPSET_TEST] = IPSET_FLAG(IPSET_OPT_NAME),
+   },
+   .full = {
+   [IPSET_CREATE] = IPSET_FLAG(IPSET_OPT_SIZE)
+   | IPSET_FLAG(IPSET_OPT_TIMEOUT)
+   | IPSET_FLAG(IPSET_OPT_COUNTERS)
+   | IPSET_FLAG(IPSET_OPT_CREATE_COMMENT)
+   | IPSET_FLAG(IPSET_OPT_SKBINFO),
+   [IPSET_ADD] = IPSET_FLAG(IPSET_OPT_NAME)
+   | IPSET_FLAG(IPSET_OPT_BEFORE)
+   | IPSET_FLAG(IPSET_OPT_NAMEREF)
+   | IPSET_FLAG(IPSET_OPT_TIMEOUT)
+   | IPSET_FLAG(IPSET_OPT_PACKETS)
+   | IPSET_FLAG(IPSET_OPT_BYTES)
+   | IPSET_FLAG(IPSET_OPT_ADT_COMMENT)
+   | IPSET_FLAG(IPSET_OPT_SKBMARK)
+   | IPSET_FLAG(IPSET_OPT_SKBPRIO)
+   | IPSET_FLAG(IPSET_OPT_SKBQUEUE),
+   [IPSET_DEL] = IPSET_FLAG(IPSET_OPT_NAME)
+   | IPSET_FLAG(IPSET_OPT_BEFORE)
+   | IPSET_FLAG(IPSET_OPT_NAMEREF),
+

[PATCH 2/2] netfilter: ipset: warn users of list:set that parameter 'size' is ignored

2017-03-21 Thread Vishwanath Pai
Revision 4 warns the users that the parameter 'size' is ignored. The
kernel module doesn't need any changes, it will work with both the
revisions.

Note that this will not restore old behavior before commit 00590fdd5be0
("netfilter: ipset: Introduce RCU locking in list type") for users of
the older revision. It will be a much bigger change if that is
what we need.

Reviewed-by: Josh Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>
---
 net/netfilter/ipset/ip_set_list_set.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/ipset/ip_set_list_set.c 
b/net/netfilter/ipset/ip_set_list_set.c
index 178d4eb..d4f820a 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -19,7 +19,8 @@
 #define IPSET_TYPE_REV_MIN 0
 /* 1Counters support added */
 /* 2Comments support added */
-#define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */
+/* 3skbinfo support added */
+#define IPSET_TYPE_REV_MAX 4 /* size argument is ignored */
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Jozsef Kadlecsik <kad...@blackhole.kfki.hu>");
-- 
1.9.1



[PATCH] netfilter: ipset: Null pointer exception in ipset list:set

2017-02-15 Thread Vishwanath Pai
If we use before/after to add an element to an empty list it will cause
a kernel panic.

$> cat crash.restore
create a hash:ip
create b hash:ip
create test list:set timeout 5 size 4
add test b before a

$> ipset -R < crash.restore

Executing the above will crash the kernel.

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Reviewed-by: Josh Hunt <joh...@akamai.com>
---
 net/netfilter/ipset/ip_set_list_set.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_list_set.c 
b/net/netfilter/ipset/ip_set_list_set.c
index 51077c5..178d4eb 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -260,11 +260,14 @@ struct list_set {
else
prev = e;
}
+
+   /* If before/after is used on an empty set */
+   if ((d->before > 0 && !next) ||
+   (d->before < 0 && !prev))
+   return -IPSET_ERR_REF_EXIST;
+
/* Re-add already existing element */
if (n) {
-   if ((d->before > 0 && !next) ||
-   (d->before < 0 && !prev))
-   return -IPSET_ERR_REF_EXIST;
if (!flag_exist)
return -IPSET_ERR_EXIST;
/* Update extensions */
-- 
1.9.1



Re: [PATCH] netfilter: xt_hashlimit: Add missing ULL suffixes for 64-bit constants

2016-10-06 Thread Vishwanath Pai
On 10/06/2016 09:40 AM, Geert Uytterhoeven wrote:
> diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
> index 2fab0c65aa94b666..b89b688e9d01a2d1 100644
> --- a/net/netfilter/xt_hashlimit.c
> +++ b/net/netfilter/xt_hashlimit.c
> @@ -431,7 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
> CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
>  */
>  #define MAX_CPJ_v1 (0x / (HZ*60*60*24))
> -#define MAX_CPJ (0x / (HZ*60*60*24))
> +#define MAX_CPJ (0xULL / (HZ*60*60*24))
>  
>  /* Repeated shift and or gives us all 1s, final shift and add 1 gives
>   * us the power of 2 below the theoretical max, so GCC simply does a
> @@ -473,7 +473,7 @@ static u64 user2credits(u64 user, int revision)
>   return div64_u64(user * HZ * CREDITS_PER_JIFFY_v1,
>XT_HASHLIMIT_SCALE);
>   } else {
> - if (user > 0x / (HZ*CREDITS_PER_JIFFY))
> + if (user > 0xULL / (HZ*CREDITS_PER_JIFFY))
>   return div64_u64(user, XT_HASHLIMIT_SCALE_v2)
>   * HZ * CREDITS_PER_JIFFY;
>  
> -- 1.9.1

Thanks for fixing this.

Acked-by: Vishwanath Pai <v...@akamai.com>


Re: [PATCH net-next v3] netfilter: xt_hashlimit: Fix link error in 32bit arch because of 64bit division

2016-09-30 Thread Vishwanath Pai
On 09/30/2016 01:46 PM, Pablo Neira Ayuso wrote:
> On Thu, Sep 29, 2016 at 01:39:50PM -0400, Vishwanath Pai wrote:
>> v2:
>> Remove unnecessary div64_u64 around constants
>>
>> v3:
>> remove backslashes
>>
>> --
>>
>> Fix link error in 32bit arch because of 64bit division
>>
>> Division of 64bit integers will cause linker error undefined reference
>> to `__udivdi3'. Fix this by replacing divisions with div64_64
> 
> Applied, thanks Pai.
> 
>> Signed-off-by: Vishwanath Pai <v...@akamai.com>
>> Fixes: 11d5f15723c9 ("netfilter: xt_hashlimit: Create revision 2 to ...")
>>
>> ---
> 
> Please, next time place the versioning information here, otherwise git
> am takes the wrong description here.
> 

Thank you. And sorry about, I did not realize git am would put that into
the description.

-Vishwanath


Re: [PATCH 3/3] netfilter: xt_hashlimit: uses div_u64 for division

2016-09-30 Thread Vishwanath Pai
On 09/30/2016 12:38 PM, Eric Dumazet wrote:
> On Fri, 2016-09-30 at 18:05 +0200, Arnd Bergmann wrote:
>> The newly added support for high-resolution pps rates introduced multiple 
>> 64-bit
>> division operations in one function, which fails on all 32-bit architectures:
>>
>> net/netfilter/xt_hashlimit.o: In function `user2credits':
>> xt_hashlimit.c:(.text.user2credits+0x3c): undefined reference to 
>> `__aeabi_uldivmod'
>> xt_hashlimit.c:(.text.user2credits+0x68): undefined reference to 
>> `__aeabi_uldivmod'
>> xt_hashlimit.c:(.text.user2credits+0x88): undefined reference to 
>> `__aeabi_uldivmod'
>>
>> This replaces the division with an explicit call to div_u64 for version 2
>> to documents that this is a slow operation, and reverts back to 32-bit 
>> arguments
>> for the version 1 data to restore the original faster 32-bit division.
>>
>> With both changes combined, we no longer get a link error.
>>
>> Fixes: 11d5f15723c9 ("netfilter: xt_hashlimit: Create revision 2 to support 
>> higher pps rates")
>> Signed-off-by: Arnd Bergmann <a...@arndb.de>
>> ---
>> Vishwanath Pai already sent a patch for this, and I did my version 
>> independently.
>> The difference is that his version also the more expensive division for the
>> version 1 variant that doesn't need it.
>>
>> See also http://patchwork.ozlabs.org/patch/676713/
>> ---
>>  net/netfilter/xt_hashlimit.c | 17 ++---
>>  1 file changed, 10 insertions(+), 7 deletions(-)
>>
>> diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
>> index 44a095ecc7b7..3d5525df6eb3 100644
>> --- a/net/netfilter/xt_hashlimit.c
>> +++ b/net/netfilter/xt_hashlimit.c
>> @@ -464,20 +464,23 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
>>  static u64 user2credits(u64 user, int revision)
>>  {
>>  if (revision == 1) {
>> +u32 user32 = user; /* use 32-bit division */
>> +
> 
> This looks dangerous to me. Have you really tried all possible cases ?
> 
> Caller (even if using revision == 1) does
> user2credits(cfg->avg * cfg->burst, revision);
> 

It does look like we might lose precision here because of 64bit to 32bit
conversion, but I am not sure how much it matters here. Iirc this is how
it used to be before rev2 code.

> Since this is not a fast path, I would prefer to keep the 64bit divide.
> 

Agreed, this code does not get executed too often for us to worry about
div_u64 being slow. And it reverts back to regular division on 64 bit
arch anyways.

> Vishwanath version looks safer.
> 
> 

-Vishwanath


Re: next-20160929 build: 2 failures 4 warnings (next-20160929)

2016-09-29 Thread Vishwanath Pai
On 09/29/2016 02:47 PM, Mark Brown wrote:
> On Thu, Sep 29, 2016 at 12:40:35PM +0100, Build bot for Mark Brown wrote:
> 
> For the past couple of days -next has been failing to build an ARM
> allmodconfig due to:
> 
>>  arm-allmodconfig
>> ERROR: "__aeabi_uldivmod" [net/netfilter/xt_hashlimit.ko] undefined!
> 
> which appears to be triggered by 11d5f15723c9 (netfilter: xt_hashlimit:
> Create revision 2 to support higher pps rates) introducing a division of
> a 64 bit number which should be done using do_div().
> 

I have sent a patch for this a couple of days ago to netdev, it hasn't
made it to net-next yet. Here's the latest one:

[PATCH net-next v3] netfilter: xt_hashlimit: Fix link error in 32bit
arch because of 64bit division

This should fix the link error.

-Vishwanath


[PATCH net-next v3] netfilter: xt_hashlimit: Fix link error in 32bit arch because of 64bit division

2016-09-29 Thread Vishwanath Pai
v2:
Remove unnecessary div64_u64 around constants

v3:
remove backslashes

--

Fix link error in 32bit arch because of 64bit division

Division of 64bit integers will cause linker error undefined reference
to `__udivdi3'. Fix this by replacing divisions with div64_64

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Fixes: 11d5f15723c9 ("netfilter: xt_hashlimit: Create revision 2 to ...")

---
 net/netfilter/xt_hashlimit.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 44a095e..faf65f1 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -467,17 +467,18 @@ static u64 user2credits(u64 user, int revision)
/* If multiplying would overflow... */
if (user > 0x / (HZ*CREDITS_PER_JIFFY_v1))
/* Divide first. */
-   return (user / XT_HASHLIMIT_SCALE) *\
-   HZ * CREDITS_PER_JIFFY_v1;
+   return div64_u64(user, XT_HASHLIMIT_SCALE)
+   * HZ * CREDITS_PER_JIFFY_v1;
 
-   return (user * HZ * CREDITS_PER_JIFFY_v1) \
-   / XT_HASHLIMIT_SCALE;
+   return div64_u64((user * HZ * CREDITS_PER_JIFFY_v1),
+XT_HASHLIMIT_SCALE);
} else {
if (user > 0x / (HZ*CREDITS_PER_JIFFY))
-   return (user / XT_HASHLIMIT_SCALE_v2) *\
-   HZ * CREDITS_PER_JIFFY;
+   return div64_u64(user, XT_HASHLIMIT_SCALE_v2)
+   * HZ * CREDITS_PER_JIFFY;
 
-   return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE_v2;
+   return div64_u64((user * HZ * CREDITS_PER_JIFFY),
+XT_HASHLIMIT_SCALE_v2);
}
 }
 
-- 
1.9.1



[PATCH v2] netfilter: xt_hashlimit: Fix link error in 32bit arch because of 64bit division

2016-09-27 Thread Vishwanath Pai
v2:
Remove unnecessary div64_u64 around constants

--

Fix link error in 32bit arch because of 64bit division

Division of 64bit integers will cause linker error undefined reference
to `__udivdi3'. Fix this by replacing divisions with div64_64

Signed-off-by: Vishwanath Pai <v...@akamai.com>

---
 net/netfilter/xt_hashlimit.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 44a095e..200a9d8 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -467,17 +467,18 @@ static u64 user2credits(u64 user, int revision)
/* If multiplying would overflow... */
if (user > 0x / (HZ*CREDITS_PER_JIFFY_v1))
/* Divide first. */
-   return (user / XT_HASHLIMIT_SCALE) *\
+   return div64_u64(user, XT_HASHLIMIT_SCALE) *\
HZ * CREDITS_PER_JIFFY_v1;
 
-   return (user * HZ * CREDITS_PER_JIFFY_v1) \
-   / XT_HASHLIMIT_SCALE;
+   return div64_u64((user * HZ * CREDITS_PER_JIFFY_v1),
+ XT_HASHLIMIT_SCALE);
} else {
if (user > 0x / (HZ*CREDITS_PER_JIFFY))
-   return (user / XT_HASHLIMIT_SCALE_v2) *\
+   return div64_u64(user, XT_HASHLIMIT_SCALE_v2) *\
HZ * CREDITS_PER_JIFFY;
 
-   return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE_v2;
+   return div64_u64((user * HZ * CREDITS_PER_JIFFY),
+XT_HASHLIMIT_SCALE_v2);
}
 }
 
-- 
1.9.1



[PATCH] Fix link error in 32bit arch because of 64bit division

2016-09-27 Thread Vishwanath Pai
Fix link error in 32bit arch because of 64bit division

Division of 64bit integers will cause linker error undefined reference
to `__udivdi3'. Fix this by replacing divisions with div64_64

Signed-off-by: Vishwanath Pai <v...@akamai.com>

---
 net/netfilter/xt_hashlimit.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 44a095e..7fc694e 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -465,19 +465,20 @@ static u64 user2credits(u64 user, int revision)
 {
if (revision == 1) {
/* If multiplying would overflow... */
-   if (user > 0x / (HZ*CREDITS_PER_JIFFY_v1))
+   if (user > div64_u64(0x, (HZ*CREDITS_PER_JIFFY_v1)))
/* Divide first. */
-   return (user / XT_HASHLIMIT_SCALE) *\
+   return div64_u64(user, XT_HASHLIMIT_SCALE) *\
HZ * CREDITS_PER_JIFFY_v1;
 
-   return (user * HZ * CREDITS_PER_JIFFY_v1) \
-   / XT_HASHLIMIT_SCALE;
+   return div64_u64((user * HZ * CREDITS_PER_JIFFY_v1),
+ XT_HASHLIMIT_SCALE);
} else {
-   if (user > 0x / (HZ*CREDITS_PER_JIFFY))
-   return (user / XT_HASHLIMIT_SCALE_v2) *\
+   if (user > div64_u64(0x, 
(HZ*CREDITS_PER_JIFFY)))
+   return div64_u64(user, XT_HASHLIMIT_SCALE_v2) *\
HZ * CREDITS_PER_JIFFY;
 
-   return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE_v2;
+   return div64_u64((user * HZ * CREDITS_PER_JIFFY),
+XT_HASHLIMIT_SCALE_v2);
}
 }
 
-- 
1.9.1



Re: [PATCH v3 2/2] netfilter: Create revision 2 of xt_hashlimit to support higher pps rates

2016-09-27 Thread Vishwanath Pai
On Tue, Sep 27, 2016 at 12:15 AM, Liping Zhang <zlpnob...@gmail.com> wrote:
> Hi Vishwanath,
>
> 2016-09-23 0:43 GMT+08:00 Vishwanath Pai <v...@akamai.com>:
>>
>>  /* Precision saver. */
>> -static u32 user2credits(u32 user)
>> +static u64 user2credits(u64 user, int revision)
>>  {
>> -   /* If multiplying would overflow... */
>> -   if (user > 0x / (HZ*CREDITS_PER_JIFFY_v1))
>> -   /* Divide first. */
>> -   return (user / XT_HASHLIMIT_SCALE_v1) *\
>> -   HZ * CREDITS_PER_JIFFY_v1;
>> +   if (revision == 1) {
>> +   /* If multiplying would overflow... */
>> +   if (user > 0x / (HZ*CREDITS_PER_JIFFY_v1))
>> +   /* Divide first. */
>> +   return (user / XT_HASHLIMIT_SCALE_v1) *\
>> +   HZ * CREDITS_PER_JIFFY_v1;
>> +
>> +   return (user * HZ * CREDITS_PER_JIFFY_v1) \
>> +   / XT_HASHLIMIT_SCALE_v1;
>> +   } else {
>> +   if (user > 0x / (HZ*CREDITS_PER_JIFFY))
>> +   return (user / XT_HASHLIMIT_SCALE) *\
>> +   HZ * CREDITS_PER_JIFFY;
>>
>> -   return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE_v1;
>> +   return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
>> +   }
>>  }
>>
>
> In my memory, 64-bit division operation should be replaced by
> div_u64 or div64_u64, otherwise on some 32-bit architecture
> systems, link error will happen. Something like this:
> ... undefined reference to `__udivdi3'.

I did not know that, thanks for pointing it out. I will send a patch
to fix this.

-Vishwanath


[PATCH v4 2/2] libxt_hashlimit: Create revision 2 of xt_hashlimit to support higher pps rates

2016-09-26 Thread Vishwanath Pai
v2:
* Added two tests for 100pps support to libxt_hashlimit.t
* cfg_copy returns -EINVAL and all callers of cfg_copy will look
  for   the error value

v3:
Reorder mode, avg and burst in hashlimit_cfg2 to avoid padding

v4:
XT_HASHLIMIT_SCALE for rev2 is now XT_HASHLIMIT_SCALE_v2

--

libxt_hashlimit: Create revision 2 of xt_hashlimit to support higher pps
rates

Create a new revision for the hashlimit iptables extension module. Rev 2
will support higher pps of upto 1 million, Version 1 supports only 10k.

To support this we have to increase the size of the variables avg and
burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2
and xt_hashlimit_mtinfo2 and also create newer versions of all the
functions for match, checkentry and destory.

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Signed-off-by: Joshua Hunt <joh...@akamai.com>
---
 extensions/libxt_hashlimit.c   | 457 ++---
 extensions/libxt_hashlimit.t   |   2 +
 include/linux/netfilter/xt_hashlimit.h |  26 +-
 3 files changed, 394 insertions(+), 91 deletions(-)

diff --git a/extensions/libxt_hashlimit.c b/extensions/libxt_hashlimit.c
index 99d5e1c..8580179 100644
--- a/extensions/libxt_hashlimit.c
+++ b/extensions/libxt_hashlimit.c
@@ -18,12 +18,14 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 
 #define XT_HASHLIMIT_BURST 5
 #define XT_HASHLIMIT_BURST_MAX_v1  1
+#define XT_HASHLIMIT_BURST_MAX 100
 
 #define XT_HASHLIMIT_BYTE_EXPIRE   15
 #define XT_HASHLIMIT_BYTE_EXPIRE_BURST 60
@@ -150,16 +152,70 @@ static const struct xt_option_entry 
hashlimit_mt_opts_v1[] = {
 };
 #undef s
 
-static uint32_t cost_to_bytes(uint32_t cost)
+#define s struct xt_hashlimit_mtinfo2
+static const struct xt_option_entry hashlimit_mt_opts[] = {
+   {.name = "hashlimit-upto", .id = O_UPTO, .excl = F_ABOVE,
+.type = XTTYPE_STRING, .flags = XTOPT_INVERT},
+   {.name = "hashlimit-above", .id = O_ABOVE, .excl = F_UPTO,
+.type = XTTYPE_STRING, .flags = XTOPT_INVERT},
+   {.name = "hashlimit", .id = O_UPTO, .excl = F_ABOVE,
+.type = XTTYPE_STRING, .flags = XTOPT_INVERT}, /* old name */
+   {.name = "hashlimit-srcmask", .id = O_SRCMASK, .type = XTTYPE_PLEN},
+   {.name = "hashlimit-dstmask", .id = O_DSTMASK, .type = XTTYPE_PLEN},
+   {.name = "hashlimit-burst", .id = O_BURST, .type = XTTYPE_STRING},
+   {.name = "hashlimit-htable-size", .id = O_HTABLE_SIZE,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.size)},
+   {.name = "hashlimit-htable-max", .id = O_HTABLE_MAX,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.max)},
+   {.name = "hashlimit-htable-gcinterval", .id = O_HTABLE_GCINT,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.gc_interval)},
+   {.name = "hashlimit-htable-expire", .id = O_HTABLE_EXPIRE,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.expire)},
+   {.name = "hashlimit-mode", .id = O_MODE, .type = XTTYPE_STRING},
+   {.name = "hashlimit-name", .id = O_NAME, .type = XTTYPE_STRING,
+.flags = XTOPT_MAND | XTOPT_PUT, XTOPT_POINTER(s, name), .min = 1},
+   XTOPT_TABLEEND,
+};
+#undef s
+
+static int
+cfg_copy(struct hashlimit_cfg2 *to, const void *from, int revision)
+{
+   if (revision == 1) {
+   struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+   to->mode = cfg->mode;
+   to->avg = cfg->avg;
+   to->burst = cfg->burst;
+   to->size = cfg->size;
+   to->max = cfg->max;
+   to->gc_interval = cfg->gc_interval;
+   to->expire = cfg->expire;
+   to->srcmask = cfg->srcmask;
+   to->dstmask = cfg->dstmask;
+   } else if (revision == 2) {
+   memcpy(to, from, sizeof(struct hashlimit_cfg2));
+   } else {
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
+static uint64_t cost_to_bytes(uint64_t cost)
 {
-   uint32_t r;
+   uint64_t r;
 
r = cost ? UINT32_MAX / cost : UINT32_MAX;
r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
return r;
 }
 
-static uint64_t bytes_to_cost(uint32_t bytes)
+static uint64_t bytes_to_cost(uint64_t bytes)
 {
uint32_t r = bytes >> XT_HASHLIMIT_BYTE_SHIFT;
return UINT32_MAX / (r+1);
@@ -180,57 +236,78 @@ static void burst_error_v1(void)
"\"--hashlimit-burst\", or out of range (1-%u).", 
XT_HASHLIMIT_BURST_MAX_v1);
 }
 
-static uint32_t parse_burst(const char *burst, struct xt_hashlimit_mtinfo1 
*info)
+static void burst_error(void)
+{
+   xtab

[PATCH v4 1/2] libxt_hashlimit: Prepare libxt_hashlimit.c for revision 2

2016-09-26 Thread Vishwanath Pai
v4:
Do not rename XT_HASHLIMIT_SCALE to  XT_HASHLIMIT_SCALE_v1 this
will break userspace, instead rename XT_HASHLIMIT_SCALE for v2
as XT_HASHLIMIT_SCALE_v2

--

I am planning to add a revision 2 for the hashlimit xtables module to
support higher packets per second rates. This patch renames all the
functions and variables related to revision 1 by adding _v1 at the
end of the names.

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Signed-off-by: Joshua Hunt <joh...@akamai.com>
---
 extensions/libxt_hashlimit.c | 78 ++--
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/extensions/libxt_hashlimit.c b/extensions/libxt_hashlimit.c
index c5b8d77..99d5e1c 100644
--- a/extensions/libxt_hashlimit.c
+++ b/extensions/libxt_hashlimit.c
@@ -23,7 +23,7 @@
 #include 
 
 #define XT_HASHLIMIT_BURST 5
-#define XT_HASHLIMIT_BURST_MAX 1
+#define XT_HASHLIMIT_BURST_MAX_v1  1
 
 #define XT_HASHLIMIT_BYTE_EXPIRE   15
 #define XT_HASHLIMIT_BYTE_EXPIRE_BURST 60
@@ -98,7 +98,7 @@ static const struct xt_option_entry hashlimit_opts[] = {
{.name = "hashlimit", .id = O_UPTO, .excl = F_ABOVE,
 .type = XTTYPE_STRING},
{.name = "hashlimit-burst", .id = O_BURST, .type = XTTYPE_UINT32,
-.min = 1, .max = XT_HASHLIMIT_BURST_MAX, .flags = XTOPT_PUT,
+.min = 1, .max = XT_HASHLIMIT_BURST_MAX_v1, .flags = XTOPT_PUT,
 XTOPT_POINTER(s, cfg.burst)},
{.name = "hashlimit-htable-size", .id = O_HTABLE_SIZE,
 .type = XTTYPE_UINT32, .flags = XTOPT_PUT,
@@ -121,7 +121,7 @@ static const struct xt_option_entry hashlimit_opts[] = {
 #undef s
 
 #define s struct xt_hashlimit_mtinfo1
-static const struct xt_option_entry hashlimit_mt_opts[] = {
+static const struct xt_option_entry hashlimit_mt_opts_v1[] = {
{.name = "hashlimit-upto", .id = O_UPTO, .excl = F_ABOVE,
 .type = XTTYPE_STRING, .flags = XTOPT_INVERT},
{.name = "hashlimit-above", .id = O_ABOVE, .excl = F_UPTO,
@@ -174,10 +174,10 @@ static uint32_t get_factor(int chr)
return 1;
 }
 
-static void burst_error(void)
+static void burst_error_v1(void)
 {
xtables_error(PARAMETER_PROBLEM, "bad value for option "
-   "\"--hashlimit-burst\", or out of range (1-%u).", 
XT_HASHLIMIT_BURST_MAX);
+   "\"--hashlimit-burst\", or out of range (1-%u).", 
XT_HASHLIMIT_BURST_MAX_v1);
 }
 
 static uint32_t parse_burst(const char *burst, struct xt_hashlimit_mtinfo1 
*info)
@@ -186,8 +186,8 @@ static uint32_t parse_burst(const char *burst, struct 
xt_hashlimit_mtinfo1 *info
char *end;
 
if (!xtables_strtoul(burst, , , 1, UINT32_MAX) ||
-   (*end == 0 && v > XT_HASHLIMIT_BURST_MAX))
-   burst_error();
+   (*end == 0 && v > XT_HASHLIMIT_BURST_MAX_v1))
+   burst_error_v1();
 
v *= get_factor(*end);
if (v > UINT32_MAX)
@@ -272,7 +272,7 @@ static void hashlimit_init(struct xt_entry_match *m)
 
 }
 
-static void hashlimit_mt4_init(struct xt_entry_match *match)
+static void hashlimit_mt4_init_v1(struct xt_entry_match *match)
 {
struct xt_hashlimit_mtinfo1 *info = (void *)match->data;
 
@@ -283,7 +283,7 @@ static void hashlimit_mt4_init(struct xt_entry_match *match)
info->cfg.dstmask = 32;
 }
 
-static void hashlimit_mt6_init(struct xt_entry_match *match)
+static void hashlimit_mt6_init_v1(struct xt_entry_match *match)
 {
struct xt_hashlimit_mtinfo1 *info = (void *)match->data;
 
@@ -342,7 +342,7 @@ static void hashlimit_parse(struct xt_option_call *cb)
}
 }
 
-static void hashlimit_mt_parse(struct xt_option_call *cb)
+static void hashlimit_mt_parse_v1(struct xt_option_call *cb)
 {
struct xt_hashlimit_mtinfo1 *info = cb->data;
 
@@ -395,7 +395,7 @@ static void hashlimit_check(struct xt_fcheck_call *cb)
info->cfg.expire = udata->mult * 1000; /* from s to msec */
 }
 
-static void hashlimit_mt_check(struct xt_fcheck_call *cb)
+static void hashlimit_mt_check_v1(struct xt_fcheck_call *cb)
 {
const struct hashlimit_mt_udata *udata = cb->udata;
struct xt_hashlimit_mtinfo1 *info = cb->data;
@@ -421,18 +421,18 @@ static void hashlimit_mt_check(struct xt_fcheck_call *cb)
info->cfg.expire = 
XT_HASHLIMIT_BYTE_EXPIRE_BURST * 1000;
}
info->cfg.burst = burst;
-   } else if (info->cfg.burst > XT_HASHLIMIT_BURST_MAX)
-   burst_error();
+   } else if (info->cfg.burst > XT_HASHLIMIT_BURST_MAX_v1)
+   burst_error_v1();
 }
 
-static const struct rates
+static const struct rates_v1
 {
const char *name;
uint32_t mult;
-} rates[] = { { "day", XT_HASHLIMIT_SCALE*24*60*60 },
-   

Re: [PATCH v3 2/2] netfilter: Create revision 2 of xt_hashlimit to support higher pps rates

2016-09-22 Thread Vishwanath Pai
Thanks for pointing this out, I will reorder the fields to:

struct hashlimit_cfg2 {
__u64 avg;/* Average secs between packets * scale */
__u64 burst;
__u32 mode; /* bitmask of XT_HASHLIMIT_HASH_* */

This should fix the hole and avoid padding.

-Vishwanath

On 09/22/2016 12:53 PM, Jan Engelhardt wrote:
> On Thursday 2016-09-22 18:43, Vishwanath Pai wrote:
>> >+struct hashlimit_cfg2 {
>> >+   __u32 mode;   /* bitmask of XT_HASHLIMIT_HASH_* */
>> >+   __u64 avg;/* Average secs between packets * scale */
>> >+   __u64 burst;  /* Period multiplier for upper limit. */
> This would have different sizes between i386 and x86_64,
> necessiting additional compat functions. It should be padded
> or reordered instead.
> 



[PATCH v3 2/2] netfilter: Create revision 2 of xt_hashlimit to support higher pps rates

2016-09-22 Thread Vishwanath Pai
V2:
Removed the call to BUG() in cfg_copy, we return -EINVAL
instead and all calls to cfg_copy check for this

V3:
change "revision" in the call to cfg_copy inside htable_create to 2
previously this would pass down revision from the function parameter
this is wrong since *cfg here is always hashlimit_cfg2

There is no change in userspace patch so V2 will work

--

netfilter: Create revision 2 of xt_hashlimit to support higher pps rates

Create a new revision for the hashlimit iptables extension module. Rev 2
will support higher pps of upto 1 million, Version 1 supports only 10k.

To support this we have to increase the size of the variables avg and
burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2
and xt_hashlimit_mtinfo2 and also create newer versions of all the
functions for match, checkentry and destroy.

Some of the functions like hashlimit_mt, hashlimit_mt_check etc are very
similar in both rev1 and rev2 with only minor changes, so I have split
those functions and moved all the common code to a *_common function.

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Signed-off-by: Joshua Hunt <joh...@akamai.com>
---
 include/uapi/linux/netfilter/xt_hashlimit.h |  23 ++
 net/netfilter/xt_hashlimit.c| 330 ++--
 2 files changed, 285 insertions(+), 68 deletions(-)

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h 
b/include/uapi/linux/netfilter/xt_hashlimit.h
index ea8c1c0..be5d2e1 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -6,6 +6,7 @@
 
 /* timings are in milliseconds. */
 #define XT_HASHLIMIT_SCALE_v1 1
+#define XT_HASHLIMIT_SCALE 100llu
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
  * seconds, or one packet every 59 hours.
  */
@@ -63,6 +64,20 @@ struct hashlimit_cfg1 {
__u8 srcmask, dstmask;
 };
 
+struct hashlimit_cfg2 {
+   __u32 mode;   /* bitmask of XT_HASHLIMIT_HASH_* */
+   __u64 avg;/* Average secs between packets * scale */
+   __u64 burst;  /* Period multiplier for upper limit. */
+
+   /* user specified */
+   __u32 size; /* how many buckets */
+   __u32 max;  /* max number of entries */
+   __u32 gc_interval;  /* gc interval */
+   __u32 expire;   /* when do entries expire? */
+
+   __u8 srcmask, dstmask;
+};
+
 struct xt_hashlimit_mtinfo1 {
char name[IFNAMSIZ];
struct hashlimit_cfg1 cfg;
@@ -71,4 +86,12 @@ struct xt_hashlimit_mtinfo1 {
struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
 };
 
+struct xt_hashlimit_mtinfo2 {
+   char name[NAME_MAX];
+   struct hashlimit_cfg2 cfg;
+
+   /* Used internally by the kernel */
+   struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
+};
+
 #endif /* _UAPI_XT_HASHLIMIT_H */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 6b0ad93..64579f3 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -57,6 +57,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct 
net *net)
 
 /* need to declare this at the top */
 static const struct file_operations dl_file_ops_v1;
+static const struct file_operations dl_file_ops;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -86,8 +87,8 @@ struct dsthash_ent {
unsigned long expires;  /* precalculated expiry time */
struct {
unsigned long prev; /* last modification */
-   u_int32_t credit;
-   u_int32_t credit_cap, cost;
+   u_int64_t credit;
+   u_int64_t credit_cap, cost;
} rateinfo;
struct rcu_head rcu;
 };
@@ -98,7 +99,7 @@ struct xt_hashlimit_htable {
u_int8_t family;
bool rnd_initialized;
 
-   struct hashlimit_cfg1 cfg;  /* config */
+   struct hashlimit_cfg2 cfg;  /* config */
 
/* used internally */
spinlock_t lock;/* lock for list_head */
@@ -114,6 +115,30 @@ struct xt_hashlimit_htable {
struct hlist_head hash[0];  /* hashtable itself */
 };
 
+static int
+cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
+{
+   if (revision == 1) {
+   struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+   to->mode = cfg->mode;
+   to->avg = cfg->avg;
+   to->burst = cfg->burst;
+   to->size = cfg->size;
+   to->max = cfg->max;
+   to->gc_interval = cfg->gc_interval;
+   to->expire = cfg->expire;
+   to->srcmask = cfg->srcmask;
+   to->dstmask = cfg->dstmask;
+   } else if (revision == 2) {
+   memcpy(to, from, sizeof(struct hashlimit_cfg2));
+   } else {
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 st

[PATCH v3 1/2] netfilter: Prepare xt_hashlimit.c for revision 2

2016-09-22 Thread Vishwanath Pai
I am planning to add a revision 2 for the hashlimit xtables module to
support higher packets per second rates. This patch renames all the
functions and variables related to revision 1 by adding _v1 at the
end of the names.

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Signed-off-by: Joshua Hunt <joh...@akamai.com>
---
 include/uapi/linux/netfilter/xt_hashlimit.h |  2 +-
 net/netfilter/xt_hashlimit.c| 61 +++--
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h 
b/include/uapi/linux/netfilter/xt_hashlimit.h
index 6db9037..ea8c1c0 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -5,7 +5,7 @@
 #include 
 
 /* timings are in milliseconds. */
-#define XT_HASHLIMIT_SCALE 1
+#define XT_HASHLIMIT_SCALE_v1 1
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
  * seconds, or one packet every 59 hours.
  */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 1786968..6b0ad93 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -56,7 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct 
net *net)
 }
 
 /* need to declare this at the top */
-static const struct file_operations dl_file_ops;
+static const struct file_operations dl_file_ops_v1;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -215,8 +215,8 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct 
dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
-u_int8_t family)
+static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 
*minfo,
+   u_int8_t family)
 {
struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
struct xt_hashlimit_htable *hinfo;
@@ -265,7 +265,7 @@ static int htable_create(struct net *net, struct 
xt_hashlimit_mtinfo1 *minfo,
hinfo->pde = proc_create_data(minfo->name, 0,
(family == NFPROTO_IPV4) ?
hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-   _file_ops, hinfo);
+   _file_ops_v1, hinfo);
if (hinfo->pde == NULL) {
kfree(hinfo->name);
vfree(hinfo);
@@ -398,7 +398,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
(slowest userspace tool allows), which means
CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
 */
-#define MAX_CPJ (0x / (HZ*60*60*24))
+#define MAX_CPJ_v1 (0x / (HZ*60*60*24))
 
 /* Repeated shift and or gives us all 1s, final shift and add 1 gives
  * us the power of 2 below the theoretical max, so GCC simply does a
@@ -410,7 +410,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
 #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
 
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+#define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
 
 /* in byte mode, the lowest possible rate is one packet/second.
  * credit_cap is used as a counter that tells us how many times we can
@@ -428,11 +428,12 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
 static u32 user2credits(u32 user)
 {
/* If multiplying would overflow... */
-   if (user > 0x / (HZ*CREDITS_PER_JIFFY))
+   if (user > 0x / (HZ*CREDITS_PER_JIFFY_v1))
/* Divide first. */
-   return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+   return (user / XT_HASHLIMIT_SCALE_v1) *\
+   HZ * CREDITS_PER_JIFFY_v1;
 
-   return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+   return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE_v1;
 }
 
 static u32 user2credits_byte(u32 user)
@@ -461,7 +462,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, 
unsigned long now, u32 mode)
return;
}
} else {
-   dh->rateinfo.credit += delta * CREDITS_PER_JIFFY;
+   dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1;
cap = dh->rateinfo.credit_cap;
}
if (dh->rateinfo.credit > cap)
@@ -603,7 +604,7 @@ static u32 hashlimit_byte_cost(unsigned int len, struct 
dsthash_ent *dh)
 }
 
 static bool
-hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
struct xt_hashlimit_htable *hinfo = info->hinfo;
@@ -660,7 +661,7 @@ hashlimit_mt(const struct sk_buff *skb, struct 
xt_action_param *par)
return false;
 }
 
-static int hashlimit_mt_check(const struct xt_mtchk_par

Re: [PATCH iptables 3/3] libxt_hashlimit: iptables-restore does not work as expected with xt_hashlimit

2016-07-12 Thread Vishwanath Pai
On 07/08/2016 07:54 AM, Pablo Neira Ayuso wrote:
> We have to keep the existing behaviour. Yes, it's broken or ambiguos
> but there may be people outthere relying on this.
> 
> What I think we can do to resolve this scenario that you describe
> abobe is to provide a new option:
> 
> --hashlimit-exclusive
> 
> this turns on a flag in the configuration that results in a bail out
> if any of these two happens:
> 
> 1) there is an existing hashlimit with that name already.
> 2) someone tries to add a hashlimit with a clashing name with no
>--hashlimit-exclusive.
> 
> So the --hashlimit-exclusive just makes sure the user gets a unique
> name and silly things don't happen.
> 
> If not specified, we rely on the existing (broken) behaviour.
> 

Bailing out if another rule of the same name exists is not an option
right now. Case1 and case2 will look exactly the same to the kernel
module, if you look at the dmesg call for case2 hashlimit_mt_check for
the new rule is called first and then the old rule is deleted so as far
as the kernel module is concerned it appears like someone is adding a
new rule with same name as the existing rule.

Adding --hashlimit-exclusive will solve case 1 but it will error out for
case 2 as well. In case 1 the user is indeed trying to add a rule with
the same name as an existing rule and we can bail out, but in case2 we
are merely restoring it to its original state. But the kernel module
won't be able to differentiate between case1 and case2 because we have
no way of telling whether a user is adding another rule or restoring it.
So if a user tries case2 with --hashlimit-exclusive then
iptables-restore will fail which will be totally unexpected.

One thing we can do to fix this is to make procfs file creation
optional. Right now --hashlimit-name is a mandatory option, I think we
can make this optional so that no proc file is created if the option is
not specified. This will work perfectly with --hashlimit-update and with
both case1 and case2.

>>> > > 2) Replace an iptables rule with iptables-restore
>>> > > 
>>> > > $ iptables -A chain1 -m hashlimit --hashlimit-above 200/sec \
>>> > >   --hashlimit-mode srcip --hashlimit-name hashlimit1 -j DROP
>>> > > 
>>> > > $ iptables-save > /tmp/hashlimit
>>> > > 
>>> > > $ vi /tmp/hashlimit (edit 200/s to 300/s)
>>> > > 
>>> > > $ iptables-restore < /tmp/hashlimit
>>> > > 
>>> > > $ dmesg -c
>>> > > [ 1585.411093] hashlimit_mt_check for rule hashlimit1
>>> > > [ 1585.418948] hashlimit_mt_destroy on hashlimit1
> For this case, you rename your enhance-proc option to:
> 
> --hashlimit-update
> 
> as I proposed, and remove the /proc entry rename logic.
> 

I will remove the proc entry rename logic and change the option to
--hashlimit-update. This will work for both the cases provided we make
the procfs file creation optional. If a user specifies --hashlimit-name
then we fall back to the old behavior.

>>> > > In case 1 there exists two rules with the same name but we cannot have
>>> > > procfs files for both of the rules since they have to exist in the same
>>> > > directory. In case 2 there will be only one rule but there is a small
>>> > > window where two rules with same name exist. We cannot differentiate
>>> > > this from case 1. In both the cases we get the call for
>>> > > hashlimit_mt_check for the new rule before the old rule is deleted.
>>> > > 
>>> > > Without the rename feature I do not know how to correctly handle the
>>> > > scenario where two rules with different parameters but the same name 
>>> > > exist.
> With the explicit new options, we can indeed know where to go. If no
> options are specified, we just keep behaving in the same (broken) way
> that we provide now.
> 
> This is the way hashlimit has been working since day one
> 
> Does this sound good to you?
> 
> Thanks for not giving up on sorting out this problem.

No problem, this turned out to be trickier to solve than I originally
expected. Please let me know what you think about what I proposed above,
I will send a V2 after that.

Thanks,
Vishwanath



Re: [PATCH v2 1/2] libxt_hashlimit: Prepare libxt_hashlimit.c for revision 2

2016-07-08 Thread Vishwanath Pai
On 07/08/2016 12:54 PM, Vishwanath Pai wrote:
> On 07/08/2016 12:37 PM, David Laight wrote:
>> If you think some users would still want 32bit limits, then you should
>> (probably) use a _64 suffix for the new functions.
>>
>>  David
> 
> I am proposing a new revision for hashlimit that supports a higher rate
> along with a few other changes/fixes (in separate patches). Hence the
> prefix _v2 for the new functions.
> 
> - Vish
> 

I'm sorry _v1 for the old functions, the new functions (for rev2) don't
have a suffix.


Re: [PATCH v2 1/2] libxt_hashlimit: Prepare libxt_hashlimit.c for revision 2

2016-07-08 Thread Vishwanath Pai
On 07/08/2016 12:37 PM, David Laight wrote:
> If you think some users would still want 32bit limits, then you should
> (probably) use a _64 suffix for the new functions.
> 
>   David

I am proposing a new revision for hashlimit that supports a higher rate
along with a few other changes/fixes (in separate patches). Hence the
prefix _v2 for the new functions.

- Vish


[PATCH v2 2/2] libxt_hashlimit: Create revision 2 of xt_hashlimit to support higher pps rates

2016-07-07 Thread Vishwanath Pai
* Added two tests for 100pps support to libxt_hashlimit.t 
* cfg_copy returns -EINVAL and all callers of cfg_copy will
  look for   the error value

--

libxt_hashlimit: Create revision 2 of xt_hashlimit to support higher pps rates

Create a new revision for the hashlimit iptables extension module. Rev 2
will support higher pps of upto 1 million, Version 1 supports only 10k.

To support this we have to increase the size of the variables avg and
burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2
and xt_hashlimit_mtinfo2 and also create newer versions of all the
functions for match, checkentry and destory.

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Signed-off-by: Joshua Hunt <joh...@akamai.com>
---
 extensions/libxt_hashlimit.c   | 460 ++---
 extensions/libxt_hashlimit.t   |   2 +
 include/linux/netfilter/xt_hashlimit.h |  23 ++
 3 files changed, 394 insertions(+), 91 deletions(-)

diff --git a/extensions/libxt_hashlimit.c b/extensions/libxt_hashlimit.c
index ad7fb93..1814c29 100644
--- a/extensions/libxt_hashlimit.c
+++ b/extensions/libxt_hashlimit.c
@@ -18,12 +18,14 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 
 #define XT_HASHLIMIT_BURST 5
 #define XT_HASHLIMIT_BURST_MAX_v1  1
+#define XT_HASHLIMIT_BURST_MAX 100
 
 #define XT_HASHLIMIT_BYTE_EXPIRE   15
 #define XT_HASHLIMIT_BYTE_EXPIRE_BURST 60
@@ -150,16 +152,70 @@ static const struct xt_option_entry 
hashlimit_mt_opts_v1[] = {
 };
 #undef s
 
-static uint32_t cost_to_bytes(uint32_t cost)
+#define s struct xt_hashlimit_mtinfo2
+static const struct xt_option_entry hashlimit_mt_opts[] = {
+   {.name = "hashlimit-upto", .id = O_UPTO, .excl = F_ABOVE,
+.type = XTTYPE_STRING, .flags = XTOPT_INVERT},
+   {.name = "hashlimit-above", .id = O_ABOVE, .excl = F_UPTO,
+.type = XTTYPE_STRING, .flags = XTOPT_INVERT},
+   {.name = "hashlimit", .id = O_UPTO, .excl = F_ABOVE,
+.type = XTTYPE_STRING, .flags = XTOPT_INVERT}, /* old name */
+   {.name = "hashlimit-srcmask", .id = O_SRCMASK, .type = XTTYPE_PLEN},
+   {.name = "hashlimit-dstmask", .id = O_DSTMASK, .type = XTTYPE_PLEN},
+   {.name = "hashlimit-burst", .id = O_BURST, .type = XTTYPE_STRING},
+   {.name = "hashlimit-htable-size", .id = O_HTABLE_SIZE,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.size)},
+   {.name = "hashlimit-htable-max", .id = O_HTABLE_MAX,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.max)},
+   {.name = "hashlimit-htable-gcinterval", .id = O_HTABLE_GCINT,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.gc_interval)},
+   {.name = "hashlimit-htable-expire", .id = O_HTABLE_EXPIRE,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.expire)},
+   {.name = "hashlimit-mode", .id = O_MODE, .type = XTTYPE_STRING},
+   {.name = "hashlimit-name", .id = O_NAME, .type = XTTYPE_STRING,
+.flags = XTOPT_MAND | XTOPT_PUT, XTOPT_POINTER(s, name), .min = 1},
+   XTOPT_TABLEEND,
+};
+#undef s
+
+static int
+cfg_copy(struct hashlimit_cfg2 *to, const void *from, int revision)
+{
+   if (revision == 1) {
+   struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+   to->mode = cfg->mode;
+   to->avg = cfg->avg;
+   to->burst = cfg->burst;
+   to->size = cfg->size;
+   to->max = cfg->max;
+   to->gc_interval = cfg->gc_interval;
+   to->expire = cfg->expire;
+   to->srcmask = cfg->srcmask;
+   to->dstmask = cfg->dstmask;
+   } else if (revision == 2) {
+   memcpy(to, from, sizeof(struct hashlimit_cfg2));
+   } else {
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
+static uint64_t cost_to_bytes(uint64_t cost)
 {
-   uint32_t r;
+   uint64_t r;
 
r = cost ? UINT32_MAX / cost : UINT32_MAX;
r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
return r;
 }
 
-static uint64_t bytes_to_cost(uint32_t bytes)
+static uint64_t bytes_to_cost(uint64_t bytes)
 {
uint32_t r = bytes >> XT_HASHLIMIT_BYTE_SHIFT;
return UINT32_MAX / (r+1);
@@ -180,57 +236,78 @@ static void burst_error_v1(void)
"\"--hashlimit-burst\", or out of range (1-%u).", 
XT_HASHLIMIT_BURST_MAX_v1);
 }
 
-static uint32_t parse_burst(const char *burst, struct xt_hashlimit_mtinfo1 
*info)
+static void burst_error(void)
+{
+   xtables_error(PARAMETER_PROBLEM, "bad value for option "
+   "\"--hashlimit-burst\", or out

[PATCH v2 1/2] libxt_hashlimit: Prepare libxt_hashlimit.c for revision 2

2016-07-07 Thread Vishwanath Pai
I am planning to add a revision 2 for the hashlimit xtables module to
support higher packets per second rates. This patch renames all the
functions and variables related to revision 1 by adding _v1 at the end of
the names.

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Signed-off-by: Joshua Hunt <joh...@akamai.com>
---
 extensions/libxt_hashlimit.c   | 80 +-
 include/linux/netfilter/xt_hashlimit.h |  2 +-
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/extensions/libxt_hashlimit.c b/extensions/libxt_hashlimit.c
index c5b8d77..ad7fb93 100644
--- a/extensions/libxt_hashlimit.c
+++ b/extensions/libxt_hashlimit.c
@@ -23,7 +23,7 @@
 #include 
 
 #define XT_HASHLIMIT_BURST 5
-#define XT_HASHLIMIT_BURST_MAX 1
+#define XT_HASHLIMIT_BURST_MAX_v1  1
 
 #define XT_HASHLIMIT_BYTE_EXPIRE   15
 #define XT_HASHLIMIT_BYTE_EXPIRE_BURST 60
@@ -98,7 +98,7 @@ static const struct xt_option_entry hashlimit_opts[] = {
{.name = "hashlimit", .id = O_UPTO, .excl = F_ABOVE,
 .type = XTTYPE_STRING},
{.name = "hashlimit-burst", .id = O_BURST, .type = XTTYPE_UINT32,
-.min = 1, .max = XT_HASHLIMIT_BURST_MAX, .flags = XTOPT_PUT,
+.min = 1, .max = XT_HASHLIMIT_BURST_MAX_v1, .flags = XTOPT_PUT,
 XTOPT_POINTER(s, cfg.burst)},
{.name = "hashlimit-htable-size", .id = O_HTABLE_SIZE,
 .type = XTTYPE_UINT32, .flags = XTOPT_PUT,
@@ -121,7 +121,7 @@ static const struct xt_option_entry hashlimit_opts[] = {
 #undef s
 
 #define s struct xt_hashlimit_mtinfo1
-static const struct xt_option_entry hashlimit_mt_opts[] = {
+static const struct xt_option_entry hashlimit_mt_opts_v1[] = {
{.name = "hashlimit-upto", .id = O_UPTO, .excl = F_ABOVE,
 .type = XTTYPE_STRING, .flags = XTOPT_INVERT},
{.name = "hashlimit-above", .id = O_ABOVE, .excl = F_UPTO,
@@ -174,10 +174,10 @@ static uint32_t get_factor(int chr)
return 1;
 }
 
-static void burst_error(void)
+static void burst_error_v1(void)
 {
xtables_error(PARAMETER_PROBLEM, "bad value for option "
-   "\"--hashlimit-burst\", or out of range (1-%u).", 
XT_HASHLIMIT_BURST_MAX);
+   "\"--hashlimit-burst\", or out of range (1-%u).", 
XT_HASHLIMIT_BURST_MAX_v1);
 }
 
 static uint32_t parse_burst(const char *burst, struct xt_hashlimit_mtinfo1 
*info)
@@ -186,8 +186,8 @@ static uint32_t parse_burst(const char *burst, struct 
xt_hashlimit_mtinfo1 *info
char *end;
 
if (!xtables_strtoul(burst, , , 1, UINT32_MAX) ||
-   (*end == 0 && v > XT_HASHLIMIT_BURST_MAX))
-   burst_error();
+   (*end == 0 && v > XT_HASHLIMIT_BURST_MAX_v1))
+   burst_error_v1();
 
v *= get_factor(*end);
if (v > UINT32_MAX)
@@ -253,7 +253,7 @@ int parse_rate(const char *rate, uint32_t *val, struct 
hashlimit_mt_udata *ud)
if (!r)
return 0;
 
-   *val = XT_HASHLIMIT_SCALE * ud->mult / r;
+   *val = XT_HASHLIMIT_SCALE_v1 * ud->mult / r;
if (*val == 0)
/*
 * The rate maps to infinity. (1/day is the minimum they can
@@ -272,7 +272,7 @@ static void hashlimit_init(struct xt_entry_match *m)
 
 }
 
-static void hashlimit_mt4_init(struct xt_entry_match *match)
+static void hashlimit_mt4_init_v1(struct xt_entry_match *match)
 {
struct xt_hashlimit_mtinfo1 *info = (void *)match->data;
 
@@ -283,7 +283,7 @@ static void hashlimit_mt4_init(struct xt_entry_match *match)
info->cfg.dstmask = 32;
 }
 
-static void hashlimit_mt6_init(struct xt_entry_match *match)
+static void hashlimit_mt6_init_v1(struct xt_entry_match *match)
 {
struct xt_hashlimit_mtinfo1 *info = (void *)match->data;
 
@@ -342,7 +342,7 @@ static void hashlimit_parse(struct xt_option_call *cb)
}
 }
 
-static void hashlimit_mt_parse(struct xt_option_call *cb)
+static void hashlimit_mt_parse_v1(struct xt_option_call *cb)
 {
struct xt_hashlimit_mtinfo1 *info = cb->data;
 
@@ -395,7 +395,7 @@ static void hashlimit_check(struct xt_fcheck_call *cb)
info->cfg.expire = udata->mult * 1000; /* from s to msec */
 }
 
-static void hashlimit_mt_check(struct xt_fcheck_call *cb)
+static void hashlimit_mt_check_v1(struct xt_fcheck_call *cb)
 {
const struct hashlimit_mt_udata *udata = cb->udata;
struct xt_hashlimit_mtinfo1 *info = cb->data;
@@ -421,18 +421,18 @@ static void hashlimit_mt_check(struct xt_fcheck_call *cb)
info->cfg.expire = 
XT_HASHLIMIT_BYTE_EXPIRE_BURST * 1000;
}
info->cfg.burst = burst;
-   } else if (info->cfg.burst > XT_HASHLIMIT_BURST_MAX)
-   burst_error();
+   } else if (info->cfg.burst >

[PATCH v2 2/2] netfilter: Create revision 2 of xt_hashlimit to support higher pps rates

2016-07-07 Thread Vishwanath Pai
Removed the call to BUG() in cfg_copy, we return -EINVAL
instead and all calls to cfg_copy check for this

--

netfilter: Create revision 2 of xt_hashlimit to support higher pps rates

Create a new revision for the hashlimit iptables extension module. Rev 2
will support higher pps of upto 1 million, Version 1 supports only 10k.

To support this we have to increase the size of the variables avg and
burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2
and xt_hashlimit_mtinfo2 and also create newer versions of all the
functions for match, checkentry and destroy.

Some of the functions like hashlimit_mt, hashlimit_mt_check etc are very
similar in both rev1 and rev2 with only minor changes, so I have split
those functions and moved all the common code to a *_common function.

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Signed-off-by: Joshua Hunt <joh...@akamai.com>
---
 include/uapi/linux/netfilter/xt_hashlimit.h |  23 ++
 net/netfilter/xt_hashlimit.c| 330 ++--
 2 files changed, 285 insertions(+), 68 deletions(-)

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h 
b/include/uapi/linux/netfilter/xt_hashlimit.h
index ea8c1c0..be5d2e1 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -6,6 +6,7 @@
 
 /* timings are in milliseconds. */
 #define XT_HASHLIMIT_SCALE_v1 1
+#define XT_HASHLIMIT_SCALE 100llu
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
  * seconds, or one packet every 59 hours.
  */
@@ -63,6 +64,20 @@ struct hashlimit_cfg1 {
__u8 srcmask, dstmask;
 };
 
+struct hashlimit_cfg2 {
+   __u32 mode;   /* bitmask of XT_HASHLIMIT_HASH_* */
+   __u64 avg;/* Average secs between packets * scale */
+   __u64 burst;  /* Period multiplier for upper limit. */
+
+   /* user specified */
+   __u32 size; /* how many buckets */
+   __u32 max;  /* max number of entries */
+   __u32 gc_interval;  /* gc interval */
+   __u32 expire;   /* when do entries expire? */
+
+   __u8 srcmask, dstmask;
+};
+
 struct xt_hashlimit_mtinfo1 {
char name[IFNAMSIZ];
struct hashlimit_cfg1 cfg;
@@ -71,4 +86,12 @@ struct xt_hashlimit_mtinfo1 {
struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
 };
 
+struct xt_hashlimit_mtinfo2 {
+   char name[NAME_MAX];
+   struct hashlimit_cfg2 cfg;
+
+   /* Used internally by the kernel */
+   struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
+};
+
 #endif /* _UAPI_XT_HASHLIMIT_H */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 6b0ad93..69d317c 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -57,6 +57,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct 
net *net)
 
 /* need to declare this at the top */
 static const struct file_operations dl_file_ops_v1;
+static const struct file_operations dl_file_ops;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -86,8 +87,8 @@ struct dsthash_ent {
unsigned long expires;  /* precalculated expiry time */
struct {
unsigned long prev; /* last modification */
-   u_int32_t credit;
-   u_int32_t credit_cap, cost;
+   u_int64_t credit;
+   u_int64_t credit_cap, cost;
} rateinfo;
struct rcu_head rcu;
 };
@@ -98,7 +99,7 @@ struct xt_hashlimit_htable {
u_int8_t family;
bool rnd_initialized;
 
-   struct hashlimit_cfg1 cfg;  /* config */
+   struct hashlimit_cfg2 cfg;  /* config */
 
/* used internally */
spinlock_t lock;/* lock for list_head */
@@ -114,6 +115,30 @@ struct xt_hashlimit_htable {
struct hlist_head hash[0];  /* hashtable itself */
 };
 
+static int
+cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
+{
+   if (revision == 1) {
+   struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+   to->mode = cfg->mode;
+   to->avg = cfg->avg;
+   to->burst = cfg->burst;
+   to->size = cfg->size;
+   to->max = cfg->max;
+   to->gc_interval = cfg->gc_interval;
+   to->expire = cfg->expire;
+   to->srcmask = cfg->srcmask;
+   to->dstmask = cfg->dstmask;
+   } else if (revision == 2) {
+   memcpy(to, from, sizeof(struct hashlimit_cfg2));
+   } else {
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 static DEFINE_MUTEX(hashlimit_mutex);  /* protects htables list */
 static struct kmem_cache *hashlimit_cachep __read_mostly;
 
@@ -215,16 +240,18 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct 
dsthash_ent *ent)
 }
 static void htable_gc(struct work_

[PATCH v2 1/2] netfilter: Prepare xt_hashlimit.c for revision 2

2016-07-07 Thread Vishwanath Pai
I am planning to add a revision 2 for the hashlimit xtables module to
support higher packets per second rates. This patch renames all the
functions and variables related to revision 1 by adding _v1 at the end of
the names.

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Signed-off-by: Joshua Hunt <joh...@akamai.com>
---
 include/uapi/linux/netfilter/xt_hashlimit.h |  2 +-
 net/netfilter/xt_hashlimit.c| 61 +++--
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h 
b/include/uapi/linux/netfilter/xt_hashlimit.h
index 6db9037..ea8c1c0 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -5,7 +5,7 @@
 #include 
 
 /* timings are in milliseconds. */
-#define XT_HASHLIMIT_SCALE 1
+#define XT_HASHLIMIT_SCALE_v1 1
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
  * seconds, or one packet every 59 hours.
  */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 1786968..6b0ad93 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -56,7 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct 
net *net)
 }
 
 /* need to declare this at the top */
-static const struct file_operations dl_file_ops;
+static const struct file_operations dl_file_ops_v1;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -215,8 +215,8 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct 
dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
-u_int8_t family)
+static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 
*minfo,
+   u_int8_t family)
 {
struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
struct xt_hashlimit_htable *hinfo;
@@ -265,7 +265,7 @@ static int htable_create(struct net *net, struct 
xt_hashlimit_mtinfo1 *minfo,
hinfo->pde = proc_create_data(minfo->name, 0,
(family == NFPROTO_IPV4) ?
hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-   _file_ops, hinfo);
+   _file_ops_v1, hinfo);
if (hinfo->pde == NULL) {
kfree(hinfo->name);
vfree(hinfo);
@@ -398,7 +398,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
(slowest userspace tool allows), which means
CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
 */
-#define MAX_CPJ (0x / (HZ*60*60*24))
+#define MAX_CPJ_v1 (0x / (HZ*60*60*24))
 
 /* Repeated shift and or gives us all 1s, final shift and add 1 gives
  * us the power of 2 below the theoretical max, so GCC simply does a
@@ -410,7 +410,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
 #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
 
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+#define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
 
 /* in byte mode, the lowest possible rate is one packet/second.
  * credit_cap is used as a counter that tells us how many times we can
@@ -428,11 +428,12 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
 static u32 user2credits(u32 user)
 {
/* If multiplying would overflow... */
-   if (user > 0x / (HZ*CREDITS_PER_JIFFY))
+   if (user > 0x / (HZ*CREDITS_PER_JIFFY_v1))
/* Divide first. */
-   return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+   return (user / XT_HASHLIMIT_SCALE_v1) *\
+   HZ * CREDITS_PER_JIFFY_v1;
 
-   return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+   return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE_v1;
 }
 
 static u32 user2credits_byte(u32 user)
@@ -461,7 +462,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, 
unsigned long now, u32 mode)
return;
}
} else {
-   dh->rateinfo.credit += delta * CREDITS_PER_JIFFY;
+   dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1;
cap = dh->rateinfo.credit_cap;
}
if (dh->rateinfo.credit > cap)
@@ -603,7 +604,7 @@ static u32 hashlimit_byte_cost(unsigned int len, struct 
dsthash_ent *dh)
 }
 
 static bool
-hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
struct xt_hashlimit_htable *hinfo = info->hinfo;
@@ -660,7 +661,7 @@ hashlimit_mt(const struct sk_buff *skb, struct 
xt_action_param *par)
return false;
 }
 
-static int hashlimit_mt_check(const struct xt_mtchk_par

Re: [PATCH iptables 3/3] libxt_hashlimit: iptables-restore does not work as expected with xt_hashlimit

2016-07-06 Thread Vishwanath Pai
On 07/05/2016 04:13 PM, Vishwanath Pai wrote:
> On 06/25/2016 05:39 AM, Pablo Neira Ayuso wrote:
>> I see, but I'm not convinced about this /proc rename feature.
>>
>> I think the main point of this, as well as other entries in bugzilla
>> related to this, is ability to update an existing hashlimit state.
>>
>> So, I'm not proposing to rename --enhanced-procfs to something else,
>> I think that a different approach consisting on adding a new option
>> like --hashlimit-update that will update the internal state of an
>> existing hashlimit object is just fine for your usecase, right?
>>
>>>> Other than that, we are doing exactly what you said, but creating a new
>>>> entry in the hashtable instead of updating it. The previous entry will
>>>> automatically be removed when the old rule is flushed/deleted.
>> What I'm missing is why we need this /proc rename at all.
> 
> The reason we need the procfs rename feature is because it is broken at
> the moment. Let us assume someone adds two rules with the same name
> (intentionally or otherwise). We cannot prevent them from doing this or
> error out when someone does this because all of this is done in
> hashlimit_mt_check which is called for every iptables rule change, even
> an entirely different rule. I'll demonstrate two scenarios here. I have
> put debug printk statements which prints everytime hashlimit_mt_check is
> called.
> 
> 1) Add two rules with the same name but in different chains
> 
> $ iptables -A chain1 -m hashlimit --hashlimit-above 200/sec \
>   --hashlimit-mode srcip --hashlimit-name hashlimit1 -j DROP
> 
> $ dmesg -c
> [  103.965578] hashlimit_mt_check for rule hashlimit1
> 
> 
> $ iptables -A chain2 -m hashlimit --hashlimit-above 300/sec \
>--hashlimit-mode srcip --hashlimit-name hashlimit1 -j DROP
> 
> $ dmesg -c
> [  114.613758] hashlimit_mt_check for rule hashlimit1
> [  114.621360] hashlimit_mt_check for rule hashlimit1
> [  114.627411] hashlimit_mt_destroy on hashlimit1
> 
> 2) Replace an iptables rule with iptables-restore
> 
> $ iptables -A chain1 -m hashlimit --hashlimit-above 200/sec \
>   --hashlimit-mode srcip --hashlimit-name hashlimit1 -j DROP
> 
> $ iptables-save > /tmp/hashlimit
> 
> $ vi /tmp/hashlimit (edit 200/s to 300/s)
> 
> $ iptables-restore < /tmp/hashlimit
> 
> $ dmesg -c
> [ 1585.411093] hashlimit_mt_check for rule hashlimit1
> [ 1585.418948] hashlimit_mt_destroy on hashlimit1
> 
> In case 1 there exists two rules with the same name but we cannot have
> procfs files for both of the rules since they have to exist in the same
> directory. In case 2 there will be only one rule but there is a small
> window where two rules with same name exist. We cannot differentiate
> this from case 1. In both the cases we get the call for
> hashlimit_mt_check for the new rule before the old rule is deleted.
> 
> Without the rename feature I do not know how to correctly handle the
> scenario where two rules with different parameters but the same name exist.
> 
> I believe the rest of the patch handles the --hashlimit-update feature
> you mentioned, but instead of updating an existing object it creates a
> new one and the old object is deleted by the call to destroy. The
> hashtable match function is modified to include all parameters of the
> object and not just the name so that we can reuse objects that have the
> exact same features.
> 

There is also another option of removing the procfs files altogether in
revision 2. The files were only used for debugging and any users relying
on the file being present will continue to see it as long as they are in
revision 1.

If we do need these files, we can create another option in rev2 which
explicitly enables procfs but leave them disabled by default. And we can
retain the existing behavior assuming whoever is using it knows its
caveats. Please suggest.

Thanks,
Vishwanath


Re: [PATCH 2/3] netfilter: Create revision 2 of xt_hashlimit to support higher pps rates

2016-07-05 Thread Vishwanath Pai
On 06/23/2016 07:16 AM, Pablo Neira Ayuso wrote:
> On Wed, Jun 01, 2016 at 08:11:38PM -0400, Vishwanath Pai wrote:
>> +static void
>> +cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
>> +{
>> +if (revision == 1) {
>> +struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
>> +
>> +to->mode = cfg->mode;
>> +to->avg = cfg->avg;
>> +to->burst = cfg->burst;
>> +to->size = cfg->size;
>> +to->max = cfg->max;
>> +to->gc_interval = cfg->gc_interval;
>> +to->expire = cfg->expire;
>> +to->srcmask = cfg->srcmask;
>> +to->dstmask = cfg->dstmask;
>> +} else if (revision == 2) {
>> +memcpy(to, from, sizeof(struct hashlimit_cfg2));
>> +} else {
>> +BUG();
> 
> BUG here is probably too much, this halts the system. I can see we
> only use this somewhere else in this code. Instead, I'd suggest you
> propagate an error back to userspace if this ever happen.
> 
> I would like to see if this spots any problem with our test
> infrastructure under iptables/.
> 
> Thanks.
> 

copy_cfg is only used internally by the kernel module and the value for
revision is passed to the function by the module itself and not from
userspace. I will remove BUG() and propagate the error back to the
caller, will send a v2.


Re: [PATCH iptables 3/3] libxt_hashlimit: iptables-restore does not work as expected with xt_hashlimit

2016-07-05 Thread Vishwanath Pai
On 06/25/2016 05:39 AM, Pablo Neira Ayuso wrote:
> I see, but I'm not convinced about this /proc rename feature.
> 
> I think the main point of this, as well as other entries in bugzilla
> related to this, is ability to update an existing hashlimit state.
> 
> So, I'm not proposing to rename --enhanced-procfs to something else,
> I think that a different approach consisting on adding a new option
> like --hashlimit-update that will update the internal state of an
> existing hashlimit object is just fine for your usecase, right?
> 
>> > Other than that, we are doing exactly what you said, but creating a new
>> > entry in the hashtable instead of updating it. The previous entry will
>> > automatically be removed when the old rule is flushed/deleted.
> What I'm missing is why we need this /proc rename at all.

The reason we need the procfs rename feature is because it is broken at
the moment. Let us assume someone adds two rules with the same name
(intentionally or otherwise). We cannot prevent them from doing this or
error out when someone does this because all of this is done in
hashlimit_mt_check which is called for every iptables rule change, even
an entirely different rule. I'll demonstrate two scenarios here. I have
put debug printk statements which prints everytime hashlimit_mt_check is
called.

1) Add two rules with the same name but in different chains

$ iptables -A chain1 -m hashlimit --hashlimit-above 200/sec \
  --hashlimit-mode srcip --hashlimit-name hashlimit1 -j DROP

$ dmesg -c
[  103.965578] hashlimit_mt_check for rule hashlimit1


$ iptables -A chain2 -m hashlimit --hashlimit-above 300/sec \
   --hashlimit-mode srcip --hashlimit-name hashlimit1 -j DROP

$ dmesg -c
[  114.613758] hashlimit_mt_check for rule hashlimit1
[  114.621360] hashlimit_mt_check for rule hashlimit1
[  114.627411] hashlimit_mt_destroy on hashlimit1

2) Replace an iptables rule with iptables-restore

$ iptables -A chain1 -m hashlimit --hashlimit-above 200/sec \
  --hashlimit-mode srcip --hashlimit-name hashlimit1 -j DROP

$ iptables-save > /tmp/hashlimit

$ vi /tmp/hashlimit (edit 200/s to 300/s)

$ iptables-restore < /tmp/hashlimit

$ dmesg -c
[ 1585.411093] hashlimit_mt_check for rule hashlimit1
[ 1585.418948] hashlimit_mt_destroy on hashlimit1

In case 1 there exists two rules with the same name but we cannot have
procfs files for both of the rules since they have to exist in the same
directory. In case 2 there will be only one rule but there is a small
window where two rules with same name exist. We cannot differentiate
this from case 1. In both the cases we get the call for
hashlimit_mt_check for the new rule before the old rule is deleted.

Without the rename feature I do not know how to correctly handle the
scenario where two rules with different parameters but the same name exist.

I believe the rest of the patch handles the --hashlimit-update feature
you mentioned, but instead of updating an existing object it creates a
new one and the old object is deleted by the call to destroy. The
hashtable match function is modified to include all parameters of the
object and not just the name so that we can reuse objects that have the
exact same features.


[PATCH v3] netfilter/nflog: nflog-range does not truncate packets (userspace)

2016-06-24 Thread Vishwanath Pai
Added tests to libxt_NFLOG.t for the new option --nflog-size

--

netfilter/nflog: nflog-range does not truncate packets

The option --nflog-range has never worked, but we cannot just fix this
because users might be using this feature option and their behavior would
change. Instead add a new option --nflog-size. This option works the same
way nflog-range should have, and both of them are mutually exclusive. When
someone uses --nflog-range we print a warning message informing them that
this feature has no effect.

To indicate the kernel that the user has set --nflog-size we have to pass a
new flag XT_NFLOG_F_COPY_LEN.

Also updated the man page to reflect the new option and added tests to
extensions/libxt_NFLOG.t

Reported-by: Joe Dollard <jdoll...@akamai.com>
Reviewed-by: Josh Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>

diff --git a/extensions/libxt_NFLOG.c b/extensions/libxt_NFLOG.c
index f611631..8c67066 100644
--- a/extensions/libxt_NFLOG.c
+++ b/extensions/libxt_NFLOG.c
@@ -12,7 +12,10 @@ enum {
O_GROUP = 0,
O_PREFIX,
O_RANGE,
+   O_SIZE,
O_THRESHOLD,
+   F_RANGE = 1 << O_RANGE,
+   F_SIZE = 1 << O_SIZE,
 };
 
 #define s struct xt_nflog_info
@@ -22,7 +25,9 @@ static const struct xt_option_entry NFLOG_opts[] = {
{.name = "nflog-prefix", .id = O_PREFIX, .type = XTTYPE_STRING,
 .min = 1, .flags = XTOPT_PUT, XTOPT_POINTER(s, prefix)},
{.name = "nflog-range", .id = O_RANGE, .type = XTTYPE_UINT32,
-.flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
+.excl = F_SIZE, .flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
+   {.name = "nflog-size", .id = O_SIZE, .type = XTTYPE_UINT32,
+.excl = F_RANGE, .flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
{.name = "nflog-threshold", .id = O_THRESHOLD, .type = XTTYPE_UINT16,
 .flags = XTOPT_PUT, XTOPT_POINTER(s, threshold)},
XTOPT_TABLEEND,
@@ -33,7 +38,8 @@ static void NFLOG_help(void)
 {
printf("NFLOG target options:\n"
   " --nflog-group NUM  NETLINK group used for 
logging\n"
-  " --nflog-range NUM  Number of byte to copy\n"
+  " --nflog-range NUM  This option has no effect, use 
--nflog-size\n"
+  " --nflog-size NUM   Number of bytes to copy\n"
   " --nflog-threshold NUM  Message threshold of in-kernel 
queue\n"
   " --nflog-prefix STRING  Prefix string for log 
messages\n");
 }
@@ -57,6 +63,18 @@ static void NFLOG_parse(struct xt_option_call *cb)
}
 }
 
+static void NFLOG_check(struct xt_fcheck_call *cb)
+{
+   struct xt_nflog_info *info = cb->data;
+
+   if (cb->xflags & F_RANGE)
+   fprintf(stderr, "warn: --nflog-range has never worked and is no"
+   " longer supported, please use --nflog-size insted\n");
+
+   if (cb->xflags & F_SIZE)
+   info->flags |= XT_NFLOG_F_COPY_LEN;
+}
+
 static void nflog_print(const struct xt_nflog_info *info, char *prefix)
 {
if (info->prefix[0] != '\0') {
@@ -65,7 +83,9 @@ static void nflog_print(const struct xt_nflog_info *info, 
char *prefix)
}
if (info->group)
printf(" %snflog-group %u", prefix, info->group);
-   if (info->len)
+   if (info->len && info->flags & XT_NFLOG_F_COPY_LEN)
+   printf(" %snflog-size %u", prefix, info->len);
+   else if (info->len)
printf(" %snflog-range %u", prefix, info->len);
if (info->threshold != XT_NFLOG_DEFAULT_THRESHOLD)
printf(" %snflog-threshold %u", prefix, info->threshold);
@@ -117,6 +137,7 @@ static struct xtables_target nflog_target = {
.userspacesize  = XT_ALIGN(sizeof(struct xt_nflog_info)),
.help   = NFLOG_help,
.init   = NFLOG_init,
+   .x6_fcheck  = NFLOG_check,
.x6_parse   = NFLOG_parse,
.print  = NFLOG_print,
.save   = NFLOG_save,
diff --git a/extensions/libxt_NFLOG.man b/extensions/libxt_NFLOG.man
index 1b6dbf1..318e630 100644
--- a/extensions/libxt_NFLOG.man
+++ b/extensions/libxt_NFLOG.man
@@ -17,6 +17,9 @@ A prefix string to include in the log message, up to 64 
characters
 long, useful for distinguishing messages in the logs.
 .TP
 \fB\-\-nflog\-range\fP \fIsize\fP
+This option has never worked, use --nflog-size instead
+.TP
+\fB\-\-nflog\-size\fP \fIsize\fP
 The number of bytes to be copied to userspace (only applicable for
 nfnetlink_log). nfnetlink_log instances may specify their own
 range, this option overrides it.
diff --git a/extensions/libxt_NFLOG.t b/extensions/libxt_NFLOG.t
index f976

Re: [PATCH iptables 3/3] libxt_hashlimit: iptables-restore does not work as expected with xt_hashlimit

2016-06-24 Thread Vishwanath Pai
On 06/23/2016 06:25 AM, Pablo Neira Ayuso wrote:
> On Wed, Jun 01, 2016 at 08:17:59PM -0400, Vishwanath Pai wrote:
>> libxt_hashlimit: iptables-restore does not work as expected with xt_hashlimit
>>
>> Add the following iptables rule.
>>
>> $ iptables -A INPUT -m hashlimit --hashlimit-above 200/sec \
>>   --hashlimit-burst 5 --hashlimit-mode srcip --hashlimit-name hashlimit1 \
>>   --hashlimit-htable-expire 3 -j DROP
>>
>> $ iptables-save > save.txt
>>
>> Edit save.txt and change the value of --hashlimit-above to 300:
>>
>> -A INPUT -m hashlimit --hashlimit-above 300/sec --hashlimit-burst 5 \
>> --hashlimit-mode srcip --hashlimit-name hashlimit2 \
>> --hashlimit-htable-expire 3 -j DROP
>>
>> Now restore save.txt
>>
>> $ iptables-restore < save.txt
> 
> In this case, we don't end up with two rules, we actually get one
> single hashlimit rule, given the sequence you provide.
> 
> $ iptables-save > save.txt
> ... edit save.txt
> $ iptables-restore < save.txt
> 

Yes, we end up with just one rule, but the kernel data structure is not
updated. Userspace thinks the value is 300/s but in the kernel it is
still 200/s.

>> Now userspace thinks that the value of --hashlimit-above is 300 but it is
>> actually 200 in the kernel. This happens because when we add multiple
>> hash-limit rules with the same name they will share the same hashtable
>> internally. The kernel module tries to re-use the old hashtable without
>> updating the values.
>>
>> There are multiple problems here:
>> 1) We can add two iptables rules with the same name, but kernel does not
>>handle this well, one procfs file cannot work with two rules
>> 2) If the second rule has no effect because the hashtable has values from
>>rule 1
>> 3) hashtable-restore does not work (as described above)
>>
>> To fix this I have made the following design change:
>> 1) If a second rule is added with the same name as an existing rule,
>>append a number when we create the procfs, for example hashlimit_1,
>>hashlimit_2 etc
>> 2) Two rules will not share the same hashtable unless they are similar in
>>every possible way
>> 3) This behavior has to be forced with a new userspace flag:
>>--hashlimit-ehanced-procfs, if this flag is not passed we default to
>>the old behavior. This is to make sure we do not break existing scripts
>>that rely on the existing behavior.
> 
> We discussed this in netdev0.1, and I think we agreed on adding a new
> option, something like --hashlimit-update that would force an update
> to the existing hashlimit internal state (that is identified by the
> hashlimit name).
> 
> I think the problem here is that you may want to update the internal
> state of an existing hashlimit object, and currently this is not
> actually happening.
> 
> With the explicit --hashlimit-update flag, from the kernel we really
> know that the user wants an update.
> 
> Let me know, thanks.
> 

Yes, I believe you had a discussion about this with Josh Hunt. This
patch does add a new option, but it is called -enhanced-procfs instead.
I am open to renaming this to something else. I chose this name because
this patch will affect the names of the procfs files when multiple rules
with the same name exist. This generally does not happen, but is a side
effect of the way we create these files. In the case of restore example
above - we get the call to "hashlimit_mt_check" for the new rule before
the old rule is deleted, so there is a short window where we have two
rules in the kernel with the same name.

Other than that, we are doing exactly what you said, but creating a new
entry in the hashtable instead of updating it. The previous entry will
automatically be removed when the old rule is flushed/deleted.

Users will see this new behavior only if the new option is passed,
otherwise we default to the old behavior. We are also doing this in rev2
so old userspace tools will not be affected.

Thanks,
Vishwanath



[PATCH v2 1/2] netfilter/nflog: nflog-range does not truncate packets

2016-06-21 Thread Vishwanath Pai
netfilter/nflog: nflog-range does not truncate packets

li->u.ulog.copy_len is currently ignored by the kernel, we should truncate
the packet to either li->u.ulog.copy_len (if set) or copy_range before
sending it to userspace. 0 is a valid input for copy_len, so add a new
flag to indicate whether this was option was specified by the user or not.

Add two flags to indicate whether nflog-size/copy_len was set or not.
XT_NFLOG_F_COPY_LEN is for XT_NFLOG and NFLOG_F_COPY_LEN for nfnetlink_log

On the userspace side, this was initially represented by the option
nflog-range, this will be replaced by --nflog-size now. --nflog-range would
still exist but does not do anything.

Reported-by: Joe Dollard <jdoll...@akamai.com>
Reviewed-by: Josh Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>

diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h
index 57639fc..83d855b 100644
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -12,6 +12,9 @@
 #define NF_LOG_UID 0x08/* Log UID owning local socket */
 #define NF_LOG_MASK0x0f
 
+/* This flag indicates that copy_len field in nf_loginfo is set */
+#define NF_LOG_F_COPY_LEN  0x1
+
 enum nf_log_type {
NF_LOG_TYPE_LOG = 0,
NF_LOG_TYPE_ULOG,
@@ -22,9 +25,13 @@ struct nf_loginfo {
u_int8_t type;
union {
struct {
+   /* copy_len will be used iff you set
+* NF_LOG_F_COPY_LEN in flags
+*/
u_int32_t copy_len;
u_int16_t group;
u_int16_t qthreshold;
+   u_int16_t flags;
} ulog;
struct {
u_int8_t level;
diff --git a/include/uapi/linux/netfilter/xt_NFLOG.h 
b/include/uapi/linux/netfilter/xt_NFLOG.h
index 87b5831..f330707 100644
--- a/include/uapi/linux/netfilter/xt_NFLOG.h
+++ b/include/uapi/linux/netfilter/xt_NFLOG.h
@@ -6,9 +6,13 @@
 #define XT_NFLOG_DEFAULT_GROUP 0x1
 #define XT_NFLOG_DEFAULT_THRESHOLD 0
 
-#define XT_NFLOG_MASK  0x0
+#define XT_NFLOG_MASK  0x1
+
+/* This flag indicates that 'len' field in xt_nflog_info is set*/
+#define XT_NFLOG_F_COPY_LEN0x1
 
 struct xt_nflog_info {
+   /* 'len' will be used iff you set XT_NFLOG_F_COPY_LEN in flags */
__u32   len;
__u16   group;
__u16   threshold;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 11f81c8..cbcfdfb 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -700,10 +700,13 @@ nfulnl_log_packet(struct net *net,
break;
 
case NFULNL_COPY_PACKET:
-   if (inst->copy_range > skb->len)
+   data_len = inst->copy_range;
+   if ((li->u.ulog.flags & NF_LOG_F_COPY_LEN) &&
+   (li->u.ulog.copy_len < data_len))
+   data_len = li->u.ulog.copy_len;
+
+   if (data_len > skb->len)
data_len = skb->len;
-   else
-   data_len = inst->copy_range;
 
size += nla_total_size(data_len);
break;
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index a1fa2c8..018eed7 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -33,6 +33,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param 
*par)
li.u.ulog.group  = info->group;
li.u.ulog.qthreshold = info->threshold;
 
+   if (info->flags & XT_NFLOG_F_COPY_LEN)
+   li.u.ulog.flags |= NF_LOG_F_COPY_LEN;
+
nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
  par->out, , info->prefix);
return XT_CONTINUE;


[PATCH v2 2/2] netfilter/nflog: nflog-range does not truncate packets (userspace)

2016-06-21 Thread Vishwanath Pai
netfilter/nflog: nflog-range does not truncate packets

The option --nflog-range has never worked, but we cannot just fix this
because users might be using this feature option and their behavior would
change. Instead add a new option --nflog-size. This option works the same
way nflog-range should have, and both of them are mutually exclusive. When
someone uses --nflog-range we print a warning message informing them that
this feature has no effect.

To indicate the kernel that the user has set --nflog-size we have to pass a
new flag XT_NFLOG_F_COPY_LEN.

Also updated the man page to reflect this.

Reported-by: Joe Dollard <jdoll...@akamai.com>
Reviewed-by: Josh Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>

diff --git a/extensions/libxt_NFLOG.c b/extensions/libxt_NFLOG.c
index f611631..8c564a2 100644
--- a/extensions/libxt_NFLOG.c
+++ b/extensions/libxt_NFLOG.c
@@ -12,7 +12,10 @@ enum {
O_GROUP = 0,
O_PREFIX,
O_RANGE,
+   O_SIZE,
O_THRESHOLD,
+   F_RANGE = 1 << O_RANGE,
+   F_SIZE = 1 << O_SIZE,
 };
 
 #define s struct xt_nflog_info
@@ -22,7 +25,9 @@ static const struct xt_option_entry NFLOG_opts[] = {
{.name = "nflog-prefix", .id = O_PREFIX, .type = XTTYPE_STRING,
 .min = 1, .flags = XTOPT_PUT, XTOPT_POINTER(s, prefix)},
{.name = "nflog-range", .id = O_RANGE, .type = XTTYPE_UINT32,
-.flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
+.excl = F_SIZE, .flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
+   {.name = "nflog-size", .id = O_SIZE, .type = XTTYPE_UINT32,
+.excl = F_RANGE, .flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
{.name = "nflog-threshold", .id = O_THRESHOLD, .type = XTTYPE_UINT16,
 .flags = XTOPT_PUT, XTOPT_POINTER(s, threshold)},
XTOPT_TABLEEND,
@@ -33,7 +38,8 @@ static void NFLOG_help(void)
 {
printf("NFLOG target options:\n"
   " --nflog-group NUM  NETLINK group used for 
logging\n"
-  " --nflog-range NUM  Number of byte to copy\n"
+  " --nflog-range NUM  This option has no effect, use 
--nflog-size\n"
+  " --nflog-size NUM   Number of bytes to copy\n"
   " --nflog-threshold NUM  Message threshold of in-kernel 
queue\n"
   " --nflog-prefix STRING  Prefix string for log 
messages\n");
 }
@@ -57,6 +63,18 @@ static void NFLOG_parse(struct xt_option_call *cb)
}
 }
 
+static void NFLOG_check(struct xt_fcheck_call *cb)
+{
+   struct xt_nflog_info *info = cb->data;
+
+   if (cb->xflags & F_RANGE)
+   fprintf(stderr, "warn: --nflog-range has never worked and is no"
+   " longer supported, please use --nflog-size insted\n");
+
+   if (cb->xflags & F_SIZE)
+   info->flags |= XT_NFLOG_F_COPY_LEN;
+}
+
 static void nflog_print(const struct xt_nflog_info *info, char *prefix)
 {
if (info->prefix[0] != '\0') {
@@ -65,7 +83,9 @@ static void nflog_print(const struct xt_nflog_info *info, 
char *prefix)
}
if (info->group)
printf(" %snflog-group %u", prefix, info->group);
-   if (info->len)
+   if (info->len && info->flags & XT_NFLOG_F_COPY_LEN)
+   printf(" %snflog-size %u", prefix, info->len);
+   else if (info->len)
printf(" %snflog-range %u", prefix, info->len);
if (info->threshold != XT_NFLOG_DEFAULT_THRESHOLD)
printf(" %snflog-threshold %u", prefix, info->threshold);
@@ -117,6 +137,7 @@ static struct xtables_target nflog_target = {
.userspacesize  = XT_ALIGN(sizeof(struct xt_nflog_info)),
.help   = NFLOG_help,
.init   = NFLOG_init,
+   .x6_fcheck  = NFLOG_check,
.x6_parse   = NFLOG_parse,
.print  = NFLOG_print,
.save   = NFLOG_save,
diff --git a/extensions/libxt_NFLOG.man b/extensions/libxt_NFLOG.man
index 1b6dbf1..318e630 100644
--- a/extensions/libxt_NFLOG.man
+++ b/extensions/libxt_NFLOG.man
@@ -17,6 +17,9 @@ A prefix string to include in the log message, up to 64 
characters
 long, useful for distinguishing messages in the logs.
 .TP
 \fB\-\-nflog\-range\fP \fIsize\fP
+This option has never worked, use --nflog-size instead
+.TP
+\fB\-\-nflog\-size\fP \fIsize\fP
 The number of bytes to be copied to userspace (only applicable for
 nfnetlink_log). nfnetlink_log instances may specify their own
 range, this option overrides it.
diff --git a/include/linux/netfilter/xt_NFLOG.h 
b/include/linux/netfilter/xt_NFLOG.h
index 87b5831..f330707 100644
--- a/include/linux/netfilter/xt_NFLOG.h
+++ b/include/linux/netfilter/xt_NF

Re: [PATCH] netfilter/nflog: nflog-range does not truncate packets

2016-06-17 Thread Vishwanath Pai
On 06/17/2016 07:22 AM, Pablo Neira Ayuso wrote:
> On Wed, Jun 15, 2016 at 03:13:15PM +, Lubashev, Igor wrote:
>> Vish, Pablo,
>>
>> I wonder about the value of sending more data than a client is
>> willing to consume (setting aside the important fact that the client
>> code crashes due to the extra data).
>>
>> It seems that we should either drop the nflog-range parameter from
>> nflog altogether (and just use the len from the client) or allow
>> nflog-range to further *restrict* the number of bytes sent to the
>> client.
>>
>> The "further restrict" logic would make it easier to build iptables
>> rules that vary nflog-range based on some match conditions, so a
>> single client would get different packet length depending on what
>> rules matched.
> 
> Now I understand your usecase. Restricting the size based on match
> conditions sound reasonable to me.
> 
> Why don't you add a new userspace option, eg. --nflog-size, that
> specifies this "further restrict" logic?
> 
> What I'm proposing is:
> 
> 1) If --nflog-range is used, print a message telling: "--nflog-range
>has never worked, ignoring this option."
> 
> 2) If --nflog-size is used, set the size in the structure that is
>passed to the kernel, and apply this "further restrict" logic.
> 
> 3) Add the flag to the kernel that I suggested. This flag is only set
>via --nflog-size.
> 
> Just to clarify: What I'm trying to avoid is breaking the thing for
> users that are using this --nflog-range (even if it doesn't work) and
> then change the behaviour for them.
> 
> With the new option, we really validate that the user is exactly
> asking for this "further restrict" logic that you need.
> 
> let me know, thanks.
> 

Sounds good to me, yes it will definitely change the behavior for users
who are using that parameter (whether intentional or not). I'm OK with
adding a new parameter instead of using --nflog-range. I will send a
patch with these changes.


Re: [PATCH] netfilter/nflog: nflog-range does not truncate packets

2016-06-15 Thread Vishwanath Pai
On 06/15/2016 08:39 AM, Pablo Neira Ayuso wrote:
> But nlmsg_len should match len in this.
> 
> If we're just sending a part of the packet to userspace, then we
> should adjust nlmsg_len to indicate exactly the netlink message length
> that we're sending to userspace.
> 
> Is your patch triggering this nlmsg_len != len?

The value of len here is how many bytes were returned by recv. We do
send the entire nlmsg_len to userspace, but recv cannot copy the full
packet because the buffer is not big enough to hold this. They only
allocate the buffer assuming that the packet won't be bigger than their
snap len, but we send more data than their snap len and they don't
handle this condition well.


Re: [PATCH] netfilter/nflog: nflog-range does not truncate packets

2016-06-12 Thread Vishwanath Pai
On 06/09/2016 01:57 PM, Vishwanath Pai wrote:
> On 06/08/2016 08:16 AM, Pablo Neira Ayuso wrote:
>> Looking again at your code:
>>
>> case NFULNL_COPY_PACKET:
>> -   if (inst->copy_range > skb->len)
>> +   data_len = inst->copy_range;
>> +   if (li->u.ulog.copy_len < data_len)
>> +   data_len = li->u.ulog.copy_len;
>>
>> data_len is set to instance's copy_range.
>>
>> But then, if the NFLOG rule indicates smaller copy_len, you use this
>> value. So to my understanding, NFLOG rule prevails over instance's
>> copy_range in what matters, which is to shrink the copy range.
>>
>>>> --nflog-range will not override the per-instance default,
>>>> the only time it would get preference is when its value is lesser than
>>>> the per-instance value. If copy_range is lesser than --nflog-range then
>>>> we retain copy_range.
>>>>
>>>> So basically what we are doing is min(copy_range, nflog-range).
>>>> Just wanted to clarify this, if this is not how it's meant to be
>>>> please let me know.
>>>>
>>>> Also, there is a bug in my patch, li->u.ulog.copy_len can be set to "0"
>>>> from userspace (if --nflog-range is not specified), so we have to check
>>>> for this condition before using the value. I will send a V2 of the patch
>>>> based on your reply.
>> Currently, li->u.ulog.copy_len is set to "0" by default when not
>> specified.
>>
>> But copy_len = 0 is a valid possibility, so this looks a bit more
>> tricky to me to fix since we may need to get flags here to know when
>> this is set.
>>
>> Probably something like:
>>
>> if (li->flags & NF_LOG_F_COPY_RANGE)
>> data_len = li->u.ulog.copy_len;
>> /* Per-instance copy range prevails over global per-rule option. */
>> if (data_len < inst->copy_range)
>> data_len = inst->copy_range;
>> if (data_len > skb->len)
>> data_len = skb->len;
>>
>> Although this would require a bit more code to introduce these flags.
>>
>> You will also need a new flag for xt_NFLOG:
>>
>> #define XT_NFLOG_COPY_LEN   0x2
>>
>> it seems other XT_NFLOG_* flags were already in place.
>>
>> Interesting that nobody noticed this for so long BTW.
> 
> I tried this out, I added two flags: one for XT_NFLOG to notify the
> kernel when --nflog-range is set by the user, and another flag for
> nfnetlink_log to pass this on to nfulnl_log_packet. This design works
> fine but while testing this I found a problem.
> 
> Lets say --nflog-range is set to 200, and the instance copy_range is set
> to 100. According to the code above the final value of data_len will be
> 200 so we try to pack 200 bytes into the skb. But somewhere between
> nfnetlink_log to dumpcap the packet is getting truncated and dumpcap
> doesn't like this:
> 
> $> dumpcap -y NFLOG -i nflog:5 -s 100
> Capturing on 'nflog:5'
> File: /tmp/wireshark_pcapng_nflog-5_20160609133531_pi6MrS
> dumpcap: Error while capturing packets: Message truncated: (got: 228)
> (nlmsg_len: 320)
> Please report this to the Wireshark developers.
> https://bugs.wireshark.org/
> (This is not a crash; please do not report it as such.)
> Packets captured: 0
> Packets received/dropped on interface 'nflog:5': 0/0
> (pcap:0/dumpcap:0/flushed:0/ps_ifdrop:0) (0.0%)
> 
> I'm trying to figure out where the packet is getting truncated.
> 

I found where the problem is. This is the userspace code for libpcap:

do {
len = recv(handle->fd, handle->buffer, handle->bufsize, 0);
if (handle->break_loop) {
handle->break_loop = 0;
return -2;
}
} while ((len == -1) && (errno == EINTR));

   ...

buf = handle->buffer;
while (len >= NLMSG_SPACE(0)) {
const struct nlmsghdr *nlh = (const struct nlmsghdr *) buf;
u_int32_t msg_len;
nftype_t type = OTHER;

if (nlh->nlmsg_len < sizeof(struct nlmsghdr) || len <
nlh->nlmsg_len) {
snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
"Message truncated: (got: %d) (nlmsg_len: %u)", len, nlh->nlmsg_len);
return -1;
}

handle->bufsize is only big enough to accommodate the snaplen specified
by the user in dumpcap. So if we send more data than that then we will
break userspace. One way around this is to send min(li->u.ulog.copy_len,
inst->copy_range). If this is OK then I can send a patch for this,
please suggest.




Re: [PATCH] netfilter/nflog: nflog-range does not truncate packets

2016-06-09 Thread Vishwanath Pai
On 06/08/2016 08:16 AM, Pablo Neira Ayuso wrote:
> Looking again at your code:
> 
> case NFULNL_COPY_PACKET:
> -   if (inst->copy_range > skb->len)
> +   data_len = inst->copy_range;
> +   if (li->u.ulog.copy_len < data_len)
> +   data_len = li->u.ulog.copy_len;
> 
> data_len is set to instance's copy_range.
> 
> But then, if the NFLOG rule indicates smaller copy_len, you use this
> value. So to my understanding, NFLOG rule prevails over instance's
> copy_range in what matters, which is to shrink the copy range.
> 
>> > --nflog-range will not override the per-instance default,
>> > the only time it would get preference is when its value is lesser than
>> > the per-instance value. If copy_range is lesser than --nflog-range then
>> > we retain copy_range.
>> >
>> > So basically what we are doing is min(copy_range, nflog-range).
>> > Just wanted to clarify this, if this is not how it's meant to be
>> > please let me know.
>> >
>> > Also, there is a bug in my patch, li->u.ulog.copy_len can be set to "0"
>> > from userspace (if --nflog-range is not specified), so we have to check
>> > for this condition before using the value. I will send a V2 of the patch
>> > based on your reply.
> Currently, li->u.ulog.copy_len is set to "0" by default when not
> specified.
> 
> But copy_len = 0 is a valid possibility, so this looks a bit more
> tricky to me to fix since we may need to get flags here to know when
> this is set.
> 
> Probably something like:
> 
> if (li->flags & NF_LOG_F_COPY_RANGE)
> data_len = li->u.ulog.copy_len;
> /* Per-instance copy range prevails over global per-rule option. */
> if (data_len < inst->copy_range)
> data_len = inst->copy_range;
> if (data_len > skb->len)
> data_len = skb->len;
> 
> Although this would require a bit more code to introduce these flags.
> 
> You will also need a new flag for xt_NFLOG:
> 
> #define XT_NFLOG_COPY_LEN   0x2
> 
> it seems other XT_NFLOG_* flags were already in place.
> 
> Interesting that nobody noticed this for so long BTW.

I tried this out, I added two flags: one for XT_NFLOG to notify the
kernel when --nflog-range is set by the user, and another flag for
nfnetlink_log to pass this on to nfulnl_log_packet. This design works
fine but while testing this I found a problem.

Lets say --nflog-range is set to 200, and the instance copy_range is set
to 100. According to the code above the final value of data_len will be
200 so we try to pack 200 bytes into the skb. But somewhere between
nfnetlink_log to dumpcap the packet is getting truncated and dumpcap
doesn't like this:

$> dumpcap -y NFLOG -i nflog:5 -s 100
Capturing on 'nflog:5'
File: /tmp/wireshark_pcapng_nflog-5_20160609133531_pi6MrS
dumpcap: Error while capturing packets: Message truncated: (got: 228)
(nlmsg_len: 320)
Please report this to the Wireshark developers.
https://bugs.wireshark.org/
(This is not a crash; please do not report it as such.)
Packets captured: 0
Packets received/dropped on interface 'nflog:5': 0/0
(pcap:0/dumpcap:0/flushed:0/ps_ifdrop:0) (0.0%)

I'm trying to figure out where the packet is getting truncated.


Re: [PATCH] netfilter/nflog: nflog-range does not truncate packets

2016-06-07 Thread Vishwanath Pai
On 06/06/2016 06:31 PM, Pablo Neira Ayuso wrote:
> On Wed, Jun 01, 2016 at 08:23:54PM -0400, Vishwanath Pai wrote:
>> netfilter/nflog: nflog-range does not truncate packets
>>
>> The --nflog-range parameter from userspace is ignored in the kernel and
>> the entire packet is sent to the userspace. The per-instance parameter
>> copy_range still works, with this change --nflog-range will have
>> preference over copy_range.
> 
> I think it's reasonable to assume that --nflog-range from the rule
> applies globally to any instance.
> 
> However, per-instance copy_range has prevailed over --nflog-range
> since the beginning, so I would follow a more conservative approach,
> ie. remain copy_range in preference over --nflog-range.
> 
> So I'd suggest you invert this logic.
> 
> Let me know, thanks.
> 

Thanks for reviewing this. I think my comment on the patch was
misleading, we do give preference to copy_range and that is what we
default to. --nflog-range will not override the per-instance default,
the only time it would get preference is when its value is lesser than
the per-instance value. If copy_range is lesser than --nflog-range then
we retain copy_range.

So basically what we are doing is min(copy_range, nflog-range). Just
wanted to clarify this, if this is not how it's meant to be please let
me know.

Also, there is a bug in my patch, li->u.ulog.copy_len can be set to "0"
from userspace (if --nflog-range is not specified), so we have to check
for this condition before using the value. I will send a V2 of the patch
based on your reply.

Thanks,
Vishwanath



[PATCH] netfilter/nflog: nflog-range does not truncate packets

2016-06-01 Thread Vishwanath Pai
netfilter/nflog: nflog-range does not truncate packets

The --nflog-range parameter from userspace is ignored in the kernel and
the entire packet is sent to the userspace. The per-instance parameter
copy_range still works, with this change --nflog-range will have
preference over copy_range.

Signed-off-by: Vishwanath Pai <v...@akamai.com>
Reviewed-by: Joshua Hunt <joh...@akamai.com>

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 4ef1fae..f40ddba 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -680,7 +680,6 @@ nfulnl_log_packet(struct net *net,
if (qthreshold > li->u.ulog.qthreshold)
qthreshold = li->u.ulog.qthreshold;
 
-
switch (inst->copy_mode) {
case NFULNL_COPY_META:
case NFULNL_COPY_NONE:
@@ -688,10 +687,12 @@ nfulnl_log_packet(struct net *net,
break;
 
case NFULNL_COPY_PACKET:
-   if (inst->copy_range > skb->len)
+   data_len = inst->copy_range;
+   if (li->u.ulog.copy_len < data_len)
+   data_len = li->u.ulog.copy_len;
+
+   if (data_len > skb->len)
data_len = skb->len;
-   else
-   data_len = inst->copy_range;
 
size += nla_total_size(data_len);
break;


[PATCH iptables 3/3] libxt_hashlimit: iptables-restore does not work as expected with xt_hashlimit

2016-06-01 Thread Vishwanath Pai
libxt_hashlimit: iptables-restore does not work as expected with xt_hashlimit

Add the following iptables rule.

$ iptables -A INPUT -m hashlimit --hashlimit-above 200/sec \
  --hashlimit-burst 5 --hashlimit-mode srcip --hashlimit-name hashlimit1 \
  --hashlimit-htable-expire 3 -j DROP

$ iptables-save > save.txt

Edit save.txt and change the value of --hashlimit-above to 300:

-A INPUT -m hashlimit --hashlimit-above 300/sec --hashlimit-burst 5 \
--hashlimit-mode srcip --hashlimit-name hashlimit2 \
--hashlimit-htable-expire 3 -j DROP

Now restore save.txt

$ iptables-restore < save.txt

Now userspace thinks that the value of --hashlimit-above is 300 but it is
actually 200 in the kernel. This happens because when we add multiple
hash-limit rules with the same name they will share the same hashtable
internally. The kernel module tries to re-use the old hashtable without
updating the values.

There are multiple problems here:
1) We can add two iptables rules with the same name, but kernel does not
   handle this well, one procfs file cannot work with two rules
2) If the second rule has no effect because the hashtable has values from
   rule 1
3) hashtable-restore does not work (as described above)

To fix this I have made the following design change:
1) If a second rule is added with the same name as an existing rule,
   append a number when we create the procfs, for example hashlimit_1,
   hashlimit_2 etc
2) Two rules will not share the same hashtable unless they are similar in
   every possible way
3) This behavior has to be forced with a new userspace flag:
   --hashlimit-ehanced-procfs, if this flag is not passed we default to
   the old behavior. This is to make sure we do not break existing scripts
   that rely on the existing behavior.

Signed-off-by: Vishwanath Pai <v...@akamai.com>

diff --git a/extensions/libxt_hashlimit.c b/extensions/libxt_hashlimit.c
index 4193464..ac67875 100644
--- a/extensions/libxt_hashlimit.c
+++ b/extensions/libxt_hashlimit.c
@@ -67,6 +67,7 @@ enum {
O_HTABLE_MAX,
O_HTABLE_GCINT,
O_HTABLE_EXPIRE,
+   O_PROCFS,
F_BURST = 1 << O_BURST,
F_UPTO  = 1 << O_UPTO,
F_ABOVE = 1 << O_ABOVE,
@@ -177,6 +178,7 @@ static const struct xt_option_entry hashlimit_mt_opts[] = {
{.name = "hashlimit-mode", .id = O_MODE, .type = XTTYPE_STRING},
{.name = "hashlimit-name", .id = O_NAME, .type = XTTYPE_STRING,
 .flags = XTOPT_MAND | XTOPT_PUT, XTOPT_POINTER(s, name), .min = 1},
+   {.name = "hashlimit-enhanced-procfs", .id = O_PROCFS, .type = 
XTTYPE_NONE},
XTOPT_TABLEEND,
 };
 #undef s
@@ -521,6 +523,9 @@ static void hashlimit_mt_parse(struct xt_option_call *cb)
case O_DSTMASK:
info->cfg.dstmask = cb->val.hlen;
break;
+   case O_PROCFS:
+   info->cfg.flags |= XT_HASHLIMIT_FLAG_PROCFS;
+   break;
}
 }
 
@@ -856,6 +861,9 @@ hashlimit_mt_save(const struct hashlimit_cfg2 *cfg, const 
char* name, unsigned i
printf(" --hashlimit-srcmask %u", cfg->srcmask);
if (cfg->dstmask != dmask)
printf(" --hashlimit-dstmask %u", cfg->dstmask);
+
+   if ((revision == 2) && (cfg->flags & XT_HASHLIMIT_FLAG_PROCFS) )
+printf(" --hashlimit-enhanced-procfs");
 }
 
 static void
diff --git a/extensions/libxt_hashlimit.man b/extensions/libxt_hashlimit.man
index 6aac3f2..0434f03 100644
--- a/extensions/libxt_hashlimit.man
+++ b/extensions/libxt_hashlimit.man
@@ -40,6 +40,9 @@ Like \-\-hashlimit\-srcmask, but for destination addresses.
 \fB\-\-hashlimit\-name\fP \fIfoo\fP
 The name for the /proc/net/ipt_hashlimit/foo entry.
 .TP
+\fB\-\-hashlimit\-enhanced\-procfs\fP
+Append _number to the procfs file when multiple rules with the same name exist
+.TP
 \fB\-\-hashlimit\-htable\-size\fP \fIbuckets\fP
 The number of buckets of the hash table
 .TP
diff --git a/include/linux/netfilter/xt_hashlimit.h 
b/include/linux/netfilter/xt_hashlimit.h
index e493fc1..2954381 100644
--- a/include/linux/netfilter/xt_hashlimit.h
+++ b/include/linux/netfilter/xt_hashlimit.h
@@ -16,6 +16,10 @@
 struct xt_hashlimit_htable;
 
 enum {
+   XT_HASHLIMIT_FLAG_PROCFS = 1
+};
+
+enum {
XT_HASHLIMIT_HASH_DIP = 1 << 0,
XT_HASHLIMIT_HASH_DPT = 1 << 1,
XT_HASHLIMIT_HASH_SIP = 1 << 2,
@@ -74,6 +78,9 @@ struct hashlimit_cfg2 {
__u32 expire;   /* when do entries expire? */
 
__u8 srcmask, dstmask;
+
+   __u8 procfs_suffix;
+   __u32 flags;
 };
 
 struct xt_hashlimit_mtinfo1 {


[PATCH iptables 1/3] libxt_hashlimit: Prepare libxt_hashlimit.c for revision 2

2016-06-01 Thread Vishwanath Pai
libxt_hashlimit: Prepare libxt_hashlimit.c for revision 2

I am planning to add a revision 2 for the hashlimit xtables module to
support higher packets per second rates. This patch renames all the
functions and variables related to revision 1 by adding _v1 at the end of
the names.

Signed-off-by: Vishwanath Pai <v...@akamai.com>

diff --git a/extensions/libxt_hashlimit.c b/extensions/libxt_hashlimit.c
index c5b8d77..ad7fb93 100644
--- a/extensions/libxt_hashlimit.c
+++ b/extensions/libxt_hashlimit.c
@@ -23,7 +23,7 @@
 #include 
 
 #define XT_HASHLIMIT_BURST 5
-#define XT_HASHLIMIT_BURST_MAX 1
+#define XT_HASHLIMIT_BURST_MAX_v1  1
 
 #define XT_HASHLIMIT_BYTE_EXPIRE   15
 #define XT_HASHLIMIT_BYTE_EXPIRE_BURST 60
@@ -98,7 +98,7 @@ static const struct xt_option_entry hashlimit_opts[] = {
{.name = "hashlimit", .id = O_UPTO, .excl = F_ABOVE,
 .type = XTTYPE_STRING},
{.name = "hashlimit-burst", .id = O_BURST, .type = XTTYPE_UINT32,
-.min = 1, .max = XT_HASHLIMIT_BURST_MAX, .flags = XTOPT_PUT,
+.min = 1, .max = XT_HASHLIMIT_BURST_MAX_v1, .flags = XTOPT_PUT,
 XTOPT_POINTER(s, cfg.burst)},
{.name = "hashlimit-htable-size", .id = O_HTABLE_SIZE,
 .type = XTTYPE_UINT32, .flags = XTOPT_PUT,
@@ -121,7 +121,7 @@ static const struct xt_option_entry hashlimit_opts[] = {
 #undef s
 
 #define s struct xt_hashlimit_mtinfo1
-static const struct xt_option_entry hashlimit_mt_opts[] = {
+static const struct xt_option_entry hashlimit_mt_opts_v1[] = {
{.name = "hashlimit-upto", .id = O_UPTO, .excl = F_ABOVE,
 .type = XTTYPE_STRING, .flags = XTOPT_INVERT},
{.name = "hashlimit-above", .id = O_ABOVE, .excl = F_UPTO,
@@ -174,10 +174,10 @@ static uint32_t get_factor(int chr)
return 1;
 }
 
-static void burst_error(void)
+static void burst_error_v1(void)
 {
xtables_error(PARAMETER_PROBLEM, "bad value for option "
-   "\"--hashlimit-burst\", or out of range (1-%u).", 
XT_HASHLIMIT_BURST_MAX);
+   "\"--hashlimit-burst\", or out of range (1-%u).", 
XT_HASHLIMIT_BURST_MAX_v1);
 }
 
 static uint32_t parse_burst(const char *burst, struct xt_hashlimit_mtinfo1 
*info)
@@ -186,8 +186,8 @@ static uint32_t parse_burst(const char *burst, struct 
xt_hashlimit_mtinfo1 *info
char *end;
 
if (!xtables_strtoul(burst, , , 1, UINT32_MAX) ||
-   (*end == 0 && v > XT_HASHLIMIT_BURST_MAX))
-   burst_error();
+   (*end == 0 && v > XT_HASHLIMIT_BURST_MAX_v1))
+   burst_error_v1();
 
v *= get_factor(*end);
if (v > UINT32_MAX)
@@ -253,7 +253,7 @@ int parse_rate(const char *rate, uint32_t *val, struct 
hashlimit_mt_udata *ud)
if (!r)
return 0;
 
-   *val = XT_HASHLIMIT_SCALE * ud->mult / r;
+   *val = XT_HASHLIMIT_SCALE_v1 * ud->mult / r;
if (*val == 0)
/*
 * The rate maps to infinity. (1/day is the minimum they can
@@ -272,7 +272,7 @@ static void hashlimit_init(struct xt_entry_match *m)
 
 }
 
-static void hashlimit_mt4_init(struct xt_entry_match *match)
+static void hashlimit_mt4_init_v1(struct xt_entry_match *match)
 {
struct xt_hashlimit_mtinfo1 *info = (void *)match->data;
 
@@ -283,7 +283,7 @@ static void hashlimit_mt4_init(struct xt_entry_match *match)
info->cfg.dstmask = 32;
 }
 
-static void hashlimit_mt6_init(struct xt_entry_match *match)
+static void hashlimit_mt6_init_v1(struct xt_entry_match *match)
 {
struct xt_hashlimit_mtinfo1 *info = (void *)match->data;
 
@@ -342,7 +342,7 @@ static void hashlimit_parse(struct xt_option_call *cb)
}
 }
 
-static void hashlimit_mt_parse(struct xt_option_call *cb)
+static void hashlimit_mt_parse_v1(struct xt_option_call *cb)
 {
struct xt_hashlimit_mtinfo1 *info = cb->data;
 
@@ -395,7 +395,7 @@ static void hashlimit_check(struct xt_fcheck_call *cb)
info->cfg.expire = udata->mult * 1000; /* from s to msec */
 }
 
-static void hashlimit_mt_check(struct xt_fcheck_call *cb)
+static void hashlimit_mt_check_v1(struct xt_fcheck_call *cb)
 {
const struct hashlimit_mt_udata *udata = cb->udata;
struct xt_hashlimit_mtinfo1 *info = cb->data;
@@ -421,18 +421,18 @@ static void hashlimit_mt_check(struct xt_fcheck_call *cb)
info->cfg.expire = 
XT_HASHLIMIT_BYTE_EXPIRE_BURST * 1000;
}
info->cfg.burst = burst;
-   } else if (info->cfg.burst > XT_HASHLIMIT_BURST_MAX)
-   burst_error();
+   } else if (info->cfg.burst > XT_HASHLIMIT_BURST_MAX_v1)
+   burst_error_v1();
 }
 
-static const struct rates
+static const struct rates_v1
 {
const char *name;
uint32_t mu

[PATCH iptables 2/3] libxt_hashlimit: Create revision 2 of xt_hashlimit to support higher pps rates

2016-06-01 Thread Vishwanath Pai
libxt_hashlimit: Create revision 2 of xt_hashlimit to support higher pps rates

Create a new revision for the hashlimit iptables extension module. Rev 2
will support higher pps of upto 1 million, Version 1 supports only 10k.

To support this we have to increase the size of the variables avg and
burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2
and xt_hashlimit_mtinfo2 and also create newer versions of all the
functions for match, checkentry and destory.

Signed-off-by: Vishwanath Pai <v...@akamai.com>

diff --git a/extensions/libxt_hashlimit.c b/extensions/libxt_hashlimit.c
index ad7fb93..4193464 100644
--- a/extensions/libxt_hashlimit.c
+++ b/extensions/libxt_hashlimit.c
@@ -24,6 +24,7 @@
 
 #define XT_HASHLIMIT_BURST 5
 #define XT_HASHLIMIT_BURST_MAX_v1  1
+#define XT_HASHLIMIT_BURST_MAX 100
 
 #define XT_HASHLIMIT_BYTE_EXPIRE   15
 #define XT_HASHLIMIT_BYTE_EXPIRE_BURST 60
@@ -150,16 +151,66 @@ static const struct xt_option_entry 
hashlimit_mt_opts_v1[] = {
 };
 #undef s
 
-static uint32_t cost_to_bytes(uint32_t cost)
+#define s struct xt_hashlimit_mtinfo2
+static const struct xt_option_entry hashlimit_mt_opts[] = {
+   {.name = "hashlimit-upto", .id = O_UPTO, .excl = F_ABOVE,
+.type = XTTYPE_STRING, .flags = XTOPT_INVERT},
+   {.name = "hashlimit-above", .id = O_ABOVE, .excl = F_UPTO,
+.type = XTTYPE_STRING, .flags = XTOPT_INVERT},
+   {.name = "hashlimit", .id = O_UPTO, .excl = F_ABOVE,
+.type = XTTYPE_STRING, .flags = XTOPT_INVERT}, /* old name */
+   {.name = "hashlimit-srcmask", .id = O_SRCMASK, .type = XTTYPE_PLEN},
+   {.name = "hashlimit-dstmask", .id = O_DSTMASK, .type = XTTYPE_PLEN},
+   {.name = "hashlimit-burst", .id = O_BURST, .type = XTTYPE_STRING},
+   {.name = "hashlimit-htable-size", .id = O_HTABLE_SIZE,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.size)},
+   {.name = "hashlimit-htable-max", .id = O_HTABLE_MAX,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.max)},
+   {.name = "hashlimit-htable-gcinterval", .id = O_HTABLE_GCINT,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.gc_interval)},
+   {.name = "hashlimit-htable-expire", .id = O_HTABLE_EXPIRE,
+.type = XTTYPE_UINT32, .flags = XTOPT_PUT,
+XTOPT_POINTER(s, cfg.expire)},
+   {.name = "hashlimit-mode", .id = O_MODE, .type = XTTYPE_STRING},
+   {.name = "hashlimit-name", .id = O_NAME, .type = XTTYPE_STRING,
+.flags = XTOPT_MAND | XTOPT_PUT, XTOPT_POINTER(s, name), .min = 1},
+   XTOPT_TABLEEND,
+};
+#undef s
+
+static void
+cfg_copy(struct hashlimit_cfg2 *to, const void *from, int revision)
+{
+   if (revision == 1) {
+   struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+   to->mode = cfg->mode;
+   to->avg = cfg->avg;
+   to->burst = cfg->burst;
+   to->size = cfg->size;
+   to->max = cfg->max;
+   to->gc_interval = cfg->gc_interval;
+   to->expire = cfg->expire;
+   to->srcmask = cfg->srcmask;
+   to->dstmask = cfg->dstmask;
+   } else if (revision == 2) {
+   memcpy(to, from, sizeof(struct hashlimit_cfg2));
+   }
+}
+
+static uint64_t cost_to_bytes(uint64_t cost)
 {
-   uint32_t r;
+   uint64_t r;
 
r = cost ? UINT32_MAX / cost : UINT32_MAX;
r = (r - 1) << XT_HASHLIMIT_BYTE_SHIFT;
return r;
 }
 
-static uint64_t bytes_to_cost(uint32_t bytes)
+static uint64_t bytes_to_cost(uint64_t bytes)
 {
uint32_t r = bytes >> XT_HASHLIMIT_BYTE_SHIFT;
return UINT32_MAX / (r+1);
@@ -180,57 +231,78 @@ static void burst_error_v1(void)
"\"--hashlimit-burst\", or out of range (1-%u).", 
XT_HASHLIMIT_BURST_MAX_v1);
 }
 
-static uint32_t parse_burst(const char *burst, struct xt_hashlimit_mtinfo1 
*info)
+static void burst_error(void)
+{
+   xtables_error(PARAMETER_PROBLEM, "bad value for option "
+   "\"--hashlimit-burst\", or out of range (1-%u).", 
XT_HASHLIMIT_BURST_MAX);
+}
+
+static uint64_t parse_burst(const char *burst, int revision)
 {
uintmax_t v;
char *end;
-
-   if (!xtables_strtoul(burst, , , 1, UINT32_MAX) ||
-   (*end == 0 && v > XT_HASHLIMIT_BURST_MAX_v1))
-   burst_error_v1();
+   uint64_t max = (revision == 1) ? UINT32_MAX : UINT64_MAX;
+   uint64_t burst_max = (revision == 1) ?
+ XT_HASHLIMIT_BURST_MAX_v1 : 
XT_HASHLIMIT_BURST_MAX;
+
+   if (!xtables_strtoul(burst, , , 1, max) ||
+   (*end 

[PATCH 3/3] netfilter: iptables-restore does not work as expected with xt_hashlimit

2016-06-01 Thread Vishwanath Pai
netfilter: iptables-restore does not work as expected with xt_hashlimit

Add the following iptables rule.

$ iptables -A INPUT -m hashlimit --hashlimit-above 200/sec \
  --hashlimit-burst 5 --hashlimit-mode srcip --hashlimit-name hashlimit1 \
  --hashlimit-htable-expire 3 -j DROP

$ iptables-save > save.txt

Edit save.txt and change the value of --hashlimit-above to 300:

-A INPUT -m hashlimit --hashlimit-above 300/sec --hashlimit-burst 5 \
--hashlimit-mode srcip --hashlimit-name hashlimit2 \
--hashlimit-htable-expire 3 -j DROP

Now restore save.txt

$ iptables-restore < save.txt

Now userspace thinks that the value of --hashlimit-above is 300 but it is
actually 200 in the kernel. This happens because when we add multiple
hash-limit rules with the same name they will share the same hashtable
internally. The kernel module tries to re-use the old hashtable without
updating the values.

There are multiple problems here:
1) We can add two iptables rules with the same name, but kernel does not
   handle this well, one procfs file cannot work with two rules
2) If the second rule has no effect because the hashtable has values from
   rule 1
3) hashtable-restore does not work (as described above)

To fix this I have made the following design change:
1) If a second rule is added with the same name as an existing rule,
   append a number when we create the procfs, for example hashlimit_1,
   hashlimit_2 etc
2) Two rules will not share the same hashtable unless they are similar in
   every possible way
3) This behavior has to be forced with a new userspace flag:
   --hashlimit-ehanced-procfs, if this flag is not passed we default to
   the old behavior. This is to make sure we do not break existing scripts
   that rely on the existing behavior.

Signed-off-by: Vishwanath Pai <v...@akamai.com>

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h 
b/include/uapi/linux/netfilter/xt_hashlimit.h
index be5d2e1..7c7 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -18,6 +18,10 @@
 struct xt_hashlimit_htable;
 
 enum {
+   XT_HASHLIMIT_FLAG_PROCFS = 1
+};
+
+enum {
XT_HASHLIMIT_HASH_DIP = 1 << 0,
XT_HASHLIMIT_HASH_DPT = 1 << 1,
XT_HASHLIMIT_HASH_SIP = 1 << 2,
@@ -76,6 +80,9 @@ struct hashlimit_cfg2 {
__u32 expire;   /* when do entries expire? */
 
__u8 srcmask, dstmask;
+
+   __u8 procfs_suffix;
+   __u32 flags;
 };
 
 struct xt_hashlimit_mtinfo1 {
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 412e6ef..79d6745 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -238,6 +238,74 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct 
dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
+static int
+generate_procfs_suffix(struct net *net, const char *name,
+  struct hashlimit_cfg2 *cfg, u_int8_t family)
+{
+   struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
+   struct xt_hashlimit_htable *hinfo;
+   unsigned long bitarray = 0;
+
+   hlist_for_each_entry(hinfo, _net->htables, node) {
+   if (!strcmp(name, hinfo->name) && hinfo->family == family)
+   __set_bit(hinfo->cfg.procfs_suffix, );
+   }
+
+   /* Too many name clashes, bail out
+*/
+   if (bitarray == ~0UL)
+   return -1;
+
+   cfg->procfs_suffix = ffz(bitarray);
+
+   return 0;
+}
+
+static bool
+htable_compare(struct hashlimit_cfg2 *newcfg, const char *newname,
+  struct xt_hashlimit_htable *hinfo, u_int8_t family)
+{
+   struct hashlimit_cfg2 *oldcfg = >cfg;
+
+   if (!(newcfg->flags & XT_HASHLIMIT_FLAG_PROCFS)) {
+   return (!strcmp(newname, hinfo->name) &&
+   family == hinfo->family);
+   }
+
+   if (!strcmp(newname, hinfo->name) &&
+   family == hinfo->family &&
+   newcfg->mode == oldcfg->mode &&
+   newcfg->avg == oldcfg->avg &&
+   newcfg->burst == oldcfg->burst &&
+   newcfg->size == oldcfg->size &&
+   newcfg->max == oldcfg->max &&
+   newcfg->gc_interval == oldcfg->gc_interval &&
+   newcfg->expire == oldcfg->expire &&
+   newcfg->srcmask == oldcfg->srcmask &&
+   newcfg->dstmask == oldcfg->dstmask) {
+   return true;
+   }
+
+   return false;
+}
+
+static void
+htable_cfg_init(struct hashlimit_cfg2 *cfg) {
+   if (!cfg->size) {
+   cfg->size = (totalram_pages << PAGE_SHIFT) / 16384 /
+   sizeof(struct list_head);
+   if (totalram_pages &g

[PATCH 2/3] netfilter: Create revision 2 of xt_hashlimit to support higher pps rates

2016-06-01 Thread Vishwanath Pai
netfilter: Create revision 2 of xt_hashlimit to support higher pps rates

Create a new revision for the hashlimit iptables extension module. Rev 2
will support higher pps of upto 1 million, Version 1 supports only 10k.

To support this we have to increase the size of the variables avg and
burst in hashlimit_cfg to 64-bit. Create two new structs hashlimit_cfg2
and xt_hashlimit_mtinfo2 and also create newer versions of all the
functions for match, checkentry and destroy.

Some of the functions like hashlimit_mt, hashlimit_mt_check etc are very
similar in both rev1 and rev2 with only minor changes, so I have split
those functions and moved all the common code to a *_common function.

Signed-off-by: Vishwanath Pai <v...@akamai.com>

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h 
b/include/uapi/linux/netfilter/xt_hashlimit.h
index ea8c1c0..be5d2e1 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -6,6 +6,7 @@
 
 /* timings are in milliseconds. */
 #define XT_HASHLIMIT_SCALE_v1 1
+#define XT_HASHLIMIT_SCALE 100llu
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
  * seconds, or one packet every 59 hours.
  */
@@ -63,6 +64,20 @@ struct hashlimit_cfg1 {
__u8 srcmask, dstmask;
 };
 
+struct hashlimit_cfg2 {
+   __u32 mode;   /* bitmask of XT_HASHLIMIT_HASH_* */
+   __u64 avg;/* Average secs between packets * scale */
+   __u64 burst;  /* Period multiplier for upper limit. */
+
+   /* user specified */
+   __u32 size; /* how many buckets */
+   __u32 max;  /* max number of entries */
+   __u32 gc_interval;  /* gc interval */
+   __u32 expire;   /* when do entries expire? */
+
+   __u8 srcmask, dstmask;
+};
+
 struct xt_hashlimit_mtinfo1 {
char name[IFNAMSIZ];
struct hashlimit_cfg1 cfg;
@@ -71,4 +86,12 @@ struct xt_hashlimit_mtinfo1 {
struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
 };
 
+struct xt_hashlimit_mtinfo2 {
+   char name[NAME_MAX];
+   struct hashlimit_cfg2 cfg;
+
+   /* Used internally by the kernel */
+   struct xt_hashlimit_htable *hinfo __attribute__((aligned(8)));
+};
+
 #endif /* _UAPI_XT_HASHLIMIT_H */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 6b0ad93..412e6ef 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -57,6 +57,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct 
net *net)
 
 /* need to declare this at the top */
 static const struct file_operations dl_file_ops_v1;
+static const struct file_operations dl_file_ops;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -86,8 +87,8 @@ struct dsthash_ent {
unsigned long expires;  /* precalculated expiry time */
struct {
unsigned long prev; /* last modification */
-   u_int32_t credit;
-   u_int32_t credit_cap, cost;
+   u_int64_t credit;
+   u_int64_t credit_cap, cost;
} rateinfo;
struct rcu_head rcu;
 };
@@ -98,7 +99,7 @@ struct xt_hashlimit_htable {
u_int8_t family;
bool rnd_initialized;
 
-   struct hashlimit_cfg1 cfg;  /* config */
+   struct hashlimit_cfg2 cfg;  /* config */
 
/* used internally */
spinlock_t lock;/* lock for list_head */
@@ -114,6 +115,28 @@ struct xt_hashlimit_htable {
struct hlist_head hash[0];  /* hashtable itself */
 };
 
+static void
+cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision)
+{
+   if (revision == 1) {
+   struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from;
+
+   to->mode = cfg->mode;
+   to->avg = cfg->avg;
+   to->burst = cfg->burst;
+   to->size = cfg->size;
+   to->max = cfg->max;
+   to->gc_interval = cfg->gc_interval;
+   to->expire = cfg->expire;
+   to->srcmask = cfg->srcmask;
+   to->dstmask = cfg->dstmask;
+   } else if (revision == 2) {
+   memcpy(to, from, sizeof(struct hashlimit_cfg2));
+   } else {
+   BUG();
+   }
+}
+
 static DEFINE_MUTEX(hashlimit_mutex);  /* protects htables list */
 static struct kmem_cache *hashlimit_cachep __read_mostly;
 
@@ -215,16 +238,17 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct 
dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 
*minfo,
-   u_int8_t family)
+static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg,
+const char *name, u_int8_t family,
+struct xt_hashlimit_htable **out_hinfo,
+int revision)
 {
   

[PATCH 1/3] netfilter: Prepare xt_hashlimit.c for revision 2

2016-06-01 Thread Vishwanath Pai
netfilter: Prepare xt_hashlimit.c for revision 2

I am planning to add a revision 2 for the hashlimit xtables module to
support higher packets per second rates. This patch renames all the
functions and variables related to revision 1 by adding _v1 at the end of
the names.

Signed-off-by: Vishwanath Pai <v...@akamai.com>

diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h 
b/include/uapi/linux/netfilter/xt_hashlimit.h
index 6db9037..ea8c1c0 100644
--- a/include/uapi/linux/netfilter/xt_hashlimit.h
+++ b/include/uapi/linux/netfilter/xt_hashlimit.h
@@ -5,7 +5,7 @@
 #include 
 
 /* timings are in milliseconds. */
-#define XT_HASHLIMIT_SCALE 1
+#define XT_HASHLIMIT_SCALE_v1 1
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
  * seconds, or one packet every 59 hours.
  */
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 1786968..6b0ad93 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -56,7 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct 
net *net)
 }
 
 /* need to declare this at the top */
-static const struct file_operations dl_file_ops;
+static const struct file_operations dl_file_ops_v1;
 
 /* hash table crap */
 struct dsthash_dst {
@@ -215,8 +215,8 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct 
dsthash_ent *ent)
 }
 static void htable_gc(struct work_struct *work);
 
-static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
-u_int8_t family)
+static int htable_create_v1(struct net *net, struct xt_hashlimit_mtinfo1 
*minfo,
+   u_int8_t family)
 {
struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
struct xt_hashlimit_htable *hinfo;
@@ -265,7 +265,7 @@ static int htable_create(struct net *net, struct 
xt_hashlimit_mtinfo1 *minfo,
hinfo->pde = proc_create_data(minfo->name, 0,
(family == NFPROTO_IPV4) ?
hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
-   _file_ops, hinfo);
+   _file_ops_v1, hinfo);
if (hinfo->pde == NULL) {
kfree(hinfo->name);
vfree(hinfo);
@@ -398,7 +398,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
(slowest userspace tool allows), which means
CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
 */
-#define MAX_CPJ (0x / (HZ*60*60*24))
+#define MAX_CPJ_v1 (0x / (HZ*60*60*24))
 
 /* Repeated shift and or gives us all 1s, final shift and add 1 gives
  * us the power of 2 below the theoretical max, so GCC simply does a
@@ -410,7 +410,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo)
 #define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
 #define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
 
-#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+#define CREDITS_PER_JIFFY_v1 POW2_BELOW32(MAX_CPJ_v1)
 
 /* in byte mode, the lowest possible rate is one packet/second.
  * credit_cap is used as a counter that tells us how many times we can
@@ -428,11 +428,12 @@ static u32 xt_hashlimit_len_to_chunks(u32 len)
 static u32 user2credits(u32 user)
 {
/* If multiplying would overflow... */
-   if (user > 0x / (HZ*CREDITS_PER_JIFFY))
+   if (user > 0x / (HZ*CREDITS_PER_JIFFY_v1))
/* Divide first. */
-   return (user / XT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+   return (user / XT_HASHLIMIT_SCALE_v1) *\
+   HZ * CREDITS_PER_JIFFY_v1;
 
-   return (user * HZ * CREDITS_PER_JIFFY) / XT_HASHLIMIT_SCALE;
+   return (user * HZ * CREDITS_PER_JIFFY_v1) / XT_HASHLIMIT_SCALE_v1;
 }
 
 static u32 user2credits_byte(u32 user)
@@ -461,7 +462,7 @@ static void rateinfo_recalc(struct dsthash_ent *dh, 
unsigned long now, u32 mode)
return;
}
} else {
-   dh->rateinfo.credit += delta * CREDITS_PER_JIFFY;
+   dh->rateinfo.credit += delta * CREDITS_PER_JIFFY_v1;
cap = dh->rateinfo.credit_cap;
}
if (dh->rateinfo.credit > cap)
@@ -603,7 +604,7 @@ static u32 hashlimit_byte_cost(unsigned int len, struct 
dsthash_ent *dh)
 }
 
 static bool
-hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
+hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
const struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
struct xt_hashlimit_htable *hinfo = info->hinfo;
@@ -660,7 +661,7 @@ hashlimit_mt(const struct sk_buff *skb, struct 
xt_action_param *par)
return false;
 }
 
-static int hashlimit_mt_check(const struct xt_mtchk_param *par)
+static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par)
 {
struct net *net = par->net;
struct xt_hashlimit_mtinfo1 *info = par->matchinfo;
@@ -701,

[PATCH v2] netfilter: fix race condition in ipset save, swap and delete

2016-03-14 Thread Vishwanath Pai
I have updated the patch according to comments by Jozsef. Renamed
ref_kernel to ref_netlink, renamed _put/_get functions and updated the
description in commit log.

Thanks,
Vishwanath

--

netfilter: fix race condition in ipset save, swap and delete

This fix adds a new reference counter (ref_netlink) for the struct ip_set.
The other reference counter (ref) can be swapped out by ip_set_swap and we
need a separate counter to keep track of references for netlink events
like dump. Using the same ref counter for dump causes a race condition
which can be demonstrated by the following script:

#!/bin/sh
ipset create hash_ip1 hash:ip family inet hashsize 1024 maxelem 50 \
counters
ipset create hash_ip2 hash:ip family inet hashsize 30 maxelem 50 \
counters
ipset create hash_ip3 hash:ip family inet hashsize 1024 maxelem 50 \
counters

ipset save &

ipset swap hash_ip3 hash_ip2
ipset destroy hash_ip3 /* will crash the machine */

Swap will exchange the values of ref so destroy will see ref = 0 instead of
ref = 1. With this fix in place swap will not succeed because ipset save
still has ref_netlink on the set (ip_set_swap doesn't swap ref_netlink).

Both delete and swap will error out if ref_netlink != 0 on the set.

Note: The changes to *_head functions is because previously we would
increment ref whenever we called these functions, we don't do that
anymore.

Reviewed-by: Joshua Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>

--

diff --git a/include/linux/netfilter/ipset/ip_set.h 
b/include/linux/netfilter/ipset/ip_set.h
index 0e1f433..f48b8a6 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -234,6 +234,10 @@ struct ip_set {
spinlock_t lock;
/* References to the set */
u32 ref;
+   /* References to the set for netlink events like dump,
+* ref can be swapped out by ip_set_swap
+*/
+   u32 ref_netlink;
/* The core set type */
struct ip_set_type *type;
/* The type variant doing the real job */
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h 
b/net/netfilter/ipset/ip_set_bitmap_gen.h
index b0bc475..2e8e7e5 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -95,7 +95,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
if (!nested)
goto nla_put_failure;
if (mtype_do_head(skb, map) ||
-   nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+   nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
diff --git a/net/netfilter/ipset/ip_set_core.c 
b/net/netfilter/ipset/ip_set_core.c
index 95db43f..a558075 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -497,6 +497,26 @@ __ip_set_put(struct ip_set *set)
write_unlock_bh(_set_ref_lock);
 }
 
+/* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need
+ * a separate reference counter
+ */
+static inline void
+__ip_set_get_netlink(struct ip_set *set)
+{
+   write_lock_bh(_set_ref_lock);
+   set->ref_netlink++;
+   write_unlock_bh(_set_ref_lock);
+}
+
+static inline void
+__ip_set_put_netlink(struct ip_set *set)
+{
+   write_lock_bh(_set_ref_lock);
+   BUG_ON(set->ref_netlink == 0);
+   set->ref_netlink--;
+   write_unlock_bh(_set_ref_lock);
+}
+
 /* Add, del and test set entries from kernel.
  *
  * The set behind the index must exist and must be referenced
@@ -999,7 +1019,7 @@ static int ip_set_destroy(struct net *net, struct sock 
*ctnl,
if (!attr[IPSET_ATTR_SETNAME]) {
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
-   if (s && s->ref) {
+   if (s && (s->ref || s->ref_netlink)) {
ret = -IPSET_ERR_BUSY;
goto out;
}
@@ -1021,7 +1041,7 @@ static int ip_set_destroy(struct net *net, struct sock 
*ctnl,
if (!s) {
ret = -ENOENT;
goto out;
-   } else if (s->ref) {
+   } else if (s->ref || s->ref_netlink) {
ret = -IPSET_ERR_BUSY;
goto out;
}
@@ -1168,6 +1188,9 @@ static int ip_set_swap(struct net *net, struct sock 
*ctnl, struct sk_buff *skb,
  from->family == to->family))
return -IPSET_ERR_TYPE_MISMATCH;
 
+   if (from->ref_netlink || to->ref_netlink)
+   return -EBUSY;
+
strncpy(from_name, from->name, IPSET_MAXNAMELEN);
strncpy(from->name, to->name

Re: [PATCH] netfilter: fix race condition in ipset save and delete

2016-03-13 Thread Vishwanath Pai
Hi Jozsef,

On 03/13/2016 08:07 AM, Jozsef Kadlecsik wrote:
> Hi,
> 
> On Sat, 12 Mar 2016, Vishwanath Pai wrote:
> 
>> netfilter: fix race condition in ipset save and delete
>>
>> This fix adds a new reference counter (ref_kernel) for the struct ip_set.
>> The other reference counter (ref) is used to track references from the
>> userspace and we need a separate counter to keep track of in-kernel
>> references. Using the same ref counter for both userspace and kernel
>> references causes a race condition which can be demonstrated by the
>> following script:
>>
>> #!/bin/sh
>> ipset create hash_ip1 hash:ip family inet hashsize 1024 maxelem 50 \
>> counters
>> ipset create hash_ip2 hash:ip family inet hashsize 30 maxelem 50 \
>> counters
>> ipset create hash_ip3 hash:ip family inet hashsize 1024 maxelem 50 \
>> counters
>>
>> ipset save &
>>
>> ipset swap hash_ip3 hash_ip2
>> ipset destroy hash_ip3 /* will crash the machine */
>>
>> Swap will exchange the values of ref so destroy will see ref = 0 instead of
>> ref = 1. With this fix in place swap will not suceed because ipset save
>> still has ref_kernel on the set (ip_set_swap doesn't swap ref_kernel).
>>
>> Both delete and swap will error out if ref_kernel != 0 on the set.
>>
>> Note: The changes to *_head functions is because previously we would
>> increment ref whenever we called these functions, we don't do that
>> anymore.
> 
> Overall, I agree with your patch, however I disagree with the description 
> and some details. 
> 
> It's a race between dump & swap and then destroy - dump and destroy are 
> safe. The "ref" reference counter *is* kernel related: it keeps track of 
> references from other kernel subsystems (netfilter matches/targets) and 
> from ipset itself when a set is a member of another set. It would be 
> misleading to call "ref" as userspace reference counter.
> 
> The reference counter you introduce is for netlink events (technically 
> just for dump), so it would better be named "ref_netlink" instead of 
> "ref_kernel" (similarly, ip_set_get|put_ref_netlink).
> 
> Please update the patch, the description and resubmit. Thanks!
> 
> Best regards,
> Jozsef
> 

Thanks for reviewing, I will update the patch and send it again.

Thanks,
Vishwanath



[PATCH] netfilter: fix race condition in ipset save and delete

2016-03-12 Thread Vishwanath Pai
netfilter: fix race condition in ipset save and delete

This fix adds a new reference counter (ref_kernel) for the struct ip_set.
The other reference counter (ref) is used to track references from the
userspace and we need a separate counter to keep track of in-kernel
references. Using the same ref counter for both userspace and kernel
references causes a race condition which can be demonstrated by the
following script:

#!/bin/sh
ipset create hash_ip1 hash:ip family inet hashsize 1024 maxelem 50 \
counters
ipset create hash_ip2 hash:ip family inet hashsize 30 maxelem 50 \
counters
ipset create hash_ip3 hash:ip family inet hashsize 1024 maxelem 50 \
counters

ipset save &

ipset swap hash_ip3 hash_ip2
ipset destroy hash_ip3 /* will crash the machine */

Swap will exchange the values of ref so destroy will see ref = 0 instead of
ref = 1. With this fix in place swap will not suceed because ipset save
still has ref_kernel on the set (ip_set_swap doesn't swap ref_kernel).

Both delete and swap will error out if ref_kernel != 0 on the set.

Note: The changes to *_head functions is because previously we would
increment ref whenever we called these functions, we don't do that
anymore.

Reviewed-by: Joshua Hunt <joh...@akamai.com>
Signed-off-by: Vishwanath Pai <v...@akamai.com>

--

diff --git a/include/linux/netfilter/ipset/ip_set.h 
b/include/linux/netfilter/ipset/ip_set.h
index 0e1f433..86d86db 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -234,6 +234,9 @@ struct ip_set {
spinlock_t lock;
/* References to the set */
u32 ref;
+   /* the above ref can be swapped out by ip_set_swap and
+  cannot be used to keep track of references within ipset code */
+   u32 ref_kernel;
/* The core set type */
struct ip_set_type *type;
/* The type variant doing the real job */
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h 
b/net/netfilter/ipset/ip_set_bitmap_gen.h
index b0bc475..2e8e7e5 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -95,7 +95,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
if (!nested)
goto nla_put_failure;
if (mtype_do_head(skb, map) ||
-   nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+   nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
diff --git a/net/netfilter/ipset/ip_set_core.c 
b/net/netfilter/ipset/ip_set_core.c
index 95db43f..a055f29 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -497,6 +497,26 @@ __ip_set_put(struct ip_set *set)
write_unlock_bh(_set_ref_lock);
 }
 
+/* The above two functions keep track of references from the userspace, the
+ * _internal functions keep track of references in-kernel
+ */
+static inline void
+__ip_set_get_internal(struct ip_set *set)
+{
+   write_lock_bh(_set_ref_lock);
+   set->ref_kernel++;
+   write_unlock_bh(_set_ref_lock);
+}
+
+static inline void
+__ip_set_put_internal(struct ip_set *set)
+{
+   write_lock_bh(_set_ref_lock);
+   BUG_ON(set->ref_kernel == 0);
+   set->ref_kernel--;
+   write_unlock_bh(_set_ref_lock);
+}
+
 /* Add, del and test set entries from kernel.
  *
  * The set behind the index must exist and must be referenced
@@ -999,7 +1019,7 @@ static int ip_set_destroy(struct net *net, struct sock 
*ctnl,
if (!attr[IPSET_ATTR_SETNAME]) {
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
-   if (s && s->ref) {
+   if (s && (s->ref || s->ref_kernel)) {
ret = -IPSET_ERR_BUSY;
goto out;
}
@@ -1021,7 +1041,7 @@ static int ip_set_destroy(struct net *net, struct sock 
*ctnl,
if (!s) {
ret = -ENOENT;
goto out;
-   } else if (s->ref) {
+   } else if (s->ref || s->ref_kernel) {
ret = -IPSET_ERR_BUSY;
goto out;
}
@@ -1168,6 +1188,9 @@ static int ip_set_swap(struct net *net, struct sock 
*ctnl, struct sk_buff *skb,
  from->family == to->family))
return -IPSET_ERR_TYPE_MISMATCH;
 
+   if (from->ref_kernel || to->ref_kernel)
+   return -EBUSY;
+
strncpy(from_name, from->name, IPSET_MAXNAMELEN);
strncpy(from->name, to->name, IPSET_MAXNAMELEN);
strncpy(to->name, from_name, IPSET_MAXNAMELEN);
@@ -1203,7 +1226,7 @@ ip_set_dump_done(struct netlink_callback *cb)