date:20160923

[PATCH 6/6] mISDN: remove unused function

2016-09-23 Thread Baoyou Xie

We get 1 warning when building kernel with W=1:
drivers/isdn/mISDN/layer2.c:463:1: warning: no previous declaration for 'IsRR' 
[-Wmissing-declarations]

In fact, this function is called by no one and not exported,
so this patch removes it.

Signed-off-by: Baoyou Xie 
---
 drivers/isdn/mISDN/layer2.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/isdn/mISDN/layer2.c b/drivers/isdn/mISDN/layer2.c
index f6ab6027..2519510 100644
--- a/drivers/isdn/mISDN/layer2.c
+++ b/drivers/isdn/mISDN/layer2.c
@@ -459,15 +459,6 @@ IsDISC(u_char *data)
return (data[0] & 0xef) == DISC;
 }
 
-inline int
-IsRR(u_char *data, struct layer2 *l2)
-{
-   if (test_bit(FLG_MOD128, >flag))
-   return data[0] == RR;
-   else
-   return (data[0] & 0xf) == 1;
-}
-
 static inline int
 IsSFrame(u_char *data, struct layer2 *l2)
 {
-- 
2.7.4

[PATCH 5/6] mISDN: mark symbols static where possible

2016-09-23 Thread Baoyou Xie

We get a few warnings when building kernel with W=1:
drivers/isdn/mISDN/layer2.c:120:1: warning: no previous declaration for 
'l2headersize' [-Wmissing-declarations]
drivers/isdn/mISDN/layer2.c:127:1: warning: no previous declaration for 
'l2addrsize' [-Wmissing-declarations]
drivers/isdn/mISDN/layer2.c:379:1: warning: no previous declaration for 
'cansend' [-Wmissing-declarations]
drivers/isdn/mISDN/layer2.c:679:1: warning: no previous declaration for 
'stop_t200' [-Wmissing-declarations]


In fact, these functions are only used in the file in which they are
declared and don't need a declaration, but can be made static.
So this patch marks these functions with 'static'.

Signed-off-by: Baoyou Xie 
---
 drivers/isdn/mISDN/layer2.c | 44 ++--
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/isdn/mISDN/layer2.c b/drivers/isdn/mISDN/layer2.c
index 5eb380a..f6ab6027 100644
--- a/drivers/isdn/mISDN/layer2.c
+++ b/drivers/isdn/mISDN/layer2.c
@@ -116,14 +116,14 @@ l2m_debug(struct FsmInst *fi, char *fmt, ...)
va_end(va);
 }
 
-inline u_int
+static inline u_int
 l2headersize(struct layer2 *l2, int ui)
 {
return ((test_bit(FLG_MOD128, >flag) && (!ui)) ? 2 : 1) +
(test_bit(FLG_LAPD, >flag) ? 2 : 1);
 }
 
-inline u_int
+static inline u_int
 l2addrsize(struct layer2 *l2)
 {
return test_bit(FLG_LAPD, >flag) ? 2 : 1;
@@ -375,7 +375,7 @@ ReleaseWin(struct layer2 *l2)
   "isdnl2 freed %d skbuffs in release\n", cnt);
 }
 
-inline unsigned int
+static inline unsigned int
 cansend(struct layer2 *l2)
 {
unsigned int p1;
@@ -387,7 +387,7 @@ cansend(struct layer2 *l2)
return (p1 < l2->window) && !test_bit(FLG_PEER_BUSY, >flag);
 }
 
-inline void
+static inline void
 clear_exception(struct layer2 *l2)
 {
test_and_clear_bit(FLG_ACK_PEND, >flag);
@@ -435,25 +435,25 @@ enqueue_ui(struct layer2 *l2, struct sk_buff *skb)
dev_kfree_skb(skb);
 }
 
-inline int
+static inline int
 IsUI(u_char *data)
 {
return (data[0] & 0xef) == UI;
 }
 
-inline int
+static inline int
 IsUA(u_char *data)
 {
return (data[0] & 0xef) == UA;
 }
 
-inline int
+static inline int
 IsDM(u_char *data)
 {
return (data[0] & 0xef) == DM;
 }
 
-inline int
+static inline int
 IsDISC(u_char *data)
 {
return (data[0] & 0xef) == DISC;
@@ -468,7 +468,7 @@ IsRR(u_char *data, struct layer2 *l2)
return (data[0] & 0xf) == 1;
 }
 
-inline int
+static inline int
 IsSFrame(u_char *data, struct layer2 *l2)
 {
register u_char d = *data;
@@ -478,7 +478,7 @@ IsSFrame(u_char *data, struct layer2 *l2)
return ((d & 0xf3) == 1) && ((d & 0x0c) != 0x0c);
 }
 
-inline int
+static inline int
 IsSABME(u_char *data, struct layer2 *l2)
 {
u_char d = data[0] & ~0x10;
@@ -486,20 +486,20 @@ IsSABME(u_char *data, struct layer2 *l2)
return test_bit(FLG_MOD128, >flag) ? d == SABME : d == SABM;
 }
 
-inline int
+static inline int
 IsREJ(u_char *data, struct layer2 *l2)
 {
return test_bit(FLG_MOD128, >flag) ?
data[0] == REJ : (data[0] & 0xf) == REJ;
 }
 
-inline int
+static inline int
 IsFRMR(u_char *data)
 {
return (data[0] & 0xef) == FRMR;
 }
 
-inline int
+static inline int
 IsRNR(u_char *data, struct layer2 *l2)
 {
return test_bit(FLG_MOD128, >flag) ?
@@ -645,13 +645,13 @@ send_uframe(struct layer2 *l2, struct sk_buff *skb, 
u_char cmd, u_char cr)
 }
 
 
-inline u_char
+static inline u_char
 get_PollFlag(struct layer2 *l2, struct sk_buff *skb)
 {
return skb->data[l2addrsize(l2)] & 0x10;
 }
 
-inline u_char
+static inline u_char
 get_PollFlagFree(struct layer2 *l2, struct sk_buff *skb)
 {
u_char PF;
@@ -661,28 +661,28 @@ get_PollFlagFree(struct layer2 *l2, struct sk_buff *skb)
return PF;
 }
 
-inline void
+static inline void
 start_t200(struct layer2 *l2, int i)
 {
mISDN_FsmAddTimer(>t200, l2->T200, EV_L2_T200, NULL, i);
test_and_set_bit(FLG_T200_RUN, >flag);
 }
 
-inline void
+static inline void
 restart_t200(struct layer2 *l2, int i)
 {
mISDN_FsmRestartTimer(>t200, l2->T200, EV_L2_T200, NULL, i);
test_and_set_bit(FLG_T200_RUN, >flag);
 }
 
-inline void
+static inline void
 stop_t200(struct layer2 *l2, int i)
 {
if (test_and_clear_bit(FLG_T200_RUN, >flag))
mISDN_FsmDelTimer(>t200, i);
 }
 
-inline void
+static inline void
 st5_dl_release_l2l3(struct layer2 *l2)
 {
int pr;
@@ -694,7 +694,7 @@ st5_dl_release_l2l3(struct layer2 *l2)
l2up_create(l2, pr, 0, NULL);
 }
 
-inline void
+static inline void
 lapb_dl_release_l2l3(struct layer2 *l2, int f)
 {
if (test_bit(FLG_LAPB, >flag))
@@ -1129,7 +1129,7 @@ enquiry_cr(struct layer2 *l2, u_char typ, u_char cr, 
u_char pf)
enqueue_super(l2, skb);
 }
 
-inline void
+static inline void
 enquiry_response(struct layer2 *l2)
 {
if

[PATCH 4/6] isdn/hisax: clean function declaration in hscx.c up

2016-09-23 Thread Baoyou Xie

We get 1 warning when building kernel with W=1:
drivers/isdn/hisax/hscx.c:175:1: warning: no previous prototype for 
'open_hscxstate' [-Wmissing-prototypes]

In fact, this function is declared in
drivers/isdn/hisax/elsa_ser.c, but should be
declard in a header file, thus can be recognized in other file.

So this patch moves the declaration into drivers/isdn/hisax/hscx.h.

Signed-off-by: Baoyou Xie 
---
 drivers/isdn/hisax/elsa_ser.c | 2 +-
 drivers/isdn/hisax/hscx.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/isdn/hisax/elsa_ser.c b/drivers/isdn/hisax/elsa_ser.c
index a2a358c..34ccc18 100644
--- a/drivers/isdn/hisax/elsa_ser.c
+++ b/drivers/isdn/hisax/elsa_ser.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include "hscx.h"
 
 #define MAX_MODEM_BUF  256
 #define WAKEUP_CHARS   (MAX_MODEM_BUF / 2)
@@ -419,7 +420,6 @@ static void rs_interrupt_elsa(struct IsdnCardState *cs)
 #endif
 }
 
-extern int open_hscxstate(struct IsdnCardState *cs, struct BCState *bcs);
 extern void modehscx(struct BCState *bcs, int mode, int bc);
 extern void hscx_l2l1(struct PStack *st, int pr, void *arg);
 
diff --git a/drivers/isdn/hisax/hscx.h b/drivers/isdn/hisax/hscx.h
index 1148b4b..fa7bf16 100644
--- a/drivers/isdn/hisax/hscx.h
+++ b/drivers/isdn/hisax/hscx.h
@@ -39,3 +39,4 @@ extern void modehscx(struct BCState *bcs, int mode, int bc);
 extern void clear_pending_hscx_ints(struct IsdnCardState *cs);
 extern void inithscx(struct IsdnCardState *cs);
 extern void inithscxisac(struct IsdnCardState *cs, int part);
+int open_hscxstate(struct IsdnCardState *cs, struct BCState *bcs);
-- 
2.7.4

[PATCH 3/6] isdn/hisax: add function declarations

2016-09-23 Thread Baoyou Xie

We get a few warnings when building kernel with W=1:
drivers/isdn/hisax/teles3.c:273:5: warning: no previous prototype for 
'setup_teles3' [-Wmissing-prototypes]
drivers/isdn/hisax/s0box.c:213:5: warning: no previous prototype for 
'setup_s0box' [-Wmissing-prototypes]
drivers/isdn/hisax/bkm_a4t.c:325:5: warning: no previous prototype for 
'setup_bkm_a4t' [-Wmissing-prototypes]
drivers/isdn/hisax/w6692.c:996:5: warning: no previous prototype for 
'setup_w6692' [-Wmissing-prototypes]


In fact, these functions need be declared in some header files.

So this patch adds function declarations in drivers/isdn/hisax/hisax.h.

Signed-off-by: Baoyou Xie 
---
 drivers/isdn/hisax/config.c | 60 -
 drivers/isdn/hisax/hisax.h  | 60 +
 2 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/drivers/isdn/hisax/config.c b/drivers/isdn/hisax/config.c
index bf04d2a..5335c8b 100644
--- a/drivers/isdn/hisax/config.c
+++ b/drivers/isdn/hisax/config.c
@@ -460,42 +460,14 @@ __setup("hisax=", HiSax_setup);
 extern int setup_teles0(struct IsdnCard *card);
 #endif
 
-#if CARD_TELES3
-extern int setup_teles3(struct IsdnCard *card);
-#endif
-
-#if CARD_S0BOX
-extern int setup_s0box(struct IsdnCard *card);
-#endif
-
-#if CARD_TELESPCI
-extern int setup_telespci(struct IsdnCard *card);
-#endif
-
 #if CARD_AVM_A1
 extern int setup_avm_a1(struct IsdnCard *card);
 #endif
 
-#if CARD_AVM_A1_PCMCIA
-extern int setup_avm_a1_pcmcia(struct IsdnCard *card);
-#endif
-
-#if CARD_FRITZPCI
-extern int setup_avm_pcipnp(struct IsdnCard *card);
-#endif
-
-#if CARD_ELSA
-extern int setup_elsa(struct IsdnCard *card);
-#endif
-
 #if CARD_IX1MICROR2
 extern int setup_ix1micro(struct IsdnCard *card);
 #endif
 
-#if CARD_DIEHLDIVA
-extern int setup_diva(struct IsdnCard *card);
-#endif
-
 #if CARD_ASUSCOM
 extern int setup_asuscom(struct IsdnCard *card);
 #endif
@@ -504,10 +476,6 @@ extern int setup_asuscom(struct IsdnCard *card);
 extern int setup_TeleInt(struct IsdnCard *card);
 #endif
 
-#if CARD_SEDLBAUER
-extern int setup_sedlbauer(struct IsdnCard *card);
-#endif
-
 #if CARD_SPORTSTER
 extern int setup_sportster(struct IsdnCard *card);
 #endif
@@ -524,18 +492,6 @@ extern int setup_netjet_s(struct IsdnCard *card);
 extern int setup_hfcs(struct IsdnCard *card);
 #endif
 
-#if CARD_HFC_PCI
-extern int setup_hfcpci(struct IsdnCard *card);
-#endif
-
-#if CARD_HFC_SX
-extern int setup_hfcsx(struct IsdnCard *card);
-#endif
-
-#if CARD_NICCY
-extern int setup_niccy(struct IsdnCard *card);
-#endif
-
 #if CARD_ISURF
 extern int setup_isurf(struct IsdnCard *card);
 #endif
@@ -544,22 +500,6 @@ extern int setup_isurf(struct IsdnCard *card);
 extern int setup_saphir(struct IsdnCard *card);
 #endif
 
-#if CARD_BKM_A4T
-extern int setup_bkm_a4t(struct IsdnCard *card);
-#endif
-
-#if CARD_SCT_QUADRO
-extern int setup_sct_quadro(struct IsdnCard *card);
-#endif
-
-#if CARD_GAZEL
-extern int setup_gazel(struct IsdnCard *card);
-#endif
-
-#if CARD_W6692
-extern int setup_w6692(struct IsdnCard *card);
-#endif
-
 #if CARD_NETJET_U
 extern int setup_netjet_u(struct IsdnCard *card);
 #endif
diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h
index 6ead6314..7e1d2a6 100644
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -1350,3 +1350,63 @@ static inline struct pci_dev 
*hisax_find_pci_device(unsigned int vendor,
 }
 
 #endif
+
+#if CARD_TELES3
+int setup_teles3(struct IsdnCard *card);
+#endif
+
+#if CARD_TELESPCI
+int setup_telespci(struct IsdnCard *card);
+#endif
+
+#if CARD_S0BOX
+int setup_s0box(struct IsdnCard *card);
+#endif
+
+#if CARD_AVM_A1_PCMCIA
+int setup_avm_a1_pcmcia(struct IsdnCard *card);
+#endif
+
+#if CARD_FRITZPCI
+int setup_avm_pcipnp(struct IsdnCard *card);
+#endif
+
+#if CARD_ELSA
+int setup_elsa(struct IsdnCard *card);
+#endif
+
+#if CARD_DIEHLDIVA
+int setup_diva(struct IsdnCard *card);
+#endif
+
+#if CARD_SEDLBAUER
+int setup_sedlbauer(struct IsdnCard *card);
+#endif
+
+#if CARD_HFC_PCI
+int setup_hfcpci(struct IsdnCard *card);
+#endif
+
+#if CARD_HFC_SX
+int setup_hfcsx(struct IsdnCard *card);
+#endif
+
+#if CARD_NICCY
+int setup_niccy(struct IsdnCard *card);
+#endif
+
+#if CARD_BKM_A4T
+int setup_bkm_a4t(struct IsdnCard *card);
+#endif
+
+#if CARD_SCT_QUADRO
+int setup_sct_quadro(struct IsdnCard *card);
+#endif
+
+#if CARD_GAZEL
+int setup_gazel(struct IsdnCard *card);
+#endif
+
+#if CARD_W6692
+int setup_w6692(struct IsdnCard *card);
+#endif
-- 
2.7.4

[PATCH 2/6] isdn/hardware/eicon: add missing header dependencies

2016-09-23 Thread Baoyou Xie

We get 1 warning when building kernel with W=1:
drivers/isdn/hardware/eicon/diva.c:655:6: warning: no previous prototype for 
'xdiFreeFile' [-Wmissing-prototypes]

In fact, this function is declared in
drivers/isdn/hardware/eicon/helpers.h,
so this patch adds missing header dependencies.

Signed-off-by: Baoyou Xie 
---
 drivers/isdn/hardware/eicon/diva.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/isdn/hardware/eicon/diva.c 
b/drivers/isdn/hardware/eicon/diva.c
index 9693add..ad32552 100644
--- a/drivers/isdn/hardware/eicon/diva.c
+++ b/drivers/isdn/hardware/eicon/diva.c
@@ -17,6 +17,7 @@
 #include "xdi_adapter.h"
 #include "diva_pci.h"
 #include "diva.h"
+#include "helpers.h"
 
 #ifdef CONFIG_ISDN_DIVAS_PRIPCI
 #include "os_pri.h"
-- 
2.7.4

[PATCH 1/6] isdn/eicon: add function declarations

2016-09-23 Thread Baoyou Xie

We get a few warnings when building kernel with W=1:
drivers/isdn/hardware/eicon/diddfunc.c:95:12: warning: no previous prototype 
for 'diddfunc_init' [-Wmissing-prototypes]
drivers/isdn/hardware/eicon/s_4bri.c:128:6: warning: no previous prototype for 
'start_qBri_hardware' [-Wmissing-prototypes]
drivers/isdn/hardware/eicon/idifunc.c:243:12: warning: no previous prototype 
for 'idifunc_init' [-Wmissing-prototypes]
drivers/isdn/hardware/eicon/capifunc.c:217:6: warning: no previous prototype 
for 'api_remove_complete' [-Wmissing-prototypes]


In fact, these functions need be declare in some header files.

So this patch adds function declarations in
drivers/isdn/hardware/eicon/di_defs.h,
drivers/isdn/hardware/eicon/capifunc.h,
drivers/isdn/hardware/eicon/xdi_adapter.h.

Signed-off-by: Baoyou Xie 
---
 drivers/isdn/hardware/eicon/capifunc.c|  3 --
 drivers/isdn/hardware/eicon/capifunc.h|  1 +
 drivers/isdn/hardware/eicon/di_defs.h | 46 +++
 drivers/isdn/hardware/eicon/diva.c|  2 --
 drivers/isdn/hardware/eicon/diva_didd.c   |  5 
 drivers/isdn/hardware/eicon/divacapi.h|  6 
 drivers/isdn/hardware/eicon/divamnt.c |  4 ---
 drivers/isdn/hardware/eicon/divasi.c  |  3 --
 drivers/isdn/hardware/eicon/divasmain.c   |  5 
 drivers/isdn/hardware/eicon/divasproc.c   |  2 --
 drivers/isdn/hardware/eicon/idifunc.c |  2 --
 drivers/isdn/hardware/eicon/message.c | 11 
 drivers/isdn/hardware/eicon/mntfunc.c |  5 
 drivers/isdn/hardware/eicon/os_4bri.c | 13 -
 drivers/isdn/hardware/eicon/os_bri.c  |  7 -
 drivers/isdn/hardware/eicon/os_pri.c  |  6 
 drivers/isdn/hardware/eicon/um_idi.c  |  5 
 drivers/isdn/hardware/eicon/xdi_adapter.h |  6 
 18 files changed, 59 insertions(+), 73 deletions(-)

diff --git a/drivers/isdn/hardware/eicon/capifunc.c 
b/drivers/isdn/hardware/eicon/capifunc.c
index 7a0bdbd..869b98e 100644
--- a/drivers/isdn/hardware/eicon/capifunc.c
+++ b/drivers/isdn/hardware/eicon/capifunc.c
@@ -55,9 +55,6 @@ static void diva_release_appl(struct capi_ctr *, __u16);
 static char *diva_procinfo(struct capi_ctr *);
 static u16 diva_send_message(struct capi_ctr *,
 diva_os_message_buffer_s *);
-extern void diva_os_set_controller_struct(struct capi_ctr *);
-
-extern void DIVA_DIDD_Read(DESCRIPTOR *, int);
 
 /*
  * debug
diff --git a/drivers/isdn/hardware/eicon/capifunc.h 
b/drivers/isdn/hardware/eicon/capifunc.h
index e96c45b..4bd0f20 100644
--- a/drivers/isdn/hardware/eicon/capifunc.h
+++ b/drivers/isdn/hardware/eicon/capifunc.h
@@ -36,5 +36,6 @@ typedef struct _diva_card {
  */
 int init_capifunc(void);
 void finit_capifunc(void);
+void diva_os_set_controller_struct(struct capi_ctr *);
 
 #endif /* __CAPIFUNC_H__ */
diff --git a/drivers/isdn/hardware/eicon/di_defs.h 
b/drivers/isdn/hardware/eicon/di_defs.h
index a5094d2..ed744aa 100644
--- a/drivers/isdn/hardware/eicon/di_defs.h
+++ b/drivers/isdn/hardware/eicon/di_defs.h
@@ -179,3 +179,49 @@ typedef void (IDI_CALL_LINK_T 
*didd_adapter_change_callback_t)(void IDI_CALL_ENT
 #define DI_VOICE_OVER_IP  0x0800 /* Voice over IP support */
 typedef void (IDI_CALL_LINK_T *_IDI_CALL)(void *, ENTITY *);
 #endif
+
+int diddfunc_init(void);
+void diddfunc_finit(void);
+
+void DIVA_DIDD_Read(void *, int);
+
+int divasfunc_init(int dbgmask);
+void divasfunc_exit(void);
+irqreturn_t diva_os_irq_wrapper(int irq, void *context);
+void diva_xdi_display_adapter_features(int card);
+int create_divas_proc(void);
+void remove_divas_proc(void);
+void prepare_maestra_functions(PISDN_ADAPTER IoAdapter);
+void start_qBri_hardware(PISDN_ADAPTER IoAdapter);
+int qBri_FPGA_download(PISDN_ADAPTER IoAdapter);
+void prepare_qBri_functions(PISDN_ADAPTER IoAdapter);
+void prepare_qBri2_functions(PISDN_ADAPTER IoAdapter);
+
+void prepare_pri_functions(PISDN_ADAPTER IoAdapter);
+void prepare_pri2_functions(PISDN_ADAPTER IoAdapter);
+
+int diva_os_copy_to_user(void *os_handle, void __user *dst,
+const void *src, int length);
+int diva_os_copy_from_user(void *os_handle, void *dst,
+  const void __user *src, int length);
+
+int mntfunc_init(int *, void **, unsigned long);
+void mntfunc_finit(void);
+int maint_read_write(void __user *buf, int count);
+
+void diva_os_wakeup_read(void *os_context);
+void diva_os_wakeup_close(void *os_context);
+
+int idifunc_init(void);
+void idifunc_finit(void);
+
+void diva_user_mode_idi_remove_adapter(int);
+int diva_user_mode_idi_create_adapter(const DESCRIPTOR *, int);
+
+void divas_get_version(char *);
+void diva_get_vserial_number(PISDN_ADAPTER IoAdapter, char *buf);
+
+byte MapController(byte);
+
+int fax_head_line_time(char *buffer);
+void api_remove_complete(void);
diff --git a/drivers/isdn/hardware/eicon/diva.c 
b/drivers/isdn/hardware/eicon/diva.c
index d91dd58..9693add 100644
--- a/drivers/isdn/hardware/eicon/diva.c
+++

RE: Alignment issues with freescale FEC driver

2016-09-23 Thread Andy Duan

From: David Miller  Sent: Saturday, September 24, 2016 
10:46 AM
> To: e...@nelint.com
> Cc: and...@lunn.ch; eduma...@google.com; Andy Duan
> ; ota...@ossystems.com.br;
> netdev@vger.kernel.org; troy.ki...@boundarydevices.com;
> rmk+ker...@arm.linux.org.uk; cjb.sw.nos...@gmail.com; linux-arm-
> ker...@lists.infradead.org
> Subject: Re: Alignment issues with freescale FEC driver
> 
> From: Eric Nelson 
> Date: Fri, 23 Sep 2016 11:35:17 -0700
> 
> > From the i.MX6DQ reference manual, bit 7 of ENET_RACC says this:
> >
> > "RX FIFO Shift-16
> >
> > When this field is set, the actual frame data starts at bit 16 of the
> > first word read from the RX FIFO aligning the Ethernet payload on a
> > 32-bit boundary."
> >
> > Same for the i.MX6UL.
> >
> > I'm not sure what it will take to use this, but it seems to be exactly
> > what we're looking for.
> 
> +1

RACC[SHIFT16] just instructs the MAC to write two additional bytes in front of 
each frame received into the RX FIFO to align
the Ethernet payload on a 32-bit boundary.
Eric's patch "net: fec: support RRACC_SHIFT16 to align IP header" works fine.

For the alignment issues, that is introduced by commit 1b7bde6d6 and c259c132a 
in net-next tree. Before these commits, no alignment issue.

How to fix the issue:
Solution1:  to enable HW RRACC_SHIFT16 feature (test pass):
Eric's patch  "net: fec: support RRACC_SHIFT16 to align IP header".
Solution2: include the correct prefetch() header (test pass):
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -59,7 +59,7 @@
 #include 
#include 
#include 
-#include 
+#include 
Solution3: use __netdev_alloc_skb_ip_align() instead of netdev_alloc_skb(). 
 Or: still use the previous method before commit 1b7bde6d6:
skb = netdev_alloc_skb(ndev, pkt_len - 4 + NET_IP_ALIGN);
skb_reserve(skb, NET_IP_ALIGN);

Comparing these solutions:
From sw effort and performance, I think these are the similar.  Enable 
RRACC_SHIFT16 doesn't take extra advantage.

Correct my if I am wrong. Thanks.

Regards,
Andy

Re: [Intel-wired-lan] [PATCH net-next v2 2/2] i40e: fix setting debug parameter early

2016-09-23 Thread Alexander Duyck

On Fri, Sep 23, 2016 at 6:30 AM, Stefan Assmann  wrote:
> pf->msg_enable is a bitmask, therefore assigning the value of the
> "debug" parameter is wrong. It is initialized again later in
> i40e_sw_init() so it didn't cause any problem, except that we missed
> early debug messages. Moved the initialization and assigned
> pf->hw.debug_mask the bitmask as that's what the driver actually uses
> in i40e_debug(). Otherwise the debug parameter is just a noop.
>
> Fixes: 5b5faa4 ("i40e: enable debug earlier")
>
> Signed-off-by: Stefan Assmann 
> ---
>  drivers/net/ethernet/intel/i40e/i40e_main.c | 16 +++-
>  1 file changed, 7 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
> b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index 56369761..f972f0d 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -8498,11 +8498,6 @@ static int i40e_sw_init(struct i40e_pf *pf)
> int err = 0;
> int size;
>
> -   pf->msg_enable = netif_msg_init(debug,
> -   NETIF_MSG_DRV|
> -   NETIF_MSG_PROBE  |
> -   NETIF_MSG_LINK);
> -
> /* Set default capability flags */
> pf->flags = I40E_FLAG_RX_CSUM_ENABLED |
> I40E_FLAG_MSI_ENABLED |
> @@ -10812,10 +10807,13 @@ static int i40e_probe(struct pci_dev *pdev, const 
> struct pci_device_id *ent)
> mutex_init(>aq.asq_mutex);
> mutex_init(>aq.arq_mutex);
>
> -   if (debug != -1) {
> -   pf->msg_enable = pf->hw.debug_mask;
> -   pf->msg_enable = debug;
> -   }
> +   /* enable debug prints if requested */
> +   pf->msg_enable = netif_msg_init(debug,
> +   NETIF_MSG_DRV   |
> +   NETIF_MSG_PROBE |
> +   NETIF_MSG_LINK);
> +   if (debug != -1)
> +   pf->hw.debug_mask = pf->msg_enable;
>
> /* do a special CORER for clearing PXE mode once at init */
> if (hw->revision_id == 0 &&

The patch is broken, mainly because the code was already broken.  The
flags in pf->hw.debug_mask are in no way related to pf->msg_enable.
For now just use the default mask provided to populate
pf->hw.debug_mask and then the msg_enable portion is fine.

- Alex

Re: [PATCH] Revert "net: ethernet: bcmgenet: use phydev from struct net_device"

2016-09-23 Thread David Miller

From: Jaedon Shin 
Date: Sat, 24 Sep 2016 06:08:19 +0900

> This reverts commit 62469c76007e ("net: ethernet: bcmgenet: use phydev
> from struct net_device")
> 
> without this patch, we call twice bcmgenet_mii_reset, and that is intended:
> - first time from bcmgenet_power_up() to make sure the PHY is initialized
>   *before* we get to initialize the UniMAC, this is critical
> - second time from bcmgenet_mii_probe(), through the normal phy_init_hw()
> 
> with this patch, we only get to call bcmgenet_mii_reset once, in
> bcmgenet_mii_probe() because the first time in bcmgenet_power_up(),
> dev->phydev is NULL, because of a prior call to phy_disconnect() in
> bcmgenet_close(), unfortunately, there has been MAC activity, so the PHY
> gets in a bad state
> 
> Signed-off-by: Jaedon Shin 

This is needed by the ksettings commit that happened right after this
one, so if you want this reverted you have to revert both commits.

Re: [Intel-wired-lan] [PATCH net-next v2 1/2] i40e: remove superfluous I40E_DEBUG_USER statement

2016-09-23 Thread Alexander Duyck

On Fri, Sep 23, 2016 at 6:30 AM, Stefan Assmann  wrote:
> This debug statement is confusing and never set in the code. Any debug
> output should be guarded by the proper I40E_DEBUG_* statement which can
> be enabled via the debug module parameter or ethtool.
> Remove or convert the I40E_DEBUG_USER cases to I40E_DEBUG_INIT.
>
> v2: re-add setting the debug_mask in i40e_set_msglevel() so that the
> debug level can still be altered via ethtool msglvl.
>
> Signed-off-by: Stefan Assmann 
> ---
>  drivers/net/ethernet/intel/i40e/i40e_common.c  |  3 ---
>  drivers/net/ethernet/intel/i40e/i40e_debugfs.c |  6 -
>  drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  3 +--
>  drivers/net/ethernet/intel/i40e/i40e_main.c| 35 
> +-
>  drivers/net/ethernet/intel/i40e/i40e_type.h|  2 --
>  5 files changed, 18 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
> b/drivers/net/ethernet/intel/i40e/i40e_common.c
> index 2154a34..8ccb09c 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_common.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
> @@ -3207,9 +3207,6 @@ static void i40e_parse_discover_capabilities(struct 
> i40e_hw *hw, void *buff,
> break;
> case I40E_AQ_CAP_ID_MSIX:
> p->num_msix_vectors = number;
> -   i40e_debug(hw, I40E_DEBUG_INIT,
> -  "HW Capability: MSIX vector count = %d\n",
> -  p->num_msix_vectors);
> break;
> case I40E_AQ_CAP_ID_VF_MSIX:
> p->num_msix_vectors_vf = number;

I'm assuming this is dropped because you considered it redundant with
the dump in i40e_get_capabilities.  If so it would have been nice to
see this called out in your patch description somewhere as it doesn't
jive with the rest of the patch since you are stripping something that
is using I40E_DEBUG_INIT.

> diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c 
> b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
> index 05cf9a7..e9c6f1c 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
> @@ -1210,12 +1210,6 @@ static ssize_t i40e_dbg_command_write(struct file 
> *filp,
> u32 level;
> cnt = sscanf(_buf[10], "%i", );
> if (cnt) {
> -   if (I40E_DEBUG_USER & level) {
> -   pf->hw.debug_mask = level;
> -   dev_info(>pdev->dev,
> -"set hw.debug_mask = 0x%08x\n",
> -pf->hw.debug_mask);
> -   }
> pf->msg_enable = level;
> dev_info(>pdev->dev, "set msg_enable = 0x%08x\n",
>  pf->msg_enable);

>From what I can tell the interface is completely redundant as ethtool
can already do this.  I'd say it is okay to just remove this command
and section entirely from the debugfs interface.

> diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
> b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
> index 1835186..02f55ab 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
> @@ -987,8 +987,7 @@ static void i40e_set_msglevel(struct net_device *netdev, 
> u32 data)
> struct i40e_netdev_priv *np = netdev_priv(netdev);
> struct i40e_pf *pf = np->vsi->back;
>
> -   if (I40E_DEBUG_USER & data)
> -   pf->hw.debug_mask = data;
> +   pf->hw.debug_mask = data;
> pf->msg_enable = data;
>  }
>

So the way I view this is that I40E_DEBUG_USER appears to be a flag
that is being used to differentiate between some proprietary flags and
the standard msg level.  The problem is that msg_enable and debug_mask
are playing off of two completely different bit definitions.  For
example how much sense does it make for NETIF_F_MSG_TX_DONE to map to
I40E_DEBUG_DCB.  If anything what should probably happen here is
instead of dropping the if there probably needs to be an else.

This is one of many things on my list of items to fix since I have
come back to Intel.  It is just a matter of finding the time.
Basically what I would really prefer to see here is us move all of the
flags in i40e_debug_mask so that we didn't have any overlap with the
NETIF_F_MSG_* flags unless there is a relation between the two.

> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
> b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index 61b0fc4..56369761 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -6665,16 +6665,19 @@ static int i40e_get_capabilities(struct i40e_pf *pf)
> }
> } while (err);
>

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread David Miller

From: Eric Nelson 
Date: Fri, 23 Sep 2016 11:35:17 -0700

> From the i.MX6DQ reference manual, bit 7 of ENET_RACC says this:
> 
> "RX FIFO Shift-16
> 
> When this field is set, the actual frame data starts at bit 16 of the first
> word read from the RX FIFO aligning the Ethernet payload on a
> 32-bit boundary."
> 
> Same for the i.MX6UL.
> 
> I'm not sure what it will take to use this, but it seems to be exactly
> what we're looking for.

+1

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread David Miller

From: Eric Nelson 
Date: Fri, 23 Sep 2016 10:33:29 -0700

> Since the hardware requires longword alignment for its' DMA transfers,
> aligning the IP header will require a memcpy, right?

I wish hardware designers didn't do this.

There is no conflict between DMA alignment and properly offseting
the packet data by two bytes.

All hardware designers have to do is allow 2 padding bytes to be
emitted by the chip before the actual packet data.

Then the longword or whatever DMA transfer alignment is met
whilst still giving the necessary flexibility for where the
packet data lands.

Re: [PATCH net-next] net/vxlan: Avoid unaligned access in vxlan_build_skb()

2016-09-23 Thread Alexander Duyck

On Fri, Sep 23, 2016 at 4:41 PM, Sowmini Varadhan
 wrote:
> On (09/23/16 10:38), Alexander Duyck wrote:
>>
>> So basically what I was thinking is we do something like reserving
>> NET_IP_ALIGN and continue writing headers to skb->data, but we force
>> the tracking for the inner headers into frag[0] so that we can keep
>> the inner headers aligned without messing up the alignment for outer
>> headers.  In theory the inner offset and all that would still be
>> functional but might need a few tweaks.  You could probably even use
>> the skb->encapsulation bit to indicate you are doing this.  You could
>> almost think of it as us doing something like the inverse of
>> pskb_pull_tail.  The general idea here is we want to actually leave
>> the data in skb->data, but just reference it from frag[0] so that we
>> don't accidentally pull in the 2 byte padding for alignment when
>> transmitting the frame.
>
> yes, I think something along this line could do the trick.. I tried
> hacking it a bit today for vxlan, and it could be extended for all
> these encaps protocols. Let me fix/test this more next week, maybe
> we can discuss in Tokyo.

Agreed.  Keep in mind we only really need it for the architectures
that need to set NET_IP_ALIGN so we may want to end up wrapping the
code in ifndef checks for HAVE_EFFICIENT_UNALIGNED_ACCESS.

- Alex

Re: [PATCH net-next] net/vxlan: Avoid unaligned access in vxlan_build_skb()

2016-09-23 Thread Sowmini Varadhan

On (09/23/16 10:38), Alexander Duyck wrote:
> 
> So basically what I was thinking is we do something like reserving
> NET_IP_ALIGN and continue writing headers to skb->data, but we force
> the tracking for the inner headers into frag[0] so that we can keep
> the inner headers aligned without messing up the alignment for outer
> headers.  In theory the inner offset and all that would still be
> functional but might need a few tweaks.  You could probably even use
> the skb->encapsulation bit to indicate you are doing this.  You could
> almost think of it as us doing something like the inverse of
> pskb_pull_tail.  The general idea here is we want to actually leave
> the data in skb->data, but just reference it from frag[0] so that we
> don't accidentally pull in the 2 byte padding for alignment when
> transmitting the frame.

yes, I think something along this line could do the trick.. I tried
hacking it a bit today for vxlan, and it could be extended for all
these encaps protocols. Let me fix/test this more next week, maybe
we can discuss in Tokyo.

--Sowmini

Re: Modification to skb->queue_mapping affecting performance

2016-09-23 Thread Michael Ma

2016-09-16 15:00 GMT-07:00 Michael Ma :
> 2016-09-16 12:53 GMT-07:00 Eric Dumazet :
>> On Fri, 2016-09-16 at 10:57 -0700, Michael Ma wrote:
>>
>>> This is actually the problem - if flows from different RX queues are
>>> switched to the same RX queue in IFB, they'll use different processor
>>> context with the same tasklet, and the processor context of different
>>> tasklets might be the same. So multiple tasklets in IFB competes for
>>> the same core when queue is switched.
>>>
>>> The following simple fix proved this - with this change even switching
>>> the queue won't affect small packet bandwidth/latency anymore:
>>>
>>> in ifb.c:
>>>
>>> -   struct ifb_q_private *txp = dp->tx_private + 
>>> skb_get_queue_mapping(skb);
>>> +   struct ifb_q_private *txp = dp->tx_private +
>>> (smp_processor_id() % dev->num_tx_queues);
>>>
>>> This should be more efficient since we're not sending the task to a
>>> different processor, instead we try to queue the packet to an
>>> appropriate tasklet based on the processor ID. Will this cause any
>>> packet out-of-order problem? If packets from the same flow are queued
>>> to the same RX queue due to RSS, and processor affinity is set for RX
>>> queues, I assume packets from the same flow will end up in the same
>>> core when tasklet is scheduled. But I might have missed some uncommon
>>> cases here... Would appreciate if anyone can provide more insights.
>>
>> Wait, don't you have proper smp affinity for the RX queues on your NIC ?
>>
>> ( Documentation/networking/scaling.txt RSS IRQ Configuration )
>>
> Yes - what I was trying to say is that this change will be more
> efficient than using smp_call_function_single() to schedule the
> tasklet to a different processor.
>
> RSS IRQ should be set properly already. The issue here is that I'll
> need to switch the queue mapping for NIC RX to a different TXQ on IFB,
> which allows me to classify the flows at the IFB TXQ layer and avoid
> qdisc lock contention.
>
> When that switch happens, ideally processor core shouldn't be switched
> because all the thread context isn't changed. The work in tasklet
> should be scheduled to the same processor as well. That's why I tried
> this change. Also conceptually IFB is a software device which should
> be able to schedule its workload independent from how NIC is
> configured for the interrupt handling.
>
>> A driver ndo_start_xmit() MUST use skb_get_queue_mapping(skb), because
>> the driver queue is locked before ndo_start_xmit())  (for non
>> NETIF_F_LLTX drivers at least)
>>
>
> Thanks a lot for pointing out this! I was expecting this kind of
> guidance... Then the options would be:
>
> 1. Use smp_call_function_single() to schedule the tasklet to a core
> statically mapped to the IFB TXQ, which is very similar to how TX/RX
> IRQ is configured.

This actually won't help with the throughput because ultimately load
will still be concentrated to some particular cores after packets are
concentrated to a TXQ due to queue level classification.

> 2. As you suggested below add some additional action to do the
> rescheduling before entering IFB - for example when receiving the
> packet we could just use RSS to redirect to the desired RXQ, however
> this doesn't seem to be easy, especially compared with the way how
> mqprio chooses the queue. The challenge here is that IFB queue
> selection is based on queue_mapping when skb arrives at IFB and core
> selection is based on RXQ on NIC and so it's also based on
> queue_mapping when skb arrives at NIC. Then these two queue_mappings
> must be the same so that there is no core conflict of processing two
> TXQs of IFB. Then this essentially means we have to change queue
> mapping of the NIC on the receiver side which can't be achieved using
> TC.
>

I tried to explore this further - there is actually XPS on ifb which
can be used to specify the processor cores that will be used to
process each TXQ of ifb, however the problem is similar as above -
eventually I'll have a few cores processing these queues instead of
having all the cores processing together with relatively light
contention. And this again reduces the throughput. So there isn't a
good place to do this. The ultimate problem is that we're trying to
workaround the qdisc spin lock problem by leveraging the independence
of TXQs, but at the same time after qdisc phase we also want to
maximize the utilization of cores across whatever TXQs that are used.

>> In case of ifb, __skb_queue_tail(>rq, skb); could corrupt the skb
>> list.
>>
>> In any case, you could have an action to do this before reaching IFB.
>>
>>
>>

So here is another solution - for packets coming from the NIC ingress
path the context is already a tasklet and there is no need of starting
another tasklet based on the queue selected, right? All the RQ
handling and netif_tx_stop/wakeup stuff in ifb module is unnecessary
in this case. Then we can just do transmit/receive in ifb_xmit()

Re: [PATCH 2/3] bpf powerpc: implement support for tail calls

2016-09-23 Thread Daniel Borkmann


On 09/23/2016 10:35 PM, Naveen N. Rao wrote:

Tail calls allow JIT'ed eBPF programs to call into other JIT'ed eBPF
programs. This can be achieved either by:
(1) retaining the stack setup by the first eBPF program and having all
subsequent eBPF programs re-using it, or,
(2) by unwinding/tearing down the stack and having each eBPF program
deal with its own stack as it sees fit.

To ensure that this does not create loops, there is a limit to how many
tail calls can be done (currently 32). This requires the JIT'ed code to
maintain a count of the number of tail calls done so far.

Approach (1) is simple, but requires every eBPF program to have (almost)
the same prologue/epilogue, regardless of whether they need it. This is
inefficient for small eBPF programs which may not sometimes need a
prologue at all. As such, to minimize impact of tail call
implementation, we use approach (2) here which needs each eBPF program
in the chain to use its own prologue/epilogue. This is not ideal when
many tail calls are involved and when all the eBPF programs in the chain
have similar prologue/epilogue. However, the impact is restricted to
programs that do tail calls. Individual eBPF programs are not affected.

We maintain the tail call count in a fixed location on the stack and
updated tail call count values are passed in through this. The very
first eBPF program in a chain sets this up to 0 (the first 2
instructions). Subsequent tail calls skip the first two eBPF JIT
instructions to maintain the count. For programs that don't do tail
calls themselves, the first two instructions are NOPs.

Signed-off-by: Naveen N. Rao 


Thanks for adding support, Naveen, that's really great! I think 2) seems
fine as well in this context as prologue size can vary quite a bit here,
and depending on program types likelihood of tail call usage as well (but
I wouldn't expect deep nesting). Thanks a lot!

[PATCH] Net Driver: Add Cypress GX3 VID=04b4 PID=3610.

2016-09-23 Thread chris.roth

From: Chris Roth 

From: Allan Chou 

Add support for Cypress GX3 SuperSpeed to Gigabit Ethernet
Bridge Controller (Vendor=04b4 ProdID=3610).

Patch verified on x64 linux kernel 4.7.4 system with the
Kensington SD4600P USB-C Universal Dock with Power, which uses the
Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge Controller.

A similar patch was signed-off and tested-by Allan Chou
 on 2015-12-01.

Allan verified his similar patch on x86 Linux kernel 4.1.6 system
with Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge Controller.

Tested-by: Allan Chou 
Tested-by: Chris Roth 

Signed-off-by: Allan Chou 
Signed-off-by: Chris Roth 
---
 drivers/net/usb/ax88179_178a.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index e6338c1..8a6675d 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -1656,6 +1656,19 @@ static const struct driver_info ax88178a_info = {
.tx_fixup = ax88179_tx_fixup,
 };
 
+static const struct driver_info cypress_GX3_info = {
+   .description = "Cypress GX3 SuperSpeed to Gigabit Ethernet Controller",
+   .bind = ax88179_bind,
+   .unbind = ax88179_unbind,
+   .status = ax88179_status,
+   .link_reset = ax88179_link_reset,
+   .reset = ax88179_reset,
+   .stop = ax88179_stop,
+   .flags = FLAG_ETHER | FLAG_FRAMING_AX,
+   .rx_fixup = ax88179_rx_fixup,
+   .tx_fixup = ax88179_tx_fixup,
+};
+
 static const struct driver_info dlink_dub1312_info = {
.description = "D-Link DUB-1312 USB 3.0 to Gigabit Ethernet Adapter",
.bind = ax88179_bind,
@@ -1718,6 +1731,10 @@ static const struct usb_device_id products[] = {
USB_DEVICE(0x0b95, 0x178a),
.driver_info = (unsigned long)_info,
 }, {
+   /* Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge Controller */
+   USB_DEVICE(0x04b4, 0x3610),
+   .driver_info = (unsigned long)_GX3_info,
+}, {
/* D-Link DUB-1312 USB 3.0 to Gigabit Ethernet Adapter */
USB_DEVICE(0x2001, 0x4a00),
.driver_info = (unsigned long)_dub1312_info,
-- 
2.7.4

[PATCH] netfilter: don't permit unprivileged writes to global state via sysctls

2016-09-23 Thread Jann Horn

This prevents the modification of nf_conntrack_max in unprivileged network
namespaces. For unprivileged network namespaces, ip_conntrack_max is kept
as a readonly sysctl in order to minimize potential compatibility issues.

This patch should apply cleanly to the net tree.

Signed-off-by: Jann Horn 
---
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c 
b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index ae1a71a..a639e94 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -358,6 +358,9 @@ static int ipv4_init_net(struct net *net)
if (!in->ctl_table)
return -ENOMEM;
 
+   if (net->user_ns != _user_ns)
+   in->ctl_table[0].mode = 0444;
+
in->ctl_table[0].data = _conntrack_max;
in->ctl_table[1].data = >ct.count;
in->ctl_table[2].data = _conntrack_htable_size;
-- 
2.1.4

[PATCH] hv_netvsc: fix comments

2016-09-23 Thread sthemmin

From: Stephen Hemminger 

Typo's and spelling errors. Also remove old comment from staging era.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/hyperv_net.h |7 +++
 1 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 284b97b..d7c1cc6 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -433,7 +433,7 @@ struct nvsp_1_message_revoke_send_buffer {
  */
 struct nvsp_1_message_send_rndis_packet {
/*
-* This field is specified by RNIDS. They assume there's two different
+* This field is specified by RNDIS. They assume there's two different
 * channels of communication. However, the Network VSP only has one.
 * Therefore, the channel travels with the RNDIS packet.
 */
@@ -578,7 +578,7 @@ struct nvsp_5_send_indirect_table {
/* The number of entries in the send indirection table */
u32 count;
 
-   /* The offset of the send indireciton table from top of this struct.
+   /* The offset of the send indirection table from top of this struct.
 * The send indirection table tells which channel to put the send
 * traffic on. Each entry is a channel number.
 */
@@ -733,7 +733,6 @@ struct netvsc_device {
struct nvsp_message channel_init_pkt;
 
struct nvsp_message revoke_packet;
-   /* unsigned char HwMacAddr[HW_MACADDR_LEN]; */
 
struct vmbus_channel *chn_table[VRSS_CHANNEL_MAX];
u32 send_table[VRSS_SEND_TAB_SIZE];
@@ -1238,7 +1237,7 @@ struct rndis_message {
u32 ndis_msg_type;
 
/* Total length of this message, from the beginning */
-   /* of the sruct rndis_message, in bytes. */
+   /* of the struct rndis_message, in bytes. */
u32 msg_len;
 
/* Actual message */
-- 
1.7.4.1

Re: [PATCH net-next] Documentation: devicetree: fix typo in MediaTek ethernet device-tree binding

2016-09-23 Thread Rob Herring

On Fri, Sep 23, 2016 at 02:09:32PM +0800, sean.w...@mediatek.com wrote:
> From: Sean Wang 
> 
> fix typo in
> Documentation/devicetree/bindings/net/mediatek-net.txt
> 
> Cc: devicet...@vger.kernel.org
> Reported-by: Sergei Shtylyov 
> Signed-off-by: Sean Wang 
> ---
>  Documentation/devicetree/bindings/net/mediatek-net.txt | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Acked-by: Rob Herring

Re: [PATCH net-next v2] Documentation: devicetree: revise ethernet device-tree binding about TRGMII

2016-09-23 Thread Rob Herring

On Fri, Sep 23, 2016 at 02:04:09PM +0800, sean.w...@mediatek.com wrote:
> From: Sean Wang 
> 
> add phy-mode "trgmii" to
> Documentation/devicetree/bindings/net/ethernet.txt
> 
> Cc: devicet...@vger.kernel.org
> Reported-by: Sergei Shtylyov 
> Signed-off-by: Sean Wang 
> ---
>  Documentation/devicetree/bindings/net/ethernet.txt | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)

Acked-by: Rob Herring

Re: [PATCH 3/3] bpf powerpc: add support for bpf constant blinding

2016-09-23 Thread Daniel Borkmann


On 09/23/2016 10:35 PM, Naveen N. Rao wrote:

In line with similar support for other architectures by Daniel Borkmann.

'MOD Default X' from test_bpf without constant blinding:
84 bytes emitted from JIT compiler (pass:3, flen:7)
d58a4688 + :
0:  nop
4:  nop
8:  std r27,-40(r1)
c:  std r28,-32(r1)
   10:  xor r8,r8,r8
   14:  xor r28,r28,r28
   18:  mr  r27,r3
   1c:  li  r8,66
   20:  cmpwi   r28,0
   24:  bne 0x0030
   28:  li  r8,0
   2c:  b   0x0044
   30:  divwu   r9,r8,r28
   34:  mullw   r9,r28,r9
   38:  subfr8,r9,r8
   3c:  rotlwi  r8,r8,0
   40:  li  r8,66
   44:  ld  r27,-40(r1)
   48:  ld  r28,-32(r1)
   4c:  mr  r3,r8
   50:  blr

... and with constant blinding:
140 bytes emitted from JIT compiler (pass:3, flen:11)
dbd6ab24 + :
0:  nop
4:  nop
8:  std r27,-40(r1)
c:  std r28,-32(r1)
   10:  xor r8,r8,r8
   14:  xor r28,r28,r28
   18:  mr  r27,r3
   1c:  lis r2,-22834
   20:  ori r2,r2,36083
   24:  rotlwi  r2,r2,0
   28:  xorir2,r2,36017
   2c:  xoris   r2,r2,42702
   30:  rotlwi  r2,r2,0
   34:  mr  r8,r2
   38:  rotlwi  r8,r8,0
   3c:  cmpwi   r28,0
   40:  bne 0x004c
   44:  li  r8,0
   48:  b   0x007c
   4c:  divwu   r9,r8,r28
   50:  mullw   r9,r28,r9
   54:  subfr8,r9,r8
   58:  rotlwi  r8,r8,0
   5c:  lis r2,-17137
   60:  ori r2,r2,39065
   64:  rotlwi  r2,r2,0
   68:  xorir2,r2,39131
   6c:  xoris   r2,r2,48399
   70:  rotlwi  r2,r2,0
   74:  mr  r8,r2
   78:  rotlwi  r8,r8,0
   7c:  ld  r27,-40(r1)
   80:  ld  r28,-32(r1)
   84:  mr  r3,r8
   88:  blr

Signed-off-by: Naveen N. Rao 


Acked-by: Daniel Borkmann

Re: [PATCH net] i40e: fix call of ndo_dflt_bridge_getlink()

2016-09-23 Thread Jeff Kirsher

On Fri, 2016-09-23 at 11:12 +0200, Nicolas Dichtel wrote:
> Le 19/09/2016 à 18:14, Nicolas Dichtel a écrit :
> > From: Huaibin Wang 
> > 
> > Order of arguments is wrong.
> > The wrong code has been introduced by commit 7d4f8d871ab1, but is
> compiled
> > only since commit 9df70b66418e.
> > 
> > Note that this may break netlink dumps.
> > 
> > Fixes: 9df70b66418e ("i40e: Remove incorrect #ifdef's")
> > Fixes: 7d4f8d871ab1 ("switchdev; add VLAN support for port's
> bridge_getlink")
> > CC: Scott Feldman 
> > CC: Carolyn Wyborny 
> > CC: Catherine Sullivan 
> > Signed-off-by: Huaibin Wang 
> > Signed-off-by: Nicolas Dichtel 
> Hi Jeff,
> 
> any news about this patch? David has marked it "awaiting upstream" on the
> patchwork, so I understand it should go to your tree.

Yes, it needs to go through my tree.   Please send it to intel-wired-lan@li
sts.osuosl.org mailing list, that way I can track it through our patchwork
projects.  Also you can trim Scott Feldman and Catherine Sullivan from the
CC list.

http://patchwork.ozlabs.org/project/intel-wired-lan/list/

signature.asc
Description: This is a digitally signed message part

Re: [RFC] net: store port/representative id in metadata_dst

2016-09-23 Thread John Fastabend

On 16-09-23 01:45 PM, Jakub Kicinski wrote:
> On Fri, 23 Sep 2016 13:25:10 -0700, John Fastabend wrote:
>> On 16-09-23 01:17 PM, Jakub Kicinski wrote:
>>> On Fri, 23 Sep 2016 10:22:59 -0700, Samudrala, Sridhar wrote:  
 On 9/23/2016 8:29 AM, Jakub Kicinski wrote:  
>>  [...]  
>>  [...]  

 The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
 to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
 queue.
 Also, it is not passed all the way to the driver specific xmit routine.  
 Doesn't it require
 changing all the driver xmit routines if we want to pass this parameter?

>>  [...]  

 Yes.  The VFPR netdevs don't have any HW queues associated with them and 
 we would like
 to use the PF queues for the xmit.
 I was also looking into some way of passing the port id via skb 
 parameter to the
 dev_queue_xmit() call so that the PF xmit routine can do a directed 
 transmit to a specifc VF.
 Is skb->cb an option to pass this info?
 dst_metadata approach would work  too if it is acceptable.  
>>>
>>> I don't think we can trust skb->cb to be set to anything meaningful
>>> when the skb is received by the lower device. 
>>
>> Agreed. I wouldn't recommend using skb->cb. How about passing it through
>> dev_queue_xmit_accel() through to the driver?
>>
>> If you pass the metadata through the dev_queue_xmit_accel() handle tx
>> queue  selection would work using normal mechanisms (xps, select_queue,
>> cls  hook, etc.). If you wanted to pick some specific queue based on
>> policy the policy could be loaded into one of those hooks.
> 
> Do you mean without extending how accel is handled by
> dev_queue_xmit_accel() today?  If my goal is to not have extra HW
> queues then I don't see how I could mux in the lower dev without extra
> locking (as I tried to explain two emails ago).  Sorry for being slow
> here :(
> 

Not slow here I think I was overly optimistic...

Yeh let me try this, roughly the current flow is,

   dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv);
   __dev_queue_xmit(skb, accel_priv);
   netdev_pick_tx(dev, skb, accel_priv);
ndo_select_queue(dev, skb, accel_priv, ...);
   [...]
   q->enqueue();
   [...]
   dev_hard_start_xmit();
   [...]

So in this flow the VFR netdev driver handles its xmit routine by
calling dev_queue_xmit_accel after setting skb->dev to the physical
device and passing a cookie via accel that the select_queue() routine
can use to pick a tx queue. The rest of the stack q->enqueue() and
friends will ensure that locking and qdisc is handled correctly.

But accel_priv was lost at queue selection and so its not being passed
down to the driver so no way to set your descriptor bits or whatever
needed to push to the VF. I was sort of thinking we could map it from
the select_queue routine but I can't figure out how to do that either.

The metadata idea doesn't seem that bad now that I've spent some more
time going through it. Either that or hijack some field in the skb but
I think that might be worse than the proposal here.

I'm trying to think up some other alternative now and will let you know
if I think of anything clever but got nothing at the moment.

.John

Re: [PATCH] softirq: let ksoftirqd do its job

2016-09-23 Thread Peter Zijlstra

On Fri, Sep 23, 2016 at 06:51:04PM +0200, Jesper Dangaard Brouer wrote:

> This is your git tree, right:
>  https://git.kernel.org/cgit/linux/kernel/git/peterz/queue.git/
> 
> Doesn't look like you pushed it yet, or do I need to look at a specific
> branch?

I mainly work from a local quilt queue which I feed to mingo. I
occasionally push out to get build-bot coverage or have people look at
bits I poked together.

That said, I'll try and do a push later tonight.

Do note however, that git tree is a complete wipe and rebuild, don't
expect any kind of continuity from it.

[PATCH] Revert "net: ethernet: bcmgenet: use phydev from struct net_device"

2016-09-23 Thread Jaedon Shin

This reverts commit 62469c76007e ("net: ethernet: bcmgenet: use phydev
from struct net_device")

without this patch, we call twice bcmgenet_mii_reset, and that is intended:
- first time from bcmgenet_power_up() to make sure the PHY is initialized
  *before* we get to initialize the UniMAC, this is critical
- second time from bcmgenet_mii_probe(), through the normal phy_init_hw()

with this patch, we only get to call bcmgenet_mii_reset once, in
bcmgenet_mii_probe() because the first time in bcmgenet_power_up(),
dev->phydev is NULL, because of a prior call to phy_disconnect() in
bcmgenet_close(), unfortunately, there has been MAC activity, so the PHY
gets in a bad state

Signed-off-by: Jaedon Shin 
---
 drivers/net/ethernet/broadcom/genet/bcmgenet.c | 45 ++
 drivers/net/ethernet/broadcom/genet/bcmgenet.h |  1 +
 drivers/net/ethernet/broadcom/genet/bcmmii.c   | 24 +++---
 3 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c 
b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
index 8d4f8495dbb3..541456398dfb 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
@@ -453,25 +453,29 @@ static inline void bcmgenet_rdma_ring_writel(struct 
bcmgenet_priv *priv,
 static int bcmgenet_get_settings(struct net_device *dev,
 struct ethtool_cmd *cmd)
 {
+   struct bcmgenet_priv *priv = netdev_priv(dev);
+
if (!netif_running(dev))
return -EINVAL;
 
-   if (!dev->phydev)
+   if (!priv->phydev)
return -ENODEV;
 
-   return phy_ethtool_gset(dev->phydev, cmd);
+   return phy_ethtool_gset(priv->phydev, cmd);
 }
 
 static int bcmgenet_set_settings(struct net_device *dev,
 struct ethtool_cmd *cmd)
 {
+   struct bcmgenet_priv *priv = netdev_priv(dev);
+
if (!netif_running(dev))
return -EINVAL;
 
-   if (!dev->phydev)
+   if (!priv->phydev)
return -ENODEV;
 
-   return phy_ethtool_sset(dev->phydev, cmd);
+   return phy_ethtool_sset(priv->phydev, cmd);
 }
 
 static int bcmgenet_set_rx_csum(struct net_device *dev,
@@ -937,7 +941,7 @@ static int bcmgenet_get_eee(struct net_device *dev, struct 
ethtool_eee *e)
e->eee_active = p->eee_active;
e->tx_lpi_timer = bcmgenet_umac_readl(priv, UMAC_EEE_LPI_TIMER);
 
-   return phy_ethtool_get_eee(dev->phydev, e);
+   return phy_ethtool_get_eee(priv->phydev, e);
 }
 
 static int bcmgenet_set_eee(struct net_device *dev, struct ethtool_eee *e)
@@ -954,7 +958,7 @@ static int bcmgenet_set_eee(struct net_device *dev, struct 
ethtool_eee *e)
if (!p->eee_enabled) {
bcmgenet_eee_enable_set(dev, false);
} else {
-   ret = phy_init_eee(dev->phydev, 0);
+   ret = phy_init_eee(priv->phydev, 0);
if (ret) {
netif_err(priv, hw, dev, "EEE initialization failed\n");
return ret;
@@ -964,12 +968,14 @@ static int bcmgenet_set_eee(struct net_device *dev, 
struct ethtool_eee *e)
bcmgenet_eee_enable_set(dev, true);
}
 
-   return phy_ethtool_set_eee(dev->phydev, e);
+   return phy_ethtool_set_eee(priv->phydev, e);
 }
 
 static int bcmgenet_nway_reset(struct net_device *dev)
 {
-   return genphy_restart_aneg(dev->phydev);
+   struct bcmgenet_priv *priv = netdev_priv(dev);
+
+   return genphy_restart_aneg(priv->phydev);
 }
 
 /* standard ethtool support functions. */
@@ -996,13 +1002,12 @@ static struct ethtool_ops bcmgenet_ethtool_ops = {
 static int bcmgenet_power_down(struct bcmgenet_priv *priv,
enum bcmgenet_power_mode mode)
 {
-   struct net_device *ndev = priv->dev;
int ret = 0;
u32 reg;
 
switch (mode) {
case GENET_POWER_CABLE_SENSE:
-   phy_detach(ndev->phydev);
+   phy_detach(priv->phydev);
break;
 
case GENET_POWER_WOL_MAGIC:
@@ -1063,6 +1068,7 @@ static void bcmgenet_power_up(struct bcmgenet_priv *priv,
 /* ioctl handle special commands that are not present in ethtool. */
 static int bcmgenet_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
+   struct bcmgenet_priv *priv = netdev_priv(dev);
int val = 0;
 
if (!netif_running(dev))
@@ -1072,10 +1078,10 @@ static int bcmgenet_ioctl(struct net_device *dev, 
struct ifreq *rq, int cmd)
case SIOCGMIIPHY:
case SIOCGMIIREG:
case SIOCSMIIREG:
-   if (!dev->phydev)
+   if (!priv->phydev)
val = -ENODEV;
else
-   val = phy_mii_ioctl(dev->phydev, rq, cmd);
+   val = phy_mii_ioctl(priv->phydev, rq, cmd);
break;
 
default:
@@ -2458,7 +2464,6 @@

Re: [net-next 5/5] PCI: disable FLR for 82579 device

2016-09-23 Thread Jeff Kirsher

On Fri, 2016-09-23 at 09:01 -0500, Bjorn Helgaas wrote:
> On Thu, Sep 22, 2016 at 11:39:01PM -0700, Jeff Kirsher wrote:
> > 
> > From: Sasha Neftin 
> > 
> > 82579 has a problem reattaching itself after the device is detached.
> > The bug was reported by Redhat. The suggested fix is to disable
> > FLR capability in PCIe configuration space.
> > 
> > Reproduction:
> > Attach the device to a VM, then detach and try to attach again.
> > 
> > Fix:
> > Disable FLR capability to prevent the 82579 from hanging.
> 
> Is there a bugzilla or other reference URL to include here?  Should
> this be marked for stable?

So the author is in Israel, meaning it is their weekend now.  I do not
believe Sasha monitors email over the weekend, so a response to your
questions won't happen for a few days.

I tried searching my archives for more information, but had no luck finding
any additional information.

> > Signed-off-by: Sasha Neftin 
> > Tested-by: Aaron Brown 
> > Signed-off-by: Jeff Kirsher 
> > ---
> >  drivers/pci/quirks.c | 21 +
> >  1 file changed, 21 insertions(+)
> > 
> > diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
> > index 44e0ff3..59fba6e 100644
> > --- a/drivers/pci/quirks.c
> > +++ b/drivers/pci/quirks.c
> > @@ -4431,3 +4431,24 @@ static void quirk_intel_qat_vf_cap(struct
> > pci_dev *pdev)
> >     }
> >  }
> >  DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x443,
> > quirk_intel_qat_vf_cap);
> > +/*
> > + * Workaround FLR issues for 82579
> > + * This code disables the FLR (Function Level Reset) via PCIe, in
> > order
> > + * to workaround a bug found while using device passthrough, where the
> > + * interface would become non-responsive.
> > + * NOTE: the FLR bit is Read/Write Once (RWO) in config space, so if
> > + * the BIOS or kernel writes this register * then this workaround will
> > + * not work.
> 
> This doesn't sound like a root cause.  Is the issue a hardware
> erratum?  Linux PCI core bug?  VFIO bug?  Device firmware bug?
> 
> The changelog suggests that the problem only affects passthrough,
> which suggests some sort of kernel bug related to how passthrough is
> implemented.
> 
> > 
> > + */
> > +static void quirk_intel_flr_cap_dis(struct pci_dev *dev)
> > +{
> > +   int pos = pci_find_capability(dev, PCI_CAP_ID_AF);
> > +   if (pos) {
> > +   u8 cap;
> > +   pci_read_config_byte(dev, pos + PCI_AF_CAP, );
> > +   cap = cap & (~PCI_AF_CAP_FLR);
> > +   pci_write_config_byte(dev, pos + PCI_AF_CAP, cap);
> > +   }
> > +}
> > +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1502,
> > quirk_intel_flr_cap_dis);
> > +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x1503,
> > quirk_intel_flr_cap_dis);
> > -- 
> > 2.7.4
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> > the body of a message to majord...@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html


signature.asc
Description: This is a digitally signed message part

Re: [PATCH] net: bcmgenet: Fix EPHY reset in power up

2016-09-23 Thread Jaedon Shin

Hi Florian,

> On 24 Sep 2016, at 1:54 AM, Florian Fainelli  wrote:
> 
> On 09/23/2016 08:04 AM, Jaedon Shin wrote:
>> Hi Andrew,
>> 
>> On 23 Sep 2016, at 11:06 PM, Andrew Lunn  wrote:
>>> 
>>> On Fri, Sep 23, 2016 at 10:20:04PM +0900, Jaedon Shin wrote:
 The bcmgenet_mii_reset() is always not running in power up sequence
 after 'commit 62469c76007e ("net: ethernet: bcmgenet: use phydev from
 struct net_device")'. This'll show extremely high latency and duplicate
 packets while interface down and up repeatedly.
 
 For now, adds again a private phydev for mii reset when runs power up to
 open interface.
>>> 
>>> Hi Jaedon
>>> 
>>> How does this fix the issue? It sounds like you are papering over the
>>> crack, not truly fixing it.
>>> 
>>>  Andrew
>> 
>> Yes, It feel like a workaround, but I think it must need v4.8 stable
>> version. If we find better way that fixes internal PHY to initialize
>> after re-open interface, this patch will be dropped.
> 
> I can observe the faulting behavior with 4.8-rc7 that the link below
> fixed initially:
> 
> # ping fainelli-linux
> PING fainelli-linux (10.112.156.244): 56 data bytes
> 64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.352 ms
> 64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.472 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.496 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.517 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.536 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.557 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=1 ttl=61 time=752.448 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.291 ms
> 64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.421 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.444 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.464 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.483 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.505 ms (DUP!)
> 64 bytes from 10.112.156.244: seq=2 ttl=61 time=24.964 ms (DUP!)
> 
> If we revert this patch, we indeed get the normal and expected behavior
> back:
> 
> # ping fainelli-linux
> PING fainelli-linux (10.112.156.244): 56 data bytes
> 64 bytes from 10.112.156.244: seq=0 ttl=61 time=0.417 ms
> 64 bytes from 10.112.156.244: seq=1 ttl=61 time=0.415 ms
> 64 bytes from 10.112.156.244: seq=2 ttl=61 time=0.424 ms
> 
> Actually, the key thing is this:
> 
> - without Philippe's patch we call twice bcmgenet_mii_reset, and that is
> intended:
>   - first time from bcmgenet_power_up() to make sure the PHY is
> initialized *before* we get to initialize the UniMAC, this is critical
>   - second time from bcmgenet_mii_probe(), through the normal 
> phy_init_hw()
> 
> - with Philippe's patch, we only get to call bcmgenet_mii_reset once, in
> bcmgenet_mii_probe() because the first time in bcmgenet_power_up(),
> dev->phydev is NULL, because of a prior call to phy_disconnect() in
> bcmgenet_close(), unfortunately, there has been MAC activity, so the PHY
> gets in a bad state
> 
> Jaedon, feel free to use the explanation above, and send a plain revert
> of commit 62469c76007e11428e2ee3c6de90cbe74b588d44.
> 

Will send revert patch.

Thanks,
Jaedon

> Thanks!
> 
> Thanks!
> -- 
> Florian

Re: [RFC] net: store port/representative id in metadata_dst

2016-09-23 Thread Jakub Kicinski

On Fri, 23 Sep 2016 13:25:10 -0700, John Fastabend wrote:
> On 16-09-23 01:17 PM, Jakub Kicinski wrote:
> > On Fri, 23 Sep 2016 10:22:59 -0700, Samudrala, Sridhar wrote:  
> >> On 9/23/2016 8:29 AM, Jakub Kicinski wrote:  
>  [...]  
>  [...]  
> >>
> >> The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
> >> to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
> >> queue.
> >> Also, it is not passed all the way to the driver specific xmit routine.  
> >> Doesn't it require
> >> changing all the driver xmit routines if we want to pass this parameter?
> >>  
>  [...]  
> >>
> >> Yes.  The VFPR netdevs don't have any HW queues associated with them and 
> >> we would like
> >> to use the PF queues for the xmit.
> >> I was also looking into some way of passing the port id via skb 
> >> parameter to the
> >> dev_queue_xmit() call so that the PF xmit routine can do a directed 
> >> transmit to a specifc VF.
> >> Is skb->cb an option to pass this info?
> >> dst_metadata approach would work  too if it is acceptable.  
> > 
> > I don't think we can trust skb->cb to be set to anything meaningful
> > when the skb is received by the lower device. 
> 
> Agreed. I wouldn't recommend using skb->cb. How about passing it through
> dev_queue_xmit_accel() through to the driver?
> 
> If you pass the metadata through the dev_queue_xmit_accel() handle tx
> queue  selection would work using normal mechanisms (xps, select_queue,
> cls  hook, etc.). If you wanted to pick some specific queue based on
> policy the policy could be loaded into one of those hooks.

Do you mean without extending how accel is handled by
dev_queue_xmit_accel() today?  If my goal is to not have extra HW
queues then I don't see how I could mux in the lower dev without extra
locking (as I tried to explain two emails ago).  Sorry for being slow
here :(

[PATCH 1/2] bpf samples: fix compiler errors with sockex2 and sockex3

2016-09-23 Thread Naveen N. Rao

These samples fail to compile as 'struct flow_keys' conflicts with
definition in net/flow_dissector.h. Fix the same by renaming the
structure used in the sample.

Signed-off-by: Naveen N. Rao 
---
 samples/bpf/sockex2_kern.c | 10 +-
 samples/bpf/sockex3_kern.c |  8 
 samples/bpf/sockex3_user.c |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/samples/bpf/sockex2_kern.c b/samples/bpf/sockex2_kern.c
index ba0e177..44e5846 100644
--- a/samples/bpf/sockex2_kern.c
+++ b/samples/bpf/sockex2_kern.c
@@ -14,7 +14,7 @@ struct vlan_hdr {
__be16 h_vlan_encapsulated_proto;
 };
 
-struct flow_keys {
+struct bpf_flow_keys {
__be32 src;
__be32 dst;
union {
@@ -59,7 +59,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, 
__u64 off)
 }
 
 static inline __u64 parse_ip(struct __sk_buff *skb, __u64 nhoff, __u64 
*ip_proto,
-struct flow_keys *flow)
+struct bpf_flow_keys *flow)
 {
__u64 verlen;
 
@@ -83,7 +83,7 @@ static inline __u64 parse_ip(struct __sk_buff *skb, __u64 
nhoff, __u64 *ip_proto
 }
 
 static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 nhoff, __u64 
*ip_proto,
-  struct flow_keys *flow)
+  struct bpf_flow_keys *flow)
 {
*ip_proto = load_byte(skb,
  nhoff + offsetof(struct ipv6hdr, nexthdr));
@@ -96,7 +96,7 @@ static inline __u64 parse_ipv6(struct __sk_buff *skb, __u64 
nhoff, __u64 *ip_pro
return nhoff;
 }
 
-static inline bool flow_dissector(struct __sk_buff *skb, struct flow_keys 
*flow)
+static inline bool flow_dissector(struct __sk_buff *skb, struct bpf_flow_keys 
*flow)
 {
__u64 nhoff = ETH_HLEN;
__u64 ip_proto;
@@ -198,7 +198,7 @@ struct bpf_map_def SEC("maps") hash_map = {
 SEC("socket2")
 int bpf_prog2(struct __sk_buff *skb)
 {
-   struct flow_keys flow;
+   struct bpf_flow_keys flow;
struct pair *value;
u32 key;
 
diff --git a/samples/bpf/sockex3_kern.c b/samples/bpf/sockex3_kern.c
index 41ae2fd..95907f8 100644
--- a/samples/bpf/sockex3_kern.c
+++ b/samples/bpf/sockex3_kern.c
@@ -61,7 +61,7 @@ struct vlan_hdr {
__be16 h_vlan_encapsulated_proto;
 };
 
-struct flow_keys {
+struct bpf_flow_keys {
__be32 src;
__be32 dst;
union {
@@ -88,7 +88,7 @@ static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, 
__u64 off)
 }
 
 struct globals {
-   struct flow_keys flow;
+   struct bpf_flow_keys flow;
 };
 
 struct bpf_map_def SEC("maps") percpu_map = {
@@ -114,14 +114,14 @@ struct pair {
 
 struct bpf_map_def SEC("maps") hash_map = {
.type = BPF_MAP_TYPE_HASH,
-   .key_size = sizeof(struct flow_keys),
+   .key_size = sizeof(struct bpf_flow_keys),
.value_size = sizeof(struct pair),
.max_entries = 1024,
 };
 
 static void update_stats(struct __sk_buff *skb, struct globals *g)
 {
-   struct flow_keys key = g->flow;
+   struct bpf_flow_keys key = g->flow;
struct pair *value;
 
value = bpf_map_lookup_elem(_map, );
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
index d4184ab..3fcfd8c4 100644
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
@@ -7,7 +7,7 @@
 #include 
 #include 
 
-struct flow_keys {
+struct bpf_flow_keys {
__be32 src;
__be32 dst;
union {
@@ -49,7 +49,7 @@ int main(int argc, char **argv)
(void) f;
 
for (i = 0; i < 5; i++) {
-   struct flow_keys key = {}, next_key;
+   struct bpf_flow_keys key = {}, next_key;
struct pair value;
 
sleep(1);
-- 
2.9.3

[PATCH 2/2] bpf samples: update tracex5 sample to use __seccomp_filter

2016-09-23 Thread Naveen N. Rao

seccomp_phase1() does not exist anymore. Instead, update sample to use
__seccomp_filter(). While at it, set max locked memory to unlimited.

Signed-off-by: Naveen N. Rao 
---
I am not completely sure if __seccomp_filter is the right place to hook
in. This works for me though. Please review.

Thanks,
Naveen


 samples/bpf/tracex5_kern.c | 16 +++-
 samples/bpf/tracex5_user.c |  3 +++
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c
index f95f232..fd12d71 100644
--- a/samples/bpf/tracex5_kern.c
+++ b/samples/bpf/tracex5_kern.c
@@ -19,20 +19,18 @@ struct bpf_map_def SEC("maps") progs = {
.max_entries = 1024,
 };
 
-SEC("kprobe/seccomp_phase1")
+SEC("kprobe/__seccomp_filter")
 int bpf_prog1(struct pt_regs *ctx)
 {
-   struct seccomp_data sd;
-
-   bpf_probe_read(, sizeof(sd), (void *)PT_REGS_PARM1(ctx));
+   int sc_nr = (int)PT_REGS_PARM1(ctx);
 
/* dispatch into next BPF program depending on syscall number */
-   bpf_tail_call(ctx, , sd.nr);
+   bpf_tail_call(ctx, , sc_nr);
 
/* fall through -> unknown syscall */
-   if (sd.nr >= __NR_getuid && sd.nr <= __NR_getsid) {
+   if (sc_nr >= __NR_getuid && sc_nr <= __NR_getsid) {
char fmt[] = "syscall=%d (one of get/set uid/pid/gid)\n";
-   bpf_trace_printk(fmt, sizeof(fmt), sd.nr);
+   bpf_trace_printk(fmt, sizeof(fmt), sc_nr);
}
return 0;
 }
@@ -42,7 +40,7 @@ PROG(__NR_write)(struct pt_regs *ctx)
 {
struct seccomp_data sd;
 
-   bpf_probe_read(, sizeof(sd), (void *)PT_REGS_PARM1(ctx));
+   bpf_probe_read(, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
if (sd.args[2] == 512) {
char fmt[] = "write(fd=%d, buf=%p, size=%d)\n";
bpf_trace_printk(fmt, sizeof(fmt),
@@ -55,7 +53,7 @@ PROG(__NR_read)(struct pt_regs *ctx)
 {
struct seccomp_data sd;
 
-   bpf_probe_read(, sizeof(sd), (void *)PT_REGS_PARM1(ctx));
+   bpf_probe_read(, sizeof(sd), (void *)PT_REGS_PARM2(ctx));
if (sd.args[2] > 128 && sd.args[2] <= 1024) {
char fmt[] = "read(fd=%d, buf=%p, size=%d)\n";
bpf_trace_printk(fmt, sizeof(fmt),
diff --git a/samples/bpf/tracex5_user.c b/samples/bpf/tracex5_user.c
index a04dd3c..36b5925 100644
--- a/samples/bpf/tracex5_user.c
+++ b/samples/bpf/tracex5_user.c
@@ -6,6 +6,7 @@
 #include 
 #include "libbpf.h"
 #include "bpf_load.h"
+#include 
 
 /* install fake seccomp program to enable seccomp code path inside the kernel,
  * so that our kprobe attached to seccomp_phase1() can be triggered
@@ -27,8 +28,10 @@ int main(int ac, char **argv)
 {
FILE *f;
char filename[256];
+   struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
 
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+   setrlimit(RLIMIT_MEMLOCK, );
 
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
-- 
2.9.3

Re: [PATCH net v2] ip6_gre: fix flowi6_proto value in ip6gre_xmit_other()

2016-09-23 Thread Lance Richardson

> From: "Sergei Shtylyov" 
> To: "Lance Richardson" , netdev@vger.kernel.org
> Cc: "shmulik ladkani" , jb...@redhat.com
> Sent: Friday, September 23, 2016 4:01:15 PM
> Subject: Re: [PATCH net v2] ip6_gre: fix flowi6_proto value in 
> ip6gre_xmit_other()
> 
> Hello.
> 
> On 09/23/2016 10:50 PM, Lance Richardson wrote:
> 
> > Similar to commit 3be07244b733 ("ip6_gre: fix flowi6_proto value in
> > xmit path"), set flowi6_proto to IPPROTO_GRE for output route lookup.
> >
> > Up until now, ip6gre_xmit_other() has set flowi6_proto to a bogus value.
> > This affected output route lookup for packets sent on an ip6gretap device
> > in cases where routing was dependent on the value of flowi6_proto.
> >
> > Since the correct proto is already set in the tunnel flowi6 template via
> > commit 252f3f5a1189 ("ip6_gre: Set flowi6_proto as IPPROTO_GRE in xmit
> > path."), simply delete the line setting the incorrect flowi6_proto value.
> >
> > Suggested-by: Jiri Benc 
> > Fixes: commit c12b395a4664 ("gre: Support GRE over IPv6")
> 
> That "commit" isn't needed here, this tag has a standardized format.
> Hopefully, can be fixed while applying...

Thanks for pointing that out, I mistakenly added that "commit" after
checkpatch.pl complained about not having "commit" before the hashes
in the log. Hoping it can be fixed when applying as well.

> 
> > Reviewed-by: Shmulik Ladkani 
> > Signed-off-by: Lance Richardson 
> [...]
> 
> MBR, Sergei
> 
>

[PATCH 3/3] bpf powerpc: add support for bpf constant blinding

2016-09-23 Thread Naveen N. Rao

In line with similar support for other architectures by Daniel Borkmann.

'MOD Default X' from test_bpf without constant blinding:
84 bytes emitted from JIT compiler (pass:3, flen:7)
d58a4688 + :
   0:   nop
   4:   nop
   8:   std r27,-40(r1)
   c:   std r28,-32(r1)
  10:   xor r8,r8,r8
  14:   xor r28,r28,r28
  18:   mr  r27,r3
  1c:   li  r8,66
  20:   cmpwi   r28,0
  24:   bne 0x0030
  28:   li  r8,0
  2c:   b   0x0044
  30:   divwu   r9,r8,r28
  34:   mullw   r9,r28,r9
  38:   subfr8,r9,r8
  3c:   rotlwi  r8,r8,0
  40:   li  r8,66
  44:   ld  r27,-40(r1)
  48:   ld  r28,-32(r1)
  4c:   mr  r3,r8
  50:   blr

... and with constant blinding:
140 bytes emitted from JIT compiler (pass:3, flen:11)
dbd6ab24 + :
   0:   nop
   4:   nop
   8:   std r27,-40(r1)
   c:   std r28,-32(r1)
  10:   xor r8,r8,r8
  14:   xor r28,r28,r28
  18:   mr  r27,r3
  1c:   lis r2,-22834
  20:   ori r2,r2,36083
  24:   rotlwi  r2,r2,0
  28:   xorir2,r2,36017
  2c:   xoris   r2,r2,42702
  30:   rotlwi  r2,r2,0
  34:   mr  r8,r2
  38:   rotlwi  r8,r8,0
  3c:   cmpwi   r28,0
  40:   bne 0x004c
  44:   li  r8,0
  48:   b   0x007c
  4c:   divwu   r9,r8,r28
  50:   mullw   r9,r28,r9
  54:   subfr8,r9,r8
  58:   rotlwi  r8,r8,0
  5c:   lis r2,-17137
  60:   ori r2,r2,39065
  64:   rotlwi  r2,r2,0
  68:   xorir2,r2,39131
  6c:   xoris   r2,r2,48399
  70:   rotlwi  r2,r2,0
  74:   mr  r8,r2
  78:   rotlwi  r8,r8,0
  7c:   ld  r27,-40(r1)
  80:   ld  r28,-32(r1)
  84:   mr  r3,r8
  88:   blr

Signed-off-by: Naveen N. Rao 
---
 arch/powerpc/net/bpf_jit64.h  |  9 +
 arch/powerpc/net/bpf_jit_comp64.c | 36 +---
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h
index 038e00b..62fa758 100644
--- a/arch/powerpc/net/bpf_jit64.h
+++ b/arch/powerpc/net/bpf_jit64.h
@@ -39,10 +39,10 @@
 #ifndef __ASSEMBLY__
 
 /* BPF register usage */
-#define SKB_HLEN_REG   (MAX_BPF_REG + 0)
-#define SKB_DATA_REG   (MAX_BPF_REG + 1)
-#define TMP_REG_1  (MAX_BPF_REG + 2)
-#define TMP_REG_2  (MAX_BPF_REG + 3)
+#define SKB_HLEN_REG   (MAX_BPF_JIT_REG + 0)
+#define SKB_DATA_REG   (MAX_BPF_JIT_REG + 1)
+#define TMP_REG_1  (MAX_BPF_JIT_REG + 2)
+#define TMP_REG_2  (MAX_BPF_JIT_REG + 3)
 
 /* BPF to ppc register mappings */
 static const int b2p[] = {
@@ -62,6 +62,7 @@ static const int b2p[] = {
/* frame pointer aka BPF_REG_10 */
[BPF_REG_FP] = 31,
/* eBPF jit internal registers */
+   [BPF_REG_AX] = 2,
[SKB_HLEN_REG] = 25,
[SKB_DATA_REG] = 26,
[TMP_REG_1] = 9,
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 3ec29d6..0fe98a5 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -974,21 +974,37 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
int pass;
int flen;
struct bpf_binary_header *bpf_hdr;
+   struct bpf_prog *org_fp = fp;
+   struct bpf_prog *tmp_fp;
+   bool bpf_blinded = false;
 
if (!bpf_jit_enable)
-   return fp;
+   return org_fp;
+
+   tmp_fp = bpf_jit_blind_constants(org_fp);
+   if (IS_ERR(tmp_fp))
+   return org_fp;
+
+   if (tmp_fp != org_fp) {
+   bpf_blinded = true;
+   fp = tmp_fp;
+   }
 
flen = fp->len;
addrs = kzalloc((flen+1) * sizeof(*addrs), GFP_KERNEL);
-   if (addrs == NULL)
-   return fp;
+   if (addrs == NULL) {
+   fp = org_fp;
+   goto out;
+   }
+
+   memset(, 0, sizeof(struct codegen_context));
 
-   cgctx.idx = 0;
-   cgctx.seen = 0;
/* Scouting faux-generate pass 0 */
-   if (bpf_jit_build_body(fp, 0, , addrs))
+   if (bpf_jit_build_body(fp, 0, , addrs)) {
/* We hit something illegal or unsupported. */
+   fp = org_fp;
goto out;
+   }
 
/*
 * Pretend to build prologue, given the features we've seen.  This will
@@ -1003,8 +1019,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 
bpf_hdr = bpf_jit_binary_alloc(alloclen, , 4,
bpf_jit_fill_ill_insns);
-   if (!bpf_hdr)
+   if (!bpf_hdr) {
+   fp = org_fp;
goto out;
+   }
 
code_base = (u32 *)(image + FUNCTION_DESCR_SIZE);
 
@@ -1041,6 +1059,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 
 out:
kfree(addrs);
+
+   if (bpf_blinded)
+   bpf_jit_prog_release_other(fp, fp == org_fp ? tmp_fp : org_fp);
+
return fp;
 }
 
-- 
2.9.3

[PATCH 2/3] bpf powerpc: implement support for tail calls

2016-09-23 Thread Naveen N. Rao

Tail calls allow JIT'ed eBPF programs to call into other JIT'ed eBPF
programs. This can be achieved either by:
(1) retaining the stack setup by the first eBPF program and having all
subsequent eBPF programs re-using it, or,
(2) by unwinding/tearing down the stack and having each eBPF program
deal with its own stack as it sees fit.

To ensure that this does not create loops, there is a limit to how many
tail calls can be done (currently 32). This requires the JIT'ed code to
maintain a count of the number of tail calls done so far.

Approach (1) is simple, but requires every eBPF program to have (almost)
the same prologue/epilogue, regardless of whether they need it. This is
inefficient for small eBPF programs which may not sometimes need a
prologue at all. As such, to minimize impact of tail call
implementation, we use approach (2) here which needs each eBPF program
in the chain to use its own prologue/epilogue. This is not ideal when
many tail calls are involved and when all the eBPF programs in the chain
have similar prologue/epilogue. However, the impact is restricted to
programs that do tail calls. Individual eBPF programs are not affected.

We maintain the tail call count in a fixed location on the stack and
updated tail call count values are passed in through this. The very
first eBPF program in a chain sets this up to 0 (the first 2
instructions). Subsequent tail calls skip the first two eBPF JIT
instructions to maintain the count. For programs that don't do tail
calls themselves, the first two instructions are NOPs.

Signed-off-by: Naveen N. Rao 
---
 arch/powerpc/include/asm/ppc-opcode.h |   2 +
 arch/powerpc/net/bpf_jit.h|   2 +
 arch/powerpc/net/bpf_jit64.h  |   1 +
 arch/powerpc/net/bpf_jit_comp64.c | 149 +++---
 4 files changed, 126 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index 127ebf5..54ff8ce 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -236,6 +236,7 @@
 #define PPC_INST_STWU  0x9400
 #define PPC_INST_MFLR  0x7c0802a6
 #define PPC_INST_MTLR  0x7c0803a6
+#define PPC_INST_MTCTR 0x7c0903a6
 #define PPC_INST_CMPWI 0x2c00
 #define PPC_INST_CMPDI 0x2c20
 #define PPC_INST_CMPW  0x7c00
@@ -250,6 +251,7 @@
 #define PPC_INST_SUB   0x7c50
 #define PPC_INST_BLR   0x4e800020
 #define PPC_INST_BLRL  0x4e800021
+#define PPC_INST_BCTR  0x4e800420
 #define PPC_INST_MULLD 0x7c0001d2
 #define PPC_INST_MULLW 0x7c0001d6
 #define PPC_INST_MULHWU0x7c16
diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index d5301b6..89f7007 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -40,6 +40,8 @@
 #define PPC_BLR()  EMIT(PPC_INST_BLR)
 #define PPC_BLRL() EMIT(PPC_INST_BLRL)
 #define PPC_MTLR(r)EMIT(PPC_INST_MTLR | ___PPC_RT(r))
+#define PPC_BCTR() EMIT(PPC_INST_BCTR)
+#define PPC_MTCTR(r)   EMIT(PPC_INST_MTCTR | ___PPC_RT(r))
 #define PPC_ADDI(d, a, i)  EMIT(PPC_INST_ADDI | ___PPC_RT(d) |   \
 ___PPC_RA(a) | IMM_L(i))
 #define PPC_MR(d, a)   PPC_OR(d, a, a)
diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h
index a1645d7..038e00b 100644
--- a/arch/powerpc/net/bpf_jit64.h
+++ b/arch/powerpc/net/bpf_jit64.h
@@ -88,6 +88,7 @@ DECLARE_LOAD_FUNC(sk_load_byte);
 #define SEEN_FUNC  0x1000 /* might call external helpers */
 #define SEEN_STACK 0x2000 /* uses BPF stack */
 #define SEEN_SKB   0x4000 /* uses sk_buff */
+#define SEEN_TAILCALL  0x8000 /* uses tail calls */
 
 struct codegen_context {
/*
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 5f8c91f..3ec29d6 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "bpf_jit64.h"
 
@@ -77,6 +78,11 @@ static int bpf_jit_stack_local(struct codegen_context *ctx)
return -(BPF_PPC_STACK_SAVE + 16);
 }
 
+static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx)
+{
+   return bpf_jit_stack_local(ctx) + 8;
+}
+
 static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg)
 {
if (reg >= BPF_PPC_NVR_MIN && reg < 32)
@@ -102,33 +108,25 @@ static void bpf_jit_emit_skb_loads(u32 *image, struct 
codegen_context *ctx)
PPC_BPF_LL(b2p[SKB_DATA_REG], 3, offsetof(struct sk_buff, data));
 }
 
-static void bpf_jit_emit_func_call(u32 *image, struct codegen_context *ctx, 
u64 func)
+static void bpf_jit_build_prologue(u32 *image, struct

[PATCH 1/3] bpf powerpc: introduce accessors for using the tmp local stack space

2016-09-23 Thread Naveen N. Rao

While at it, ensure that the location of the local save area is
consistent whether or not we setup our own stackframe. This property is
utilised in the next patch that adds support for tail calls.

Signed-off-by: Naveen N. Rao 
---
 arch/powerpc/net/bpf_jit64.h  | 16 +---
 arch/powerpc/net/bpf_jit_comp64.c | 79 ++-
 2 files changed, 55 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h
index 5046d6f..a1645d7 100644
--- a/arch/powerpc/net/bpf_jit64.h
+++ b/arch/powerpc/net/bpf_jit64.h
@@ -16,22 +16,25 @@
 
 /*
  * Stack layout:
+ * Ensure the top half (upto local_tmp_var) stays consistent
+ * with our redzone usage.
  *
  * [   prev sp ] <-
  * [   nv gpr save area] 8*8   |
+ * [tail_call_cnt  ] 8 |
+ * [local_tmp_var  ] 8 |
  * fp (r31) -->[   ebpf stack space] 512   |
- * [  local/tmp var space  ] 16|
  * [ frame header  ] 32/112|
  * sp (r1) --->[stack pointer  ] --
  */
 
-/* for bpf JIT code internal usage */
-#define BPF_PPC_STACK_LOCALS   16
 /* for gpr non volatile registers BPG_REG_6 to 10, plus skb cache registers */
 #define BPF_PPC_STACK_SAVE (8*8)
+/* for bpf JIT code internal usage */
+#define BPF_PPC_STACK_LOCALS   16
 /* Ensure this is quadword aligned */
-#define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + BPF_PPC_STACK_LOCALS + \
-MAX_BPF_STACK + BPF_PPC_STACK_SAVE)
+#define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + MAX_BPF_STACK + \
+BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE)
 
 #ifndef __ASSEMBLY__
 
@@ -65,6 +68,9 @@ static const int b2p[] = {
[TMP_REG_2] = 10
 };
 
+/* PPC NVR range -- update this if we ever use NVRs below r24 */
+#define BPF_PPC_NVR_MIN24
+
 /* Assembly helpers */
 #define DECLARE_LOAD_FUNC(func)u64 func(u64 r3, u64 r4);   
\
u64 func##_negative_offset(u64 r3, u64 r4); 
\
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 6073b78..5f8c91f 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -58,6 +58,35 @@ static inline bool bpf_has_stack_frame(struct 
codegen_context *ctx)
return ctx->seen & SEEN_FUNC || bpf_is_seen_register(ctx, BPF_REG_FP);
 }
 
+/*
+ * When not setting up our own stackframe, the redzone usage is:
+ *
+ * [   prev sp ] <-
+ * [ ...   ]   |
+ * sp (r1) --->[stack pointer  ] --
+ * [   nv gpr save area] 8*8
+ * [tail_call_cnt  ] 8
+ * [local_tmp_var  ] 8
+ * [   unused red zone ] 208 bytes protected
+ */
+static int bpf_jit_stack_local(struct codegen_context *ctx)
+{
+   if (bpf_has_stack_frame(ctx))
+   return STACK_FRAME_MIN_SIZE + MAX_BPF_STACK;
+   else
+   return -(BPF_PPC_STACK_SAVE + 16);
+}
+
+static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg)
+{
+   if (reg >= BPF_PPC_NVR_MIN && reg < 32)
+   return (bpf_has_stack_frame(ctx) ? BPF_PPC_STACKFRAME : 0)
+   - (8 * (32 - reg));
+
+   pr_err("BPF JIT is asking about unknown registers");
+   BUG();
+}
+
 static void bpf_jit_emit_skb_loads(u32 *image, struct codegen_context *ctx)
 {
/*
@@ -100,9 +129,8 @@ static void bpf_jit_emit_func_call(u32 *image, struct 
codegen_context *ctx, u64
 static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
 {
int i;
-   bool new_stack_frame = bpf_has_stack_frame(ctx);
 
-   if (new_stack_frame) {
+   if (bpf_has_stack_frame(ctx)) {
/*
 * We need a stack frame, but we don't necessarily need to
 * save/restore LR unless we call other functions
@@ -122,9 +150,7 @@ static void bpf_jit_build_prologue(u32 *image, struct 
codegen_context *ctx)
 */
for (i = BPF_REG_6; i <= BPF_REG_10; i++)
if (bpf_is_seen_register(ctx, i))
-   PPC_BPF_STL(b2p[i], 1,
-   (new_stack_frame ? BPF_PPC_STACKFRAME : 0) -
-   (8 * (32 - b2p[i])));
+   PPC_BPF_STL(b2p[i], 1, bpf_jit_stack_offsetof(ctx, 
b2p[i]));
 
/*
 * Save additional non-volatile regs if we cache skb
@@ -132,22 +158,21 @@ static void bpf_jit_build_prologue(u32 *image, struct 
codegen_context *ctx)
 */
if (ctx->seen & SEEN_SKB) {
PPC_BPF_STL(b2p[SKB_HLEN_REG], 1,

Re: [RFC] net: store port/representative id in metadata_dst

2016-09-23 Thread John Fastabend

On 16-09-23 01:17 PM, Jakub Kicinski wrote:
> On Fri, 23 Sep 2016 10:22:59 -0700, Samudrala, Sridhar wrote:
>> On 9/23/2016 8:29 AM, Jakub Kicinski wrote:
>>> On Fri, 23 Sep 2016 07:23:26 -0700, John Fastabend wrote:  
 Yep, I like the idea in general. I had a slightly different approach in
 mind though. If you look at __dev_queue_xmit() there is a void
 accel_priv pointer (gather you found this based on your commit note).
 My take was we could extend this a bit so it can be used by the VFR
 devices and they could do a dev_queue_xmit_accel(). In this way there is
 no need to touch /net/core/{filter, dst, ip_tunnel}.c etc. Maybe the
 accel logic needs to be extended to push the priv pointer all the way
 through the xmit routine of the target netdev though. This should look
 a lot like the macvlan accelerated xmit device path without the
 switching logic.

 Of course maybe the name would be extended to dev_queue_xmit_extended()
 or something.

 So the flow on ingress would be,

1. pkt_received_by_PF_netdev
2. PF_netdev reads some tag off packet/descriptor and sets correct
   skb->dev field. This is needed so stack "sees" packets from
   correct VF ports.
3. packet passed up to stack.

 I guess it is a bit "zombie" like on the receive path because the packet
 is never actually handled by VF netdev code per se and on egress can
 traverse both the VFR and PF netdevs qdiscs. But on the other hand the
 VFR netdevs and PF netdevs are all in the same driver. Plus using a
 queue per VFR is a bit of a waste as its not needed and also hardware
 may not have any mechanism to push VF traffic onto a rx queue.

 On egress,

1. VFR xmit is called
2. VFR xmit calls dev_queue_xmit_accel() with some meta-data if needed
   for the lower netdev
3. lower netdev sends out the packet.

 Again we don't need to waste any queues for each VFR and the VFR can be
 a LLTX device. In this scheme I think you avoid much of the changes in
 your patch and keep it all contained in the driver. Any thoughts?  
>>
>> The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
>> to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
>> queue.
>> Also, it is not passed all the way to the driver specific xmit routine.  
>> Doesn't it require
>> changing all the driver xmit routines if we want to pass this parameter?
>>
>>> Goes without saying that you have a much better understanding of packet
>>> scheduling so please bear with me :)  My target model is that I have
>>> n_cpus x "n_tc/prio" queues on the PF and I want to transmit the
>>> fallback traffic over those same queues.  So no new HW queues are used
>>> for VFRs at all.  This is a reverse of macvlan offload which AFAICT has
>>> "bastard hw queues" which actually TX for a separate software device.
>>>
>>> My understanding was that I can rework this model to have software
>>> queues for VFRs (#sw queues == #PF queues + #VFRs) but no extra HW
>>> queues (#hw queues == #PF queues) but then when the driver sees a
>>> packet on sw-only VFR queue it has to pick one of the PF queues (which
>>> one?), lock PF software queue to own it, and only then can it
>>> transmit.  With the dst_metadata there is no need for extra locking or
>>> queue selection.  
>>
>> Yes.  The VFPR netdevs don't have any HW queues associated with them and 
>> we would like
>> to use the PF queues for the xmit.
>> I was also looking into some way of passing the port id via skb 
>> parameter to the
>> dev_queue_xmit() call so that the PF xmit routine can do a directed 
>> transmit to a specifc VF.
>> Is skb->cb an option to pass this info?
>> dst_metadata approach would work  too if it is acceptable.
> 
> I don't think we can trust skb->cb to be set to anything meaningful
> when the skb is received by the lower device. 
> 

Agreed. I wouldn't recommend using skb->cb. How about passing it through
dev_queue_xmit_accel() through to the driver?

If you pass the metadata through the dev_queue_xmit_accel() handle tx
queue  selection would work using normal mechanisms (xps, select_queue,
cls  hook, etc.). If you wanted to pick some specific queue based on
policy the policy could be loaded into one of those hooks.

.John

[PATCH] nfp: bpf: improve handling for disabled BPF syscall

2016-09-23 Thread Arnd Bergmann

I stumbled over a new warning during randconfig testing,
with CONFIG_BPF_SYSCALL disabled:

drivers/net/ethernet/netronome/nfp/nfp_net_offload.c: In function 
'nfp_net_bpf_offload':
drivers/net/ethernet/netronome/nfp/nfp_net_offload.c:263:3: error: '*((void 
*)+4)' may be used uninitialized in this function 
[-Werror=maybe-uninitialized]
drivers/net/ethernet/netronome/nfp/nfp_net_offload.c:263:3: error: 
'res.n_instr' may be used uninitialized in this function 
[-Werror=maybe-uninitialized]

As far as I can tell, this is a false positive caused by the compiler
getting confused about a function that is partially inlined, but it's
easy to avoid while improving the code:

The nfp_bpf_jit() stub helper for that configuration is unusual as it
is defined in a header file but not marked 'static inline'. By moving
the compile-time check into the caller using the IS_ENABLED() macro,
we can remove that stub and simplify the nfp_net_bpf_offload_prepare()
function enough to unconfuse the compiler.

Fixes: 7533fdc0f77f ("nfp: bpf: add hardware bpf offload")
Signed-off-by: Arnd Bergmann 
---
 drivers/net/ethernet/netronome/nfp/nfp_bpf.h | 10 --
 drivers/net/ethernet/netronome/nfp/nfp_net_offload.c |  3 +++
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_bpf.h 
b/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
index fc220cd04115..87aa8a3e9112 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
+++ b/drivers/net/ethernet/netronome/nfp/nfp_bpf.h
@@ -192,20 +192,10 @@ struct nfp_bpf_result {
bool dense_mode;
 };
 
-#ifdef CONFIG_BPF_SYSCALL
 int
 nfp_bpf_jit(struct bpf_prog *filter, void *prog, enum nfp_bpf_action_type act,
unsigned int prog_start, unsigned int prog_done,
unsigned int prog_sz, struct nfp_bpf_result *res);
-#else
-int
-nfp_bpf_jit(struct bpf_prog *filter, void *prog, enum nfp_bpf_action_type act,
-   unsigned int prog_start, unsigned int prog_done,
-   unsigned int prog_sz, struct nfp_bpf_result *res)
-{
-   return -ENOTSUPP;
-}
-#endif
 
 int nfp_prog_verify(struct nfp_prog *nfp_prog, struct bpf_prog *prog);
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c
index 43f42f842eda..8acfb631a0ea 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_offload.c
@@ -148,6 +148,9 @@ nfp_net_bpf_offload_prepare(struct nfp_net *nn,
unsigned int max_mtu;
int ret;
 
+   if (!IS_ENABLED(CONFIG_BPF_SYSCALL))
+   return -ENOTSUPP;
+
ret = nfp_net_bpf_get_act(nn, cls_bpf);
if (ret < 0)
return ret;
-- 
2.9.0

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Uwe Kleine-König

Hello Russell,

On Fri, Sep 23, 2016 at 07:37:25PM +0100, Russell King - ARM Linux wrote:
> On Fri, Sep 23, 2016 at 11:26:18AM -0700, Eric Nelson wrote:
> > So the question is: should we just live with this and acknowledge a
> > performance penalty of bad alignment or do something about it?
> 
> Well, I've no interest in trying to do anything with the FEC driver
> anymore, as I'll just generate another big patch stack which won't
> make it into the kernel in a timely fashion - my last attempt at
> improving the FEC driver was dogged with conflicting changes and I
> gave up with it in the end.  I ended up spending a full cycle
> rebasing, re-testing, and re-evaluating their performance only to find
> that I'd missed the merge window again, and other conflicting changes
> got merged which meant that I had to start from the beginning again.

I'm not included in the set of people who are responsible to review and
merge fec patches, but I'd be surprised if you couldn't get an exclusive
lock for that driver. Something like: After 4.X-rc1 the fec isn't
touched any more until you got your series ready for the 4.X+1 merge
window. Of course some fixes might have to go in, but these hopefully
won't disturb much.

Best regards
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |

Re: [RFC] net: store port/representative id in metadata_dst

2016-09-23 Thread Jakub Kicinski

On Fri, 23 Sep 2016 10:22:59 -0700, Samudrala, Sridhar wrote:
> On 9/23/2016 8:29 AM, Jakub Kicinski wrote:
> > On Fri, 23 Sep 2016 07:23:26 -0700, John Fastabend wrote:  
> >> Yep, I like the idea in general. I had a slightly different approach in
> >> mind though. If you look at __dev_queue_xmit() there is a void
> >> accel_priv pointer (gather you found this based on your commit note).
> >> My take was we could extend this a bit so it can be used by the VFR
> >> devices and they could do a dev_queue_xmit_accel(). In this way there is
> >> no need to touch /net/core/{filter, dst, ip_tunnel}.c etc. Maybe the
> >> accel logic needs to be extended to push the priv pointer all the way
> >> through the xmit routine of the target netdev though. This should look
> >> a lot like the macvlan accelerated xmit device path without the
> >> switching logic.
> >>
> >> Of course maybe the name would be extended to dev_queue_xmit_extended()
> >> or something.
> >>
> >> So the flow on ingress would be,
> >>
> >>1. pkt_received_by_PF_netdev
> >>2. PF_netdev reads some tag off packet/descriptor and sets correct
> >>   skb->dev field. This is needed so stack "sees" packets from
> >>   correct VF ports.
> >>3. packet passed up to stack.
> >>
> >> I guess it is a bit "zombie" like on the receive path because the packet
> >> is never actually handled by VF netdev code per se and on egress can
> >> traverse both the VFR and PF netdevs qdiscs. But on the other hand the
> >> VFR netdevs and PF netdevs are all in the same driver. Plus using a
> >> queue per VFR is a bit of a waste as its not needed and also hardware
> >> may not have any mechanism to push VF traffic onto a rx queue.
> >>
> >> On egress,
> >>
> >>1. VFR xmit is called
> >>2. VFR xmit calls dev_queue_xmit_accel() with some meta-data if needed
> >>   for the lower netdev
> >>3. lower netdev sends out the packet.
> >>
> >> Again we don't need to waste any queues for each VFR and the VFR can be
> >> a LLTX device. In this scheme I think you avoid much of the changes in
> >> your patch and keep it all contained in the driver. Any thoughts?  
> 
> The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
> to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
> queue.
> Also, it is not passed all the way to the driver specific xmit routine.  
> Doesn't it require
> changing all the driver xmit routines if we want to pass this parameter?
> 
> > Goes without saying that you have a much better understanding of packet
> > scheduling so please bear with me :)  My target model is that I have
> > n_cpus x "n_tc/prio" queues on the PF and I want to transmit the
> > fallback traffic over those same queues.  So no new HW queues are used
> > for VFRs at all.  This is a reverse of macvlan offload which AFAICT has
> > "bastard hw queues" which actually TX for a separate software device.
> >
> > My understanding was that I can rework this model to have software
> > queues for VFRs (#sw queues == #PF queues + #VFRs) but no extra HW
> > queues (#hw queues == #PF queues) but then when the driver sees a
> > packet on sw-only VFR queue it has to pick one of the PF queues (which
> > one?), lock PF software queue to own it, and only then can it
> > transmit.  With the dst_metadata there is no need for extra locking or
> > queue selection.  
> 
> Yes.  The VFPR netdevs don't have any HW queues associated with them and 
> we would like
> to use the PF queues for the xmit.
> I was also looking into some way of passing the port id via skb 
> parameter to the
> dev_queue_xmit() call so that the PF xmit routine can do a directed 
> transmit to a specifc VF.
> Is skb->cb an option to pass this info?
> dst_metadata approach would work  too if it is acceptable.

I don't think we can trust skb->cb to be set to anything meaningful
when the skb is received by the lower device.

[PATCH] mlx5: Add ndo_poll_controller() implementation

2016-09-23 Thread Calvin Owens

This implements ndo_poll_controller in net_device_ops for mlx5, which is
necessary to use netconsole with this driver.

Signed-off-by: Calvin Owens 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2459c7f..439476f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2786,6 +2786,20 @@ static void mlx5e_tx_timeout(struct net_device *dev)
schedule_work(>tx_timeout_work);
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Fake "interrupt" called by netpoll (eg netconsole) to send skbs without
+ * reenabling interrupts.
+ */
+static void mlx5e_netpoll(struct net_device *dev)
+{
+   struct mlx5e_priv *priv = netdev_priv(dev);
+   int i, nr_sq = priv->params.num_channels * priv->params.num_tc;
+
+   for (i = 0; i < nr_sq; i++)
+   napi_schedule(priv->txq_to_sq_map[i]->cq.napi);
+}
+#endif
+
 static const struct net_device_ops mlx5e_netdev_ops_basic = {
.ndo_open= mlx5e_open,
.ndo_stop= mlx5e_close,
@@ -2805,6 +2819,9 @@ static const struct net_device_ops mlx5e_netdev_ops_basic 
= {
.ndo_rx_flow_steer   = mlx5e_rx_flow_steer,
 #endif
.ndo_tx_timeout  = mlx5e_tx_timeout,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+   .ndo_poll_controller = mlx5e_netpoll,
+#endif
 };
 
 static const struct net_device_ops mlx5e_netdev_ops_sriov = {
@@ -2836,6 +2853,9 @@ static const struct net_device_ops mlx5e_netdev_ops_sriov 
= {
.ndo_set_vf_link_state   = mlx5e_set_vf_link_state,
.ndo_get_vf_stats= mlx5e_get_vf_stats,
.ndo_tx_timeout  = mlx5e_tx_timeout,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+   .ndo_poll_controller = mlx5e_netpoll,
+#endif
 };
 
 static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
-- 
2.9.3

[PATCH 2/3] mac80211: Export fq memory limit information in debugfs

2016-09-23 Thread Toke Høiland-Jørgensen

Add memory limit, usage and overlimit counter to per-PHY 'aqm' debugfs
file.

Signed-off-by: Toke Høiland-Jørgensen 
---
 net/mac80211/debugfs.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 8ca62b6..f56e2f4 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -89,13 +89,19 @@ static ssize_t aqm_read(struct file *file,
"R fq_flows_cnt %u\n"
"R fq_backlog %u\n"
"R fq_overlimit %u\n"
+   "R fq_overmemory %u\n"
"R fq_collisions %u\n"
+   "R fq_memory_usage %u\n"
+   "RW fq_memory_limit %u\n"
"RW fq_limit %u\n"
"RW fq_quantum %u\n",
fq->flows_cnt,
fq->backlog,
+   fq->overmemory,
fq->overlimit,
fq->collisions,
+   fq->memory_usage,
+   fq->memory_limit,
fq->limit,
fq->quantum);
 
@@ -128,6 +134,8 @@ static ssize_t aqm_write(struct file *file,
 
if (sscanf(buf, "fq_limit %u", >fq.limit) == 1)
return count;
+   else if (sscanf(buf, "fq_memory_limit %u", >fq.memory_limit) == 
1)
+   return count;
else if (sscanf(buf, "fq_quantum %u", >fq.quantum) == 1)
return count;
 
-- 
2.9.3

[PATCH 0/3] Add memory limits to fq.h and mac80211 TXQ

2016-09-23 Thread Toke Høiland-Jørgensen

This is a series of small patches to avoid OOM conditions on small
wireless devices with the mac80211 intermediate TXQ structure. The
current default limit in fq.h translates to up to 16 Mbytes of memory
usage, which can be fatal to a device with 32 MBytes of total RAM.

Rather than just change the fq_limit, this ports the memory limit
mechanism from the fq_codel qdisc. The second patch in the series just
adds the new fields to the mac80211 'aqm' debugfs file.

The third patch changes mac80211 to set a lower memory limit for non-VHT
devices. The assumption is that (a) for 802.11n and lower 4 Mbytes of
total queue (2048 packets, 64 max-size aggregates) is plenty, and so it
is safe to simply limit the queue size. And (b) that VHT-capable devices
are usually installed in systems equipped with more system memory.

Toke Høiland-Jørgensen (3):
  fq.h: Port memory limit mechanism from fq_codel
  mac80211: Export fq memory limit information in debugfs
  mac80211: Set lower memory limit for non-VHT devices

 include/net/fq.h   |  3 +++
 include/net/fq_impl.h  |  7 ++-
 net/mac80211/debugfs.c |  8 
 net/mac80211/tx.c  | 18 ++
 4 files changed, 35 insertions(+), 1 deletion(-)

-- 
2.9.3

base-commit: fb2a3d5c7c85cb6e8bc88192be919b4ef8d6e630

[PATCH 3/3] mac80211: Set lower memory limit for non-VHT devices

2016-09-23 Thread Toke Høiland-Jørgensen

Small devices can run out of memory from queueing too many packets. If
VHT is not supported by the PHY, having more than 4 MBytes of total
queue in the TXQ intermediate queues is not needed, and so we can safely
limit the memory usage in these cases and avoid OOM.

Signed-off-by: Toke Høiland-Jørgensen 
---
 net/mac80211/tx.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1ff08be..82f41fc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1434,6 +1434,8 @@ int ieee80211_txq_setup_flows(struct ieee80211_local 
*local)
struct fq *fq = >fq;
int ret;
int i;
+   bool supp_vht = false;
+   enum nl80211_band band;
 
if (!local->ops->wake_tx_queue)
return 0;
@@ -1442,6 +1444,22 @@ int ieee80211_txq_setup_flows(struct ieee80211_local 
*local)
if (ret)
return ret;
 
+   /*
+* If the hardware doesn't support VHT, it is safe to limit the maximum
+* queue size. 4 Mbytes is 64 max-size aggregates in 802.11n.
+*/
+   for (band = 0; band < NUM_NL80211_BANDS; band++) {
+   struct ieee80211_supported_band *sband;
+
+   sband = local->hw.wiphy->bands[band];
+   if (!sband)
+   continue;
+
+   supp_vht = supp_vht || sband->vht_cap.vht_supported;
+   }
+   if (!supp_vht)
+   fq->memory_limit = 4 << 20; /* 4 Mbytes */
+
codel_params_init(>cparams);
local->cparams.interval = MS2TIME(100);
local->cparams.target = MS2TIME(20);
-- 
2.9.3

[PATCH 1/3] fq.h: Port memory limit mechanism from fq_codel

2016-09-23 Thread Toke Høiland-Jørgensen

The reusable fairness queueing implementation (fq.h) lacks the memory
usage limit that the fq_codel qdisc has. This means that small
devices (e.g. WiFi routers) can run out of memory when flooded with a
large number of packets. This ports the memory limit feature from
fq_codel to fq.h.

Signed-off-by: Toke Høiland-Jørgensen 
---
 include/net/fq.h  | 3 +++
 include/net/fq_impl.h | 7 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/net/fq.h b/include/net/fq.h
index 268b490..6d8521a 100644
--- a/include/net/fq.h
+++ b/include/net/fq.h
@@ -72,9 +72,12 @@ struct fq {
u32 flows_cnt;
u32 perturbation;
u32 limit;
+   u32 memory_limit;
+   u32 memory_usage;
u32 quantum;
u32 backlog;
u32 overlimit;
+   u32 overmemory;
u32 collisions;
 };
 
diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
index 163f3ed..4e6131c 100644
--- a/include/net/fq_impl.h
+++ b/include/net/fq_impl.h
@@ -29,6 +29,7 @@ static struct sk_buff *fq_flow_dequeue(struct fq *fq,
tin->backlog_packets--;
flow->backlog -= skb->len;
fq->backlog--;
+   fq->memory_usage -= skb->truesize;
 
if (flow->backlog == 0) {
list_del_init(>backlogchain);
@@ -154,6 +155,7 @@ static void fq_tin_enqueue(struct fq *fq,
flow->backlog += skb->len;
tin->backlog_bytes += skb->len;
tin->backlog_packets++;
+   fq->memory_usage += skb->truesize;
fq->backlog++;
 
fq_recalc_backlog(fq, tin, flow);
@@ -166,7 +168,7 @@ static void fq_tin_enqueue(struct fq *fq,
 
__skb_queue_tail(>queue, skb);
 
-   if (fq->backlog > fq->limit) {
+   if (fq->backlog > fq->limit || fq->memory_usage > fq->memory_limit) {
flow = list_first_entry_or_null(>backlogs,
struct fq_flow,
backlogchain);
@@ -181,6 +183,8 @@ static void fq_tin_enqueue(struct fq *fq,
 
flow->tin->overlimit++;
fq->overlimit++;
+   if (fq->memory_usage > fq->memory_limit)
+   fq->overmemory++;
}
 }
 
@@ -251,6 +255,7 @@ static int fq_init(struct fq *fq, int flows_cnt)
fq->perturbation = prandom_u32();
fq->quantum = 300;
fq->limit = 8192;
+   fq->memory_limit = 16 << 20; /* 16 MBytes */
 
fq->flows = kcalloc(fq->flows_cnt, sizeof(fq->flows[0]), GFP_KERNEL);
if (!fq->flows)
-- 
2.9.3

Re: [PATCH net v2] ip6_gre: fix flowi6_proto value in ip6gre_xmit_other()

2016-09-23 Thread Sergei Shtylyov


Hello.

On 09/23/2016 10:50 PM, Lance Richardson wrote:


Similar to commit 3be07244b733 ("ip6_gre: fix flowi6_proto value in
xmit path"), set flowi6_proto to IPPROTO_GRE for output route lookup.

Up until now, ip6gre_xmit_other() has set flowi6_proto to a bogus value.
This affected output route lookup for packets sent on an ip6gretap device
in cases where routing was dependent on the value of flowi6_proto.

Since the correct proto is already set in the tunnel flowi6 template via
commit 252f3f5a1189 ("ip6_gre: Set flowi6_proto as IPPROTO_GRE in xmit
path."), simply delete the line setting the incorrect flowi6_proto value.

Suggested-by: Jiri Benc 
Fixes: commit c12b395a4664 ("gre: Support GRE over IPv6")


   That "commit" isn't needed here, this tag has a standardized format. 
Hopefully, can be fixed while applying...



Reviewed-by: Shmulik Ladkani 
Signed-off-by: Lance Richardson 

[...]

MBR, Sergei

Re: [PATCH net] ip6_gre: fix flowi6_proto value in ip6gre_xmit_other()

2016-09-23 Thread Shmulik Ladkani

On Fri, 23 Sep 2016 15:52:24 -0400 (EDT) Lance Richardson  
wrote:
> > From: "Shmulik Ladkani" 
> > Suggesting to add:
> > 
> > Up until now, 'ip6gre_xmit_other' has set flowi6_proto to a bogus value.
> > This affects output route lookup upon xmit of non ipv4/ipv6 packets on a
> > ip6gretap device, in cases where routing depends on flowi6_proto.
> >   
> 
> Added in v2, taking some editorial license (please let me know if I mangled it
> too badly).

Thanks Lance, looks good.

Re: [PATCH net] ip6_gre: fix flowi6_proto value in ip6gre_xmit_other()

2016-09-23 Thread Lance Richardson

> From: "Shmulik Ladkani" 
> To: "Lance Richardson" 
> Cc: netdev@vger.kernel.org
> Sent: Friday, September 23, 2016 3:00:36 PM
> Subject: Re: [PATCH net] ip6_gre: fix flowi6_proto value in 
> ip6gre_xmit_other()
> 
> On Fri, 23 Sep 2016 12:54:59 -0400 Lance Richardson 
> wrote:
> > Similar to commit 3be07244b733 ("ip6_gre: fix flowi6_proto value in
> > xmit path"), set flowi6_proto to IPPROTO_GRE for output route lookup.
> 
> Suggesting to add:
> 
> Up until now, 'ip6gre_xmit_other' has set flowi6_proto to a bogus value.
> This affects output route lookup upon xmit of non ipv4/ipv6 packets on a
> ip6gretap device, in cases where routing depends on flowi6_proto.
> 

Added in v2, taking some editorial license (please let me know if I mangled it
too badly).

Thanks,

   Lance

> > Since the correct proto is already set in the tunnel flowi6 template via
> > commit 252f3f5a1189 ("ip6_gre: Set flowi6_proto as IPPROTO_GRE in xmit
> > path."), simply delete the line setting the incorrect flowi6_proto value.
> > 
> > Suggested-by: Jiri Benc 
> > Fixes: commit c12b395a4664 ("gre: Support GRE over IPv6")
> > Signed-off-by: Lance Richardson 
> 
> Reviewed-by: Shmulik Ladkani 
>

[PATCH net v2] ip6_gre: fix flowi6_proto value in ip6gre_xmit_other()

2016-09-23 Thread Lance Richardson

Similar to commit 3be07244b733 ("ip6_gre: fix flowi6_proto value in
xmit path"), set flowi6_proto to IPPROTO_GRE for output route lookup.

Up until now, ip6gre_xmit_other() has set flowi6_proto to a bogus value.
This affected output route lookup for packets sent on an ip6gretap device
in cases where routing was dependent on the value of flowi6_proto.

Since the correct proto is already set in the tunnel flowi6 template via
commit 252f3f5a1189 ("ip6_gre: Set flowi6_proto as IPPROTO_GRE in xmit
path."), simply delete the line setting the incorrect flowi6_proto value.

Suggested-by: Jiri Benc 
Fixes: commit c12b395a4664 ("gre: Support GRE over IPv6")
Reviewed-by: Shmulik Ladkani 
Signed-off-by: Lance Richardson 
---
v2: expanded commit description as suggested by Shmulik Ladkani.

 net/ipv6/ip6_gre.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 704274c..edc3daa 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -648,7 +648,6 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct 
net_device *dev)
encap_limit = t->parms.encap_limit;
 
memcpy(, >fl.u.ip6, sizeof(fl6));
-   fl6.flowi6_proto = skb->protocol;
 
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
-- 
2.5.5

Re: [PATCH] realtek: Add switch variable to 'switch case not processed' messages

2016-09-23 Thread Larry Finger


On 09/23/2016 01:27 PM, Joe Perches wrote:

Help along debugging by showing what switch/case variable is not
being processed in these messages.

Signed-off-by: Joe Perches 


Acked-by: Larry Finger 

Thanks,

Larry

Re: [PATCH] realtek: Add switch variable to 'switch case not processed' messages

2016-09-23 Thread Joe Perches

On Fri, 2016-09-23 at 13:59 -0500, Larry Finger wrote:
> I'm not familiar with the %#x format. What does it do?

Outputs SPECIAL prefix, it's the same as "0x%x"

lib/vsprintf.c:
#define SPECIAL 64  /* prefix hex with "0x", octal with "0" */

Re: [PATCH] Net Driver: Add Cypress GX3 VID=04b4 PID=3610.

2016-09-23 Thread Greg KH

On Fri, Sep 23, 2016 at 12:24:50PM -0600, chris.r...@usask.ca wrote:
> From: Chris Roth 
> 
> Add support for Cypress GX3 SuperSpeed to Gigabit Ethernet
> Bridge Controller (Vendor=04b4 ProdID=3610).
> 
> Patch verified on x64 linux kernel 4.7.4 system with the
> Kensington SD4600P USB-C Universal Dock with Power, which uses the
> Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge Controller.
> 
> A similar patch was signed-off and tested-by Allan Chou
>  on 2015-12-01.

Then you should put a "From: " line at the top of this patch that looks
like:
From: Allan Chou 
at the top of the patch and then a blank line, and put the signed-off-by
back in the patch as well, with yours below it.  The file
Documentation/SubmittingPatches should show you how to do this.

Can you try again?

thanks,

greg k-h

Re: [PATCH net] ip6_gre: fix flowi6_proto value in ip6gre_xmit_other()

2016-09-23 Thread Shmulik Ladkani

On Fri, 23 Sep 2016 12:54:59 -0400 Lance Richardson  wrote:
> Similar to commit 3be07244b733 ("ip6_gre: fix flowi6_proto value in
> xmit path"), set flowi6_proto to IPPROTO_GRE for output route lookup.

Suggesting to add:

Up until now, 'ip6gre_xmit_other' has set flowi6_proto to a bogus value.
This affects output route lookup upon xmit of non ipv4/ipv6 packets on a
ip6gretap device, in cases where routing depends on flowi6_proto.

> Since the correct proto is already set in the tunnel flowi6 template via
> commit 252f3f5a1189 ("ip6_gre: Set flowi6_proto as IPPROTO_GRE in xmit
> path."), simply delete the line setting the incorrect flowi6_proto value.
> 
> Suggested-by: Jiri Benc 
> Fixes: commit c12b395a4664 ("gre: Support GRE over IPv6")
> Signed-off-by: Lance Richardson 

Reviewed-by: Shmulik Ladkani

Re: [PATCH] realtek: Add switch variable to 'switch case not processed' messages

2016-09-23 Thread Larry Finger


On 09/23/2016 01:27 PM, Joe Perches wrote:

Help along debugging by showing what switch/case variable is not
being processed in these messages.

Signed-off-by: Joe Perches 


Joe,

You beat me to the patch. No problem as this one looks OK; however, I'm not 
familiar with the %#x format. What does it do?


Larry


---
 drivers/net/wireless/realtek/rtlwifi/core.c  |  3 ++-
 drivers/net/wireless/realtek/rtlwifi/pci.c   |  3 ++-
 drivers/net/wireless/realtek/rtlwifi/ps.c|  2 +-
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/phy.c | 10 ++
 .../wireless/realtek/rtlwifi/rtl8192c/fw_common.c|  4 ++--
 .../wireless/realtek/rtlwifi/rtl8192c/phy_common.c   |  8 +---
 drivers/net/wireless/realtek/rtlwifi/rtl8192ce/hw.c  |  7 ---
 drivers/net/wireless/realtek/rtlwifi/rtl8192ce/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192ce/phy.c |  7 ++-
 drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192cu/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192cu/phy.c |  7 ++-
 drivers/net/wireless/realtek/rtlwifi/rtl8192de/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192de/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8192de/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192de/phy.c | 15 +++
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/phy.c | 10 ++
 drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8192se/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192se/phy.c |  5 +++--
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/phy.c | 10 ++
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c  | 10 +-
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/phy.c | 12 +++-
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/phy.c | 20 ++--
 38 files changed, 128 insertions(+), 123 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c 
b/drivers/net/wireless/realtek/rtlwifi/core.c
index 7aee5ebb1..f95760c 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -765,7 +765,8 @@ static int rtl_op_config(struct ieee80211_hw *hw, u32 
changed)
mac->bw_40 = false;
mac->bw_80 = false;
RT_TRACE(rtlpriv, COMP_ERR, DBG_EMERG,
-"switch case not processed\n");
+"switch case %#x not 
processed\n",
+channel_type);
break;
}
}
diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c 
b/drivers/net/wireless/realtek/rtlwifi/pci.c
index d12586d..0dfa9ea 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -179,7 +179,8 @@ static void _rtl_pci_update_default_setting(struct 
ieee80211_hw *hw)
break;
default:
RT_TRACE(rtlpriv, COMP_ERR, DBG_EMERG,
-"switch case not processed\n");
+"switch case %#x not processed\n",
+rtlpci->const_support_pciaspm);
break;
}

diff --git a/drivers/net/wireless/realtek/rtlwifi/ps.c 
b/drivers/net/wireless/realtek/rtlwifi/ps.c
index 9a64f9b..18d979a 100644
--- a/drivers/net/wireless/realtek/rtlwifi/ps.c
+++ b/drivers/net/wireless/realtek/rtlwifi/ps.c
@@ -151,7 +151,7 @@ static bool rtl_ps_set_rf_state(struct ieee80211_hw *hw,

default:
RT_TRACE(rtlpriv, COMP_ERR, DBG_EMERG,
-"switch case not processed\n");
+"switch case %#x not processed\n", state_toset);
break;
}

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c

Re: [PATCH] netns: move {inc,dec}_net_namespaces into #ifdef

2016-09-23 Thread Eric W. Biederman

Arnd Bergmann  writes:

> With the newly enforced limit on the number of namespaces,
> we get a build warning if CONFIG_NETNS is disabled:
>
> net/core/net_namespace.c:273:13: error: 'dec_net_namespaces' defined but not 
> used [-Werror=unused-function]
> net/core/net_namespace.c:268:24: error: 'inc_net_namespaces' defined but not 
> used [-Werror=unused-function]
>
> This moves the two added functions inside the #ifdef that guards
> their callers.
>
> Fixes: 703286608a22 ("netns: Add a limit on the number of net namespaces")
> Signed-off-by: Arnd Bergmann 

Applied to my namespace tree thanks.

This was a hold over from calling those functions in a different location.

Eric

> ---
>  net/core/net_namespace.c | 20 ++--
>  1 file changed, 10 insertions(+), 10 deletions(-)
>
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index d0eb13d3226b..989434f36f96 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -265,16 +265,6 @@ struct net *get_net_ns_by_id(struct net *net, int id)
>   return peer;
>  }
>  
> -static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
> -{
> - return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
> -}
> -
> -static void dec_net_namespaces(struct ucounts *ucounts)
> -{
> - dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
> -}
> -
>  /*
>   * setup_net runs the initializers for the network namespace object.
>   */
> @@ -319,6 +309,16 @@ static __net_init int setup_net(struct net *net, struct 
> user_namespace *user_ns)
>  
>  
>  #ifdef CONFIG_NET_NS
> +static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
> +{
> + return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
> +}
> +
> +static void dec_net_namespaces(struct ucounts *ucounts)
> +{
> + dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
> +}
> +
>  static struct kmem_cache *net_cachep;
>  static struct workqueue_struct *netns_wq;

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Eric Nelson

Thanks Russell,

On 09/23/2016 11:37 AM, Russell King - ARM Linux wrote:
> On Fri, Sep 23, 2016 at 11:26:18AM -0700, Eric Nelson wrote:
>> So the question is: should we just live with this and acknowledge a
>> performance penalty of bad alignment or do something about it?
> 
> Well, I've no interest in trying to do anything with the FEC driver
> anymore, as I'll just generate another big patch stack which won't
> make it into the kernel in a timely fashion - my last attempt at
> improving the FEC driver was dogged with conflicting changes and I
> gave up with it in the end.  I ended up spending a full cycle
> rebasing, re-testing, and re-evaluating their performance only to find
> that I'd missed the merge window again, and other conflicting changes
> got merged which meant that I had to start from the beginning again.
> 

That's sad. I recall reading your notes on that patch series and it was
a model for how to structure and document a patch set.

I hadn't noticed that you abandoned it and it's frustrating that the
merge process prevented your efforts from being used.

I'm also disheartened to hear your frustration about getting things
pushed up-stream and the entire Linux community should take note.

>> I'm not sure the cost (or the details) of Eric's proposed fix of allocating
>> and copying the header to another skb.
> 
> I had a quick look at this, and although Eric's idea may be a good
> idea, it doesn't contain enough details for me to be able to
> implement it - eg, I've no idea how to attach the 128-byte skb to the
> beginning of a previously allocated skb containing the rest of the
> packet.  I've just looked through linux/skbuff.h and I can't see
> anything that takes two sk_buff's that would do the job.
> 
> However, I don't think that's necessary in this case, because the
> iMX6 FEC supports the 16-bit alignment of the packet, if only it was
> enabled in hardware and the driver caters for it.
> 

Right. If the hardware supports placing things at a suitable address,
that's the right approach.

I'll try to review your earlier patch set and at least find a way to address
the alignment issues.

I'm a bit booked until LinuxCon but will try to get something out soon.

Regards,


Eric

[PATCH] Net Driver: Add Cypress GX3 VID=04b4 PID=3610.

2016-09-23 Thread chris.roth

From: Chris Roth 

Add support for Cypress GX3 SuperSpeed to Gigabit Ethernet
Bridge Controller (Vendor=04b4 ProdID=3610).

Patch verified on x64 linux kernel 4.7.4 system with the
Kensington SD4600P USB-C Universal Dock with Power, which uses the
Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge Controller.

A similar patch was signed-off and tested-by Allan Chou
 on 2015-12-01.

Allan verified his similar patch on x86 Linux kernel 4.1.6 system
with Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge Controller.

Signed-off-by: Chris Roth 
---
 drivers/net/usb/ax88179_178a.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c
index e6338c1..8a6675d 100644
--- a/drivers/net/usb/ax88179_178a.c
+++ b/drivers/net/usb/ax88179_178a.c
@@ -1656,6 +1656,19 @@ static const struct driver_info ax88178a_info = {
.tx_fixup = ax88179_tx_fixup,
 };
 
+static const struct driver_info cypress_GX3_info = {
+   .description = "Cypress GX3 SuperSpeed to Gigabit Ethernet Controller",
+   .bind = ax88179_bind,
+   .unbind = ax88179_unbind,
+   .status = ax88179_status,
+   .link_reset = ax88179_link_reset,
+   .reset = ax88179_reset,
+   .stop = ax88179_stop,
+   .flags = FLAG_ETHER | FLAG_FRAMING_AX,
+   .rx_fixup = ax88179_rx_fixup,
+   .tx_fixup = ax88179_tx_fixup,
+};
+
 static const struct driver_info dlink_dub1312_info = {
.description = "D-Link DUB-1312 USB 3.0 to Gigabit Ethernet Adapter",
.bind = ax88179_bind,
@@ -1718,6 +1731,10 @@ static const struct usb_device_id products[] = {
USB_DEVICE(0x0b95, 0x178a),
.driver_info = (unsigned long)_info,
 }, {
+   /* Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge Controller */
+   USB_DEVICE(0x04b4, 0x3610),
+   .driver_info = (unsigned long)_GX3_info,
+}, {
/* D-Link DUB-1312 USB 3.0 to Gigabit Ethernet Adapter */
USB_DEVICE(0x2001, 0x4a00),
.driver_info = (unsigned long)_dub1312_info,
-- 
2.7.4

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Eric Nelson

Thanks Russell,

On 09/23/2016 11:30 AM, Russell King - ARM Linux wrote:
> On Fri, Sep 23, 2016 at 08:13:01PM +0200, Andrew Lunn wrote:
>>> Since the hardware requires longword alignment for its' DMA transfers,
>>> aligning the IP header will require a memcpy, right?
>>
>> The vf610 FEC has an SHIFT16 bit in register ENETx_TACC, which inserts
>> two padding bits on transmit. ENETx_RACC has the same.
>>
>> What about your hardware?
> 
> The iMX6 FEC also has that ability - as part of my FEC patch stack from
> ages ago, I implemented support for it.
> 
>   "net:fec: implement almost zero-copy receive path"
> 
> in my public fec-testing branch.
> 
> That patch stack is sadly now totally dead and I've no interest in
> reviving it myself.  There was some interest from others in taking my
> patch stack over, but that went quiet.
> 

I'll take a look and hopefully revive at least part of the patch set.

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Russell King - ARM Linux

On Fri, Sep 23, 2016 at 11:26:18AM -0700, Eric Nelson wrote:
> So the question is: should we just live with this and acknowledge a
> performance penalty of bad alignment or do something about it?

Well, I've no interest in trying to do anything with the FEC driver
anymore, as I'll just generate another big patch stack which won't
make it into the kernel in a timely fashion - my last attempt at
improving the FEC driver was dogged with conflicting changes and I
gave up with it in the end.  I ended up spending a full cycle
rebasing, re-testing, and re-evaluating their performance only to find
that I'd missed the merge window again, and other conflicting changes
got merged which meant that I had to start from the beginning again.

> I'm not sure the cost (or the details) of Eric's proposed fix of allocating
> and copying the header to another skb.

I had a quick look at this, and although Eric's idea may be a good
idea, it doesn't contain enough details for me to be able to
implement it - eg, I've no idea how to attach the 128-byte skb to the
beginning of a previously allocated skb containing the rest of the
packet.  I've just looked through linux/skbuff.h and I can't see
anything that takes two sk_buff's that would do the job.

However, I don't think that's necessary in this case, because the
iMX6 FEC supports the 16-bit alignment of the packet, if only it was
enabled in hardware and the driver caters for it.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Russell King - ARM Linux

On Fri, Sep 23, 2016 at 08:13:01PM +0200, Andrew Lunn wrote:
> > Since the hardware requires longword alignment for its' DMA transfers,
> > aligning the IP header will require a memcpy, right?
> 
> The vf610 FEC has an SHIFT16 bit in register ENETx_TACC, which inserts
> two padding bits on transmit. ENETx_RACC has the same.
> 
> What about your hardware?

The iMX6 FEC also has that ability - as part of my FEC patch stack from
ages ago, I implemented support for it.

  "net:fec: implement almost zero-copy receive path"

in my public fec-testing branch.

That patch stack is sadly now totally dead and I've no interest in
reviving it myself.  There was some interest from others in taking my
patch stack over, but that went quiet.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Eric Nelson

Thanks Andrew.

On 09/23/2016 11:13 AM, Andrew Lunn wrote:
>> Since the hardware requires longword alignment for its' DMA transfers,
>> aligning the IP header will require a memcpy, right?
> 
> The vf610 FEC has an SHIFT16 bit in register ENETx_TACC, which inserts
> two padding bits on transmit. ENETx_RACC has the same.
> 
> What about your hardware?
> 

You got me with the RTFM!

>From the i.MX6DQ reference manual, bit 7 of ENET_RACC says this:

"RX FIFO Shift-16

When this field is set, the actual frame data starts at bit 16 of the first
word read from the RX FIFO aligning the Ethernet payload on a
32-bit boundary."

Same for the i.MX6UL.

I'm not sure what it will take to use this, but it seems to be exactly
what we're looking for.

[PATCH] realtek: Add switch variable to 'switch case not processed' messages

2016-09-23 Thread Joe Perches

Help along debugging by showing what switch/case variable is not
being processed in these messages.

Signed-off-by: Joe Perches 
---
 drivers/net/wireless/realtek/rtlwifi/core.c  |  3 ++-
 drivers/net/wireless/realtek/rtlwifi/pci.c   |  3 ++-
 drivers/net/wireless/realtek/rtlwifi/ps.c|  2 +-
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/phy.c | 10 ++
 .../wireless/realtek/rtlwifi/rtl8192c/fw_common.c|  4 ++--
 .../wireless/realtek/rtlwifi/rtl8192c/phy_common.c   |  8 +---
 drivers/net/wireless/realtek/rtlwifi/rtl8192ce/hw.c  |  7 ---
 drivers/net/wireless/realtek/rtlwifi/rtl8192ce/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192ce/phy.c |  7 ++-
 drivers/net/wireless/realtek/rtlwifi/rtl8192cu/hw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192cu/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192cu/phy.c |  7 ++-
 drivers/net/wireless/realtek/rtlwifi/rtl8192de/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192de/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8192de/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192de/phy.c | 15 +++
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192ee/phy.c | 10 ++
 drivers/net/wireless/realtek/rtlwifi/rtl8192se/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8192se/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8192se/phy.c |  5 +++--
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8723ae/phy.c | 10 ++
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/hw.c  | 10 +-
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8723be/phy.c | 12 +++-
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/fw.c  |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/hw.c  |  9 +
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/led.c |  4 ++--
 drivers/net/wireless/realtek/rtlwifi/rtl8821ae/phy.c | 20 ++--
 38 files changed, 128 insertions(+), 123 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c 
b/drivers/net/wireless/realtek/rtlwifi/core.c
index 7aee5ebb1..f95760c 100644
--- a/drivers/net/wireless/realtek/rtlwifi/core.c
+++ b/drivers/net/wireless/realtek/rtlwifi/core.c
@@ -765,7 +765,8 @@ static int rtl_op_config(struct ieee80211_hw *hw, u32 
changed)
mac->bw_40 = false;
mac->bw_80 = false;
RT_TRACE(rtlpriv, COMP_ERR, DBG_EMERG,
-"switch case not processed\n");
+"switch case %#x not 
processed\n",
+channel_type);
break;
}
}
diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c 
b/drivers/net/wireless/realtek/rtlwifi/pci.c
index d12586d..0dfa9ea 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -179,7 +179,8 @@ static void _rtl_pci_update_default_setting(struct 
ieee80211_hw *hw)
break;
default:
RT_TRACE(rtlpriv, COMP_ERR, DBG_EMERG,
-"switch case not processed\n");
+"switch case %#x not processed\n",
+rtlpci->const_support_pciaspm);
break;
}
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/ps.c 
b/drivers/net/wireless/realtek/rtlwifi/ps.c
index 9a64f9b..18d979a 100644
--- a/drivers/net/wireless/realtek/rtlwifi/ps.c
+++ b/drivers/net/wireless/realtek/rtlwifi/ps.c
@@ -151,7 +151,7 @@ static bool rtl_ps_set_rf_state(struct ieee80211_hw *hw,
 
default:
RT_TRACE(rtlpriv, COMP_ERR, DBG_EMERG,
-"switch case not processed\n");
+"switch case %#x not processed\n", state_toset);
break;
}
 
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c 
b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c
index 6291256..5360d53 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c
+++

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Eric Nelson

Thanks Russell,

On 09/23/2016 10:37 AM, Russell King - ARM Linux wrote:
> On Fri, Sep 23, 2016 at 10:19:50AM -0700, Eric Nelson wrote:
>> Oddly, it does prevent the vast majority (90%+) of the alignment errors.
>>
>> I believe this is because the compiler is generating an ldm instruction
>> when the ntohl() call is used, but I'm stumped about why these aren't
>> generating faults:

After looking at it, I have to think that the code that reads iph->id
is just hit more frequently than the other code in this routine.

> 
> ldm generates alignment faults when the address is not aligned to a
> 32-bit boundary.  ldr on ARMv6+ does not.
> 
>> I don't think that's the case.
>>
>> # CONFIG_IPV6_GRE is not set
>>
>> Hmm... Instrumenting the kernel, it seems that iphdr **is** aligned on
>> a 4-byte boundary.
>>
>> Does the ldm instruction require 8-byte alignment?
>>
>> There's definitely a compiler-version dependency involved here,
>> since using gcc 4.9 also reduced the number of faults dramatically.
> 
> Well, I don't think it's that gcc related:
> 

I can only say that I noticed a dramatic drop in the number of faults, and
didn't see the inet_gro_receive reported in /proc/cpu/alignment with gcc 4.9
when trying to identify the issue.

> User:   0
> System: 312855 (ip6_route_input+0x6c/0x1e0)
> Skipped:0
> Half:   0
> Word:   0
> DWord:  2
> Multi:  312853
> 
> c06d8998 :
> c06d89ac:   e1a04000mov r4, r0
> c06d89b0:   e1d489b4ldrhr8, [r4, #148]  ; 0x94
> c06d89b8:   e594a0a0ldr sl, [r4, #160]  ; 0xa0
> c06d89cc:   e08ac008add ip, sl, r8
> c06d89d4:   e28c3018add r3, ip, #24
> c06d89dc:   e28c7008add r7, ip, #8
> c06d89e4:   e893000fldm r3, {r0, r1, r2, r3}
> c06d89ec:   e24be044sub lr, fp, #68 ; 0x44
> c06d89f4:   e24b5054sub r5, fp, #84 ; 0x54
> c06d89fc:   e885000fstm r5, {r0, r1, r2, r3}
> c06d8a04:   e897000fldm r7, {r0, r1, r2, r3}
> c06d8a10:   e88e000fstm lr, {r0, r1, r2, r3}
> 
> This is from:
> 
> struct flowi6 fl6 = {
> .flowi6_iif = l3mdev_fib_oif(skb->dev),
> .daddr = iph->daddr,
> .saddr = iph->saddr,
> .flowlabel = ip6_flowinfo(iph),
> .flowi6_mark = skb->mark,
> .flowi6_proto = iph->nexthdr,
> };
> 
> specifically, I suspect, the saddr and daddr initialisations.
> 
> There's not much to get away from this - the FEC on iMX requires a
> 16-byte alignment for DMA addresses, which violates the network
> stack's requirement for the ethernet packet to be received with a
> two byte offset.  So the IP header (and IPv6 headers) will always
> be mis-aligned in memory, which leads to a huge number of alignment
> faults.
> 
> There's not much getting away from this - the problem is not in the
> networking stack, but the FEC hardware/network driver.  See:
> 
> struct  fec_enet_private *fep = netdev_priv(ndev);
> int off;
> 
> off = ((unsigned long)skb->data) & fep->rx_align;
> if (off)
> skb_reserve(skb, fep->rx_align + 1 - off);
> 
> bdp->cbd_bufaddr = cpu_to_fec32(dma_map_single(>pdev->dev, 
> skb->data, FEC_ENET_RX_FRSIZE - fep->rx_align, DMA_FROM_DEVICE));
> 
> in fec_enet_new_rxbdp().
> 

So the question is: should we just live with this and acknowledge a
performance penalty of bad alignment or do something about it?

I'm not sure the cost (or the details) of Eric's proposed fix of allocating
and copying the header to another skb.

The original report was of bad network performance, but I haven't
been able to see an impact doing some simple tests using wget
and SSH.

Re: [PATCH net-next 08/15] rxrpc: Fix call timer

2016-09-23 Thread David Howells

Sergei Shtylyov  wrote:

> > +   if (call->timer.expires != t || !timer_pending(>timer)) {
> > mod_timer(>timer, t);
> > }
> 
>CodingStyle: {} not needed now.

See patch 11.

David

Re: [PATCH net-next 07/15] rxrpc: Fix accidental cancellation of scheduled resend by ACK parser

2016-09-23 Thread David Howells

Sergei Shtylyov  wrote:

> > case RXRPC_ACK_TYPE_NACK:
> > if (anno_type == RXRPC_TX_ANNO_NAK)
> > continue;
> > +   if (anno_type == RXRPC_TX_ANNO_RETRANS)
> > +   continue;
> 
>Why not fold the above 2 *if*s together? Or use *else if* at least?

I have a pending patch that adds something between them.

David

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Andrew Lunn

> Since the hardware requires longword alignment for its' DMA transfers,
> aligning the IP header will require a memcpy, right?

The vf610 FEC has an SHIFT16 bit in register ENETx_TACC, which inserts
two padding bits on transmit. ENETx_RACC has the same.

What about your hardware?

 Andrew

Re: [PATCH RFC 0/2] ila: ilarouter bpf code for tc and xdp

2016-09-23 Thread Jesper Dangaard Brouer


On Fri, 23 Sep 2016 10:16:33 -0700 Alexei Starovoitov  wrote:

> From: Aaron Yue 
> 
> Jesper,
> 
> here is old email and cover letter that didn't make it to the list
> due to vger outage (I guess).
> The verifier patch that Aaron is talking about has landed long ago.
> 
> The dataplane of ILA router is very short and simple.

Yes, looks very simple indeed! Cool! :-)


> Control plane is very different matter. It's not ready for prime time yet.
> 
> --
> 
> This patch contains the tc and xdp implementation of kernelspace bpf code.
> It requires userspace to insert to the ILA bpf maps, in tc's case, the 
> precomputed ILA mappings, and in xdp's case, both the precomputed ILA
> mappings and the MAC address.
> 
> The xdp bpf code also requires a verifier patch to allow direct map access
> from the packet (will be patched in by Alexei).
> 
> Aaron Yue (2):
>   samples/bpf: ilarouter for tc
>   samples/bpf: ilarouter for xdp
> 
>  samples/bpf/Makefile|   2 +
>  samples/bpf/ila.h   |  80 
>  samples/bpf/ilarouter_tc.c  | 124 
> 
>  samples/bpf/ilarouter_xdp.c |  88 +++
>  samples/bpf/inet_helper.h   |  38 ++
>  5 files changed, 332 insertions(+)
>  create mode 100644 samples/bpf/ila.h
>  create mode 100644 samples/bpf/ilarouter_tc.c
>  create mode 100644 samples/bpf/ilarouter_xdp.c
>  create mode 100644 samples/bpf/inet_helper.h

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCH net-next 08/15] rxrpc: Fix call timer

2016-09-23 Thread Sergei Shtylyov


On 09/23/2016 06:16 PM, David Howells wrote:


Fix the call timer in the following ways:

 (1) If call->resend_at or call->ack_at are before or equal to the current
 time, then ignore that timeout.

 (2) If call->expire_at is before or equal to the current time, then don't
 set the timer at all (possibly we should queue the call).

 (3) Don't skip modifying the timer if timer_pending() is true.  This
 indicates that the timer is working, not that it has expired and is
 running/waiting to run its expiry handler.

Also call rxrpc_set_timer() to start the call timer going rather than
calling add_timer().

Signed-off-by: David Howells 
---

 net/rxrpc/call_event.c  |   25 ++---
 net/rxrpc/call_object.c |4 ++--
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 3a7f90a2659c..8bc5c8e37ab4 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -28,24 +28,27 @@ void rxrpc_set_timer(struct rxrpc_call *call)
 {
unsigned long t, now = jiffies;

-   _enter("{%ld,%ld,%ld:%ld}",
-  call->ack_at - now, call->resend_at - now, call->expire_at - now,
-  call->timer.expires - now);
-
read_lock_bh(>state_lock);

if (call->state < RXRPC_CALL_COMPLETE) {
-   t = call->ack_at;
-   if (time_before(call->resend_at, t))
+   t = call->expire_at;
+   if (time_before_eq(t, now))
+   goto out;
+
+   if (time_after(call->resend_at, now) &&
+   time_before(call->resend_at, t))
t = call->resend_at;
-   if (time_before(call->expire_at, t))
-   t = call->expire_at;
-   if (!timer_pending(>timer) ||
-   time_before(t, call->timer.expires)) {
-   _debug("set timer %ld", t - now);
+
+   if (time_after(call->ack_at, now) &&
+   time_before(call->ack_at, t))
+   t = call->ack_at;
+
+   if (call->timer.expires != t || !timer_pending(>timer)) {
mod_timer(>timer, t);
}


   CodingStyle: {} not needed now.

[...]

MBR, Sergei

Re: [PATCH net-next 07/15] rxrpc: Fix accidental cancellation of scheduled resend by ACK parser

2016-09-23 Thread Sergei Shtylyov


Hello.

On 09/23/2016 06:16 PM, David Howells wrote:


When rxrpc_input_soft_acks() is parsing the soft-ACKs from an ACK packet,
it updates the Tx packet annotations in the annotation buffer.  If a
soft-ACK is an ACK, then we overwrite unack'd, nak'd or to-be-retransmitted
states and that is fine; but if the soft-ACK is an NACK, we overwrite the
to-be-retransmitted with a nak - which isn't.

Instead, we need to let any scheduled retransmission stand if the packet
was NAK'd.

Note that we don't reissue a resend if the annotation is in the
to-be-retransmitted state because someone else must've scheduled the
resend already.

Signed-off-by: David Howells 
---

 net/rxrpc/input.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 06027b6d9c19..d3d69ab1f0a1 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -479,6 +479,8 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, 
u8 *acks,
case RXRPC_ACK_TYPE_NACK:
if (anno_type == RXRPC_TX_ANNO_NAK)
continue;
+   if (anno_type == RXRPC_TX_ANNO_RETRANS)
+   continue;


   Why not fold the above 2 *if*s together? Or use *else if* at least?

MBR, Sergei

Re: device-tree support for writing to phy registers?

2016-09-23 Thread Florian Fainelli

On 09/23/2016 10:36 AM, Tim Harvey wrote:
> On Fri, Sep 23, 2016 at 9:29 AM, Florian Fainelli  
> wrote:
>> On 09/23/2016 08:40 AM, Tim Harvey wrote:
>>> Greetings,
>>>
>>> I've got a TI DP83867 GbE phy that requires some register writes to
>>> configure its refclock output. Is there a generic device-tree API for
>>> writing to raw registers or is that something that would be need to be
>>> added to a specific phy driver with a device-tree binding?
>>
>> There are no standard properties that indicate how to write to register
>> from Device Tree (unfortunately there are non standard that allow this
>> to happen, e.g: marvell,reg-init), because that would mean that Device
>> Tree acts as some kind of firmware/binary interface, which is a bit of
>> stretch. Some bindings may indicate how to write to registers in a way
>> that accepts a address = value pair, but quite frankly, this is
>> absolutely horrible and not controllable nor easily transferable from
>> one model of device to the other, strongly discouraged.
>>
>>> There is a
>>> DP83867 phy driver but it doesn't contain anything related to
>>> configuring its CLKOUT via register 0x170.
>>
>> Then, I guess you should add a set of properties and corresponding code
>> reading these properties that would result in getting the register
>> programmed with the values you need.
>>
> 
> Florian,
> 
> agreed - this seems like the right thing to do and takes care of the
> important detail about power-management you mention below.
> 
> Are there any phy drivers you know of that do and CLKOUT configuration
> that I could use as inspiration for dt prop names?

The micrel binding has some clock related configuration:

Documentation/devicetree/bindings/net/micrel.txt

could be used as an inspirational source ;)

> 
> Thanks,
> 
> Tim
> 
>>>
>>> Alternatively, is it generally considered 'ok' to take care of this in
>>> the bootloader and not provide the MAC driver the gpio for phy-reset
>>> so that bootloader configuration persists through the kernel?
>>
>> It depends on what your platform does, punting on the bootloader is
>> usually fine, but also breaks nicely when you start implementing power
>> management in the kernel properly (e.g: deep sleep states) and you are
>> not calling back into the bootloader, yet your hardware lost its state
>> between power transitions.
>>
>> --
>> Florian


-- 
Florian

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Russell King - ARM Linux

On Fri, Sep 23, 2016 at 10:19:50AM -0700, Eric Nelson wrote:
> Oddly, it does prevent the vast majority (90%+) of the alignment errors.
> 
> I believe this is because the compiler is generating an ldm instruction
> when the ntohl() call is used, but I'm stumped about why these aren't
> generating faults:

ldm generates alignment faults when the address is not aligned to a
32-bit boundary.  ldr on ARMv6+ does not.

> I don't think that's the case.
> 
> # CONFIG_IPV6_GRE is not set
> 
> Hmm... Instrumenting the kernel, it seems that iphdr **is** aligned on
> a 4-byte boundary.
> 
> Does the ldm instruction require 8-byte alignment?
> 
> There's definitely a compiler-version dependency involved here,
> since using gcc 4.9 also reduced the number of faults dramatically.

Well, I don't think it's that gcc related:

User:   0
System: 312855 (ip6_route_input+0x6c/0x1e0)
Skipped:0
Half:   0
Word:   0
DWord:  2
Multi:  312853

c06d8998 :
c06d89ac:   e1a04000mov r4, r0
c06d89b0:   e1d489b4ldrhr8, [r4, #148]  ; 0x94
c06d89b8:   e594a0a0ldr sl, [r4, #160]  ; 0xa0
c06d89cc:   e08ac008add ip, sl, r8
c06d89d4:   e28c3018add r3, ip, #24
c06d89dc:   e28c7008add r7, ip, #8
c06d89e4:   e893000fldm r3, {r0, r1, r2, r3}
c06d89ec:   e24be044sub lr, fp, #68 ; 0x44
c06d89f4:   e24b5054sub r5, fp, #84 ; 0x54
c06d89fc:   e885000fstm r5, {r0, r1, r2, r3}
c06d8a04:   e897000fldm r7, {r0, r1, r2, r3}
c06d8a10:   e88e000fstm lr, {r0, r1, r2, r3}

This is from:

struct flowi6 fl6 = {
.flowi6_iif = l3mdev_fib_oif(skb->dev),
.daddr = iph->daddr,
.saddr = iph->saddr,
.flowlabel = ip6_flowinfo(iph),
.flowi6_mark = skb->mark,
.flowi6_proto = iph->nexthdr,
};

specifically, I suspect, the saddr and daddr initialisations.

There's not much to get away from this - the FEC on iMX requires a
16-byte alignment for DMA addresses, which violates the network
stack's requirement for the ethernet packet to be received with a
two byte offset.  So the IP header (and IPv6 headers) will always
be mis-aligned in memory, which leads to a huge number of alignment
faults.

There's not much getting away from this - the problem is not in the
networking stack, but the FEC hardware/network driver.  See:

struct  fec_enet_private *fep = netdev_priv(ndev);
int off;

off = ((unsigned long)skb->data) & fep->rx_align;
if (off)
skb_reserve(skb, fep->rx_align + 1 - off);

bdp->cbd_bufaddr = cpu_to_fec32(dma_map_single(>pdev->dev, 
skb->data, FEC_ENET_RX_FRSIZE - fep->rx_align, DMA_FROM_DEVICE));

in fec_enet_new_rxbdp().

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

Re: [PATCH net-next] net/vxlan: Avoid unaligned access in vxlan_build_skb()

2016-09-23 Thread Alexander Duyck

On Fri, Sep 23, 2016 at 10:20 AM, Sowmini Varadhan
 wrote:
> On (09/23/16 07:17), Alexander Duyck wrote:
>> >> Is this basically about, e.g., putting the vxlanhdr in its own
>> >> skb_frag_t, or something else?
>> >
>> > Yes, and this way skb_header_pointer() is forced to do a memcpy.
>   :
>> For Tx it all becomes a bit trickier since it would likely require us
>> to shift the frags up by 1 when we go from outer headers to inner
>> headers.
>
> here's how I thought through this so far, based on what I'm seeing for
> mld_newpack/vxlan (not sure if this can be extended for all the
> other tunnelling cases as well)..
>
> today the skb is set up so that we reserve LL_RESERVED_SPACE
> in the headroom, and vxlan sets up needed headroom for
> (outer_ether + ip + udp + vxlan + inner_ether). Insterad, if it
> set up the needed_headroom for just (outer_ether, ip, udp) and
> we had something like a "needed_fragroom" in the net_device,
> maybe we could set up the skb so that we dont have to shift the frags
> by 1.
>
> Drawbacks: this ends up with every skb going through vxlan etc being
> non-linear, so it is a lot of churn for several functions (e.g.,
> even mld_newpack() cannot just skb_put() things around). Also
> this probably gets quickly messy if we are dealing with multiple
> encaapsulations (even in the simple vxlan case we have
> vxlan + inner mac/ip/etc)
>
> BTW, I wonder if there is a small vxlan bug here- are we
> accounting for the outer_ether twice in LL_RESERVED_SPACE: once in
> ->hard_header_len, and once in ->needed_headroom?
>
>> One thought I had on that is that we could possibly avoid
>> having to do any allocation and could just take a reference on the
>> head_frag if that is what we are using.  Then we just add a 2 byte pad
>> and resume writing headers in place and the pointer offsets for the
>> inner headers would remain valid, though they would be past the point
>> of skb->tail.
>
> I am not sure I follow, can you elaborate? Doesnt this also assume
> that every skb is necessarily non-linear?

So basically what I was thinking is we do something like reserving
NET_IP_ALIGN and continue writing headers to skb->data, but we force
the tracking for the inner headers into frag[0] so that we can keep
the inner headers aligned without messing up the alignment for outer
headers.  In theory the inner offset and all that would still be
functional but might need a few tweaks.  You could probably even use
the skb->encapsulation bit to indicate you are doing this.  You could
almost think of it as us doing something like the inverse of
pskb_pull_tail.  The general idea here is we want to actually leave
the data in skb->data, but just reference it from frag[0] so that we
don't accidentally pull in the 2 byte padding for alignment when
transmitting the frame.

Re: device-tree support for writing to phy registers?

2016-09-23 Thread Tim Harvey

On Fri, Sep 23, 2016 at 9:29 AM, Florian Fainelli  wrote:
> On 09/23/2016 08:40 AM, Tim Harvey wrote:
>> Greetings,
>>
>> I've got a TI DP83867 GbE phy that requires some register writes to
>> configure its refclock output. Is there a generic device-tree API for
>> writing to raw registers or is that something that would be need to be
>> added to a specific phy driver with a device-tree binding?
>
> There are no standard properties that indicate how to write to register
> from Device Tree (unfortunately there are non standard that allow this
> to happen, e.g: marvell,reg-init), because that would mean that Device
> Tree acts as some kind of firmware/binary interface, which is a bit of
> stretch. Some bindings may indicate how to write to registers in a way
> that accepts a address = value pair, but quite frankly, this is
> absolutely horrible and not controllable nor easily transferable from
> one model of device to the other, strongly discouraged.
>
>> There is a
>> DP83867 phy driver but it doesn't contain anything related to
>> configuring its CLKOUT via register 0x170.
>
> Then, I guess you should add a set of properties and corresponding code
> reading these properties that would result in getting the register
> programmed with the values you need.
>

Florian,

agreed - this seems like the right thing to do and takes care of the
important detail about power-management you mention below.

Are there any phy drivers you know of that do and CLKOUT configuration
that I could use as inspiration for dt prop names?

Thanks,

Tim

>>
>> Alternatively, is it generally considered 'ok' to take care of this in
>> the bootloader and not provide the MAC driver the gpio for phy-reset
>> so that bootloader configuration persists through the kernel?
>
> It depends on what your platform does, punting on the bootloader is
> usually fine, but also breaks nicely when you start implementing power
> management in the kernel properly (e.g: deep sleep states) and you are
> not calling back into the bootloader, yet your hardware lost its state
> between power transitions.
>
> --
> Florian

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Eric Nelson

Hi Eric,

On 09/23/2016 10:19 AM, Eric Nelson wrote:
> Thanks Eric,
> 
> On 09/23/2016 09:54 AM, Eric Dumazet wrote:
>> On Fri, Sep 23, 2016 at 9:43 AM, Eric Nelson  wrote:
>>>
>>> Hello all,
>>>
>>> We're seeing alignment issues from the ethernet stack on an i.MX6UL board:
>>>
>>>
> 
> 
> 
>>>
>>> - id = ntohl(*(__be32 *)>id);
>>> - flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
>>> - id >>= 16;
>>> + id = ntohs(*(__be16 *)>id);
>>> + frag = ntohs(*(__be16 *)>frag_off);
>>> + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (frag &
>>> ~IP_DF));
>>>
>>> for (p = *head; p; p = p->next) {
>>> struct iphdr *iph2;
>>>
>>
>> This solves nothing, because a few lines after you'll have yet another
>> unaligned access :
>>
> 
> Oddly, it does prevent the vast majority (90%+) of the alignment errors.
> 
> I believe this is because the compiler is generating an ldm instruction
> when the ntohl() call is used, but I'm stumped about why these aren't
> generating faults:
> 
>> ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
>> ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
>>
>> So you might have one less problematic access, out of hundreds of them
>> all over the places.
>>
>> Really the problem is that whole stack depends on the assumption that
>> IP headers are aligned on arches that care
>> (ie where NET_IP_ALIGN == 2)
>>
>> If your build does have NET_IP_ALIGN = 2 and you get a fault here, it
>> might be because of a buggy driver.
>>
> 
> NET_IP_ALIGN is set to 2.
> 
>> The other known case is some GRE encapsulations that break the
>> assumption, and this is discussed somewhere else.
>>
> I don't think that's the case.
> 
> # CONFIG_IPV6_GRE is not set
> 
> Hmm... Instrumenting the kernel, it seems that iphdr **is** aligned on
> a 4-byte boundary.
> 

No. That was wrong.

The iphdr is aligned at offsets of 14 from the ethernet frame, which itself
is longword aligned.

I mistakenly tested before the call to skb_gro_header_slow(), when
iph was NULL.

After putting a test in the right place, I'm seeing an address of
888a364e for the first un-aligned packet.

Since the hardware requires longword alignment for its' DMA transfers,
aligning the IP header will require a memcpy, right?

You hinted at a solution in this post:

http://www.spinics.net/lists/netdev/msg213166.html

Are you aware of another driver doing this that could be used as
a reference?

Please advise,


Eric

Re: [RFC] net: store port/representative id in metadata_dst

2016-09-23 Thread Samudrala, Sridhar




On 9/23/2016 8:29 AM, Jakub Kicinski wrote:

On Fri, 23 Sep 2016 07:23:26 -0700, John Fastabend wrote:

On 16-09-23 05:55 AM, Jakub Kicinski wrote:

On Fri, 23 Sep 2016 11:06:09 +0200, Jiri Benc wrote:

On Fri, 23 Sep 2016 08:34:29 +0200, Jiri Pirko wrote:

So if I understand that correctly, this would need some "shared netdev"
which would effectively serve only as a sink for all port netdevices to
tx packets to. On RX, this would be completely avoided. This lower
device looks like half zombie to me.

Looks more like a quarter zombie. Even tx would not be allowed unless
going through one of the ports, as all skbs without
METADATA_HW_PORT_MUX metadata_dst would be dropped. But it would be
possible to attach qdisc to the "lower" netdevice and it would actually
have an effect. On rx this netdevice would be ignored completely. This
is very weird behavior.
  

I don't like it :( I wonder if the
solution would not be possible without this lower netdev.

I agree. This approach doesn't sound correct. The skbs should not be
requeued.

Thanks for the responses!

Nice timing we were just thinking about this.


I think SR-IOV NICs are coming at this problem from a different angle,
we already have a big, feature-full per-port netdevs and now we want to
spawn representators for VFs to handle fallback traffic.  This patch
would help us mux VFR traffic on all the queues of the physical port
netdevs (the ones which were already present in legacy mode, that's the
lower device).

Yep, I like the idea in general. I had a slightly different approach in
mind though. If you look at __dev_queue_xmit() there is a void
accel_priv pointer (gather you found this based on your commit note).
My take was we could extend this a bit so it can be used by the VFR
devices and they could do a dev_queue_xmit_accel(). In this way there is
no need to touch /net/core/{filter, dst, ip_tunnel}.c etc. Maybe the
accel logic needs to be extended to push the priv pointer all the way
through the xmit routine of the target netdev though. This should look
a lot like the macvlan accelerated xmit device path without the
switching logic.

Of course maybe the name would be extended to dev_queue_xmit_extended()
or something.

So the flow on ingress would be,

   1. pkt_received_by_PF_netdev
   2. PF_netdev reads some tag off packet/descriptor and sets correct
  skb->dev field. This is needed so stack "sees" packets from
  correct VF ports.
   3. packet passed up to stack.

I guess it is a bit "zombie" like on the receive path because the packet
is never actually handled by VF netdev code per se and on egress can
traverse both the VFR and PF netdevs qdiscs. But on the other hand the
VFR netdevs and PF netdevs are all in the same driver. Plus using a
queue per VFR is a bit of a waste as its not needed and also hardware
may not have any mechanism to push VF traffic onto a rx queue.

On egress,

   1. VFR xmit is called
   2. VFR xmit calls dev_queue_xmit_accel() with some meta-data if needed
  for the lower netdev
   3. lower netdev sends out the packet.

Again we don't need to waste any queues for each VFR and the VFR can be
a LLTX device. In this scheme I think you avoid much of the changes in
your patch and keep it all contained in the driver. Any thoughts?


The 'accel' parameter in dev_queue_xmit_accel() is currently only passed
to ndo_select_queue() via netdev_pick_tx() and is used to select the tx 
queue.
Also, it is not passed all the way to the driver specific xmit routine.  
Doesn't it require

changing all the driver xmit routines if we want to pass this parameter?


Goes without saying that you have a much better understanding of packet
scheduling so please bear with me :)  My target model is that I have
n_cpus x "n_tc/prio" queues on the PF and I want to transmit the
fallback traffic over those same queues.  So no new HW queues are used
for VFRs at all.  This is a reverse of macvlan offload which AFAICT has
"bastard hw queues" which actually TX for a separate software device.

My understanding was that I can rework this model to have software
queues for VFRs (#sw queues == #PF queues + #VFRs) but no extra HW
queues (#hw queues == #PF queues) but then when the driver sees a
packet on sw-only VFR queue it has to pick one of the PF queues (which
one?), lock PF software queue to own it, and only then can it
transmit.  With the dst_metadata there is no need for extra locking or
queue selection.


Yes.  The VFPR netdevs don't have any HW queues associated with them and 
we would like

to use the PF queues for the xmit.
I was also looking into some way of passing the port id via skb 
parameter to the
dev_queue_xmit() call so that the PF xmit routine can do a directed 
transmit to a specifc VF.

Is skb->cb an option to pass this info?
dst_metadata approach would work  too if it is acceptable.





To address 'I wonder if the solution can be done without this lower
netdev' I think it can be but it creates two issues which I'm not sure
have a

Re: [PATCH net-next] net/vxlan: Avoid unaligned access in vxlan_build_skb()

2016-09-23 Thread Sowmini Varadhan

On (09/23/16 07:17), Alexander Duyck wrote:
> >> Is this basically about, e.g., putting the vxlanhdr in its own
> >> skb_frag_t, or something else?
> >
> > Yes, and this way skb_header_pointer() is forced to do a memcpy.
  :
> For Tx it all becomes a bit trickier since it would likely require us
> to shift the frags up by 1 when we go from outer headers to inner
> headers.  

here's how I thought through this so far, based on what I'm seeing for 
mld_newpack/vxlan (not sure if this can be extended for all the 
other tunnelling cases as well)..

today the skb is set up so that we reserve LL_RESERVED_SPACE
in the headroom, and vxlan sets up needed headroom for 
(outer_ether + ip + udp + vxlan + inner_ether). Insterad, if it
set up the needed_headroom for just (outer_ether, ip, udp) and
we had something like a "needed_fragroom" in the net_device, 
maybe we could set up the skb so that we dont have to shift the frags
by 1. 

Drawbacks: this ends up with every skb going through vxlan etc being
non-linear, so it is a lot of churn for several functions (e.g.,
even mld_newpack() cannot just skb_put() things around). Also
this probably gets quickly messy if we are dealing with multiple 
encaapsulations (even in the simple vxlan case we have 
vxlan + inner mac/ip/etc)

BTW, I wonder if there is a small vxlan bug here- are we
accounting for the outer_ether twice in LL_RESERVED_SPACE: once in 
->hard_header_len, and once in ->needed_headroom?

> One thought I had on that is that we could possibly avoid
> having to do any allocation and could just take a reference on the
> head_frag if that is what we are using.  Then we just add a 2 byte pad
> and resume writing headers in place and the pointer offsets for the
> inner headers would remain valid, though they would be past the point
> of skb->tail.

I am not sure I follow, can you elaborate? Doesnt this also assume
that every skb is necessarily non-linear?

--Sowmini

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Eric Nelson

Thanks Eric,

On 09/23/2016 09:54 AM, Eric Dumazet wrote:
> On Fri, Sep 23, 2016 at 9:43 AM, Eric Nelson  wrote:
>>
>> Hello all,
>>
>> We're seeing alignment issues from the ethernet stack on an i.MX6UL board:
>>
>>



>>
>> - id = ntohl(*(__be32 *)>id);
>> - flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
>> - id >>= 16;
>> + id = ntohs(*(__be16 *)>id);
>> + frag = ntohs(*(__be16 *)>frag_off);
>> + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (frag &
>> ~IP_DF));
>>
>> for (p = *head; p; p = p->next) {
>> struct iphdr *iph2;
>>
> 
> This solves nothing, because a few lines after you'll have yet another
> unaligned access :
> 

Oddly, it does prevent the vast majority (90%+) of the alignment errors.

I believe this is because the compiler is generating an ldm instruction
when the ntohl() call is used, but I'm stumped about why these aren't
generating faults:

> ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
> ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
> 
> So you might have one less problematic access, out of hundreds of them
> all over the places.
> 
> Really the problem is that whole stack depends on the assumption that
> IP headers are aligned on arches that care
> (ie where NET_IP_ALIGN == 2)
> 
> If your build does have NET_IP_ALIGN = 2 and you get a fault here, it
> might be because of a buggy driver.
> 

NET_IP_ALIGN is set to 2.

> The other known case is some GRE encapsulations that break the
> assumption, and this is discussed somewhere else.
> 
I don't think that's the case.

# CONFIG_IPV6_GRE is not set

Hmm... Instrumenting the kernel, it seems that iphdr **is** aligned on
a 4-byte boundary.

Does the ldm instruction require 8-byte alignment?

There's definitely a compiler-version dependency involved here,
since using gcc 4.9 also reduced the number of faults dramatically.

[PATCH RFC 1/2] samples/bpf: ilarouter for tc

2016-09-23 Thread Alexei Starovoitov

From: Aaron Yue 

From: Aaron Yue 

Requires a userspace program to insert ila mappings to the ila map.

Signed-off-by: Aaron Yue 
Signed-off-by: Aaron Yue 
---
 samples/bpf/Makefile   |   1 +
 samples/bpf/ila.h  |  80 +
 samples/bpf/ilarouter_tc.c | 124 +
 samples/bpf/inet_helper.h  |  38 ++
 4 files changed, 243 insertions(+)
 create mode 100644 samples/bpf/ila.h
 create mode 100644 samples/bpf/ilarouter_tc.c
 create mode 100644 samples/bpf/inet_helper.h

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 90ebf7d..15e19bb 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -74,6 +74,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o
 always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
+always += ilarouter_tc.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
diff --git a/samples/bpf/ila.h b/samples/bpf/ila.h
new file mode 100644
index 000..39a11f8
--- /dev/null
+++ b/samples/bpf/ila.h
@@ -0,0 +1,80 @@
+#ifndef _SIR_H
+#define _SIR_H
+
+#include 
+#include 
+#include 
+
+#define SIR_T_LOCAL 0x1
+#define SIR_T_VIRTUAL 0x3
+
+struct in6_addr_sir {
+   __be64 prefix;
+   __be64 identifier_c_type;
+} __packed;
+
+struct in6_addr_ila {
+   __be64 locator;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+   __u8 identifier:4,
+c:1,
+type:3;
+   __u8  identifier2;
+   __be16 identifier3;
+   __be16 identifier4;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+   __be32 type:3,
+  c:1,
+  identifier:28;
+   __be16 identifier2;
+#else
+#error "Fix asm/byteorder.h"
+#endif
+   __be16 checksum;
+} __packed;
+
+struct sirhdr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+   __u16 traffic_class:4,
+   version:4,
+   flow_label:4,
+   traffic_class2:4;
+   __be16 flow_label2;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+   __u32 version:4,
+ traffic_class:8,
+ flow_label:20;
+#else
+#error "Fix asm/byteorder.h"
+#endif
+   __be16 payload_length;
+   __u8   next_header;
+   __u8   hop_limit;
+
+   struct in6_addr source_address;
+   struct in6_addr_sir destination_address;
+} __packed;
+
+struct ilahdr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+   __u16 traffic_class:4,
+   version:4,
+   flow_label:4,
+   traffic_class2:4;
+   __be16 flow_label2;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+   __u32 version:4,
+ traffic_class:8,
+ flow_label:20;
+#else
+#error "Fix asm/byteorder.h"
+#endif
+   __be16 payload_length;
+   __u8   next_header;
+   __u8   hop_limit;
+
+   struct in6_addr source_address;
+   struct in6_addr_ila destination_address;
+} __packed;
+
+#endif
diff --git a/samples/bpf/ilarouter_tc.c b/samples/bpf/ilarouter_tc.c
new file mode 100644
index 000..277322e
--- /dev/null
+++ b/samples/bpf/ilarouter_tc.c
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#define MAP_SIZE (1 << 20)
+
+#define KBUILD_MODNAME "ilarouter"
+#include 
+#include 
+#include 
+#include "ila.h"
+#include "inet_helper.h"
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+unsigned int version SEC("version") = 1;
+
+struct bpf_map_def SEC("maps") ila_lookup_map = {
+   .type = BPF_MAP_TYPE_HASH,
+   .key_size = sizeof(struct in6_addr),
+   .value_size = sizeof(struct in6_addr),
+   .max_entries = MAP_SIZE,
+};
+
+#define IPV6_DEST_OFF (ETH_HLEN + offsetof(struct ipv6hdr, daddr))
+
+struct addr {
+   __u64 addr_hi;
+   __u64 addr_lo;
+} __packed;
+
+SEC("classifier")
+int ila_lookup(struct __sk_buff *skb)
+{
+   unsigned long dataptr = (unsigned long)skb->data;
+   struct ethhdr *eth;
+   struct ipv6hdr *sir;
+   struct addr *pkt_addr;
+   struct addr stack_addr;
+   struct addr *reply;
+#ifdef DEBUG
+   char lookup_request[] = "Lookup request for sir: %llx, iden: %llx\n";
+   char lookup_fail[] = "Lookup failed\n";
+   char lookup_success[] = "Lookup success. hi: %llx, lo: %llx\n";
+#endif
+
+   /* Invalid packet: length too short
+* compiler optimization/verifier bypass: this way it won't assume
+* that we copied over a pkt_ptr,
+* which has register range of 0 (from (r1 + 0))
+*/
+   if (dataptr + sizeof(struct ethhdr) +
+   sizeof(struct ipv6hdr) > skb->data_end)
+   goto redirect;
+
+   /* Ethernet header */
+   eth = (struct ethhdr *)dataptr;
+
+   /* Irrelevant packet: not IPv6 */
+   if (eth->h_proto != htons(ETH_P_IPV6))
+   goto redirect;
+
+

[PATCH RFC 2/2] samples/bpf: ilarouter for xdp

2016-09-23 Thread Alexei Starovoitov

From: Aaron Yue 

From: Aaron Yue 

Requires a userspace program to insert ila mappings and mac addresses to
the ila map. Needs a verifier patch to directly allow access to the pkt
from the bpf map.

Signed-off-by: Aaron Yue 
Signed-off-by: Aaron Yue 
---
 samples/bpf/Makefile|  1 +
 samples/bpf/ilarouter_xdp.c | 88 +
 2 files changed, 89 insertions(+)
 create mode 100644 samples/bpf/ilarouter_xdp.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 15e19bb..827e6e8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -75,6 +75,7 @@ always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
 always += ilarouter_tc.o
+always += ilarouter_xdp.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
diff --git a/samples/bpf/ilarouter_xdp.c b/samples/bpf/ilarouter_xdp.c
new file mode 100644
index 000..24749c4
--- /dev/null
+++ b/samples/bpf/ilarouter_xdp.c
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#define MAP_SIZE (1 << 20)
+
+#define KBUILD_MODNAME "ilarouter"
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+struct ila_addr {
+   u64 addr_hi;
+   u64 addr_lo;
+} __packed;
+
+struct ila_info {
+   struct ila_addr addr;
+   u16 mac[3];
+} __packed;
+
+char _license[] SEC("license") = "GPL";
+unsigned int version SEC("version") = 1;
+
+struct bpf_map_def SEC("map_ila_lookup_map") ila_lookup_map = {
+   .type = BPF_MAP_TYPE_HASH,
+   .key_size = sizeof(struct in6_addr),
+   .value_size = sizeof(struct ila_info),
+   .max_entries = MAP_SIZE,
+};
+
+SEC("xdp_ila_lookup")
+int ila_lookup(struct xdp_md *ctx)
+{
+   unsigned long dataptr = (unsigned long)ctx->data;
+   struct ethhdr *eth;
+   struct ipv6hdr *sir;
+   struct ila_addr *pkt_addr;
+   struct ila_info *reply;
+   u16 *dst_mac;
+
+   /* Invalid packet: length too short
+* compiler optimization/verifier bypass:
+* this way it won't assume that we copied over a pkt_ptr,
+* which has register range of 0 (from (r1 + 0))
+*/
+   if (dataptr + sizeof(struct ethhdr) + sizeof(struct ipv6hdr) >
+   (unsigned long)ctx->data_end)
+   return XDP_PASS;
+
+   /* Ethernet header */
+   eth = (struct ethhdr *)dataptr;
+
+   /* Irrelevant packet: not IPv6 */
+   if (eth->h_proto != htons(ETH_P_IPV6))
+   return XDP_PASS;
+
+   /* Sir Address header */
+   sir = (struct ipv6hdr *)(dataptr + sizeof(struct ethhdr));
+
+   /* We don't have to check for C bit or Type, since
+* userspace mapping inserts guarantees that only valid values
+* will be inserted into the map in network byte-order.
+* Hence, a lookup fail implies either C bit/Type is invalid,
+* or mapping does not exist, in both cases we pass the packet without
+* modifications.
+*/
+   pkt_addr = (struct ila_addr *)&(sir->daddr);
+   reply = bpf_map_lookup_elem(_lookup_map, pkt_addr);
+
+   if (!reply)
+   return XDP_PASS;
+
+   pkt_addr->addr_hi = reply->addr.addr_hi;
+   pkt_addr->addr_lo = reply->addr.addr_lo;
+
+   dst_mac = (u16 *)eth;
+   dst_mac[0] = reply->mac[0];
+   dst_mac[1] = reply->mac[1];
+   dst_mac[2] = reply->mac[2];
+
+   return XDP_TX;
+}
+
-- 
2.8.0.rc2

[PATCH RFC 0/2] ila: ilarouter bpf code for tc and xdp

2016-09-23 Thread Alexei Starovoitov

From: Aaron Yue 

Jesper,

here is old email and cover letter that didn't make it to the list
due to vger outage (I guess).
The verifier patch that Aaron is talking about has landed long ago.

The dataplane of ILA router is very short and simple.
Control plane is very different matter. It's not ready for prime time yet.

--

This patch contains the tc and xdp implementation of kernelspace bpf code.
It requires userspace to insert to the ILA bpf maps, in tc's case, the 
precomputed ILA mappings, and in xdp's case, both the precomputed ILA
mappings and the MAC address.

The xdp bpf code also requires a verifier patch to allow direct map access
from the packet (will be patched in by Alexei).

Aaron Yue (2):
  samples/bpf: ilarouter for tc
  samples/bpf: ilarouter for xdp

 samples/bpf/Makefile|   2 +
 samples/bpf/ila.h   |  80 
 samples/bpf/ilarouter_tc.c  | 124 
 samples/bpf/ilarouter_xdp.c |  88 +++
 samples/bpf/inet_helper.h   |  38 ++
 5 files changed, 332 insertions(+)
 create mode 100644 samples/bpf/ila.h
 create mode 100644 samples/bpf/ilarouter_tc.c
 create mode 100644 samples/bpf/ilarouter_xdp.c
 create mode 100644 samples/bpf/inet_helper.h

-- 
2.8.0.rc2

[PATCH net] ip6_gre: fix flowi6_proto value in ip6gre_xmit_other()

2016-09-23 Thread Lance Richardson

Similar to commit 3be07244b733 ("ip6_gre: fix flowi6_proto value in
xmit path"), set flowi6_proto to IPPROTO_GRE for output route lookup.
Since the correct proto is already set in the tunnel flowi6 template via
commit 252f3f5a1189 ("ip6_gre: Set flowi6_proto as IPPROTO_GRE in xmit
path."), simply delete the line setting the incorrect flowi6_proto value.

Suggested-by: Jiri Benc 
Fixes: commit c12b395a4664 ("gre: Support GRE over IPv6")
Signed-off-by: Lance Richardson 
---
 net/ipv6/ip6_gre.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 704274c..edc3daa 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -648,7 +648,6 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct 
net_device *dev)
encap_limit = t->parms.encap_limit;
 
memcpy(, >fl.u.ip6, sizeof(fl6));
-   fl6.flowi6_proto = skb->protocol;
 
err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
if (err)
-- 
2.5.5

Re: [PATCH] net: bcmgenet: Fix EPHY reset in power up

2016-09-23 Thread Florian Fainelli

On 09/23/2016 08:04 AM, Jaedon Shin wrote:
> Hi Andrew,
> 
> On 23 Sep 2016, at 11:06 PM, Andrew Lunn  wrote:
>>
>> On Fri, Sep 23, 2016 at 10:20:04PM +0900, Jaedon Shin wrote:
>>> The bcmgenet_mii_reset() is always not running in power up sequence
>>> after 'commit 62469c76007e ("net: ethernet: bcmgenet: use phydev from
>>> struct net_device")'. This'll show extremely high latency and duplicate
>>> packets while interface down and up repeatedly.
>>>
>>> For now, adds again a private phydev for mii reset when runs power up to
>>> open interface.
>>
>> Hi Jaedon
>>
>> How does this fix the issue? It sounds like you are papering over the
>> crack, not truly fixing it.
>>
>>   Andrew
> 
> Yes, It feel like a workaround, but I think it must need v4.8 stable
> version. If we find better way that fixes internal PHY to initialize
> after re-open interface, this patch will be dropped.

I can observe the faulting behavior with 4.8-rc7 that the link below
fixed initially:

# ping fainelli-linux
PING fainelli-linux (10.112.156.244): 56 data bytes
64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.352 ms
64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.472 ms (DUP!)
64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.496 ms (DUP!)
64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.517 ms (DUP!)
64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.536 ms (DUP!)
64 bytes from 10.112.156.244: seq=1 ttl=61 time=1.557 ms (DUP!)
64 bytes from 10.112.156.244: seq=1 ttl=61 time=752.448 ms (DUP!)
64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.291 ms
64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.421 ms (DUP!)
64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.444 ms (DUP!)
64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.464 ms (DUP!)
64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.483 ms (DUP!)
64 bytes from 10.112.156.244: seq=2 ttl=61 time=1.505 ms (DUP!)
64 bytes from 10.112.156.244: seq=2 ttl=61 time=24.964 ms (DUP!)

If we revert this patch, we indeed get the normal and expected behavior
back:

# ping fainelli-linux
PING fainelli-linux (10.112.156.244): 56 data bytes
64 bytes from 10.112.156.244: seq=0 ttl=61 time=0.417 ms
64 bytes from 10.112.156.244: seq=1 ttl=61 time=0.415 ms
64 bytes from 10.112.156.244: seq=2 ttl=61 time=0.424 ms

Actually, the key thing is this:

- without Philippe's patch we call twice bcmgenet_mii_reset, and that is
intended:
- first time from bcmgenet_power_up() to make sure the PHY is
initialized *before* we get to initialize the UniMAC, this is critical
- second time from bcmgenet_mii_probe(), through the normal 
phy_init_hw()

- with Philippe's patch, we only get to call bcmgenet_mii_reset once, in
bcmgenet_mii_probe() because the first time in bcmgenet_power_up(),
dev->phydev is NULL, because of a prior call to phy_disconnect() in
bcmgenet_close(), unfortunately, there has been MAC activity, so the PHY
gets in a bad state

Jaedon, feel free to use the explanation above, and send a plain revert
of commit 62469c76007e11428e2ee3c6de90cbe74b588d44.

Thanks!

Thanks!
-- 
Florian

Re: Alignment issues with freescale FEC driver

2016-09-23 Thread Eric Dumazet

On Fri, Sep 23, 2016 at 9:43 AM, Eric Nelson  wrote:
>
> Hello all,
>
> We're seeing alignment issues from the ethernet stack on an i.MX6UL board:
>
> root@mx6ul:~# cat /proc/cpu/alignment
> User: 0
> System: 470010 (inet_gro_receive+0x104/0x278)
>
> This seems to be related to the ip header alignment, and there
> was much discussion in mailing list threads [1] and [2].
>
> In particular, Russell referred to a patch here, but I haven't been
> able to find it:
> https://lists.linaro.org/pipermail/linaro-toolchain/2012-October/002844.html
>
> Eric Dumazet also suggested a path toward fixing it, but I don't quite
> understand the suggestion:
> http://www.spinics.net/lists/netdev/msg213166.htm
>
> The immediate problem is addressed by just reading the id and frag_offs
> fields in the iphdr structure as shown in this patch:
>
> commit 98810abc911b1286a7e4a2ebdfbad66f12fae19d
> Author: Eric Nelson 
> Date: Fri Sep 23 08:26:03 2016 -0700
>
> net: ipv4: af_inet: don't read multiple 16-bit iphdr fields as a 32-bit
> value
>
> Change-Id: Idc7122c22c13ca078be31907d30ab1c3148ba807
> Signed-off-by: Eric Nelson 
>
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 0cc98b1..c17ef6e 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1301,6 +1301,7 @@ static struct sk_buff **inet_gro_receive(struct
> sk_buff **head,
> unsigned int hlen;
> unsigned int off;
> unsigned int id;
> + unsigned int frag;
> int flush = 1;
> int proto;
>
> @@ -1326,9 +1327,9 @@ static struct sk_buff **inet_gro_receive(struct
> sk_buff **head,
> if (unlikely(ip_fast_csum((u8 *)iph, 5)))
> goto out_unlock;
>
> - id = ntohl(*(__be32 *)>id);
> - flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
> - id >>= 16;
> + id = ntohs(*(__be16 *)>id);
> + frag = ntohs(*(__be16 *)>frag_off);
> + flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (frag &
> ~IP_DF));
>
> for (p = *head; p; p = p->next) {
> struct iphdr *iph2;
>

This solves nothing, because a few lines after you'll have yet another
unaligned access :


((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {

So you might have one less problematic access, out of hundreds of them
all over the places.

Really the problem is that whole stack depends on the assumption that
IP headers are aligned on arches that care
(ie where NET_IP_ALIGN == 2)

If your build does have NET_IP_ALIGN = 2 and you get a fault here, it
might be because of a buggy driver.

The other known case is some GRE encapsulations that break the
assumption, and this is discussed somewhere else.

Re: [PATCH] softirq: let ksoftirqd do its job

2016-09-23 Thread Jesper Dangaard Brouer

On Fri, 23 Sep 2016 13:53:33 +0200
Peter Zijlstra  wrote:

> On Fri, Sep 23, 2016 at 01:35:59PM +0200, Daniel Borkmann wrote:
> > On 09/02/2016 08:39 AM, David Miller wrote:  
> > >
> > >I'm just kind of assuming this won't go through my tree, but I can take
> > >it if that's what everyone agrees to.  
> > 
> > Was this actually picked up somewhere in the mean time?  
> 
> I can queue it for tip. In fact, I've just done so to avoid loosing it.
> If anybody else wants it holler.

Good that you are picking this up! It is a very important fix, as least
for networking.

This is your git tree, right:
 https://git.kernel.org/cgit/linux/kernel/git/peterz/queue.git/

Doesn't look like you pushed it yet, or do I need to look at a specific
branch?

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCH] fs/select: add vmalloc fallback for select(2)

2016-09-23 Thread Jason Baron


Hi,

On 09/23/2016 03:24 AM, Nicholas Piggin wrote:

On Fri, 23 Sep 2016 14:42:53 +0800
"Hillf Danton"  wrote:



The select(2) syscall performs a kmalloc(size, GFP_KERNEL) where size grows
with the number of fds passed. We had a customer report page allocation
failures of order-4 for this allocation. This is a costly order, so it might
easily fail, as the VM expects such allocation to have a lower-order fallback.

Such trivial fallback is vmalloc(), as the memory doesn't have to be
physically contiguous. Also the allocation is temporary for the duration of the
syscall, so it's unlikely to stress vmalloc too much.

Note that the poll(2) syscall seems to use a linked list of order-0 pages, so
it doesn't need this kind of fallback.


How about something like this? (untested)

Eric isn't wrong about vmalloc sucking :)

Thanks,
Nick


---
  fs/select.c | 57 +++--
  1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index 8ed9da5..3b4834c 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -555,6 +555,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set 
__user *outp,
void *bits;
int ret, max_fds;
unsigned int size;
+   size_t nr_bytes;
struct fdtable *fdt;
/* Allocate small arguments on the stack to save memory and be faster */
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
@@ -576,21 +577,39 @@ int core_sys_select(int n, fd_set __user *inp, fd_set 
__user *outp,
 * since we used fdset we need to allocate memory in units of
 * long-words.
 */
-   size = FDS_BYTES(n);
+   ret = -ENOMEM;
bits = stack_fds;
-   if (size > sizeof(stack_fds) / 6) {
-   /* Not enough space in on-stack array; must use kmalloc */
+   size = FDS_BYTES(n);
+   nr_bytes = 6 * size;
+
+   if (unlikely(nr_bytes > PAGE_SIZE)) {
+   /* Avoid multi-page allocation if possible */
ret = -ENOMEM;
-   bits = kmalloc(6 * size, GFP_KERNEL);
-   if (!bits)
-   goto out_nofds;
+   fds.in = kmalloc(size, GFP_KERNEL);
+   fds.out = kmalloc(size, GFP_KERNEL);
+   fds.ex = kmalloc(size, GFP_KERNEL);
+   fds.res_in = kmalloc(size, GFP_KERNEL);
+   fds.res_out = kmalloc(size, GFP_KERNEL);
+   fds.res_ex = kmalloc(size, GFP_KERNEL);
+
+   if (!(fds.in && fds.out && fds.ex &&
+   fds.res_in && fds.res_out && fds.res_ex))
+   goto out;
+   } else {
+   if (nr_bytes > sizeof(stack_fds)) {
+   /* Not enough space in on-stack array */
+   if (nr_bytes > PAGE_SIZE * 2)


The 'if' looks extraneous?

Also, I wonder if we can just avoid some allocations altogether by 
checking by if the user fd_set pointers are NULL? That can avoid failures :)


Thanks,

-Jason

Alignment issues with freescale FEC driver

2016-09-23 Thread Eric Nelson

Hello all,

We're seeing alignment issues from the ethernet stack on an i.MX6UL board:

root@mx6ul:~# cat /proc/cpu/alignment
User: 0
System: 470010 (inet_gro_receive+0x104/0x278)

This seems to be related to the ip header alignment, and there
was much discussion in mailing list threads [1] and [2].

In particular, Russell referred to a patch here, but I haven't been
able to find it:
https://lists.linaro.org/pipermail/linaro-toolchain/2012-October/002844.html

Eric Dumazet also suggested a path toward fixing it, but I don't quite
understand the suggestion:
http://www.spinics.net/lists/netdev/msg213166.htm

The immediate problem is addressed by just reading the id and frag_offs
fields in the iphdr structure as shown in this patch:

commit 98810abc911b1286a7e4a2ebdfbad66f12fae19d
Author: Eric Nelson 
Date: Fri Sep 23 08:26:03 2016 -0700

net: ipv4: af_inet: don't read multiple 16-bit iphdr fields as a 32-bit
value

Change-Id: Idc7122c22c13ca078be31907d30ab1c3148ba807
Signed-off-by: Eric Nelson 

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0cc98b1..c17ef6e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1301,6 +1301,7 @@ static struct sk_buff **inet_gro_receive(struct
sk_buff **head,
unsigned int hlen;
unsigned int off;
unsigned int id;
+ unsigned int frag;
int flush = 1;
int proto;

@@ -1326,9 +1327,9 @@ static struct sk_buff **inet_gro_receive(struct
sk_buff **head,
if (unlikely(ip_fast_csum((u8 *)iph, 5)))
goto out_unlock;

- id = ntohl(*(__be32 *)>id);
- flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
- id >>= 16;
+ id = ntohs(*(__be16 *)>id);
+ frag = ntohs(*(__be16 *)>frag_off);
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (frag &
~IP_DF));

for (p = *head; p; p = p->next) {
struct iphdr *iph2;


The reading of both fields in one "ntohl" seems obfuscated at best and
certainly worthy of a comment about the optimization but I understand
from other notes that the fundamental problem is that the IP header should
be aligned on a 4-byte boundary and that's not possible without a memcpy.

I'd like to hear suggestions about how we can address this.

Regards,


Eric

[1] - http://www.spinics.net/lists/netdev/msg213114.html
[2] -
https://lists.linaro.org/pipermail/linaro-toolchain/2012-October/002828.html

Re: device-tree support for writing to phy registers?

2016-09-23 Thread Florian Fainelli

On 09/23/2016 08:40 AM, Tim Harvey wrote:
> Greetings,
> 
> I've got a TI DP83867 GbE phy that requires some register writes to
> configure its refclock output. Is there a generic device-tree API for
> writing to raw registers or is that something that would be need to be
> added to a specific phy driver with a device-tree binding?

There are no standard properties that indicate how to write to register
from Device Tree (unfortunately there are non standard that allow this
to happen, e.g: marvell,reg-init), because that would mean that Device
Tree acts as some kind of firmware/binary interface, which is a bit of
stretch. Some bindings may indicate how to write to registers in a way
that accepts a address = value pair, but quite frankly, this is
absolutely horrible and not controllable nor easily transferable from
one model of device to the other, strongly discouraged.

> There is a
> DP83867 phy driver but it doesn't contain anything related to
> configuring its CLKOUT via register 0x170.

Then, I guess you should add a set of properties and corresponding code
reading these properties that would result in getting the register
programmed with the values you need.

> 
> Alternatively, is it generally considered 'ok' to take care of this in
> the bootloader and not provide the MAC driver the gpio for phy-reset
> so that bootloader configuration persists through the kernel?

It depends on what your platform does, punting on the bootloader is
usually fine, but also breaks nicely when you start implementing power
management in the kernel properly (e.g: deep sleep states) and you are
not calling back into the bootloader, yet your hardware lost its state
between power transitions.

-- 
Florian

[PATCH] netns: move {inc,dec}_net_namespaces into #ifdef

2016-09-23 Thread Arnd Bergmann

With the newly enforced limit on the number of namespaces,
we get a build warning if CONFIG_NETNS is disabled:

net/core/net_namespace.c:273:13: error: 'dec_net_namespaces' defined but not 
used [-Werror=unused-function]
net/core/net_namespace.c:268:24: error: 'inc_net_namespaces' defined but not 
used [-Werror=unused-function]

This moves the two added functions inside the #ifdef that guards
their callers.

Fixes: 703286608a22 ("netns: Add a limit on the number of net namespaces")
Signed-off-by: Arnd Bergmann 
---
 net/core/net_namespace.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index d0eb13d3226b..989434f36f96 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -265,16 +265,6 @@ struct net *get_net_ns_by_id(struct net *net, int id)
return peer;
 }
 
-static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
-{
-   return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
-}
-
-static void dec_net_namespaces(struct ucounts *ucounts)
-{
-   dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
-}
-
 /*
  * setup_net runs the initializers for the network namespace object.
  */
@@ -319,6 +309,16 @@ static __net_init int setup_net(struct net *net, struct 
user_namespace *user_ns)
 
 
 #ifdef CONFIG_NET_NS
+static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
+{
+   return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
+}
+
+static void dec_net_namespaces(struct ucounts *ucounts)
+{
+   dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
+}
+
 static struct kmem_cache *net_cachep;
 static struct workqueue_struct *netns_wq;
 
-- 
2.9.0

Re: [PATCH net-next 4/4] net/sched: act_mirred: Implement ingress actions

2016-09-23 Thread Shmulik Ladkani

On Fri, 23 Sep 2016 08:48:33 -0400 Jamal Hadi Salim  wrote:
> > Even today, one may create loops using existing 'egress redirect',
> > e.g. this rediculously errorneous construct:
> >
> >  # ip l add v0 type veth peer name v0p
> >  # tc filter add dev v0p parent : basic \
> > action mirred egress redirect dev v0
> 
> I think we actually recover from this one by eventually
> dropping (theres a ttl field).

[off topic]

Don't know about that :) cpu fan got very noisy, 3 of 4 cores at 100%,
and after one second I got:

# ip -s l show type veth
16: v0p@v0:  mtu 1500 qdisc noqueue state UP 
mode DEFAULT group default qlen 1000
link/ether a2:64:ff:10:dd:85 brd ff:ff:ff:ff:ff:ff
RX: bytes  packets  errors  dropped overrun mcast   
71660305923 469890864 0   0   0   0   
TX: bytes  packets  errors  dropped carrier collsns 
3509   24   0   0   0   0   
17: v0@v0p:  mtu 1500 qdisc noqueue state UP 
mode DEFAULT group default qlen 1000
link/ether 52:a2:34:f6:7c:ec brd ff:ff:ff:ff:ff:ff
RX: bytes  packets  errors  dropped overrun mcast   
3509   24   0   0   0   0   
TX: bytes  packets  errors  dropped carrier collsns 
71660713017 469893555 0   0   0   0

> The other question is what to set skb->dev and skb->iif?
> Some information will be lost if you move around netdevs a
> bit.

[back to topic]

Good point.

Similarly to all constructs injecting skbs to device rx (bond/team,
vlan, macvlan, tunnels, ifb, __dev_forward_skb callers, etc..), we are
obligated to assign 'skb2->dev' as the new rx device.

Regarding 'skb2->skb_iif', original act_mirred code already has:

skb2->skb_iif = skb->dev->ifindex;   <--- THIS IS ORIG DEV IIF
skb2->dev = dev; <--- THIS IS TARGET DEV
err = dev_queue_xmit(skb2);

I'm preserving this; OTOH the suggested modification in the patch is

-   err = dev_queue_xmit(skb2);
+   if (tcf_mirred_act_direction(m->tcfm_eaction) & AT_EGRESS)
+   err = dev_queue_xmit(skb2);
+   else
+   netif_receive_skb(skb2);

now, the call to 'netif_receive_skb' will eventually override skb_iif to
the target RX dev's index, upon entry to __netif_receive_skb_core.

I think this IS the expected behavior - as done by other "rx injection"
constructs.

My doubts were around whether we should call 'dev_forward_skb' instead
of 'netif_receive_skb'.
The former does some things I assumed we're not interested of, like
testing 'is_skb_forwardable' and re-running 'eth_type_trans'.
OTOH, it DOES scrub the skb.
Maybe we should scrub it as well prior the netif_receive_skb call?

Thanks,
Shmulik

device-tree support for writing to phy registers?

2016-09-23 Thread Tim Harvey

Greetings,

I've got a TI DP83867 GbE phy that requires some register writes to
configure its refclock output. Is there a generic device-tree API for
writing to raw registers or is that something that would be need to be
added to a specific phy driver with a device-tree binding? There is a
DP83867 phy driver but it doesn't contain anything related to
configuring its CLKOUT via register 0x170.

Alternatively, is it generally considered 'ok' to take care of this in
the bootloader and not provide the MAC driver the gpio for phy-reset
so that bootloader configuration persists through the kernel?

Regards,

Tim

Re: [RFC] net: store port/representative id in metadata_dst

2016-09-23 Thread Jakub Kicinski

On Fri, 23 Sep 2016 07:23:26 -0700, John Fastabend wrote:
> On 16-09-23 05:55 AM, Jakub Kicinski wrote:
> > On Fri, 23 Sep 2016 11:06:09 +0200, Jiri Benc wrote:  
> >> On Fri, 23 Sep 2016 08:34:29 +0200, Jiri Pirko wrote:  
> >>> So if I understand that correctly, this would need some "shared netdev"
> >>> which would effectively serve only as a sink for all port netdevices to
> >>> tx packets to. On RX, this would be completely avoided. This lower
> >>> device looks like half zombie to me.
> >>
> >> Looks more like a quarter zombie. Even tx would not be allowed unless
> >> going through one of the ports, as all skbs without
> >> METADATA_HW_PORT_MUX metadata_dst would be dropped. But it would be
> >> possible to attach qdisc to the "lower" netdevice and it would actually
> >> have an effect. On rx this netdevice would be ignored completely. This
> >> is very weird behavior.
> >>  
> >>> I don't like it :( I wonder if the
> >>> solution would not be possible without this lower netdev.
> >>
> >> I agree. This approach doesn't sound correct. The skbs should not be
> >> requeued.  
> > 
> > Thanks for the responses!  
> 
> Nice timing we were just thinking about this.
> 
> > 
> > I think SR-IOV NICs are coming at this problem from a different angle,
> > we already have a big, feature-full per-port netdevs and now we want to
> > spawn representators for VFs to handle fallback traffic.  This patch
> > would help us mux VFR traffic on all the queues of the physical port
> > netdevs (the ones which were already present in legacy mode, that's the
> > lower device).  
> 
> Yep, I like the idea in general. I had a slightly different approach in
> mind though. If you look at __dev_queue_xmit() there is a void
> accel_priv pointer (gather you found this based on your commit note).
> My take was we could extend this a bit so it can be used by the VFR
> devices and they could do a dev_queue_xmit_accel(). In this way there is
> no need to touch /net/core/{filter, dst, ip_tunnel}.c etc. Maybe the
> accel logic needs to be extended to push the priv pointer all the way
> through the xmit routine of the target netdev though. This should look
> a lot like the macvlan accelerated xmit device path without the
> switching logic.
> 
> Of course maybe the name would be extended to dev_queue_xmit_extended()
> or something.
> 
> So the flow on ingress would be,
> 
>   1. pkt_received_by_PF_netdev
>   2. PF_netdev reads some tag off packet/descriptor and sets correct
>  skb->dev field. This is needed so stack "sees" packets from
>  correct VF ports.
>   3. packet passed up to stack.
> 
> I guess it is a bit "zombie" like on the receive path because the packet
> is never actually handled by VF netdev code per se and on egress can
> traverse both the VFR and PF netdevs qdiscs. But on the other hand the
> VFR netdevs and PF netdevs are all in the same driver. Plus using a
> queue per VFR is a bit of a waste as its not needed and also hardware
> may not have any mechanism to push VF traffic onto a rx queue.
> 
> On egress,
> 
>   1. VFR xmit is called
>   2. VFR xmit calls dev_queue_xmit_accel() with some meta-data if needed
>  for the lower netdev
>   3. lower netdev sends out the packet.
> 
> Again we don't need to waste any queues for each VFR and the VFR can be
> a LLTX device. In this scheme I think you avoid much of the changes in
> your patch and keep it all contained in the driver. Any thoughts?

Goes without saying that you have a much better understanding of packet
scheduling so please bear with me :)  My target model is that I have
n_cpus x "n_tc/prio" queues on the PF and I want to transmit the
fallback traffic over those same queues.  So no new HW queues are used
for VFRs at all.  This is a reverse of macvlan offload which AFAICT has
"bastard hw queues" which actually TX for a separate software device.

My understanding was that I can rework this model to have software
queues for VFRs (#sw queues == #PF queues + #VFRs) but no extra HW
queues (#hw queues == #PF queues) but then when the driver sees a
packet on sw-only VFR queue it has to pick one of the PF queues (which
one?), lock PF software queue to own it, and only then can it
transmit.  With the dst_metadata there is no need for extra locking or
queue selection.

> To address 'I wonder if the solution can be done without this lower
> netdev' I think it can be but it creates two issues which I'm not sure
> have a good solution.
> 
> Without a lowerdev we either (a) give each VFR its own queue which I
> don't like because it complicates mgmt and uses resources or (b) we
> implicitly share queues. The later could be fine it just looks a bit
> cleaner IMO to make it explicit.
> 
> With regard to VF-PF flow rules if we allow matching on ingress port
> then can all your flow rules be pushed through the PF netdevices or
> if you want any of the VFR netdevs? After all I expsect the flow rule
> table is actually a shared resource between all attached ports.

Re: [PATCH V3 1/3] Documentation: devicetree: add qca8k binding

2016-09-23 Thread Rob Herring

On Thu, Sep 15, 2016 at 04:26:39PM +0200, John Crispin wrote:
> Add device-tree binding for ar8xxx switch families.
> 
> Cc: devicet...@vger.kernel.org
> Signed-off-by: John Crispin 
> ---
> Changes in V2
> * fixup example to include phy nodes and corresponding phandles
> * add a note explaining why we need to phy nodes
> 
> Changes in V3
> * add note stating that the cpu port is always 0
> 
>  .../devicetree/bindings/net/dsa/qca8k.txt  |   89 
> 
>  1 file changed, 89 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/dsa/qca8k.txt
> 
> diff --git a/Documentation/devicetree/bindings/net/dsa/qca8k.txt 
> b/Documentation/devicetree/bindings/net/dsa/qca8k.txt
> new file mode 100644
> index 000..9c67ee4
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/dsa/qca8k.txt
> @@ -0,0 +1,89 @@
> +* Qualcomm Atheros QCA8xxx switch family
> +
> +Required properties:
> +
> +- compatible: should be "qca,qca8337"
> +- #size-cells: must be 0
> +- #address-cells: must be 1
> +
> +Subnodes:
> +
> +The integrated switch subnode should be specified according to the binding
> +described in dsa/dsa.txt. As the QCA8K switches do not have a N:N mapping of
> +port and PHY id, each subnode describing a port needs to have a valid phandle
> +referencing the internal PHY connected to it. The CPU port of this switch is
> +always port 0.
> +
> +Example:
> +
> +
> +  {
> + phy_port1: phy@0 {
> + reg = <0>;
> + };
> +
> + phy_port2: phy@1 {
> + reg = <1>;
> + };
> +
> + phy_port3: phy@2 {
> + reg = <2>;
> + };
> +
> + phy_port4: phy@3 {
> + reg = <3>;
> + };
> +
> + phy_port5: phy@4 {
> + reg = <4>;
> + };
> +
> + switch0@0 {

The unit address here is the mdio device address and should be unique. 
You have 2 devices at 0.

> + compatible = "qca,qca8337";
> + #address-cells = <1>;
> + #size-cells = <0>;
> +
> + reg = <0>;

Not documented.

> +
> + ports {
> + #address-cells = <1>;
> + #size-cells = <0>;
> + port@0 {
> + reg = <0>;
> + label = "cpu";
> + ethernet = <>;
> + phy-mode = "rgmii";
> + };
> +
> + port@1 {
> + reg = <1>;
> + label = "lan1";
> + phy-handle = <_port1>;
> + };
> +
> + port@2 {
> + reg = <2>;
> + label = "lan2";
> + phy-handle = <_port2>;
> + };
> +
> + port@3 {
> + reg = <3>;
> + label = "lan3";
> + phy-handle = <_port3>;
> + };
> +
> + port@4 {
> + reg = <4>;
> + label = "lan4";
> + phy-handle = <_port4>;
> + };
> +
> + port@5 {
> + reg = <5>;
> + label = "wan";
> + phy-handle = <_port5>;
> + };
> + };
> + };
> + };
> -- 
> 1.7.10.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe devicetree" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-next 10/15] rxrpc: Don't call the tx_ack tracepoint if don't generate an ACK

2016-09-23 Thread David Howells

rxrpc_send_call_packet() is invoking the tx_ack tracepoint before it checks
whether there's an ACK to transmit (another thread may jump in and transmit
it).

Fix this by only invoking the tracepoint if we get a valid ACK to transmit.

Further, only allocate a serial number if we're going to actually transmit
something.

Signed-off-by: David Howells 
---

 net/rxrpc/output.c |   26 +++---
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 282cb1e36d06..5c1e008a5323 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -80,9 +80,6 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_call *call,
pkt->ackinfo.rwind  = htonl(call->rx_winsize);
pkt->ackinfo.jumbo_max  = htonl(jmax);
 
-   trace_rxrpc_tx_ack(call, hard_ack + 1, serial, call->ackr_reason,
-  top - hard_ack);
-
*ackp++ = 0;
*ackp++ = 0;
*ackp++ = 0;
@@ -119,8 +116,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
return -ENOMEM;
}
 
-   serial = atomic_inc_return(>serial);
-
msg.msg_name= >peer->srx.transport;
msg.msg_namelen = call->peer->srx.transport_len;
msg.msg_control = NULL;
@@ -131,7 +126,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
pkt->whdr.cid   = htonl(call->cid);
pkt->whdr.callNumber= htonl(call->call_id);
pkt->whdr.seq   = 0;
-   pkt->whdr.serial= htonl(serial);
pkt->whdr.type  = type;
pkt->whdr.flags = conn->out_clientflag;
pkt->whdr.userStatus= 0;
@@ -157,14 +151,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 
type)
 
spin_unlock_bh(>lock);
 
-   _proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
-  serial,
-  ntohs(pkt->ack.maxSkew),
-  ntohl(pkt->ack.firstPacket),
-  ntohl(pkt->ack.previousPacket),
-  ntohl(pkt->ack.serial),
-  rxrpc_acks(pkt->ack.reason),
-  pkt->ack.nAcks);
 
iov[0].iov_len += sizeof(pkt->ack) + n;
iov[1].iov_base = >ackinfo;
@@ -176,7 +162,6 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 type)
case RXRPC_PACKET_TYPE_ABORT:
abort_code = call->abort_code;
pkt->abort_code = htonl(abort_code);
-   _proto("Tx ABORT %%%u { %d }", serial, abort_code);
iov[0].iov_len += sizeof(pkt->abort_code);
len += sizeof(pkt->abort_code);
ioc = 1;
@@ -188,6 +173,17 @@ int rxrpc_send_call_packet(struct rxrpc_call *call, u8 
type)
goto out;
}
 
+   serial = atomic_inc_return(>serial);
+   pkt->whdr.serial = htonl(serial);
+   switch (type) {
+   case RXRPC_PACKET_TYPE_ACK:
+   trace_rxrpc_tx_ack(call,
+  ntohl(pkt->ack.firstPacket),
+  ntohl(pkt->ack.serial),
+  pkt->ack.reason, pkt->ack.nAcks);
+   break;
+   }
+
if (ping) {
call->ackr_ping = serial;
smp_wmb();

[PATCH net-next 03/15] rxrpc: Make sure sendmsg() is woken on call completion

2016-09-23 Thread David Howells

Make sure that sendmsg() gets woken up if the call it is waiting for
completes abnormally.

Signed-off-by: David Howells 
---

 net/rxrpc/ar-internal.h |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index b13754a6dd7a..808ab750dc6b 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -758,6 +758,7 @@ static inline bool __rxrpc_set_call_completion(struct 
rxrpc_call *call,
call->error = error;
call->completion = compl,
call->state = RXRPC_CALL_COMPLETE;
+   wake_up(>waitq);
return true;
}
return false;

[PATCH net-next 05/15] rxrpc: Use before_eq() and friends to compare serial numbers

2016-09-23 Thread David Howells

before_eq() and friends should be used to compare serial numbers (when not
checking for (non)equality) rather than casting to int, subtracting and
checking the result.

Signed-off-by: David Howells 
---

 net/rxrpc/input.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index cbb5d53f09d7..06027b6d9c19 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -578,7 +578,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct 
sk_buff *skb,
}
 
/* Discard any out-of-order or duplicate ACKs. */
-   if ((int)sp->hdr.serial - (int)call->acks_latest <= 0) {
+   if (before_eq(sp->hdr.serial, call->acks_latest)) {
_debug("discard ACK %d <= %d",
   sp->hdr.serial, call->acks_latest);
return;

[PATCH net-next 11/15] rxrpc: Add a tracepoint for the call timer

2016-09-23 Thread David Howells

Add a tracepoint to log call timer initiation, setting and expiry.

Signed-off-by: David Howells 
---

 include/trace/events/rxrpc.h |   36 
 net/rxrpc/ar-internal.h  |   13 -
 net/rxrpc/call_event.c   |7 ---
 net/rxrpc/call_object.c  |6 --
 net/rxrpc/misc.c |8 
 net/rxrpc/sendmsg.c  |2 +-
 6 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index e8f2afbbe0bf..57322897d745 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -414,6 +414,42 @@ TRACE_EVENT(rxrpc_rtt_rx,
  __entry->avg)
);
 
+TRACE_EVENT(rxrpc_timer,
+   TP_PROTO(struct rxrpc_call *call, enum rxrpc_timer_trace why,
+unsigned long now),
+
+   TP_ARGS(call, why, now),
+
+   TP_STRUCT__entry(
+   __field(struct rxrpc_call *,call
)
+   __field(enum rxrpc_timer_trace, why 
)
+   __field(unsigned long,  now 
)
+   __field(unsigned long,  expire_at   
)
+   __field(unsigned long,  ack_at  
)
+   __field(unsigned long,  resend_at   
)
+   __field(unsigned long,  timer   
)
+),
+
+   TP_fast_assign(
+   __entry->call   = call;
+   __entry->why= why;
+   __entry->now= now;
+   __entry->expire_at  = call->expire_at;
+   __entry->ack_at = call->ack_at;
+   __entry->resend_at  = call->resend_at;
+   __entry->timer  = call->timer.expires;
+  ),
+
+   TP_printk("c=%p %s now=%lx x=%ld a=%ld r=%ld t=%ld",
+ __entry->call,
+ rxrpc_timer_traces[__entry->why],
+ __entry->now,
+ __entry->expire_at - __entry->now,
+ __entry->ack_at - __entry->now,
+ __entry->resend_at - __entry->now,
+ __entry->timer - __entry->now)
+   );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index a494d56eb236..e564eca75985 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -678,6 +678,17 @@ enum rxrpc_rtt_rx_trace {
 
 extern const char rxrpc_rtt_rx_traces[rxrpc_rtt_rx__nr_trace][5];
 
+enum rxrpc_timer_trace {
+   rxrpc_timer_begin,
+   rxrpc_timer_expired,
+   rxrpc_timer_set_for_ack,
+   rxrpc_timer_set_for_resend,
+   rxrpc_timer_set_for_send,
+   rxrpc_timer__nr_trace
+};
+
+extern const char rxrpc_timer_traces[rxrpc_timer__nr_trace][8];
+
 extern const char *const rxrpc_pkts[];
 extern const char *rxrpc_acks(u8 reason);
 
@@ -707,7 +718,7 @@ int rxrpc_reject_call(struct rxrpc_sock *);
 /*
  * call_event.c
  */
-void rxrpc_set_timer(struct rxrpc_call *);
+void rxrpc_set_timer(struct rxrpc_call *, enum rxrpc_timer_trace);
 void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool);
 void rxrpc_process_call(struct work_struct *);
 
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 8bc5c8e37ab4..90e970ba048a 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -24,7 +24,7 @@
 /*
  * Set the timer
  */
-void rxrpc_set_timer(struct rxrpc_call *call)
+void rxrpc_set_timer(struct rxrpc_call *call, enum rxrpc_timer_trace why)
 {
unsigned long t, now = jiffies;
 
@@ -45,6 +45,7 @@ void rxrpc_set_timer(struct rxrpc_call *call)
 
if (call->timer.expires != t || !timer_pending(>timer)) {
mod_timer(>timer, t);
+   trace_rxrpc_timer(call, why, now);
}
}
 
@@ -120,7 +121,7 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 
ack_reason,
_debug("deferred ACK %ld < %ld", expiry, call->ack_at - now);
if (time_before(ack_at, call->ack_at)) {
call->ack_at = ack_at;
-   rxrpc_set_timer(call);
+   rxrpc_set_timer(call, rxrpc_timer_set_for_ack);
}
}
 }
@@ -293,7 +294,7 @@ recheck_state:
goto recheck_state;
}
 
-   rxrpc_set_timer(call);
+   rxrpc_set_timer(call, rxrpc_timer_set_for_resend);
 
/* other events may have been raised since we started checking */
if (call->events && call->state < RXRPC_CALL_COMPLETE) {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index f2fadf667e19..a53f4c2c0025 100644

[PATCH net-next 09/15] rxrpc: Pass the last Tx packet marker in the annotation buffer

2016-09-23 Thread David Howells

When the last packet of data to be transmitted on a call is queued, tx_top
is set and then the RXRPC_CALL_TX_LAST flag is set.  Unfortunately, this
leaves a race in the ACK processing side of things because the flag affects
the interpretation of tx_top and also allows us to start receiving reply
data before we've finished transmitting.

To fix this, make the following changes:

 (1) rxrpc_queue_packet() now sets a marker in the annotation buffer
 instead of setting the RXRPC_CALL_TX_LAST flag.

 (2) rxrpc_rotate_tx_window() detects the marker and sets the flag in the
 same context as the routines that use it.

 (3) rxrpc_end_tx_phase() is simplified to just shift the call state.
 The Tx window must have been rotated before calling to discard the
 last packet.

 (4) rxrpc_receiving_reply() is added to handle the arrival of the first
 DATA packet of a reply to a client call (which is an implicit ACK of
 the Tx phase).

 (5) The last part of rxrpc_input_ack() is reordered to perform Tx
 rotation, then soft-ACK application and then to end the phase if we've
 rotated the last packet.  In the event of a terminal ACK, the soft-ACK
 application will be skipped as nAcks should be 0.

 (6) rxrpc_input_ackall() now has to rotate as well as ending the phase.

In addition:

 (7) Alter the transmit tracepoint to log the rotation of the last packet.

 (8) Remove the no-longer relevant queue_reqack tracepoint note.  The
 ACK-REQUESTED packet header flag is now set as needed when we actually
 transmit the packet and may vary by retransmission.

Signed-off-by: David Howells 
---

 net/rxrpc/ar-internal.h |7 ++-
 net/rxrpc/input.c   |  102 +++
 net/rxrpc/misc.c|3 +
 net/rxrpc/sendmsg.c |   14 +++---
 4 files changed, 81 insertions(+), 45 deletions(-)

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 9e3ba4dc9578..a494d56eb236 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -508,7 +508,9 @@ struct rxrpc_call {
 #define RXRPC_TX_ANNO_NAK  2
 #define RXRPC_TX_ANNO_RETRANS  3
 #define RXRPC_TX_ANNO_MASK 0x03
-#define RXRPC_TX_ANNO_RESENT   0x04
+#define RXRPC_TX_ANNO_LAST 0x04
+#define RXRPC_TX_ANNO_RESENT   0x08
+
 #define RXRPC_RX_ANNO_JUMBO0x3f/* Jumbo subpacket number + 1 
if not zero */
 #define RXRPC_RX_ANNO_JLAST0x40/* Set if last element of a 
jumbo packet */
 #define RXRPC_RX_ANNO_VERIFIED 0x80/* Set if verified and 
decrypted */
@@ -621,9 +623,10 @@ extern const char 
rxrpc_call_traces[rxrpc_call__nr_trace][4];
 enum rxrpc_transmit_trace {
rxrpc_transmit_wait,
rxrpc_transmit_queue,
-   rxrpc_transmit_queue_reqack,
rxrpc_transmit_queue_last,
rxrpc_transmit_rotate,
+   rxrpc_transmit_rotate_last,
+   rxrpc_transmit_await_reply,
rxrpc_transmit_end,
rxrpc_transmit__nr_trace
 };
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index d3d69ab1f0a1..fb3e2f6afa3b 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -59,6 +59,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, 
rxrpc_seq_t to)
 {
struct sk_buff *skb, *list = NULL;
int ix;
+   u8 annotation;
 
spin_lock(>lock);
 
@@ -66,16 +67,22 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, 
rxrpc_seq_t to)
call->tx_hard_ack++;
ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK;
skb = call->rxtx_buffer[ix];
+   annotation = call->rxtx_annotations[ix];
rxrpc_see_skb(skb, rxrpc_skb_tx_rotated);
call->rxtx_buffer[ix] = NULL;
call->rxtx_annotations[ix] = 0;
skb->next = list;
list = skb;
+
+   if (annotation & RXRPC_TX_ANNO_LAST)
+   set_bit(RXRPC_CALL_TX_LAST, >flags);
}
 
spin_unlock(>lock);
 
-   trace_rxrpc_transmit(call, rxrpc_transmit_rotate);
+   trace_rxrpc_transmit(call, (test_bit(RXRPC_CALL_TX_LAST, >flags) ?
+   rxrpc_transmit_rotate_last :
+   rxrpc_transmit_rotate));
wake_up(>waitq);
 
while (list) {
@@ -92,42 +99,65 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, 
rxrpc_seq_t to)
  * This occurs when we get an ACKALL packet, the first DATA packet of a reply,
  * or a final ACK packet.
  */
-static bool rxrpc_end_tx_phase(struct rxrpc_call *call, const char *abort_why)
+static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
+  const char *abort_why)
 {
-   _enter("");
-
-   switch (call->state) {
-   case RXRPC_CALL_CLIENT_RECV_REPLY:
-   return true;
-   case RXRPC_CALL_CLIENT_AWAIT_REPLY:
-   case RXRPC_CALL_SERVER_AWAIT_ACK:
-   break;
-

[PATCH net-next 00/15] rxrpc: Bug fixes and tracepoints

2016-09-23 Thread David Howells


Here are a bunch of bug fixes:

 (1) Need to set the timestamp on a Tx packet before queueing it to avoid
 trouble with the retransmission function.

 (2) Don't send an ACK at the end of the service reply transmission; it's
 the responsibility of the client to send an ACK to close the call.
 The service can resend the last DATA packet or send a PING ACK.

 (3) Wake sendmsg() on abnormal call termination.

 (4) Use ktime_add_ms() not ktime_add_ns() to add millisecond offsets.

 (5) Use before_eq() & co. to compare serial numbers (which may wrap).

 (6) Start the resend timer on DATA packet transmission.

 (7) Don't accidentally cancel a retransmission upon receiving a NACK.

 (8) Fix the call timer setting function to deal with timeouts that are now
 or past.

 (9) Don't use a flag to communicate the presence of the last packet in the
 Tx buffer from sendmsg to the input routines where ACK and DATA
 reception is handled.  The problem is that there's a window between
 queueing the last packet for transmission and setting the flag in
 which ACKs or reply DATA packets can arrive, causing apparent state
 machine violation issues.

 Instead use the annotation buffer to mark the last packet and pick up
 and set the flag in the input routines.

(10) Don't call the tx_ack tracepoint and don't allocate a serial number if
 someone else nicked the ACK we were about to transmit.

There are also new tracepoints and one altered tracepoint used to track
down the above bugs:

(11) Call timer tracepoint.

(12) Data Tx tracepoint (and adjustments to ACK tracepoint).

(13) Injected Rx packet loss tracepoint.

(14) Ack proposal tracepoint.

(15) Retransmission selection tracepoint.

The patches can be found here also:


http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=rxrpc-rewrite

Tagged thusly:

git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
rxrpc-rewrite-20160923

David
---
David Howells (15):
  rxrpc: Preset timestamp on Tx sk_buffs
  rxrpc: Don't send an ACK at the end of service call response transmission
  rxrpc: Make sure sendmsg() is woken on call completion
  rxrpc: Should be using ktime_add_ms() not ktime_add_ns()
  rxrpc: Use before_eq() and friends to compare serial numbers
  rxrpc: Need to start the resend timer on initial transmission
  rxrpc: Fix accidental cancellation of scheduled resend by ACK parser
  rxrpc: Fix call timer
  rxrpc: Pass the last Tx packet marker in the annotation buffer
  rxrpc: Don't call the tx_ack tracepoint if don't generate an ACK
  rxrpc: Add a tracepoint for the call timer
  rxrpc: Add data Tx tracepoint and adjust Tx ACK tracepoint
  rxrpc: Add a tracepoint to log injected Rx packet loss
  rxrpc: Add tracepoint for ACK proposal
  rxrpc: Add a tracepoint to log which packets will be retransmitted


 include/rxrpc/packet.h   |1 
 include/trace/events/rxrpc.h |  174 --
 net/rxrpc/ar-internal.h  |   45 ++-
 net/rxrpc/call_event.c   |   57 --
 net/rxrpc/call_object.c  |8 +-
 net/rxrpc/conn_event.c   |5 -
 net/rxrpc/input.c|  136 +
 net/rxrpc/misc.c |   41 +++---
 net/rxrpc/output.c   |   32 
 net/rxrpc/recvmsg.c  |5 -
 net/rxrpc/sendmsg.c  |   28 +--
 11 files changed, 405 insertions(+), 127 deletions(-)

[PATCH net-next 12/15] rxrpc: Add data Tx tracepoint and adjust Tx ACK tracepoint

2016-09-23 Thread David Howells

Add a tracepoint to log transmission of DATA packets (including loss
injection).

Adjust the ACK transmission tracepoint to include the packet serial number
and to line this up with the DATA transmission display.

Signed-off-by: David Howells 
---

 include/trace/events/rxrpc.h |   50 +++---
 net/rxrpc/conn_event.c   |5 ++--
 net/rxrpc/output.c   |5 +++-
 3 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 57322897d745..6001bf93dc79 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -256,33 +256,67 @@ TRACE_EVENT(rxrpc_rx_ack,
  __entry->n_acks)
);
 
+TRACE_EVENT(rxrpc_tx_data,
+   TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq,
+rxrpc_serial_t serial, u8 flags, bool lose),
+
+   TP_ARGS(call, seq, serial, flags, lose),
+
+   TP_STRUCT__entry(
+   __field(struct rxrpc_call *,call)
+   __field(rxrpc_seq_t,seq )
+   __field(rxrpc_serial_t, serial  )
+   __field(u8, flags   )
+   __field(bool,   lose)
+),
+
+   TP_fast_assign(
+   __entry->call = call;
+   __entry->seq = seq;
+   __entry->serial = serial;
+   __entry->flags = flags;
+   __entry->lose = lose;
+  ),
+
+   TP_printk("c=%p DATA %08x q=%08x fl=%02x%s",
+ __entry->call,
+ __entry->serial,
+ __entry->seq,
+ __entry->flags,
+ __entry->lose ? " *LOSE*" : "")
+   );
+
 TRACE_EVENT(rxrpc_tx_ack,
-   TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t first,
-rxrpc_serial_t serial, u8 reason, u8 n_acks),
+   TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial,
+rxrpc_seq_t ack_first, rxrpc_serial_t ack_serial,
+u8 reason, u8 n_acks),
 
-   TP_ARGS(call, first, serial, reason, n_acks),
+   TP_ARGS(call, serial, ack_first, ack_serial, reason, n_acks),
 
TP_STRUCT__entry(
__field(struct rxrpc_call *,call)
-   __field(rxrpc_seq_t,first   )
__field(rxrpc_serial_t, serial  )
+   __field(rxrpc_seq_t,ack_first   )
+   __field(rxrpc_serial_t, ack_serial  )
__field(u8, reason  )
__field(u8, n_acks  )
 ),
 
TP_fast_assign(
__entry->call = call;
-   __entry->first = first;
__entry->serial = serial;
+   __entry->ack_first = ack_first;
+   __entry->ack_serial = ack_serial;
__entry->reason = reason;
__entry->n_acks = n_acks;
   ),
 
-   TP_printk("c=%p %s f=%08x r=%08x n=%u",
+   TP_printk(" c=%p ACK  %08x %s f=%08x r=%08x n=%u",
  __entry->call,
- rxrpc_acks(__entry->reason),
- __entry->first,
  __entry->serial,
+ rxrpc_acks(__entry->reason),
+ __entry->ack_first,
+ __entry->ack_serial,
  __entry->n_acks)
);
 
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 75a15a4c74c3..a1cf1ec5f29e 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -98,9 +98,6 @@ static void rxrpc_conn_retransmit_call(struct 
rxrpc_connection *conn,
pkt.info.rwind  = htonl(rxrpc_rx_window_size);
pkt.info.jumbo_max  = htonl(rxrpc_rx_jumbo_max);
len += sizeof(pkt.ack) + sizeof(pkt.info);
-
-   trace_rxrpc_tx_ack(NULL, chan->last_seq, 0,
-  RXRPC_ACK_DUPLICATE, 0);
break;
}
 
@@ -122,6 +119,8 @@ static void rxrpc_conn_retransmit_call(struct 
rxrpc_connection *conn,
_proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
break;
case RXRPC_PACKET_TYPE_ACK:
+   trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0,
+  RXRPC_ACK_DUPLICATE, 0);
_proto("Tx ACK %%%u [re]", serial);
break;
}
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c

[PATCH net-next 01/15] rxrpc: Preset timestamp on Tx sk_buffs

2016-09-23 Thread David Howells

Set the timestamp on sk_buffs holding packets to be transmitted before
queueing them because the moment the packet is on the queue it can be seen
by the retransmission algorithm - which may see a completely random
timestamp.

If the retransmission algorithm sees such a timestamp, it may retransmit
the packet and, in future, tell the congestion management algorithm that
the retransmit timer expired.

Signed-off-by: David Howells 
---

 net/rxrpc/sendmsg.c |5 +
 1 file changed, 5 insertions(+)

diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index ca7c3be60ad2..ca3811bfbd17 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -99,6 +99,11 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, 
struct sk_buff *skb,
 
ASSERTCMP(seq, ==, call->tx_top + 1);
 
+   /* We have to set the timestamp before queueing as the retransmit
+* algorithm can see the packet as soon as we queue it.
+*/
+   skb->tstamp = ktime_get_real();
+
ix = seq & RXRPC_RXTX_BUFF_MASK;
rxrpc_get_skb(skb, rxrpc_skb_tx_got);
call->rxtx_annotations[ix] = RXRPC_TX_ANNO_UNACK;

[PATCH net-next 13/15] rxrpc: Add a tracepoint to log injected Rx packet loss

2016-09-23 Thread David Howells

Add a tracepoint to log received packets that get discarded due to Rx
packet loss.

Signed-off-by: David Howells 
---

 include/trace/events/rxrpc.h |   21 +
 net/rxrpc/input.c|   11 +--
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 6001bf93dc79..9413b17ba04b 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -484,6 +484,27 @@ TRACE_EVENT(rxrpc_timer,
  __entry->timer - __entry->now)
);
 
+TRACE_EVENT(rxrpc_rx_lose,
+   TP_PROTO(struct rxrpc_skb_priv *sp),
+
+   TP_ARGS(sp),
+
+   TP_STRUCT__entry(
+   __field_struct(struct rxrpc_host_header,hdr 
)
+),
+
+   TP_fast_assign(
+   memcpy(&__entry->hdr, >hdr, sizeof(__entry->hdr));
+  ),
+
+   TP_printk("%08x:%08x:%08x:%04x %08x %08x %02x %02x %s *LOSE*",
+ __entry->hdr.epoch, __entry->hdr.cid,
+ __entry->hdr.callNumber, __entry->hdr.serviceId,
+ __entry->hdr.serial, __entry->hdr.seq,
+ __entry->hdr.type, __entry->hdr.flags,
+ __entry->hdr.type <= 15 ? rxrpc_pkts[__entry->hdr.type] : 
"?UNK")
+   );
+
 #endif /* _TRACE_RXRPC_H */
 
 /* This part must be outside protection */
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index fb3e2f6afa3b..19b1e189f5dc 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -837,20 +837,19 @@ void rxrpc_data_ready(struct sock *udp_sk)
skb_orphan(skb);
sp = rxrpc_skb(skb);
 
+   /* dig out the RxRPC connection details */
+   if (rxrpc_extract_header(sp, skb) < 0)
+   goto bad_message;
+
if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
static int lose;
if ((lose++ & 7) == 7) {
+   trace_rxrpc_rx_lose(sp);
rxrpc_lose_skb(skb, rxrpc_skb_rx_lost);
return;
}
}
 
-   _net("Rx UDP packet from %08x:%04hu",
-ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
-
-   /* dig out the RxRPC connection details */
-   if (rxrpc_extract_header(sp, skb) < 0)
-   goto bad_message;
trace_rxrpc_rx_packet(sp);
 
_net("Rx RxRPC %s ep=%x call=%x:%x",

1 2 3 >

1 - 100 of 214 matches

Mail list logo