from:"Eric Dumazet"

Re: [revert 0d60d8df6f49] [net/net-next] [6.8-rc5] Build Failure

2024-02-29 Thread Eric Dumazet

On Thu, Feb 29, 2024 at 3:53 PM Eric Dumazet  wrote:
>
> On Thu, Feb 29, 2024 at 3:47 PM Jakub Kicinski  wrote:
> >
> > On Thu, 29 Feb 2024 09:55:22 +0100 Eric Dumazet wrote:
> > > I do not see other solution than this, otherwise we have to add more
> > > pollution to include/linux/netdevice.h
> >
> > Right :(
> >
> > > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > > index 
> > > a9c973b92294bb110cf3cd336485972127b01b58..40797ea80bc6273cae6b7773d0a3e47459a72150
> > > 100644
> > > --- a/include/linux/netdevice.h
> > > +++ b/include/linux/netdevice.h
> > > @@ -2469,7 +2469,7 @@ struct net_device {
> > > struct devlink_port *devlink_port;
> > >
> > >  #if IS_ENABLED(CONFIG_DPLL)
> > > -   struct dpll_pin __rcu   *dpll_pin;
> > > +   void __rcu *dpll_pin;
> > >  #endif
> >
> > If DPLL wants to hide its type definitions the helpers must live
> > in dpll? IOW move netdev_dpll_pin() to drivers/dpll/dpll_core.c
>
> Oh for some reason I thought this stuff was a module.
>
> Otherwise, why having dpll 'core' helpers in net/core/dev.c

So, we would have something a bit convoluted like :

diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
index 
4c2bb27c99fe4e517b0d92c4ae3db83a679c7d11..241db366b2c74ae749f49612d86176b2f8f479c1
100644
--- a/drivers/dpll/dpll_core.c
+++ b/drivers/dpll/dpll_core.c
@@ -42,6 +42,11 @@ struct dpll_pin_registration {
void *priv;
 };

+struct dpll_pin *netdev_dpll_pin(const struct net_device *dev)
+{
+   return rcu_dereference_rtnl(dev->dpll_pin);
+}
+
 struct dpll_device *dpll_device_get_by_id(int id)
 {
if (xa_get_mark(_device_xa, id, DPLL_REGISTERED))
diff --git a/include/linux/dpll.h b/include/linux/dpll.h
index 
4ec2fe9caf5a3f284afd0cfe4fc7c2bf42cbbc60..c60591308ae80fb99aa5abb5832b9a228473a916
100644
--- a/include/linux/dpll.h
+++ b/include/linux/dpll.h
@@ -169,13 +169,13 @@ int dpll_device_change_ntf(struct dpll_device *dpll);

 int dpll_pin_change_ntf(struct dpll_pin *pin);

+#if !IS_ENABLED(CONFIG_DPLL)
 static inline struct dpll_pin *netdev_dpll_pin(const struct net_device *dev)
 {
-#if IS_ENABLED(CONFIG_DPLL)
-   return rcu_dereference_rtnl(dev->dpll_pin);
-#else
return NULL;
-#endif
 }
+#else
+struct dpll_pin *netdev_dpll_pin(const struct net_device *dev);
+#endif

 #endif

Re: [revert 0d60d8df6f49] [net/net-next] [6.8-rc5] Build Failure

2024-02-29 Thread Eric Dumazet

On Thu, Feb 29, 2024 at 3:47 PM Jakub Kicinski  wrote:
>
> On Thu, 29 Feb 2024 09:55:22 +0100 Eric Dumazet wrote:
> > I do not see other solution than this, otherwise we have to add more
> > pollution to include/linux/netdevice.h
>
> Right :(
>
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 
> > a9c973b92294bb110cf3cd336485972127b01b58..40797ea80bc6273cae6b7773d0a3e47459a72150
> > 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -2469,7 +2469,7 @@ struct net_device {
> > struct devlink_port *devlink_port;
> >
> >  #if IS_ENABLED(CONFIG_DPLL)
> > -   struct dpll_pin __rcu   *dpll_pin;
> > +   void __rcu *dpll_pin;
> >  #endif
>
> If DPLL wants to hide its type definitions the helpers must live
> in dpll? IOW move netdev_dpll_pin() to drivers/dpll/dpll_core.c

Oh for some reason I thought this stuff was a module.

Otherwise, why having dpll 'core' helpers in net/core/dev.c

Re: [revert 0d60d8df6f49] [net/net-next] [6.8-rc5] Build Failure

2024-02-29 Thread Eric Dumazet

On Thu, Feb 29, 2024 at 9:04 AM Tasmiya Nalatwad
 wrote:
>
> Greetings,
>
> I have tried the patch provided below. Moving struct to file
> "net/core/rtnetlink.c" is not resolving the problem. Please find the
> below traces.
>
> --- Traces ---
>
> In file included from ./include/linux/rbtree.h:24,
>   from ./include/linux/mm_types.h:11,
>   from ./include/linux/mmzone.h:22,
>   from ./include/linux/gfp.h:7,
>   from ./include/linux/umh.h:4,
>   from ./include/linux/kmod.h:9,
>   from ./include/linux/module.h:17,
>   from net/core/rtnetlink.c:17:
> net/core/rtnetlink.c: In function ‘netdev_dpll_pin’:
> ./include/linux/rcupdate.h:439:9: error: dereferencing pointer to
> incomplete type ‘struct dpll_pin’
>typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
>   ^
> ./include/linux/rcupdate.h:587:2: note: in expansion of macro
> ‘__rcu_dereference_check’
>__rcu_dereference_check((p), __UNIQUE_ID(rcu), \
>^~~
> ./include/linux/rtnetlink.h:70:2: note: in expansion of macro
> ‘rcu_dereference_check’
>rcu_dereference_check(p, lockdep_rtnl_is_held())
>^
> net/core/rtnetlink.c:1059:15: note: in expansion of macro
> ‘rcu_dereference_rtnl’
>  return rcu_dereference_rtnl(dev->dpll_pin);
> ^~~~
>CC  crypto/algboss.o
> net/core/rtnetlink.c:1063:1: error: control reaches end of non-void
> function [-Werror=return-type]
>   }
>   ^
>CC  crypto/authenc.o
>CC  crypto/authencesn.o
>CC  crypto/af_alg.o
>CC  crypto/algif_hash.o
>CC  crypto/algif_skcipher.o
>CC  crypto/algif_rng.o
>CC  crypto/algif_aead.o
>AR  arch/powerpc/kernel/built-in.a
> cc1: some warnings being treated as errors
> make[4]: *** [scripts/Makefile.build:243: net/core/rtnetlink.o] Error 1
> make[4]: *** Waiting for unfinished jobs
>CC  lib/kobject_uevent.o
>AR  drivers/net/mdio/built-in.a
>AR  net/802/built-in.a
>AR  drivers/connector/built-in.a
>CC  lib/vsprintf.o
>AR  ipc/built-in.a
>AR  net/nsh/built-in.a
>CC  lib/dynamic_debug.o
> In file included from ./arch/powerpc/include/generated/asm/rwonce.h:1,
>   from ./include/linux/compiler.h:251,
>   from ./include/linux/instrumented.h:10,
>   from ./include/linux/uaccess.h:6,
>   from net/core/dev.c:71:
> net/core/dev.c: In function ‘netdev_dpll_pin_assign’:
> ./include/linux/rcupdate.h:462:36: error: dereferencing pointer to
> incomplete type ‘struct dpll_pin’
>   #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
>  ^~~~
> ./include/asm-generic/rwonce.h:55:33: note: in definition of macro
> ‘__WRITE_ONCE’
>*(volatile typeof(x) *)&(x) = (val);\
>   ^~~
> ./arch/powerpc/include/asm/barrier.h:76:2: note: in expansion of macro
> ‘WRITE_ONCE’
>WRITE_ONCE(*p, v);  \
>^~
> ./include/asm-generic/barrier.h:172:55: note: in expansion of macro
> ‘__smp_store_release’
>   #define smp_store_release(p, v) do { kcsan_release();
> __smp_store_release(p, v); } while (0)
> ^~~
> ./include/linux/rcupdate.h:503:3: note: in expansion of macro
> ‘smp_store_release’
> smp_store_release(, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
> ^
> ./include/linux/rcupdate.h:503:25: note: in expansion of macro
> ‘RCU_INITIALIZER’
> smp_store_release(, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
>   ^~~
> net/core/dev.c:9081:2: note: in expansion of macro ‘rcu_assign_pointer’
>rcu_assign_pointer(dev->dpll_pin, dpll_pin);
>^~
>
> On 2/28/24 20:13, Eric Dumazet wrote:
> > On Wed, Feb 28, 2024 at 3:07 PM Vadim Fedorenko
> >  wrote:
> >> On 28/02/2024 11:09, Tasmiya Nalatwad wrote:
> >>> Greetings,
> >>>
> >>> [revert 0d60d8df6f49] [net/net-next] [6.8-rc5] Build Failure
> >>>
> >>> Reverting below commit fixes the issue
> >>>
> >>> commit 0d60d8df6f493bb46bf5db40d39dd60a1bafdd4e
> >>>   dpll: rely on rcu for netdev_dpll_pin()
> >>>
> >>> --- Traces ---
> >>>
> >>> ./include/linux/dpll.h: In function ‘netdev_dpll_pin’:
> >>> ./include/linux/rcupdate.h:439:9: error: dereferencing pointer to
> >>> incomplete type ‘struct dpll_pin’
> >&g

Re: [revert 0d60d8df6f49] [net/net-next] [6.8-rc5] Build Failure

2024-02-28 Thread Eric Dumazet

On Wed, Feb 28, 2024 at 3:07 PM Vadim Fedorenko
 wrote:
>
> On 28/02/2024 11:09, Tasmiya Nalatwad wrote:
> > Greetings,
> >
> > [revert 0d60d8df6f49] [net/net-next] [6.8-rc5] Build Failure
> >
> > Reverting below commit fixes the issue
> >
> > commit 0d60d8df6f493bb46bf5db40d39dd60a1bafdd4e
> >  dpll: rely on rcu for netdev_dpll_pin()
> >
> > --- Traces ---
> >
> > ./include/linux/dpll.h: In function ‘netdev_dpll_pin’:
> > ./include/linux/rcupdate.h:439:9: error: dereferencing pointer to
> > incomplete type ‘struct dpll_pin’
> >typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
> >   ^
> > ./include/linux/rcupdate.h:587:2: note: in expansion of macro
> > ‘__rcu_dereference_check’
> >__rcu_dereference_check((p), __UNIQUE_ID(rcu), \
> >^~~
> > ./include/linux/rtnetlink.h:70:2: note: in expansion of macro
> > ‘rcu_dereference_check’
> >rcu_dereference_check(p, lockdep_rtnl_is_held())
> >^
> > ./include/linux/dpll.h:175:9: note: in expansion of macro
> > ‘rcu_dereference_rtnl’
> >return rcu_dereference_rtnl(dev->dpll_pin);
> >   ^~~~
> > make[4]: *** [scripts/Makefile.build:243: drivers/dpll/dpll_core.o] Error 1
> > make[4]: *** Waiting for unfinished jobs
> >AR  net/mpls/built-in.a
> >AR  net/l3mdev/built-in.a
> > In file included from ./include/linux/rbtree.h:24,
> >   from ./include/linux/mm_types.h:11,
> >   from ./include/linux/mmzone.h:22,
> >   from ./include/linux/gfp.h:7,
> >   from ./include/linux/umh.h:4,
> >   from ./include/linux/kmod.h:9,
> >   from ./include/linux/module.h:17,
> >   from drivers/dpll/dpll_netlink.c:9:
> > ./include/linux/dpll.h: In function ‘netdev_dpll_pin’:
> > ./include/linux/rcupdate.h:439:9: error: dereferencing pointer to
> > incomplete type ‘struct dpll_pin’
> >typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
> >   ^
> > ./include/linux/rcupdate.h:587:2: note: in expansion of macro
> > ‘__rcu_dereference_check’
> >__rcu_dereference_check((p), __UNIQUE_ID(rcu), \
> >^~~
> > ./include/linux/rtnetlink.h:70:2: note: in expansion of macro
> > ‘rcu_dereference_check’
> >rcu_dereference_check(p, lockdep_rtnl_is_held())
> >^
> > ./include/linux/dpll.h:175:9: note: in expansion of macro
> > ‘rcu_dereference_rtnl’
> >return rcu_dereference_rtnl(dev->dpll_pin);
> >   ^~~~
> > make[4]: *** [scripts/Makefile.build:243: drivers/dpll/dpll_netlink.o]
> > Error 1
> > make[3]: *** [scripts/Makefile.build:481: drivers/dpll] Error 2
> > make[3]: *** Waiting for unfinished jobs
> > In file included from ./arch/powerpc/include/generated/asm/rwonce.h:1,
> >   from ./include/linux/compiler.h:251,
> >   from ./include/linux/instrumented.h:10,
> >   from ./include/linux/uaccess.h:6,
> >   from net/core/dev.c:71:
> > net/core/dev.c: In function ‘netdev_dpll_pin_assign’:
> > ./include/linux/rcupdate.h:462:36: error: dereferencing pointer to
> > incomplete type ‘struct dpll_pin’
> >   #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)
> >  ^~~~
> > ./include/asm-generic/rwonce.h:55:33: note: in definition of macro
> > ‘__WRITE_ONCE’
> >*(volatile typeof(x) *)&(x) = (val);\
> >   ^~~
> > ./arch/powerpc/include/asm/barrier.h:76:2: note: in expansion of macro
> > ‘WRITE_ONCE’
> >WRITE_ONCE(*p, v);  \
> >^~
> > ./include/asm-generic/barrier.h:172:55: note: in expansion of macro
> > ‘__smp_store_release’
> >   #define smp_store_release(p, v) do { kcsan_release();
> > __smp_store_release(p, v); } while (0)
> > ^~~
> > ./include/linux/rcupdate.h:503:3: note: in expansion of macro
> > ‘smp_store_release’
> > smp_store_release(, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
> > ^
> > ./include/linux/rcupdate.h:503:25: note: in expansion of macro
> > ‘RCU_INITIALIZER’
> > smp_store_release(, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
> >   ^~~
> > net/core/dev.c:9081:2: note: in expansion of macro ‘rcu_assign_pointer’
> >rcu_assign_pointer(dev->dpll_pin, dpll_pin);
> >^~
> > make[4]: *** [scripts/Makefile.build:243: net/core/dev.o] Error 1
> > make[4]: *** Waiting for unfinished jobs
> >AR  drivers/net/ethernet/built-in.a
> >AR  drivers/net/built-in.a
> >AR  net/dcb/built-in.a
> >AR  net/netlabel/built-in.a
> >AR  net/strparser/built-in.a
> >AR  net/handshake/built-in.a
> >GEN lib/test_fortify.log
> >AR  net/8021q/built-in.a
> >AR  net/nsh/built-in.a
> >AR  net/unix/built-in.a
> >CC  lib/string.o
> >AR

Re: [net-next PATCH v2 4/4] netdev: use napi_schedule bool instead of napi_schedule_prep/__napi_schedule

2023-10-09 Thread Eric Dumazet

On Sun, Oct 8, 2023 at 8:27 PM Christian Marangi  wrote:
>
> On Sun, Oct 08, 2023 at 09:08:41AM +0200, Eric Dumazet wrote:
> > On Fri, Oct 6, 2023 at 8:49 PM Christian Marangi  
> > wrote:
> > >
> > > On Thu, Oct 05, 2023 at 06:16:26PM +0200, Eric Dumazet wrote:
> > > > On Tue, Oct 3, 2023 at 8:36 PM Christian Marangi  
> > > > wrote:
> > > > >
> > > > > Replace if condition of napi_schedule_prep/__napi_schedule and use 
> > > > > bool
> > > > > from napi_schedule directly where possible.
> > > > >
> > > > > Signed-off-by: Christian Marangi 
> > > > > ---
> > > > >  drivers/net/ethernet/atheros/atlx/atl1.c | 4 +---
> > > > >  drivers/net/ethernet/toshiba/tc35815.c   | 4 +---
> > > > >  drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 4 +---
> > > > >  3 files changed, 3 insertions(+), 9 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c 
> > > > > b/drivers/net/ethernet/atheros/atlx/atl1.c
> > > > > index 02aa6fd8ebc2..a9014d7932db 100644
> > > > > --- a/drivers/net/ethernet/atheros/atlx/atl1.c
> > > > > +++ b/drivers/net/ethernet/atheros/atlx/atl1.c
> > > > > @@ -2446,7 +2446,7 @@ static int atl1_rings_clean(struct napi_struct 
> > > > > *napi, int budget)
> > > > >
> > > > >  static inline int atl1_sched_rings_clean(struct atl1_adapter* 
> > > > > adapter)
> > > > >  {
> > > > > -   if (!napi_schedule_prep(>napi))
> > > > > +   if (!napi_schedule(>napi))
> > > > > /* It is possible in case even the RX/TX ints are 
> > > > > disabled via IMR
> > > > >  * register the ISR bits are set anyway (but do not 
> > > > > produce IRQ).
> > > > >  * To handle such situation the napi functions used 
> > > > > to check is
> > > > > @@ -2454,8 +2454,6 @@ static inline int atl1_sched_rings_clean(struct 
> > > > > atl1_adapter* adapter)
> > > > >  */
> > > > > return 0;
> > > > >
> > > > > -   __napi_schedule(>napi);
> > > > > -
> > > > > /*
> > > > >  * Disable RX/TX ints via IMR register if it is
> > > > >  * allowed. NAPI handler must reenable them in same
> > > > > diff --git a/drivers/net/ethernet/toshiba/tc35815.c 
> > > > > b/drivers/net/ethernet/toshiba/tc35815.c
> > > > > index 14cf6ecf6d0d..a8b8a0e13f9a 100644
> > > > > --- a/drivers/net/ethernet/toshiba/tc35815.c
> > > > > +++ b/drivers/net/ethernet/toshiba/tc35815.c
> > > > > @@ -1436,9 +1436,7 @@ static irqreturn_t tc35815_interrupt(int irq, 
> > > > > void *dev_id)
> > > > > if (!(dmactl & DMA_IntMask)) {
> > > > > /* disable interrupts */
> > > > > tc_writel(dmactl | DMA_IntMask, >DMA_Ctl);
> > > > > -   if (napi_schedule_prep(>napi))
> > > > > -   __napi_schedule(>napi);
> > > > > -   else {
> > > > > +   if (!napi_schedule(>napi)) {
> > > > > printk(KERN_ERR "%s: interrupt taken in 
> > > > > poll\n",
> > > > >dev->name);
> > > > > BUG();
> > > >
> > > > Hmmm... could you also remove this BUG() ? I think this code path can 
> > > > be taken
> > > > if some applications are using busy polling.
> > > >
> > > > Or simply rewrite this with the traditional
> > > >
> > > > if (napi_schedule_prep(>napi)) {
> > > >/* disable interrupts */
> > > >tc_writel(dmactl | DMA_IntMask, >DMA_Ctl);
> > > > __napi_schedule(>napi);
> > > > }
> > > >
> > > >
> > >
> > > Mhhh is it safe to do so? I mean it seems very wrong to print a warning
> > > and BUG() instead of disabling the interrupt only if napi can be
> > > scheduled... Maybe is very old code? The more I see this the more I see
> > > problem... (randomly disabling the interrupt and then make the kernel
> > > die)
> >
> > I am pretty sure this BUG() can be hit these days with busy polling or
> > setting gro_flush_timeout.
> >
> > I wish we could remove these bugs before someone copy-paste them.
> >
> > Again, this is orthogonal, I might simply stop doing reviews if this
> > is not useful.
>
> They are very useful and thanks a lot for them! I'm asking these as to
> understand how to proceed. I have in queue 2 other series that depends
> on this and I'm just asking info on how to speedup the progress on this!
>
> Soo think I have to send v3 with the suggested change and BUG() dropped?
> Happy to do everything to fix and improve this series!

I think that your patch series is all about doing cleanups,
so I suggested adding another cleanup/fix,
and this can be done independently.

I doubt this matters, this code has probably not been used for quite a
long time...

Re: [net-next PATCH v2 4/4] netdev: use napi_schedule bool instead of napi_schedule_prep/__napi_schedule

2023-10-08 Thread Eric Dumazet

On Fri, Oct 6, 2023 at 8:49 PM Christian Marangi  wrote:
>
> On Thu, Oct 05, 2023 at 06:16:26PM +0200, Eric Dumazet wrote:
> > On Tue, Oct 3, 2023 at 8:36 PM Christian Marangi  
> > wrote:
> > >
> > > Replace if condition of napi_schedule_prep/__napi_schedule and use bool
> > > from napi_schedule directly where possible.
> > >
> > > Signed-off-by: Christian Marangi 
> > > ---
> > >  drivers/net/ethernet/atheros/atlx/atl1.c | 4 +---
> > >  drivers/net/ethernet/toshiba/tc35815.c   | 4 +---
> > >  drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 4 +---
> > >  3 files changed, 3 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c 
> > > b/drivers/net/ethernet/atheros/atlx/atl1.c
> > > index 02aa6fd8ebc2..a9014d7932db 100644
> > > --- a/drivers/net/ethernet/atheros/atlx/atl1.c
> > > +++ b/drivers/net/ethernet/atheros/atlx/atl1.c
> > > @@ -2446,7 +2446,7 @@ static int atl1_rings_clean(struct napi_struct 
> > > *napi, int budget)
> > >
> > >  static inline int atl1_sched_rings_clean(struct atl1_adapter* adapter)
> > >  {
> > > -   if (!napi_schedule_prep(>napi))
> > > +   if (!napi_schedule(>napi))
> > > /* It is possible in case even the RX/TX ints are 
> > > disabled via IMR
> > >  * register the ISR bits are set anyway (but do not 
> > > produce IRQ).
> > >  * To handle such situation the napi functions used to 
> > > check is
> > > @@ -2454,8 +2454,6 @@ static inline int atl1_sched_rings_clean(struct 
> > > atl1_adapter* adapter)
> > >  */
> > > return 0;
> > >
> > > -   __napi_schedule(>napi);
> > > -
> > > /*
> > >  * Disable RX/TX ints via IMR register if it is
> > >  * allowed. NAPI handler must reenable them in same
> > > diff --git a/drivers/net/ethernet/toshiba/tc35815.c 
> > > b/drivers/net/ethernet/toshiba/tc35815.c
> > > index 14cf6ecf6d0d..a8b8a0e13f9a 100644
> > > --- a/drivers/net/ethernet/toshiba/tc35815.c
> > > +++ b/drivers/net/ethernet/toshiba/tc35815.c
> > > @@ -1436,9 +1436,7 @@ static irqreturn_t tc35815_interrupt(int irq, void 
> > > *dev_id)
> > > if (!(dmactl & DMA_IntMask)) {
> > > /* disable interrupts */
> > > tc_writel(dmactl | DMA_IntMask, >DMA_Ctl);
> > > -   if (napi_schedule_prep(>napi))
> > > -   __napi_schedule(>napi);
> > > -   else {
> > > +   if (!napi_schedule(>napi)) {
> > > printk(KERN_ERR "%s: interrupt taken in poll\n",
> > >dev->name);
> > > BUG();
> >
> > Hmmm... could you also remove this BUG() ? I think this code path can be 
> > taken
> > if some applications are using busy polling.
> >
> > Or simply rewrite this with the traditional
> >
> > if (napi_schedule_prep(>napi)) {
> >/* disable interrupts */
> >tc_writel(dmactl | DMA_IntMask, >DMA_Ctl);
> > __napi_schedule(>napi);
> > }
> >
> >
>
> Mhhh is it safe to do so? I mean it seems very wrong to print a warning
> and BUG() instead of disabling the interrupt only if napi can be
> scheduled... Maybe is very old code? The more I see this the more I see
> problem... (randomly disabling the interrupt and then make the kernel
> die)

I am pretty sure this BUG() can be hit these days with busy polling or
setting gro_flush_timeout.

I wish we could remove these bugs before someone copy-paste them.

Again, this is orthogonal, I might simply stop doing reviews if this
is not useful.

Re: [net-next PATCH v2 3/4] netdev: replace napi_reschedule with napi_schedule

2023-10-08 Thread Eric Dumazet

On Fri, Oct 6, 2023 at 8:52 PM Christian Marangi  wrote:
>
> On Thu, Oct 05, 2023 at 06:41:03PM +0200, Eric Dumazet wrote:
> > On Thu, Oct 5, 2023 at 6:32 PM Jakub Kicinski  wrote:
> > >
> > > On Thu, 5 Oct 2023 18:11:56 +0200 Eric Dumazet wrote:
> > > > OK, but I suspect some users of napi_reschedule() might not be 
> > > > race-free...
> > >
> > > What's the race you're thinking of?
> >
> > This sort of thing... the race is in fl_starving() though...
> >
> > diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c
> > b/drivers/net/ethernet/chelsio/cxgb4/sge.c
> > index 98dd78551d89..b5ff2e1a9975 100644
> > --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
> > +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
> > @@ -4261,7 +4261,7 @@ static void sge_rx_timer_cb(struct timer_list *t)
> >
> > if (fl_starving(adap, fl)) {
> > rxq = container_of(fl, struct sge_eth_rxq, 
> > fl);
> > -   if (napi_reschedule(>rspq.napi))
> > +   if (napi_schedule(>rspq.napi))
> > fl->starving++;
> > else
> > set_bit(id, s->starving_fl);
>
> Ehhh problem is that this is a simple rename so if any race is present,
> it's already there and not caused by this rename :(
>
> Don't know maybe this is out of scope and should be investigated with a
> bug report?
>
> Maybe this should be changed to prep/__schedule to prevent any kind of
> race? But doing so doesn't prevent any kind of ""starving""?
>

I gave a "Reviewed-by: Eric Dumazet ", meaning
your patch was ok for me.

My remark was orthogonal, I am not asking you to act on it ;)

Re: [net-next PATCH v2 3/4] netdev: replace napi_reschedule with napi_schedule

2023-10-05 Thread Eric Dumazet

On Thu, Oct 5, 2023 at 6:32 PM Jakub Kicinski  wrote:
>
> On Thu, 5 Oct 2023 18:11:56 +0200 Eric Dumazet wrote:
> > OK, but I suspect some users of napi_reschedule() might not be race-free...
>
> What's the race you're thinking of?

This sort of thing... the race is in fl_starving() though...

diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c
b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 98dd78551d89..b5ff2e1a9975 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -4261,7 +4261,7 @@ static void sge_rx_timer_cb(struct timer_list *t)

if (fl_starving(adap, fl)) {
rxq = container_of(fl, struct sge_eth_rxq, fl);
-   if (napi_reschedule(>rspq.napi))
+   if (napi_schedule(>rspq.napi))
fl->starving++;
else
set_bit(id, s->starving_fl);

Re: [net-next PATCH v2 4/4] netdev: use napi_schedule bool instead of napi_schedule_prep/__napi_schedule

2023-10-05 Thread Eric Dumazet

On Tue, Oct 3, 2023 at 8:36 PM Christian Marangi  wrote:
>
> Replace if condition of napi_schedule_prep/__napi_schedule and use bool
> from napi_schedule directly where possible.
>
> Signed-off-by: Christian Marangi 
> ---
>  drivers/net/ethernet/atheros/atlx/atl1.c | 4 +---
>  drivers/net/ethernet/toshiba/tc35815.c   | 4 +---
>  drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 4 +---
>  3 files changed, 3 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c 
> b/drivers/net/ethernet/atheros/atlx/atl1.c
> index 02aa6fd8ebc2..a9014d7932db 100644
> --- a/drivers/net/ethernet/atheros/atlx/atl1.c
> +++ b/drivers/net/ethernet/atheros/atlx/atl1.c
> @@ -2446,7 +2446,7 @@ static int atl1_rings_clean(struct napi_struct *napi, 
> int budget)
>
>  static inline int atl1_sched_rings_clean(struct atl1_adapter* adapter)
>  {
> -   if (!napi_schedule_prep(>napi))
> +   if (!napi_schedule(>napi))
> /* It is possible in case even the RX/TX ints are disabled 
> via IMR
>  * register the ISR bits are set anyway (but do not produce 
> IRQ).
>  * To handle such situation the napi functions used to check 
> is
> @@ -2454,8 +2454,6 @@ static inline int atl1_sched_rings_clean(struct 
> atl1_adapter* adapter)
>  */
> return 0;
>
> -   __napi_schedule(>napi);
> -
> /*
>  * Disable RX/TX ints via IMR register if it is
>  * allowed. NAPI handler must reenable them in same
> diff --git a/drivers/net/ethernet/toshiba/tc35815.c 
> b/drivers/net/ethernet/toshiba/tc35815.c
> index 14cf6ecf6d0d..a8b8a0e13f9a 100644
> --- a/drivers/net/ethernet/toshiba/tc35815.c
> +++ b/drivers/net/ethernet/toshiba/tc35815.c
> @@ -1436,9 +1436,7 @@ static irqreturn_t tc35815_interrupt(int irq, void 
> *dev_id)
> if (!(dmactl & DMA_IntMask)) {
> /* disable interrupts */
> tc_writel(dmactl | DMA_IntMask, >DMA_Ctl);
> -   if (napi_schedule_prep(>napi))
> -   __napi_schedule(>napi);
> -   else {
> +   if (!napi_schedule(>napi)) {
> printk(KERN_ERR "%s: interrupt taken in poll\n",
>dev->name);
> BUG();

Hmmm... could you also remove this BUG() ? I think this code path can be taken
if some applications are using busy polling.

Or simply rewrite this with the traditional

if (napi_schedule_prep(>napi)) {
   /* disable interrupts */
   tc_writel(dmactl | DMA_IntMask, >DMA_Ctl);
__napi_schedule(>napi);
}



> diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c 
> b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
> index 23b5a0adcbd6..146bc7bd14fb 100644
> --- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
> +++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
> @@ -1660,9 +1660,7 @@ irqreturn_t iwl_pcie_irq_rx_msix_handler(int irq, void 
> *dev_id)
> IWL_DEBUG_ISR(trans, "[%d] Got interrupt\n", entry->entry);
>
> local_bh_disable();
> -   if (napi_schedule_prep(>napi))
> -   __napi_schedule(>napi);
> -   else
> +   if (!napi_schedule(>napi))
> iwl_pcie_clear_irq(trans, entry->entry);

Same remark here about twisted logic.

> local_bh_enable();
>
> --
> 2.40.1
>

Re: [net-next PATCH v2 3/4] netdev: replace napi_reschedule with napi_schedule

2023-10-05 Thread Eric Dumazet

On Tue, Oct 3, 2023 at 8:36 PM Christian Marangi  wrote:
>
> Now that napi_schedule return a bool, we can drop napi_reschedule that
> does the same exact function. The function comes from a very old commit
> bfe13f54f502 ("ibm_emac: Convert to use napi_struct independent of struct
> net_device") and the purpose is actually deprecated in favour of
> different logic.
>
> Convert every user of napi_reschedule to napi_schedule.
>
> Signed-off-by: Christian Marangi 
> Acked-by: Jeff Johnson  # ath10k
> Acked-by: Nick Child  # ibm
> Acked-by: Marc Kleine-Budde  # for can/dev/rx-offload.c

OK, but I suspect some users of napi_reschedule() might not be race-free...

Reviewed-by: Eric Dumazet

Re: [net-next PATCH v2 1/4] netdev: replace simple napi_schedule_prep/__napi_schedule to napi_schedule

2023-10-05 Thread Eric Dumazet

On Tue, Oct 3, 2023 at 8:36 PM Christian Marangi  wrote:
>
> Replace drivers that still use napi_schedule_prep/__napi_schedule
> with napi_schedule helper as it does the same exact check and call.
>
> Signed-off-by: Christian Marangi 

Reviewed-by: Eric Dumazet

Re: [net-next PATCH 2/4] netdev: make napi_schedule return bool on NAPI successful schedule

2023-10-03 Thread Eric Dumazet

On Mon, Oct 2, 2023 at 5:10 PM Christian Marangi  wrote:
>
> Change napi_schedule to return a bool on NAPI successful schedule. This
> might be useful for some driver to do additional step after a NAPI ahs

This might be useful for some drivers to do additional steps after a
NAPI has been scheduled.

> been scheduled.
>
> Signed-off-by: Christian Marangi 

Yeah, I guess you forgot to mention I suggested this patch ...

Reviewed-by: Eric Dumazet

Re: [PATCH v6 11/12] mm/vmalloc: Hugepage vmalloc mappings

2020-08-21 Thread Eric Dumazet

On 8/21/20 8:12 AM, Nicholas Piggin wrote:
> Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC
> enables support on architectures that define HAVE_ARCH_HUGE_VMAP and
> supports PMD sized vmap mappings.
> 
> vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or
> larger, and fall back to small pages if that was unsuccessful.
> 
> Allocations that do not use PAGE_KERNEL prot are not permitted to use huge
> pages, because not all callers expect this (e.g., module allocations vs
> strict module rwx).
> 
> This reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node
> POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.
> 
> This can result in more internal fragmentation and memory overhead for a
> given allocation, an option nohugevmalloc is added to disable at boot.
> 
>

Thanks for working on this stuff, I tried something similar in the past,
but could not really do more than a hack.
( https://lkml.org/lkml/2016/12/21/285 )

Note that __init alloc_large_system_hash() is used at boot time,
when NUMA policy is spreading allocations over all NUMA nodes.

This means that on a dual node system, a hash table should be 50/50 spread.

With your patch, if a hashtable is exactly the size of one huge page,
the location of this hashtable will be not balanced, this might have some
unwanted impact.

Thanks !

Re: net-next branch fails to build on my P8 CI system

2019-11-12 Thread Eric Dumazet




On 11/11/19 9:58 PM, Abdul Haleem wrote:
> Greeting's
> 
> I see a build failure for net-next branch on my Power 8 system
> 
> 13:25:10 ERROR| [stderr] ./include/linux/u64_stats_sync.h: In function 
> 64_stats_read�:
> 13:25:10 ERROR| [stderr] ./include/linux/u64_stats_sync.h:80:2: warning: 
> passing argument 1 of 鈥榣ocal_read鈥� discards 鈥榗onst鈥� qualifier from pointer 
> target type [enabled by default]
> 13:25:10 ERROR| [stderr]   return local64_read(>v);
> 13:25:10 ERROR| [stderr]   ^
> 13:25:10 ERROR| [stderr] In file included from 
> ./include/asm-generic/local64.h:22:0,
> 13:25:10 ERROR| [stderr]  from 
> ./arch/powerpc/include/generated/asm/local64.h:1,
> 13:25:10 ERROR| [stderr]  from 
> ./include/linux/u64_stats_sync.h:72,
> 13:25:10 ERROR| [stderr]  from 
> ./include/linux/cgroup-defs.h:20,
> 13:25:10 ERROR| [stderr]  from ./include/linux/cgroup.h:28,
> 13:25:10 ERROR| [stderr]  from 
> ./include/linux/memcontrol.h:13,
> 13:25:10 ERROR| [stderr]  from ./include/linux/swap.h:9,
> 13:25:10 ERROR| [stderr]  from ./include/linux/suspend.h:5,
> 13:25:10 ERROR| [stderr]  from init/do_mounts.c:7:
> 13:25:10 ERROR| [stderr] ./arch/powerpc/include/asm/local.h:20:24: note: 
> expected 鈥榮truct local_t *鈥� but argument is of type 鈥榗onst struct local_t *鈥�
> 13:25:10 ERROR| [stderr]  static __inline__ long local_read(local_t *l)
> 13:25:10 ERROR| [stderr] ^
> 13:25:11 ERROR| [stderr] In file included from 
> ./include/linux/cgroup-defs.h:20:0,
> 13:25:11 ERROR| [stderr]  from ./include/linux/cgroup.h:28,
> 13:25:11 ERROR| [stderr]  from ./include/linux/hugetlb.h:9,
> 13:25:11 ERROR| [stderr]  from 
> arch/powerpc/kvm/book3s_64_vio_hv.c:16:
> 13:25:11 ERROR| [stderr] ./include/linux/u64_stats_sync.h: In function 
> 鈥榰64_stats_read鈥�:
> 13:25:11 ERROR| [stderr] ./include/linux/u64_stats_sync.h:80:2: error: 
> passing argument 1 of 鈥榣ocal_read鈥� discards 鈥榗onst鈥� qualifier from pointer 
> target type [-Werror]
> 13:25:11 ERROR| [stderr]   return local64_read(>v);
> 13:25:11 ERROR| [stderr]   ^
> 
> I see some recent code changes here
> 
> 9dfd871481c8e9c512938e9ce632beed645363e0 Merge branch 'u64_stats_t'
> fd2f4737870eb866537fbbffa2b59414b9b0c0a2 net: use u64_stats_t in struct
> pcpu_lstats
> 5260dd3ed1ff7eba39251b28977e4d8950e2f099 tun: switch to u64_stats_t
> 316580b69d0a7a5063af47438b626bc47cbd u64_stats: provide u64_stats_t
> type
> 
> 

Fix has been sent few days ago.

For some reason the linuxppc-dev@lists.ozlabs.org  list has not received the 
patch.

>From 47c47befdcf31fb8498c9e630bb8e0dc3ef88079 Mon Sep 17 00:00:00 2001
From: Eric Dumazet 
Date: Fri, 8 Nov 2019 06:04:35 -0800
Subject: [PATCH] powerpc: add const qual to local_read() parameter

A patch in net-next triggered a compile error on powerpc.

This seems reasonable to relax powerpc local_read() requirements.

Fixes: 316580b69d0a ("u64_stats: provide u64_stats_t type")
Signed-off-by: Eric Dumazet 
Reported-by: kbuild test robot 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/include/asm/local.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h
index 
fdd00939270bf08113b537a090d6a6e34a048361..bc4bd19b7fc235b80ec1132f44409b6fe1057975
 100644
--- a/arch/powerpc/include/asm/local.h
+++ b/arch/powerpc/include/asm/local.h
@@ -17,7 +17,7 @@ typedef struct
 
 #define LOCAL_INIT(i)  { (i) }
 
-static __inline__ long local_read(local_t *l)
+static __inline__ long local_read(const local_t *l)
 {
return READ_ONCE(l->v);
 }
-- 
2.24.0.432.g9d3f5f5b63-goog

Re: linux-next: build warning after merge of the net-next tree

2019-11-10 Thread Eric Dumazet




On 11/10/19 5:39 PM, Stephen Rothwell wrote:
> Hi all,
> 
> After merging the net-next tree, today's linux-next build (powerpc
> ppc64_defconfig) produced this warning:
> 
> In file included from ./arch/powerpc/include/generated/asm/local64.h:1,
>  from include/linux/u64_stats_sync.h:72,
>  from include/linux/cgroup-defs.h:20,
>  from include/linux/cgroup.h:28,
>  from include/linux/memcontrol.h:13,
>  from include/linux/swap.h:9,
>  from include/linux/suspend.h:5,
>  from arch/powerpc/kernel/asm-offsets.c:23:
> include/linux/u64_stats_sync.h: In function 'u64_stats_read':
> include/asm-generic/local64.h:30:37: warning: passing argument 1 of 
> 'local_read' discards 'const' qualifier from pointer target type 
> [-Wdiscarded-qualifiers]
>30 | #define local64_read(l)  local_read(&(l)->a)
>   | ^~~
> include/linux/u64_stats_sync.h:80:9: note: in expansion of macro 
> 'local64_read'
>80 |  return local64_read(>v);
>   | ^~~~
> In file included from include/asm-generic/local64.h:22,
>  from ./arch/powerpc/include/generated/asm/local64.h:1,
>  from include/linux/u64_stats_sync.h:72,
>  from include/linux/cgroup-defs.h:20,
>  from include/linux/cgroup.h:28,
>  from include/linux/memcontrol.h:13,
>  from include/linux/swap.h:9,
>  from include/linux/suspend.h:5,
>  from arch/powerpc/kernel/asm-offsets.c:23:
> arch/powerpc/include/asm/local.h:20:44: note: expected 'local_t *' {aka 
> 'struct  *'} but argument is of type 'const local_t *' {aka 'const 
> struct  *'}
>20 | static __inline__ long local_read(local_t *l)
>   |   ~^
> 
> Introduced by commit
> 
>   316580b69d0a ("u64_stats: provide u64_stats_t type")
> 
> Powerpc folks: is there some reason that local_read() cannot take a
> const argument?
> 
> I have added this patch (which builds fine) for today:
> 
> From: Stephen Rothwell 
> Date: Mon, 11 Nov 2019 12:32:24 +1100
> Subject: [PATCH] powerpc: local_read() should take a const local_t argument
> 
> Signed-off-by: Stephen Rothwell 
> ---
>  arch/powerpc/include/asm/local.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/local.h 
> b/arch/powerpc/include/asm/local.h
> index fdd00939270b..bc4bd19b7fc2 100644
> --- a/arch/powerpc/include/asm/local.h
> +++ b/arch/powerpc/include/asm/local.h
> @@ -17,7 +17,7 @@ typedef struct
>  
>  #define LOCAL_INIT(i){ (i) }
>  
> -static __inline__ long local_read(local_t *l)
> +static __inline__ long local_read(const local_t *l)
>  {
>   return READ_ONCE(l->v);
>  }
> 

I have sent this patch two days ago, I do not believe I had any answer from ppc 
maintainers.

>From 47c47befdcf31fb8498c9e630bb8e0dc3ef88079 Mon Sep 17 00:00:00 2001
From: Eric Dumazet 
Date: Fri, 8 Nov 2019 06:04:35 -0800
Subject: [PATCH] powerpc: add const qual to local_read() parameter

A patch in net-next triggered a compile error on powerpc.

This seems reasonable to relax powerpc local_read() requirements.

Fixes: 316580b69d0a ("u64_stats: provide u64_stats_t type")
Signed-off-by: Eric Dumazet 
Reported-by: kbuild test robot 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/include/asm/local.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/local.h b/arch/powerpc/include/asm/local.h
index 
fdd00939270bf08113b537a090d6a6e34a048361..bc4bd19b7fc235b80ec1132f44409b6fe1057975
 100644
--- a/arch/powerpc/include/asm/local.h
+++ b/arch/powerpc/include/asm/local.h
@@ -17,7 +17,7 @@ typedef struct
 
 #define LOCAL_INIT(i)  { (i) }
 
-static __inline__ long local_read(local_t *l)
+static __inline__ long local_read(const local_t *l)
 {
return READ_ONCE(l->v);
 }
-- 
2.24.0.432.g9d3f5f5b63-goog

Re: [RFC PATCH] lib: Introduce generic __cmpxchg_u64() and use it where needed

2018-11-01 Thread Eric Dumazet




On 11/01/2018 09:32 AM, Peter Zijlstra wrote:

>> Anyhow, if the atomic maintainers are willing to stand up and state for
>> the record that the atomic counters are guaranteed to wrap modulo 2^n
>> just like unsigned integers, then I'm happy to take Paul's patch.
> 
> I myself am certainly relying on it.


Could we get uatomic_t support maybe ?

This reminds me of this so silly patch :/

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=adb03115f4590baa280ddc440a8eff08a6be0cb7

Re: [mainline][bisected 55469b][ppc] build error at drivers/net/ethernet/mellanox/mlx4/en_rx.c

2018-10-30 Thread Eric Dumazet

On Mon, Oct 29, 2018 at 11:43 PM Abdul Haleem
 wrote:
>
> Greeting's
>
> mainline build fails on my powerpc with commit 55469b : drivers: net:
> remove  inclusion when not needed
>
> Machine type: PowerPC power 8 bare-metal
> kernel : 4.19.0
> gcc :  4.8.5
> config attached
>

Thanks for the report, but this patch is not in 4.19 kernel ? It is
scheduled for 4.20 only.

Anyway, mlx4 needs to include   directly, instead of
assuming it is indirectly included by another file.

Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"

2018-06-19 Thread Eric Dumazet




On 06/19/2018 03:32 PM, Andreas Schwab wrote:
> On Jun 19 2018, Eric Dumazet  wrote:
> 
>> diff --git a/drivers/net/ethernet/sun/sungem.c 
>> b/drivers/net/ethernet/sun/sungem.c
>> index 
>> 7a16d40a72d13cf1d522e8a3a396c826fe76f9b9..672d6748ab44f0890e92d5ca55d6ff6834c20dc9
>>  100644
>> --- a/drivers/net/ethernet/sun/sungem.c
>> +++ b/drivers/net/ethernet/sun/sungem.c
>> @@ -60,8 +60,7 @@
>>  #include 
>>  #include "sungem.h"
>>  
>> -/* Stripping FCS is causing problems, disabled for now */
>> -#undef STRIP_FCS
>> +#define STRIP_FCS
>>  
>>  #define DEFAULT_MSG(NETIF_MSG_DRV  | \
>>  NETIF_MSG_PROBE| \
>> @@ -435,7 +434,7 @@ static int gem_rxmac_reset(struct gem *gp)
>> writel(desc_dma & 0x, gp->regs + RXDMA_DBLOW);
>> writel(RX_RING_SIZE - 4, gp->regs + RXDMA_KICK);
>> val = (RXDMA_CFG_BASE | (RX_OFFSET << 10) |
>> -  ((14 / 2) << 13) | RXDMA_CFG_FTHRESH_128);
>> +  (ETH_HLEN << 13) | RXDMA_CFG_FTHRESH_128);
>> writel(val, gp->regs + RXDMA_CFG);
>> if (readl(gp->regs + GREG_BIFCFG) & GREG_BIFCFG_M66EN)
>> writel(((5 & RXDMA_BLANK_IPKTS) |
>> @@ -857,6 +856,14 @@ static int gem_rx(struct gem *gp, int work_to_do)
>>  
>> csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 
>> 0x);
>> skb->csum = csum_unfold(csum);
>> +   {
>> +   __wsum rsum = csum_partial(skb->data + ETH_HLEN, len - 
>> ETH_HLEN, 0);
>> +   if (csum != csum_fold(rsum) && net_ratelimit())
>> +   pr_err("sungem wrong csum : %x/%x, len %u bytes\n",
>> +   csum, csum_fold(rsum), len);
>> +   print_hex_dump(KERN_ERR, "raw data: ", 
>> DUMP_PREFIX_OFFSET,
>> +   16, 1, skb->data, len, true);
>> +   }
>> skb->ip_summed = CHECKSUM_COMPLETE;
>> skb->protocol = eth_type_trans(skb, gp->dev);
>>  
>> @@ -1761,7 +1768,7 @@ static void gem_init_dma(struct gem *gp)
>> writel(0, gp->regs + TXDMA_KICK);
>>  
>> val = (RXDMA_CFG_BASE | (RX_OFFSET << 10) |
>> -  ((14 / 2) << 13) | RXDMA_CFG_FTHRESH_128);
>> +  (ETH_HLEN << 13) | RXDMA_CFG_FTHRESH_128);
>> writel(val, gp->regs + RXDMA_CFG);
>>  
>> writel(desc_dma >> 32, gp->regs + RXDMA_DBHI);
> 
> With that patch I still get the wrong csum messages, but no longer the
> hw csum failure messages (tested on a PowerMac G5).
> 
> [  662.659767] sungem: sungem wrong csum : 8359/7ca6, len 86 bytes, 
> c001fee9cc02
> [  662.659775] raw data: : 00 0d 93 43 81 62 d4 3d 7e 4c 48 b7 86 dd 
> 61 01  ...C.b.=~LH...a.
> [  662.659778] raw data: 0010: 1c 1e 00 20 06 40 20 01 0a 62 17 11 88 01 
> 00 00  ... .@ ..b..
> [  662.659780] raw data: 0020: 00 00 00 00 0a 38 20 01 0a 62 17 11 88 01 
> 00 00  .8 ..b..
> [  662.659783] raw data: 0030: 00 00 00 00 00 07 9a 18 00 16 c1 9a 7e ea 
> ea 44  ~..D
> [  662.659785] raw data: 0040: fb 4a 80 10 05 93 44 08 00 00 01 01 08 0a 
> 59 68  .JD...Yh
> [  662.659788] raw data: 0050: ba e2 0e bb ac ae  
>   ..
> 
> Andreas.
> 

Note that 8359 and 7ca6 are the same really (a missing ~ to invert 
csum_partial())

So the bug was that :

1) Driver programmed a wrong start offset for the csum  (7 bytes instead of 14 
bytes to skip Ethernet Header)

2) FCS was not stripped.

Basically CHECKSUM_COMPLETE support never worked, but this was hidden by the 
fact that linux stack
had to throw away this CHECKSUM_COMPLETE because the FCS had to be removed.

Thanks !

Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"

2018-06-19 Thread Eric Dumazet




On 06/19/2018 01:10 PM, Eric Dumazet wrote:
> 
> 
> On 06/19/2018 12:10 PM, Andreas Schwab wrote:
>> On Jun 18 2018, Eric Dumazet  wrote:
>>
>>> DUMP_PREFIX_ADDRESS might give us more information (say alignment problem, 
>>> or crossing page boundaries)
>>
>> DUMP_PREFIX_ADDRESS is useless for that purpose.
>>
>> Here are some samples of broken csums:
>>
>> [  853.849225] sungem: sungem wrong csum : 9886/07be, len 94 bytes, 
>> c001fa187e02
>> [  853.849232] raw data: : 00 0d 93 43 81 62 18 d6 c7 51 b8 1c 08 00 
>> 45 10  ...C.b...QE.
>> [  853.849235] raw data: 0010: 00 4c cb a0 40 00 40 11 d9 97 c0 a8 0a 01 
>> c0 a8  .L..@.@.
>> [  853.849237] raw data: 0020: 0a 07 00 7b 00 7b 00 38 69 e1 1c 03 0c f7 
>> 00 00  ...{.{.8i...
>> [  853.849240] raw data: 0030: 08 f0 00 00 15 f0 c0 35 67 67 de d3 ca c9 
>> d9 5b  ...5gg.[
>> [  853.849242] raw data: 0040: 1f ff de d3 d2 86 8f 67 fa f2 de d3 d2 86 
>> 8f 38  ...g...8
>> [  853.849244] raw data: 0050: 2f ff de d3 d2 86 8f 3b ff ff d1 93 bc 50 
>>/..;.P
> 
> Thanks.
> 
> 4 bytes in excess.
> 
> Might be the FCS, and it does not look like provided csum has a relation with 
> it.
> 
> For some reason FCS stripping was disabled by :
> 
> commit 3e32011d4da6424b3bc65b1e1a047e30ac9882c7
> Author: Benjamin Herrenschmidt 
> Date:   Mon May 19 09:39:11 2003 -0700
> 
> [SUNGEM]: Updates from PowerPC people.
> 
> Support more chips and split out all the complex PHY
> handling into a seperate file.
> 
> 
> Since this NIC never had CHECKSUM_COMPLETE support (since we have to trim 
> each skb,
> thus were forcing ip_summed to CHECKSUM_NONE) we probably should remove it 
> and be happy.
> 
> Unless you guys find a way to let the NIC strip the FCS, and double check the 
> csum is a real csum ;)
> 
> diff --git a/drivers/net/ethernet/sun/sungem.c 
> b/drivers/net/ethernet/sun/sungem.c
> index 
> 7a16d40a72d13cf1d522e8a3a396c826fe76f9b9..85439308375e95c3854e4a1561697d69ec85399b
>  100644
> --- a/drivers/net/ethernet/sun/sungem.c
> +++ b/drivers/net/ethernet/sun/sungem.c
> @@ -760,7 +760,6 @@ static int gem_rx(struct gem *gp, int work_to_do)
> struct net_device *dev = gp->dev;
> int entry, drops, work_done = 0;
> u32 done;
> -   __sum16 csum;
>  
> if (netif_msg_rx_status(gp))
> printk(KERN_DEBUG "%s: rx interrupt, done: %d, rx_new: %d\n",
> @@ -855,9 +854,6 @@ static int gem_rx(struct gem *gp, int work_to_do)
> skb = copy_skb;
> }
>  
> -   csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 
> 0x);
> -   skb->csum = csum_unfold(csum);
> -   skb->ip_summed = CHECKSUM_COMPLETE;
> skb->protocol = eth_type_trans(skb, gp->dev);
>  
> napi_gro_receive(>napi, skb);
> 


If you have time, you also could check if changing the suspect (14 / 2) to 
ETH_HLEN would help
(and also enabling STRIP_FCS)



diff --git a/drivers/net/ethernet/sun/sungem.c 
b/drivers/net/ethernet/sun/sungem.c
index 
7a16d40a72d13cf1d522e8a3a396c826fe76f9b9..672d6748ab44f0890e92d5ca55d6ff6834c20dc9
 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -60,8 +60,7 @@
 #include 
 #include "sungem.h"
 
-/* Stripping FCS is causing problems, disabled for now */
-#undef STRIP_FCS
+#define STRIP_FCS
 
 #define DEFAULT_MSG(NETIF_MSG_DRV  | \
 NETIF_MSG_PROBE| \
@@ -435,7 +434,7 @@ static int gem_rxmac_reset(struct gem *gp)
writel(desc_dma & 0x, gp->regs + RXDMA_DBLOW);
writel(RX_RING_SIZE - 4, gp->regs + RXDMA_KICK);
val = (RXDMA_CFG_BASE | (RX_OFFSET << 10) |
-  ((14 / 2) << 13) | RXDMA_CFG_FTHRESH_128);
+  (ETH_HLEN << 13) | RXDMA_CFG_FTHRESH_128);
writel(val, gp->regs + RXDMA_CFG);
if (readl(gp->regs + GREG_BIFCFG) & GREG_BIFCFG_M66EN)
writel(((5 & RXDMA_BLANK_IPKTS) |
@@ -857,6 +856,14 @@ static int gem_rx(struct gem *gp, int work_to_do)
 
csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 
0x);
skb->csum = csum_unfold(csum);
+   {
+   __wsum rsum = csum_partial(skb->data + ETH_HLEN, len - 
ETH_HLEN, 0);
+   if (csum != csum_fold(rsum) && net_ratelimit())
+   pr_err("sungem wrong csum : %x/%x, len %u bytes\n",
+   csum, csu

Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"

2018-06-19 Thread Eric Dumazet




On 06/19/2018 12:10 PM, Andreas Schwab wrote:
> On Jun 18 2018, Eric Dumazet  wrote:
> 
>> DUMP_PREFIX_ADDRESS might give us more information (say alignment problem, 
>> or crossing page boundaries)
> 
> DUMP_PREFIX_ADDRESS is useless for that purpose.
> 
> Here are some samples of broken csums:
> 
> [  853.849225] sungem: sungem wrong csum : 9886/07be, len 94 bytes, 
> c001fa187e02
> [  853.849232] raw data: : 00 0d 93 43 81 62 18 d6 c7 51 b8 1c 08 00 
> 45 10  ...C.b...QE.
> [  853.849235] raw data: 0010: 00 4c cb a0 40 00 40 11 d9 97 c0 a8 0a 01 
> c0 a8  .L..@.@.
> [  853.849237] raw data: 0020: 0a 07 00 7b 00 7b 00 38 69 e1 1c 03 0c f7 
> 00 00  ...{.{.8i...
> [  853.849240] raw data: 0030: 08 f0 00 00 15 f0 c0 35 67 67 de d3 ca c9 
> d9 5b  ...5gg.[
> [  853.849242] raw data: 0040: 1f ff de d3 d2 86 8f 67 fa f2 de d3 d2 86 
> 8f 38  ...g...8
> [  853.849244] raw data: 0050: 2f ff de d3 d2 86 8f 3b ff ff d1 93 bc 50  
>   /..;.P

Thanks.

4 bytes in excess.

Might be the FCS, and it does not look like provided csum has a relation with 
it.

For some reason FCS stripping was disabled by :

commit 3e32011d4da6424b3bc65b1e1a047e30ac9882c7
Author: Benjamin Herrenschmidt 
Date:   Mon May 19 09:39:11 2003 -0700

[SUNGEM]: Updates from PowerPC people.

Support more chips and split out all the complex PHY
handling into a seperate file.


Since this NIC never had CHECKSUM_COMPLETE support (since we have to trim each 
skb,
thus were forcing ip_summed to CHECKSUM_NONE) we probably should remove it and 
be happy.

Unless you guys find a way to let the NIC strip the FCS, and double check the 
csum is a real csum ;)

diff --git a/drivers/net/ethernet/sun/sungem.c 
b/drivers/net/ethernet/sun/sungem.c
index 
7a16d40a72d13cf1d522e8a3a396c826fe76f9b9..85439308375e95c3854e4a1561697d69ec85399b
 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -760,7 +760,6 @@ static int gem_rx(struct gem *gp, int work_to_do)
struct net_device *dev = gp->dev;
int entry, drops, work_done = 0;
u32 done;
-   __sum16 csum;
 
if (netif_msg_rx_status(gp))
printk(KERN_DEBUG "%s: rx interrupt, done: %d, rx_new: %d\n",
@@ -855,9 +854,6 @@ static int gem_rx(struct gem *gp, int work_to_do)
skb = copy_skb;
}
 
-   csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 
0x);
-   skb->csum = csum_unfold(csum);
-   skb->ip_summed = CHECKSUM_COMPLETE;
skb->protocol = eth_type_trans(skb, gp->dev);
 
napi_gro_receive(>napi, skb);

Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"

2018-06-18 Thread Eric Dumazet




On 06/18/2018 04:29 PM, Eric Dumazet wrote:
> 
> 
> On 06/18/2018 11:45 AM, Mathieu Malaterre wrote:
> 
>>
>> Here is what I get on my side
>>
>> [   53.628847] sungem: sungem wrong csum : 4e04/f97, len 64 bytes
>> [   53.667063] sungem: sungem wrong csum : eea8/6eec, len 149 bytes
>> [   58.648952] sungem: sungem wrong csum : 2095/3d06, len 64 bytes
>> [   58.669414] sungem: sungem wrong csum : 5245/b50, len 149 bytes
>> [   63.674451] sungem: sungem wrong csum : 2d8/5abd, len 149 bytes
>> [   68.678233] sungem: sungem wrong csum : b8fc/a498, len 149 bytes
>> [   73.685771] sungem: sungem wrong csum : 374/5a21, len 149 bytes
>> [   78.689089] sungem: sungem wrong csum : d81/5014, len 149 bytes
>> [   83.683261] sungem: sungem wrong csum : 4e04/f97, len 64 bytes
>> [   83.690193] sungem: sungem wrong csum : c2f7/9a9d, len 149 bytes
>> [   88.692511] sungem: sungem wrong csum : f4d8/68bc, len 149 bytes
>> [   93.699915] sungem: sungem wrong csum : 1370/4a25, len 149 bytes
>> [   98.703136] sungem: sungem wrong csum : e1b5/7bdf, len 149 bytes
>> [  103.704230] sungem: sungem wrong csum : 5321/a74, len 149 bytes
>> [  108.688912] sungem: sungem wrong csum : 2095/3d06, len 64 bytes
>> [  108.706559] sungem: sungem wrong csum : ddbc/7fd8, len 149 bytes
>> [  113.713189] sungem: sungem wrong csum : 5a65/330, len 149 bytes
>> [  113.891697] sungem: sungem wrong csum : 4e04/f97, len 64 bytes
>> [  118.717151] sungem: sungem wrong csum : f7c8/65cc, len 149 bytes
>> [  123.722680] sungem: sungem wrong csum : 3d7a/201b, len 149 bytes
>> [  128.726524] sungem: sungem wrong csum : c8fd/9497, len 149 bytes
>> [  133.732045] sungem: sungem wrong csum : de0d/7f87, len 149 bytes
>> [  135.529163] sungem: sungem wrong csum : 3089/b6dd, len 96 bytes
>> [  135.529208] eth0: hw csum failure
>> [  135.529220] CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0+ #7
>> [  135.529226] Call Trace:
>> [  135.529243] [dffedbe0] [c069ddac]
>> __skb_checksum_complete+0xf0/0x108 (unreliable)
> 
> Thanks, then I guess next step would be to dump the content of the frames
> having a wrong checksum, hoping we find an easy way to discard the 
> CHECKSUM_COMPLETE
> in a selective way.
> 
> Otherwise, we will need to remove CHECKSUM_COMPLETE setting in this driver.
> 
> diff --git a/drivers/net/ethernet/sun/sungem.c 
> b/drivers/net/ethernet/sun/sungem.c
> index 
> 7a16d40a72d13cf1d522e8a3a396c826fe76f9b9..77a761f95be788bb86c8d917f613c9084818f826
>  100644
> --- a/drivers/net/ethernet/sun/sungem.c
> +++ b/drivers/net/ethernet/sun/sungem.c
> @@ -857,6 +857,14 @@ static int gem_rx(struct gem *gp, int work_to_do)
>  
> csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 
> 0x);
> skb->csum = csum_unfold(csum);
> +   {
> +   __wsum rsum = csum_partial(skb->data + ETH_HLEN, len - 
> ETH_HLEN, 0);
> +   if (csum != csum_fold(rsum) && net_ratelimit())
> +   pr_err("sungem wrong csum : %04x/%04x, len %u 
> bytes\n",
> +   csum, csum_fold(rsum), len);
> +   print_hex_dump(KERN_ERR, "raw data: ", 
> DUMP_PREFIX_OFFSET,


DUMP_PREFIX_ADDRESS might give us more information (say alignment problem, or 
crossing page boundaries)

> +   16, 1, skb->data, len, true);
> +   }
> skb->ip_summed = CHECKSUM_COMPLETE;
> skb->protocol = eth_type_trans(skb, gp->dev);
>  
>

Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"

2018-06-18 Thread Eric Dumazet




On 06/18/2018 11:45 AM, Mathieu Malaterre wrote:

> 
> Here is what I get on my side
> 
> [   53.628847] sungem: sungem wrong csum : 4e04/f97, len 64 bytes
> [   53.667063] sungem: sungem wrong csum : eea8/6eec, len 149 bytes
> [   58.648952] sungem: sungem wrong csum : 2095/3d06, len 64 bytes
> [   58.669414] sungem: sungem wrong csum : 5245/b50, len 149 bytes
> [   63.674451] sungem: sungem wrong csum : 2d8/5abd, len 149 bytes
> [   68.678233] sungem: sungem wrong csum : b8fc/a498, len 149 bytes
> [   73.685771] sungem: sungem wrong csum : 374/5a21, len 149 bytes
> [   78.689089] sungem: sungem wrong csum : d81/5014, len 149 bytes
> [   83.683261] sungem: sungem wrong csum : 4e04/f97, len 64 bytes
> [   83.690193] sungem: sungem wrong csum : c2f7/9a9d, len 149 bytes
> [   88.692511] sungem: sungem wrong csum : f4d8/68bc, len 149 bytes
> [   93.699915] sungem: sungem wrong csum : 1370/4a25, len 149 bytes
> [   98.703136] sungem: sungem wrong csum : e1b5/7bdf, len 149 bytes
> [  103.704230] sungem: sungem wrong csum : 5321/a74, len 149 bytes
> [  108.688912] sungem: sungem wrong csum : 2095/3d06, len 64 bytes
> [  108.706559] sungem: sungem wrong csum : ddbc/7fd8, len 149 bytes
> [  113.713189] sungem: sungem wrong csum : 5a65/330, len 149 bytes
> [  113.891697] sungem: sungem wrong csum : 4e04/f97, len 64 bytes
> [  118.717151] sungem: sungem wrong csum : f7c8/65cc, len 149 bytes
> [  123.722680] sungem: sungem wrong csum : 3d7a/201b, len 149 bytes
> [  128.726524] sungem: sungem wrong csum : c8fd/9497, len 149 bytes
> [  133.732045] sungem: sungem wrong csum : de0d/7f87, len 149 bytes
> [  135.529163] sungem: sungem wrong csum : 3089/b6dd, len 96 bytes
> [  135.529208] eth0: hw csum failure
> [  135.529220] CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0+ #7
> [  135.529226] Call Trace:
> [  135.529243] [dffedbe0] [c069ddac]
> __skb_checksum_complete+0xf0/0x108 (unreliable)

Thanks, then I guess next step would be to dump the content of the frames
having a wrong checksum, hoping we find an easy way to discard the 
CHECKSUM_COMPLETE
in a selective way.

Otherwise, we will need to remove CHECKSUM_COMPLETE setting in this driver.

diff --git a/drivers/net/ethernet/sun/sungem.c 
b/drivers/net/ethernet/sun/sungem.c
index 
7a16d40a72d13cf1d522e8a3a396c826fe76f9b9..77a761f95be788bb86c8d917f613c9084818f826
 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -857,6 +857,14 @@ static int gem_rx(struct gem *gp, int work_to_do)
 
csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 
0x);
skb->csum = csum_unfold(csum);
+   {
+   __wsum rsum = csum_partial(skb->data + ETH_HLEN, len - 
ETH_HLEN, 0);
+   if (csum != csum_fold(rsum) && net_ratelimit())
+   pr_err("sungem wrong csum : %04x/%04x, len %u bytes\n",
+   csum, csum_fold(rsum), len);
+   print_hex_dump(KERN_ERR, "raw data: ", 
DUMP_PREFIX_OFFSET,
+   16, 1, skb->data, len, true);
+   }
skb->ip_summed = CHECKSUM_COMPLETE;
skb->protocol = eth_type_trans(skb, gp->dev);

Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"

2018-06-18 Thread Eric Dumazet




On 06/18/2018 10:54 AM, Andreas Schwab wrote:
> On Jun 17 2018, Eric Dumazet  wrote:
> 
>> Oh this is silly, please try :
>>
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index 
>> c642304f178ce0a4e1358d59e45032a39f76fb3f..54dd9c18ecad817812898d6f335e1794a07dabbe
>>  100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -1845,10 +1845,9 @@ EXPORT_SYMBOL(___pskb_trim);
>>  int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
>>  {
>> if (skb->ip_summed == CHECKSUM_COMPLETE) {
>> -   int delta = skb->len - len;
>> +   __wsum csumdiff = skb_checksum(skb, len, skb->len - len, 0);
>>  
>> -   skb->csum = csum_sub(skb->csum,
>> -skb_checksum(skb, len, delta, 0));
>> +   skb->csum = csum_block_sub(skb->csum, csumdiff, len);
>> }
>> return __pskb_trim(skb, len);
>>  }
> 
> That doesn't help either.
> 
> Andreas.
> 

Then maybe NIC provided csum is not correct.

It does not compute a checksum on all the frame, but part of it.

You could use this patch to double check.

diff --git a/drivers/net/ethernet/sun/sungem.c 
b/drivers/net/ethernet/sun/sungem.c
index 
7a16d40a72d13cf1d522e8a3a396c826fe76f9b9..277859ea73e35271a10b02011120fca248ec8e71
 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -857,6 +857,11 @@ static int gem_rx(struct gem *gp, int work_to_do)
 
csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 
0x);
skb->csum = csum_unfold(csum);
+   {
+   __wsum rsum = csum_partial(skb->data + ETH_HLEN, len - 
ETH_HLEN, 0);
+   if (csum != csum_fold(rsum))
+   pr_err_ratelimited("sungem wrong csum : %x/%x, len %u 
bytes\n", csum, csum_fold(rsum), len);
+   }
skb->ip_summed = CHECKSUM_COMPLETE;
skb->protocol = eth_type_trans(skb, gp->dev);

Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"

2018-06-17 Thread Eric Dumazet




On 06/17/2018 03:27 AM, Andreas Schwab wrote:

> 
> That doesn't change anything.

OK, thanks !

Oh this is silly, please try :

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 
c642304f178ce0a4e1358d59e45032a39f76fb3f..54dd9c18ecad817812898d6f335e1794a07dabbe
 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1845,10 +1845,9 @@ EXPORT_SYMBOL(___pskb_trim);
 int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
 {
if (skb->ip_summed == CHECKSUM_COMPLETE) {
-   int delta = skb->len - len;
+   __wsum csumdiff = skb_checksum(skb, len, skb->len - len, 0);
 
-   skb->csum = csum_sub(skb->csum,
-skb_checksum(skb, len, delta, 0));
+   skb->csum = csum_block_sub(skb->csum, csumdiff, len);
}
return __pskb_trim(skb, len);
 }

Re: [PATCH] Revert "net: pskb_trim_rcsum() and CHECKSUM_COMPLETE are friends"

2018-06-16 Thread Eric Dumazet




On 06/16/2018 12:14 AM, Mathieu Malaterre wrote:
> Hi Eric,
> 
> On Fri, Jun 15, 2018 at 9:14 PM Eric Dumazet  wrote:
>>
>>
>>
>> On 06/15/2018 11:56 AM, Mathieu Malaterre wrote:
>>> This reverts commit 88078d98d1bb085d72af8437707279e203524fa5.
>>>
>>> It causes regressions for people using chips driven by the sungem
>>> driver. Suspicion is that the skb->csum value isn't being adjusted
>>> properly.
>>>
>>> Symptoms as seen on G4+sungem are:
>>>
>>> [   34.023281] eth0: hw csum failure
>>> [   34.023438] CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0+ #2
>>> [   34.023618] Call Trace:
>>> [   34.023707] [dffedbd0] [c069ddac] __skb_checksum_complete+0xf0/0x108 
>>> (unreliable)
>>> [   34.023948] [dffedbf0] [c0777a70] tcp_v4_rcv+0x604/0xe00
>>> [   34.024118] [dffedc70] [c0731624] ip_local_deliver_finish+0xa8/0x3c4
>>> [   34.024315] [dffedcb0] [c0732430] ip_local_deliver+0xf0/0x154
>>> [   34.024493] [dffedcf0] [c07328dc] ip_rcv+0x448/0x774
>>> [   34.024653] [dffedd50] [c06aeae0] __netif_receive_skb_core+0x5e8/0x1184
>>> [   34.024857] [dffedde0] [c06bba20] napi_gro_receive+0x160/0x22c
>>> [   34.025044] [dffede10] [e14b2590] gem_poll+0x7fc/0x1ac0 [sungem]
>>> [   34.025228] [dffedee0] [c06bacf0] net_rx_action+0x34c/0x618
>>> [   34.025402] [dffedf60] [c07fd27c] __do_softirq+0x16c/0x5f0
>>> [   34.025575] [dffedfd0] [c0064c7c] irq_exit+0x110/0x1a8
>>> [   34.025738] [dffedff0] [c0016170] call_do_irq+0x24/0x3c
>>> [   34.025903] [c0cf7e80] [c0009a84] do_IRQ+0x98/0x1a0
>>> [   34.026055] [c0cf7eb0] [c001b474] ret_from_except+0x0/0x14
>>> [   34.026225] --- interrupt: 501 at arch_cpu_idle+0x30/0x78
>>>LR = arch_cpu_idle+0x30/0x78
>>> [   34.026510] [c0cf7f70] [c0cf6000] 0xc0cf6000 (unreliable)
>>> [   34.026682] [c0cf7f80] [c00a3868] do_idle+0xc4/0x158
>>> [   34.026835] [c0cf7fb0] [c00a3ab0] cpu_startup_entry+0x20/0x28
>>> [   34.027013] [c0cf7fc0] [c0998820] start_kernel+0x47c/0x490
>>> [   34.027181] [c0cf7ff0] [3444] 0x3444
>>>
>>> See commit 7ce5a27f2ef8 ("Revert "net: Handle CHECKSUM_COMPLETE more
>>> adequately in pskb_trim_rcsum()."") for previous reference.
>>
>> This fix seems to hide a bug in csum functions on this architecture.
> 
> That's odd since it seems to only affect g4+sungem user. None of the
> ppc64 seems to be having it. And some ppc32 users are not even seeing
> it.
> 
>> Or a bug on this NIC when receiving a small packet (less than 60 bytes).
>> Maybe the padding bytes are not included in NIC provided csum, and not 0.
> 
> Ok in that case the bug is located in
> ./drivers/net/ethernet/sun/sungem.c that seems more likely. I'll try
> to understand that code, then.


I would try something like :

Basically do not bother using CHECKSUM_COMPLETE for small frames that might 
have been padded.

Since we need to bring one cache line in eth_type_trans() and further header 
processing,
computing the checksum in software will be almost free anyway.

diff --git a/drivers/net/ethernet/sun/sungem.c 
b/drivers/net/ethernet/sun/sungem.c
index 
7a16d40a72d13cf1d522e8a3a396c826fe76f9b9..071039f211a8a33153e888bd4014314ba5e686a4
 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -855,9 +855,11 @@ static int gem_rx(struct gem *gp, int work_to_do)
skb = copy_skb;
}
 
-   csum = (__force __sum16)htons((status & RXDCTRL_TCPCSUM) ^ 
0x);
-   skb->csum = csum_unfold(csum);
-   skb->ip_summed = CHECKSUM_COMPLETE;
+   if (len > ETH_ZLEN) {
+   csum = (__force __sum16)htons((status & 
RXDCTRL_TCPCSUM) ^ 0x);
+   skb->csum = csum_unfold(csum);
+   skb->ip_summed = CHECKSUM_COMPLETE;
+   }
skb->protocol = eth_type_trans(skb, gp->dev);
 
napi_gro_receive(>napi, skb);

[RFC] implement QUEUED spinlocks on powerpc

2017-02-01 Thread Eric Dumazet

Hi all

Is anybody working on adding QUEUED spinlocks to powerpc 64bit ?

I've seen past attempts with ticket spinlocks
( https://patchwork.ozlabs.org/patch/449381/ and other related links )

But it looks ticket spinlocks are a thing of the past.

Thanks.

Re: [PATCH net-next] ibmveth: v1 calculate correct gso_size and set gso_type

2016-10-27 Thread Eric Dumazet

On Thu, 2016-10-27 at 12:54 -0500, Thomas Falcon wrote:
> On 10/27/2016 10:26 AM, Eric Dumazet wrote:
> > On Wed, 2016-10-26 at 11:09 +1100, Jon Maxwell wrote:
> >> We recently encountered a bug where a few customers using ibmveth on the 
> >> same LPAR hit an issue where a TCP session hung when large receive was
> >> enabled. Closer analysis revealed that the session was stuck because the 
> >> one side was advertising a zero window repeatedly.
> >>
> >> We narrowed this down to the fact the ibmveth driver did not set gso_size 
> >> which is translated by TCP into the MSS later up the stack. The MSS is 
> >> used to calculate the TCP window size and as that was abnormally large, 
> >> it was calculating a zero window, even although the sockets receive buffer 
> >> was completely empty. 
> >>
> >> We were able to reproduce this and worked with IBM to fix this. Thanks Tom 
> >> and Marcelo for all your help and review on this.
> >>
> >> The patch fixes both our internal reproduction tests and our customers 
> >> tests.
> >>
> >> Signed-off-by: Jon Maxwell <jmaxwel...@gmail.com>
> >> ---
> >>  drivers/net/ethernet/ibm/ibmveth.c | 20 
> >>  1 file changed, 20 insertions(+)
> >>
> >> diff --git a/drivers/net/ethernet/ibm/ibmveth.c 
> >> b/drivers/net/ethernet/ibm/ibmveth.c
> >> index 29c05d0..c51717e 100644
> >> --- a/drivers/net/ethernet/ibm/ibmveth.c
> >> +++ b/drivers/net/ethernet/ibm/ibmveth.c
> >> @@ -1182,6 +1182,8 @@ static int ibmveth_poll(struct napi_struct *napi, 
> >> int budget)
> >>int frames_processed = 0;
> >>unsigned long lpar_rc;
> >>struct iphdr *iph;
> >> +  bool large_packet = 0;
> >> +  u16 hdr_len = ETH_HLEN + sizeof(struct tcphdr);
> >>  
> >>  restart_poll:
> >>while (frames_processed < budget) {
> >> @@ -1236,10 +1238,28 @@ static int ibmveth_poll(struct napi_struct *napi, 
> >> int budget)
> >>iph->check = 0;
> >>iph->check = 
> >> ip_fast_csum((unsigned char *)iph, iph->ihl);
> >>adapter->rx_large_packets++;
> >> +  large_packet = 1;
> >>}
> >>}
> >>}
> >>  
> >> +  if (skb->len > netdev->mtu) {
> >> +  iph = (struct iphdr *)skb->data;
> >> +  if (be16_to_cpu(skb->protocol) == ETH_P_IP &&
> >> +  iph->protocol == IPPROTO_TCP) {
> >> +  hdr_len += sizeof(struct iphdr);
> >> +  skb_shinfo(skb)->gso_type = 
> >> SKB_GSO_TCPV4;
> >> +  skb_shinfo(skb)->gso_size = netdev->mtu 
> >> - hdr_len;
> >> +  } else if (be16_to_cpu(skb->protocol) == 
> >> ETH_P_IPV6 &&
> >> + iph->protocol == IPPROTO_TCP) {
> >> +  hdr_len += sizeof(struct ipv6hdr);
> >> +  skb_shinfo(skb)->gso_type = 
> >> SKB_GSO_TCPV6;
> >> +  skb_shinfo(skb)->gso_size = netdev->mtu 
> >> - hdr_len;
> >> +  }
> >> +  if (!large_packet)
> >> +  adapter->rx_large_packets++;
> >> +  }
> >> +
> >>  
> > This might break forwarding and PMTU discovery.
> >
> > You force gso_size to device mtu, regardless of real MSS used by the TCP
> > sender.
> >
> > Don't you have the MSS provided in RX descriptor, instead of guessing
> > the value ?
> >
> >
> >
> The MSS is not always available unfortunately, so this is the best solution 
> there is at the moment. 

Hmm... then what about skb_shinfo(skb)->gso_segs ?

ip_rcv() for example has :

__IP_ADD_STATS(net,
   IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
   max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));



Also prefer : (skb->protocol == htons(ETH_P_IP)) tests

And the ipv6 test is wrong :

} else if (be16_to_cpu(skb->protocol) == ETH_P_IPV6 &&
   iph->protocol == IPPROTO_TCP) {


Since iph is a pointer to ipv4 iphdr .

Re: [PATCH net-next] ibmveth: v1 calculate correct gso_size and set gso_type

2016-10-27 Thread Eric Dumazet

On Wed, 2016-10-26 at 11:09 +1100, Jon Maxwell wrote:
> We recently encountered a bug where a few customers using ibmveth on the 
> same LPAR hit an issue where a TCP session hung when large receive was
> enabled. Closer analysis revealed that the session was stuck because the 
> one side was advertising a zero window repeatedly.
> 
> We narrowed this down to the fact the ibmveth driver did not set gso_size 
> which is translated by TCP into the MSS later up the stack. The MSS is 
> used to calculate the TCP window size and as that was abnormally large, 
> it was calculating a zero window, even although the sockets receive buffer 
> was completely empty. 
> 
> We were able to reproduce this and worked with IBM to fix this. Thanks Tom 
> and Marcelo for all your help and review on this.
> 
> The patch fixes both our internal reproduction tests and our customers tests.
> 
> Signed-off-by: Jon Maxwell 
> ---
>  drivers/net/ethernet/ibm/ibmveth.c | 20 
>  1 file changed, 20 insertions(+)
> 
> diff --git a/drivers/net/ethernet/ibm/ibmveth.c 
> b/drivers/net/ethernet/ibm/ibmveth.c
> index 29c05d0..c51717e 100644
> --- a/drivers/net/ethernet/ibm/ibmveth.c
> +++ b/drivers/net/ethernet/ibm/ibmveth.c
> @@ -1182,6 +1182,8 @@ static int ibmveth_poll(struct napi_struct *napi, int 
> budget)
>   int frames_processed = 0;
>   unsigned long lpar_rc;
>   struct iphdr *iph;
> + bool large_packet = 0;
> + u16 hdr_len = ETH_HLEN + sizeof(struct tcphdr);
>  
>  restart_poll:
>   while (frames_processed < budget) {
> @@ -1236,10 +1238,28 @@ static int ibmveth_poll(struct napi_struct *napi, int 
> budget)
>   iph->check = 0;
>   iph->check = 
> ip_fast_csum((unsigned char *)iph, iph->ihl);
>   adapter->rx_large_packets++;
> + large_packet = 1;
>   }
>   }
>   }
>  
> + if (skb->len > netdev->mtu) {
> + iph = (struct iphdr *)skb->data;
> + if (be16_to_cpu(skb->protocol) == ETH_P_IP &&
> + iph->protocol == IPPROTO_TCP) {
> + hdr_len += sizeof(struct iphdr);
> + skb_shinfo(skb)->gso_type = 
> SKB_GSO_TCPV4;
> + skb_shinfo(skb)->gso_size = netdev->mtu 
> - hdr_len;
> + } else if (be16_to_cpu(skb->protocol) == 
> ETH_P_IPV6 &&
> +iph->protocol == IPPROTO_TCP) {
> + hdr_len += sizeof(struct ipv6hdr);
> + skb_shinfo(skb)->gso_type = 
> SKB_GSO_TCPV6;
> + skb_shinfo(skb)->gso_size = netdev->mtu 
> - hdr_len;
> + }
> + if (!large_packet)
> + adapter->rx_large_packets++;
> + }
> +
>  

This might break forwarding and PMTU discovery.

You force gso_size to device mtu, regardless of real MSS used by the TCP
sender.

Don't you have the MSS provided in RX descriptor, instead of guessing
the value ?

Re: [PATCH 1/3] net: fs_enet: merge NAPI RX and NAPI TX

2016-08-24 Thread Eric Dumazet

On Wed, 2016-08-24 at 06:07 -0700, Eric Dumazet wrote:

> I am afraid you could live lock here on SMP.
> 
> You should make sure you do not loop forever, not assuming cpu is faster
> than NIC.

You are protected by the tx_lock spinlock, but this is fragile as you
could very well remove this spinlock in the future to get lockless
transmit, like many other drivers.

Re: [PATCH 1/3] net: fs_enet: merge NAPI RX and NAPI TX

2016-08-24 Thread Eric Dumazet

On Wed, 2016-08-24 at 12:36 +0200, Christophe Leroy wrote:
> Initially, a NAPI TX routine has been implemented separately from
> NAPI RX, as done on the freescale/gianfar driver.
> 
> By merging NAPI RX and NAPI TX, we reduce the amount of TX completion
> interrupts.
> 
> Handling of the budget in association with TX interrupts is based on
> indications provided at https://wiki.linuxfoundation.org/networking/napi
> 
> At the same time, we fix an issue in the handling of fep->tx_free:
> 
> It is only when fep->tx_free goes up to MAX_SKB_FRAGS that
> we need to wake up the queue. There is no need to call
> netif_wake_queue() at every packet successfully transmitted.
> 
> Signed-off-by: Christophe Leroy 
> ---
>  .../net/ethernet/freescale/fs_enet/fs_enet-main.c  | 259 
> +
>  drivers/net/ethernet/freescale/fs_enet/fs_enet.h   |  16 +-
>  drivers/net/ethernet/freescale/fs_enet/mac-fcc.c   |  57 ++---
>  drivers/net/ethernet/freescale/fs_enet/mac-fec.c   |  57 ++---
>  drivers/net/ethernet/freescale/fs_enet/mac-scc.c   |  57 ++---
>  5 files changed, 153 insertions(+), 293 deletions(-)
> 
> diff --git a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c 
> b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
> index 61fd486..7cd3ef9 100644
> --- a/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
> +++ b/drivers/net/ethernet/freescale/fs_enet/fs_enet-main.c
> @@ -79,8 +79,8 @@ static void skb_align(struct sk_buff *skb, int align)
>   skb_reserve(skb, align - off);
>  }
>  
> -/* NAPI receive function */
> -static int fs_enet_rx_napi(struct napi_struct *napi, int budget)
> +/* NAPI function */
> +static int fs_enet_napi(struct napi_struct *napi, int budget)
>  {
>   struct fs_enet_private *fep = container_of(napi, struct 
> fs_enet_private, napi);
>   struct net_device *dev = fep->ndev;
> @@ -90,9 +90,100 @@ static int fs_enet_rx_napi(struct napi_struct *napi, int 
> budget)
>   int received = 0;
>   u16 pkt_len, sc;
>   int curidx;
> + int dirtyidx, do_wake, do_restart;
>  
> - if (budget <= 0)
> - return received;
> + spin_lock(>tx_lock);
> + bdp = fep->dirty_tx;
> +
> + /* clear status bits for napi*/
> + (*fep->ops->napi_clear_event)(dev);
> +
> + do_wake = do_restart = 0;
> + while (((sc = CBDR_SC(bdp)) & BD_ENET_TX_READY) == 0) {

I am afraid you could live lock here on SMP.

You should make sure you do not loop forever, not assuming cpu is faster
than NIC.



> + dirtyidx = bdp - fep->tx_bd_base;
> +
> + if (fep->tx_free == fep->tx_ring)
> + break;
> +
> + skb = fep->tx_skbuff[dirtyidx];
> +
> + /*
> +  * Check for errors.
> +  */
> + if (sc & (BD_ENET_TX_HB | BD_ENET_TX_LC |
> +   BD_ENET_TX_RL | BD_ENET_TX_UN | BD_ENET_TX_CSL)) {
> +
> + if (sc & BD_ENET_TX_HB) /* No heartbeat */
> + fep->stats.tx_heartbeat_errors++;
> + if (sc & BD_ENET_TX_LC) /* Late collision */
> + fep->stats.tx_window_errors++;
> + if (sc & BD_ENET_TX_RL) /* Retrans limit */
> + fep->stats.tx_aborted_errors++;
> + if (sc & BD_ENET_TX_UN) /* Underrun */
> + fep->stats.tx_fifo_errors++;
> + if (sc & BD_ENET_TX_CSL)/* Carrier lost */
> + fep->stats.tx_carrier_errors++;
> +
> + if (sc & (BD_ENET_TX_LC | BD_ENET_TX_RL | 
> BD_ENET_TX_UN)) {
> + fep->stats.tx_errors++;
> + do_restart = 1;
> + }
> + } else
> + fep->stats.tx_packets++;
> +
> + if (sc & BD_ENET_TX_READY) {
> + dev_warn(fep->dev,
> +  "HEY! Enet xmit interrupt and TX_READY.\n");
> + }
> +
> + /*
> +  * Deferred means some collisions occurred during transmit,
> +  * but we eventually sent the packet OK.
> +  */
> + if (sc & BD_ENET_TX_DEF)
> + fep->stats.collisions++;
> +
> + /* unmap */
> + if (fep->mapped_as_page[dirtyidx])
> + dma_unmap_page(fep->dev, CBDR_BUFADDR(bdp),
> +CBDR_DATLEN(bdp), DMA_TO_DEVICE);
> + else
> + dma_unmap_single(fep->dev, CBDR_BUFADDR(bdp),
> +  CBDR_DATLEN(bdp), DMA_TO_DEVICE);
> +
> + /*
> +  * Free the sk buffer associated with this last transmit.
> +  */
> + if (skb) {
> + dev_kfree_skb(skb);
> + fep->tx_skbuff[dirtyidx] = NULL;
> +

Re: [PATCH net-next] ibmvnic: Defer tx completion processing using a wait queue

2016-04-12 Thread Eric Dumazet

On Tue, 2016-04-12 at 14:38 -0500, John Allen wrote:
> Moves tx completion processing out of interrupt context, deferring work
> using a wait queue. With this work now deferred, we must account for the
> possibility that skbs can be sent faster than we can process completion
> requests in which case the tx buffer will overflow. If the tx buffer is
> full, ibmvnic_xmit will return NETDEV_TX_BUSY and stop the current tx
> queue. Subsequently, the queue will be restarted in ibmvnic_complete_tx
> when all pending tx completion requests have been cleared.

1) Why is this needed ?

2) If it is needed, why is this not done in a generic way, so that other
drivers can use this ?

Thanks.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: filter: make JITs zero A for SKF_AD_ALU_XOR_X

2016-01-05 Thread Eric Dumazet

On Tue, 2016-01-05 at 16:23 +0100, Rabin Vincent wrote:
> The SKF_AD_ALU_XOR_X ancillary is not like the other ancillary data
> instructions since it XORs A with X while all the others replace A with
> some loaded value.  All the BPF JITs fail to clear A if this is used as
> the first instruction in a filter.

Is x86_64 part of this 'All' subset ? ;)

>   This was found using american fuzzy
> lop.
> 
> Add a helper to determine if A needs to be cleared given the first
> instruction in a filter, and use this in the JITs.  Except for ARM, the
> rest have only been compile-tested.
> 
> Fixes: 3480593131e0 ("net: filter: get rid of BPF_S_* enum")
> Signed-off-by: Rabin Vincent 
> ---


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2 3/3] sched/preempt: fix cond_resched_lock() and cond_resched_softirq()

2015-07-15 Thread Eric Dumazet

On Wed, 2015-07-15 at 12:52 +0300, Konstantin Khlebnikov wrote:
 These functions check should_resched() before unlocking spinlock/bh-enable:
 preempt_count always non-zero = should_resched() always returns false.
 cond_resched_lock() worked iff spin_needbreak is set.

Interesting, this definitely used to work (linux-3.11)

Any idea of which commit broke things ?



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC,v3 02/10] dpaa_eth: add support for DPAA Ethernet

2015-06-15 Thread Eric Dumazet

On Mon, 2015-06-15 at 13:40 +, Madalin-Cristian Bucur wrote:
  -Original Message-
  From: Eric Dumazet [mailto:eric.duma...@gmail.com]
  
  On Wed, 2015-04-29 at 17:56 +0300, Madalin Bucur wrote:
   This introduces the Freescale Data Path Acceleration Architecture
   (DPAA) Ethernet driver (dpaa_eth) that builds upon the DPAA QMan,
   BMan, PAMU and FMan drivers to deliver Ethernet connectivity on
   the Freescale DPAA QorIQ platforms.
  ...
  
   + /* We're going to store the skb backpointer at the beginning
   +  * of the data buffer, so we need a privately owned skb
   +  */
   +
   + /* Code borrowed from skb_unshare(). */
   + if (skb_cloned(skb)) {
   + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
   +
   + /* Finally, create a contig FD from this skb */
   + skb_to_contig_fd(priv, skb, fd, countptr, offset);
   +
   + kfree_skb(skb);
   + skb = nskb;
   + /* skb_copy() has now linearized the skbuff. */
   + }
   +
  
  You know that TCP packets are clones, right ?
  This code is killing performance.
  
  TCP allows the headers part being modified by a driver if needed.
  
  You should use skb_header_cloned() instead.
 
 Thank you, I'll address this. I plan to do something like this:
 
 +   if (!nonlinear) {
 +   /* We're going to store the skb backpointer at the beginning
 +* of the data buffer, so we need a privately owned skb
 +*/
 +
 +   /* make sure skb is not shared, skb_cow_head() assumes it's 
 not */
 +   skb = skb_share_check(skb, GFP_ATOMIC);
 +   if (!skb)
 +   goto enomem;
 +
 +   /* verify the skb head is not cloned */
 +   if (skb_cow_head(skb, priv-tx_headroom))
 +   goto enomem;
 +
 +   nonlinear = skb_is_nonlinear(skb);
 +   }
 
 but I'm not sure the skb_share_check() is required on tx.
 I'm also a bit puzzled by the aliasing between shared and cloned terms
 (i.e. skb_unshare() could be named something like skb_unclone();
  the skb_share_check() not only checks but also unshares an skb so
  the already taken skb_unshare() name would probably fit too).

A driver can ask tx skbs to be not shared, even if pktgen is used with
clone=X on it.

dev-priv_flags = ~IFF_TX_SKB_SHARING;

This way, you can avoid skb_share_check() completely. Only pktgen will
care.

Most skb_share_check() calls in drivers/net are related to input paths
on virtual drivers.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC,v3 02/10] dpaa_eth: add support for DPAA Ethernet

2015-06-10 Thread Eric Dumazet

On Wed, 2015-06-10 at 07:38 +, Madalin-Cristian Bucur wrote:
 Hi Eric,
 
 Can you please tell us if this change would be for the better?
 
 I was about to say yes to this request but checked and no other Ethernet 
 driver seems to use the queue trans_start.
 I was able to find your patch net: tx scalability works : trans_start [ 
 http://patchwork.ozlabs.org/patch/27104/ ] but did not find more about this 
 topic.
  

Drivers no longer have to worry about updating trans_start themselves.
Core networking stack handles this.

Check txq_trans_update() done from netdev_start_xmit()


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC,v3 02/10] dpaa_eth: add support for DPAA Ethernet

2015-06-10 Thread Eric Dumazet

On Wed, 2015-04-29 at 17:56 +0300, Madalin Bucur wrote:
 This introduces the Freescale Data Path Acceleration Architecture
 (DPAA) Ethernet driver (dpaa_eth) that builds upon the DPAA QMan,
 BMan, PAMU and FMan drivers to deliver Ethernet connectivity on
 the Freescale DPAA QorIQ platforms.
...

 + /* We're going to store the skb backpointer at the beginning
 +  * of the data buffer, so we need a privately owned skb
 +  */
 +
 + /* Code borrowed from skb_unshare(). */
 + if (skb_cloned(skb)) {
 + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
 +
 + /* Finally, create a contig FD from this skb */
 + skb_to_contig_fd(priv, skb, fd, countptr, offset);
 +
 + kfree_skb(skb);
 + skb = nskb;
 + /* skb_copy() has now linearized the skbuff. */
 + }
 +

You know that TCP packets are clones, right ?
This code is killing performance.

TCP allows the headers part being modified by a driver if needed.

You should use skb_header_cloned() instead.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH net-next v2 4/4] ibmveth: Add support for Large Receive Offload

2015-04-27 Thread Eric Dumazet

On Mon, 2015-04-27 at 17:10 -0500, Thomas Falcon wrote:




 + if (be16_to_cpu(skb-protocol) == ETH_P_IP) {
 + skb_set_network_header(skb, 0);
 + skb_set_transport_header(skb, 
 sizeof(struct iphdr));
 + iph = ip_hdr(skb);
 +

All this should really be :

if (skb-protocol == htons(ETH_P_IP)) {
iph = (struct iphdr *)skb-data;

Because setting network header and a (potentially wrong) transport
header is confusing at this stage.

(It should be done later in core networking stack)


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 4/5] ibmveth: Add support for Large Receive Offload

2015-04-14 Thread Eric Dumazet

On Tue, 2015-04-14 at 15:35 -0500, Thomas Falcon wrote:
 Enables receiving large packets from other LPARs. These packets
 have a -1 IP header checksum, so we must recalculate to have
 a valid checksum.
 
 Signed-off-by: Brian King brk...@linux.vnet.ibm.com
 Signed-off-by: Thomas Falcon tlfal...@linux.vnet.ibm.com
 ---
  drivers/net/ethernet/ibm/ibmveth.c | 15 ++-
  1 file changed, 14 insertions(+), 1 deletion(-)
 
 diff --git a/drivers/net/ethernet/ibm/ibmveth.c 
 b/drivers/net/ethernet/ibm/ibmveth.c
 index 08970c7..05eaca6a 100644
 --- a/drivers/net/ethernet/ibm/ibmveth.c
 +++ b/drivers/net/ethernet/ibm/ibmveth.c
 @@ -1092,6 +1092,7 @@ static int ibmveth_poll(struct napi_struct *napi, int 
 budget)
   struct net_device *netdev = adapter-netdev;
   int frames_processed = 0;
   unsigned long lpar_rc;
 + struct iphdr *iph;
  
  restart_poll:
   while (frames_processed  budget) {
 @@ -1134,8 +1135,20 @@ restart_poll:
   skb_put(skb, length);
   skb-protocol = eth_type_trans(skb, netdev);
  
 - if (csum_good)
 + if (csum_good) {
   skb-ip_summed = CHECKSUM_UNNECESSARY;
 + if (be16_to_cpu(skb-protocol) == ETH_P_IP) {
 + skb_set_network_header(skb, 0);
 + skb_set_transport_header(skb, 
 sizeof(struct iphdr));
 + iph = ip_hdr(skb);
 +
 + /* If the IP checksum is not offloaded 
 and if the packet
 +  *  is large send, the checksum must be 
 rebuilt.
 +  */
 + if (iph-check == 0x)
 + iph-check = 
 ip_fast_csum((unsigned char *)iph, iph-ihl);


How can this possibly work ?

Normally you would have to set iph-check to 0 before calling
ip_fast_csum(), as done in ip_send_check()


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v4 net-next] fix unsafe set_memory_rw from softirq

2013-10-06 Thread Eric Dumazet

On Fri, 2013-10-04 at 00:14 -0700, Alexei Starovoitov wrote:
 on x86 system with net.core.bpf_jit_enable = 1

 cannot reuse jited filter memory, since it's readonly,
 so use original bpf insns memory to hold work_struct
 
 defer kfree of sk_filter until jit completed freeing
 
 tested on x86_64 and i386
 
 Signed-off-by: Alexei Starovoitov a...@plumgrid.com

Acked-by: Eric Dumazet eduma...@google.com


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 net-next] fix unsafe set_memory_rw from softirq

2013-10-03 Thread Eric Dumazet

On Thu, 2013-10-03 at 21:11 -0700, Alexei Starovoitov wrote:

 diff --git a/include/linux/filter.h b/include/linux/filter.h
 index a6ac848..5d66cd9 100644
 --- a/include/linux/filter.h
 +++ b/include/linux/filter.h
 @@ -25,15 +25,20 @@ struct sk_filter
  {
   atomic_trefcnt;
   unsigned intlen;/* Number of filter blocks */
 + struct rcu_head rcu;
   unsigned int(*bpf_func)(const struct sk_buff *skb,
   const struct sock_filter *filter);
 - struct rcu_head rcu;
 + /* insns start right after bpf_func, so that sk_run_filter() fetches
 +  * first insn from the same cache line that was used to call into
 +  * sk_run_filter()
 +  */
   struct sock_filter  insns[0];
  };
  
  static inline unsigned int sk_filter_len(const struct sk_filter *fp)
  {
 - return fp-len * sizeof(struct sock_filter) + sizeof(*fp);
 + return max(fp-len * sizeof(struct sock_filter),
 +sizeof(struct work_struct)) + sizeof(*fp);
  }

I would use for include/linux/filter.h this (untested) diff :

(Note the include linux/workqueue.h)

I also remove your comment about cache lines, since there is nothing
to align stuff on a cache line boundary.

diff --git a/include/linux/filter.h b/include/linux/filter.h
index a6ac848..281b05c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -6,6 +6,7 @@
 
 #include linux/atomic.h
 #include linux/compat.h
+#include linux/workqueue.h
 #include uapi/linux/filter.h
 
 #ifdef CONFIG_COMPAT
@@ -25,15 +26,20 @@ struct sk_filter
 {
atomic_trefcnt;
unsigned intlen;/* Number of filter blocks */
+   struct rcu_head rcu;
unsigned int(*bpf_func)(const struct sk_buff *skb,
const struct sock_filter *filter);
-   struct rcu_head rcu;
-   struct sock_filter  insns[0];
+   union {
+   struct work_struct  work;
+   struct sock_filter  insns[0];
+   };
 };
 
-static inline unsigned int sk_filter_len(const struct sk_filter *fp)
+static inline unsigned int sk_filter_size(const struct sk_filter *fp,
+ unsigned int proglen)
 {
-   return fp-len * sizeof(struct sock_filter) + sizeof(*fp);
+   return max(sizeof(*fp),
+  offsetof(struct sk_filter, insns[proglen]));
 }
 
 extern int sk_filter(struct sock *sk, struct sk_buff *skb);



This way, you can use sk_filter_size(fp, fprog-len)
instead of doing the max() games in sk_attach_filter() and
sk_unattached_filter_create()

Other than that, I think your patch is fine.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: Unbreak compat_sys_{send,recv}msg

2013-06-06 Thread Eric Dumazet

On Thu, 2013-06-06 at 00:26 -0700, David Miller wrote:
 From: Andy Lutomirski l...@amacapital.net
 Date: Wed,  5 Jun 2013 22:38:26 -0700
 
  I broke them in this commit:
  
  commit 1be374a0518a288147c6a7398792583200a67261
  Author: Andy Lutomirski l...@amacapital.net
  Date:   Wed May 22 14:07:44 2013 -0700
  
  net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg
  
  This patch adds __sys_sendmsg and __sys_sendmsg as common helpers that 
  accept
  MSG_CMSG_COMPAT and blocks MSG_CMSG_COMPAT at the syscall entrypoints.  It
  also reverts some unnecessary checks in sys_socketcall.
  
  Apparently I was suffering from underscore blindness the first time around.
  
  Signed-off-by: Andy Lutomirski l...@amacapital.net
 
 Eric, can you test this patch too?

Yes, this fixes the problem as well on x86_64

Tested-by: Eric Dumazet eduma...@google.com

Thanks !

PS: I had following fuzz while applying on Linus tree :

patching file include/linux/socket.h
Hunk #1 succeeded at 320 (offset -1 lines).
patching file net/compat.c
patching file net/socket.c
Hunk #1 succeeded at 1956 (offset -22 lines).
Hunk #2 succeeded at 2071 (offset -22 lines).
Hunk #3 succeeded at 2125 (offset -22 lines).
Hunk #4 succeeded at 2163 (offset -22 lines).
Hunk #5 succeeded at 2255 (offset -22 lines).
Hunk #6 succeeded at 2317 (offset -22 lines).
Hunk #7 succeeded at 2515 (offset -20 lines).


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 5/5] net: Block MSG_CMSG_COMPAT in send(m)msg and recv(m)msg

2013-06-05 Thread Eric Dumazet

On Thu, 2013-06-06 at 12:56 +1000, Michael Neuling wrote:
 On Thu, May 23, 2013 at 7:07 AM, Andy Lutomirski l...@amacapital.net wrote:
  MSG_CMSG_COMPAT is (AFAIK) not intended to be part of the API --
  it's a hack that steals a bit to indicate to other networking code
  that a compat entry was used.  So don't allow it from a non-compat
  syscall.
 
 Dave  Linus
 
 This is causing a regression on 64bit powerpc with 32bit usermode.
 When I hit userspace, udev is broken and I suspect all networking is
 broken as well.
 
 Can we please revert 1be374a0518a288147c6a7398792583200a67261 upstream?
 

It seems to also break x86_64, if using 32bit usermode.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH net-next] af_unix: fix a fatal race with bit fields

2013-05-03 Thread Eric Dumazet

On Fri, 2013-05-03 at 11:01 +0930, Alan Modra wrote:
 On Tue, Apr 30, 2013 at 10:04:32PM -0700, Eric Dumazet wrote:
  These kind of errors are pretty hard to find, its a pity to spend time
  on them.
 
 Well, yes.  From the first comment in gcc PR52080.  For the following
 testcase we generate a 8 byte RMW cycle on IA64 which causes locking
 problems in the linux kernel btrfs filesystem.
 
 Did someone fix btrfs, but not check other kernel locks?  Having now
 hit the same problem again, have you checked that other kernel locks
 don't have adjacent bit fields in the same 64-bit word?  And comment
 the struct to ensure someone doesn't optimize those unsigned chars
 back to bit fields.

Not only spinlock, but atomic_t followed by bit fields.

BTW, if a spinlock is followed by bit fields, but bit fields
only changed when this spinlock is held, there is no problem, unless
spinlock is a ticket spinlock.

In af_unix, bug happens because the bit fields were changed without
spinlock being held (another global spinlock is used instead)

(ppc64 doesnt use ticket spinlocks yet)


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH net-next] af_unix: fix a fatal race with bit fields

2013-05-03 Thread Eric Dumazet

On Fri, 2013-05-03 at 15:29 +0100, David Laight wrote:
   Could ppc64 experts confirm using byte is safe, or should we really add
   a 32bit hole after the spinlock ? If so, I wonder how many other places
   need a change...
 ...
  Also I'd be surprised if ppc64 is the only one with that problem... what
  about sparc64 and arm64 ?
 
 Even x86 could be affected.
 The width of the memory cycles used by the 'bit set and bit clear'
 instructions isn't documented. They are certainly allowed to do
 RMW on adjacent bytes.
 I don't remember whether they are constrained to only do
 32bit accesses, but nothing used to say that they wouldn't
 do 32bit misaligned ones! (although I suspect they never have).

x86 is not affected (or else we would have found the bug much earlier)

Setting 1-bit field to one/zero uses OR/AND instructions.

orb  $4,724(%reg)

doesn't load/store 64bits but 8bits.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2 net-next] af_unix: fix a fatal race with bit fields

2013-05-01 Thread Eric Dumazet

On Wed, 2013-05-01 at 03:36 -0400, David Miller wrote:
 From: Benjamin Herrenschmidt b...@kernel.crashing.org
 Date: Wed, 01 May 2013 11:39:53 +1000
 
  I'm not even completely certain bytes are safe to be honest, though
  probably more than bitfields. I'll poke our compiler people.
 
 Older Alpha only has 32-bit and 64-bit loads and stores, so byte sized
 accesses are not atomic, and therefore use racey read-modify-write
 sequences.

Right, so what about the following more general fix ?

Thanks !

[PATCH v2] af_unix: fix a fatal race with bit fields

Using bit fields is dangerous on ppc64/sparc64, as the compiler [1]
uses 64bit instructions to manipulate them.
If the 64bit word includes any atomic_t or spinlock_t, we can lose
critical concurrent changes.

This is happening in af_unix, where unix_sk(sk)-gc_candidate/
gc_maybe_cycle/lock share the same 64bit word.

This leads to fatal deadlock, as one/several cpus spin forever
on a spinlock that will never be available again.

A safer way would be to use a long to store flags.
This way we are sure compiler/arch wont do bad things.

As we own unix_gc_lock spinlock when clearing or setting bits,
we can use the non atomic __set_bit()/__clear_bit().

recursion_level can share the same 64bit location with the spinlock,
as it is set only with this spinlock held.

[1] bug fixed in gcc-4.8.0 :
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=52080

Reported-by: Ambrose Feinstein ambr...@google.com
Signed-off-by: Eric Dumazet eduma...@google.com
Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
Cc: Paul Mackerras pau...@samba.org
---
 include/net/af_unix.h |5 +++--
 net/unix/garbage.c|   12 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index a8836e8..dbdfd2b 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -57,9 +57,10 @@ struct unix_sock {
struct list_headlink;
atomic_long_t   inflight;
spinlock_t  lock;
-   unsigned intgc_candidate : 1;
-   unsigned intgc_maybe_cycle : 1;
unsigned char   recursion_level;
+   unsigned long   gc_flags;
+#define UNIX_GC_CANDIDATE  0
+#define UNIX_GC_MAYBE_CYCLE1
struct socket_wqpeer_wq;
 };
 #define unix_sk(__sk) ((struct unix_sock *)__sk)
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index d0f6545..9c6cc08 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -185,7 +185,7 @@ static void scan_inflight(struct sock *x, void 
(*func)(struct unix_sock *),
 * have been added to the queues after
 * starting the garbage collection
 */
-   if (u-gc_candidate) {
+   if (test_bit(UNIX_GC_CANDIDATE, 
u-gc_flags)) {
hit = true;
func(u);
}
@@ -254,7 +254,7 @@ static void inc_inflight_move_tail(struct unix_sock *u)
 * of the list, so that it's checked even if it was already
 * passed over
 */
-   if (u-gc_maybe_cycle)
+   if (test_bit(UNIX_GC_MAYBE_CYCLE, u-gc_flags))
list_move_tail(u-link, gc_candidates);
 }
 
@@ -315,8 +315,8 @@ void unix_gc(void)
BUG_ON(total_refs  inflight_refs);
if (total_refs == inflight_refs) {
list_move_tail(u-link, gc_candidates);
-   u-gc_candidate = 1;
-   u-gc_maybe_cycle = 1;
+   __set_bit(UNIX_GC_CANDIDATE, u-gc_flags);
+   __set_bit(UNIX_GC_MAYBE_CYCLE, u-gc_flags);
}
}
 
@@ -344,7 +344,7 @@ void unix_gc(void)
 
if (atomic_long_read(u-inflight)  0) {
list_move_tail(u-link, not_cycle_list);
-   u-gc_maybe_cycle = 0;
+   __clear_bit(UNIX_GC_MAYBE_CYCLE, u-gc_flags);
scan_children(u-sk, inc_inflight_move_tail, NULL);
}
}
@@ -356,7 +356,7 @@ void unix_gc(void)
 */
while (!list_empty(not_cycle_list)) {
u = list_entry(not_cycle_list.next, struct unix_sock, link);
-   u-gc_candidate = 0;
+   __clear_bit(UNIX_GC_CANDIDATE, u-gc_flags);
list_move_tail(u-link, gc_inflight_list);
}
 


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH v2 net-next] af_unix: fix a fatal race with bit fields

2013-05-01 Thread Eric Dumazet

On Wed, 2013-05-01 at 16:53 +0100, David Laight wrote:
 Why not just change gc_candidate and gc_maybe_cycle to
 unsigned char?
 It might reduce the number of pad bytes somewhat.

You didn't quite follow the discussion.

I used bytes on V1, and we are not 100% sure its safe on all arches.

unsigned long is guaranteed to be safe. We absolutely rely on this.

Better use more bytes on a socket (with no impact on real memory use),
than spending hours to debug some strange issues.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH net-next] af_unix: fix a fatal race with bit fields

2013-04-30 Thread Eric Dumazet

From: Eric Dumazet eduma...@google.com

Using bit fields is dangerous on ppc64, as the compiler uses 64bit
instructions to manipulate them. If the 64bit word includes any
atomic_t or spinlock_t, we can lose critical concurrent changes.

This is happening in af_unix, where unix_sk(sk)-gc_candidate/
gc_maybe_cycle/lock share the same 64bit word.

This leads to fatal deadlock, as one/several cpus spin forever
on a spinlock that will never be available again.

Reported-by: Ambrose Feinstein ambr...@google.com
Signed-off-by: Eric Dumazet eduma...@google.com
Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
Cc: Paul Mackerras pau...@samba.org
---

Could ppc64 experts confirm using byte is safe, or should we really add
a 32bit hole after the spinlock ? If so, I wonder how many other places
need a change...

 include/net/af_unix.h |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index a8836e8..4520a23f 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -57,8 +57,8 @@ struct unix_sock {
struct list_headlink;
atomic_long_t   inflight;
spinlock_t  lock;
-   unsigned intgc_candidate : 1;
-   unsigned intgc_maybe_cycle : 1;
+   unsigned char   gc_candidate;
+   unsigned char   gc_maybe_cycle;
unsigned char   recursion_level;
struct socket_wqpeer_wq;
 };


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH net-next] af_unix: fix a fatal race with bit fields

2013-04-30 Thread Eric Dumazet

On Wed, 2013-05-01 at 11:51 +1000, Anton Blanchard wrote:
 Hi Eric,
 
  From: Eric Dumazet eduma...@google.com
  
  Using bit fields is dangerous on ppc64, as the compiler uses 64bit
  instructions to manipulate them. If the 64bit word includes any
  atomic_t or spinlock_t, we can lose critical concurrent changes.
  
  This is happening in af_unix, where unix_sk(sk)-gc_candidate/
  gc_maybe_cycle/lock share the same 64bit word.
  
  This leads to fatal deadlock, as one/several cpus spin forever
  on a spinlock that will never be available again.
 
 I just spoke to Alan Modra and he suspects this is a compiler
 bug. Can you give us your compiler version info?

$ gcc-4.6.3-nolibc/powerpc64-linux/bin/powerpc64-linux-gcc -v
Using built-in specs.
COLLECT_GCC=gcc-4.6.3-nolibc/powerpc64-linux/bin/powerpc64-linux-gcc
COLLECT_LTO_WRAPPER=/usr/local/google/home/edumazet/cross/gcc-4.6.3-nolibc/powerpc64-linux/bin/../libexec/gcc/powerpc64-linux/4.6.3/lto-wrapper
Target: powerpc64-linux
Configured with: /home/tony/buildall/src/gcc/configure
--target=powerpc64-linux --host=x86_64-linux-gnu
--build=x86_64-linux-gnu --enable-targets=all
--prefix=/opt/cross/gcc-4.6.3-nolibc/powerpc64-linux/
--enable-languages=c --with-newlib --without-headers
--enable-sjlj-exceptions --with-system-libunwind --disable-nls
--disable-threads --disable-shared --disable-libmudflap --disable-libssp
--disable-libgomp --disable-decimal-float --enable-checking=release
--with-mpfr=/home/tony/buildall/src/sys-x86_64
--with-gmp=/home/tony/buildall/src/sys-x86_64 --disable-bootstrap
--disable-libquadmath
Thread model: single
gcc version 4.6.3 (GCC) 


$ cat try.c ; gcc-4.6.3-nolibc/powerpc64-linux/bin/powerpc64-linux-gcc
-O2 -S try.c ; cat try.s
struct s {
unsigned int lock;
unsigned int f1 : 1;
unsigned int f2 : 1;
void *ptr;
} *p ;

showbug()
{
p-lock++;
p-f1 = 1;
}
.file   try.c
.section.toc,aw
.section.text
.section.toc,aw
.LC0:
.tc p[TC],p
.section.text
.align 2
.globl showbug
.section.opd,aw
.align 3
showbug:
.quad   .L.showbug,.TOC.@tocbase,0
.previous
.type   showbug, @function
.L.showbug:
addis 9,2,.LC0@toc@ha
ld 9,.LC0@toc@l(9)
ld 9,0(9)
lwz 11,0(9)
addi 0,11,1
stw 0,0(9)
li 11,1
ld 0,0(9)
rldimi 0,11,31,32
std 0,0(9)
blr
.long 0
.byte 0,0,0,0,0,0,0,0
.size   showbug,.-.L.showbug
.comm   p,8,8
.ident  GCC: (GNU) 4.6.3

You can see ld 0,0(9) is used : its a 64 bit load.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH net-next] af_unix: fix a fatal race with bit fields

2013-04-30 Thread Eric Dumazet

On Wed, 2013-05-01 at 13:24 +0930, Alan Modra wrote:
 On Tue, Apr 30, 2013 at 07:24:20PM -0700, Eric Dumazet wrote:
  li 11,1
  ld 0,0(9)
  rldimi 0,11,31,32
  std 0,0(9)
  blr
  .ident  GCC: (GNU) 4.6.3
  
  You can see ld 0,0(9) is used : its a 64 bit load.
 
 Yup.  This is not a powerpc64 specific problem.  See
 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=52080
 Fixed in 4.8.0 and 4.7.3.

Ah thanks.

This seems a pretty serious issue, is it documented somewhere that
ppc64, sparc64 and others need such compiler version ?

These kind of errors are pretty hard to find, its a pity to spend time
on them.




___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2] net: mv643xx_eth: remove deprecated inet_lro support

2013-04-12 Thread Eric Dumazet

On Fri, 2013-04-12 at 11:20 +0200, Sebastian Hesselbarth wrote:
 With recent support for GRO, there is no need to keep both LRO and
 GRO. This patch therefore removes the deprecated inet_lro support
 from mv643xx_eth. This is work is based on an experimental patch
 provided by Eric Dumazet and Willy Tarreau.
 
 Signed-off-by: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
 Based-on-patch-by: Eric Dumazet eric.duma...@gmail.com
 Based-on-patch-by: Willy Tarreau w...@1wt.eu
 ---
 Changes from v1:
 - also remove INET_LRO from Kconfig (Reported by Eric Dumazet)

Thanks for finishing this patch

Signed-off-by: Eric Dumazet eduma...@google.com


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: mv643xx_eth: Add GRO support

2013-04-11 Thread Eric Dumazet

On Thu, 2013-04-11 at 17:32 +0200, Willy Tarreau wrote:
 On Thu, Apr 11, 2013 at 05:27:03PM +0200, Sebastian Hesselbarth wrote:
  I don't have a strong opinion on whether Soeren's or your proposal should
  be submitted. But I insist on having one of them in, as GRO significantly
  improves the common use case, is enabled by default, and not as
  constrained as LRO.
 
 I agree, use yours first, but we should keep an eye on this. Since you have
 everything to run a test, please try to see if you can get netperf to run
 over IPv6, I'm sure the NIC doesn't handle it.

Willy, testing the checksum in the NIC driver itself prevents the stack
doing GRO even if the NIC could not checksum the packet, as in GRE
tunneling for example.

So Sebastien patch is better IMHO : Just call the napi gro handler and
let core stack handles the details ;)





___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: mv643xx_eth: Add GRO support

2013-04-11 Thread Eric Dumazet

On Thu, 2013-04-11 at 14:40 +0200, Sebastian Hesselbarth wrote:
 This patch adds GRO support to mv643xx_eth by making it invoke
 napi_gro_receive instead of netif_receive_skb.
 
 Signed-off-by: Soeren Moch sm...@web.de
 Signed-off-by: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
 ---
 Cc: David S. Miller da...@davemloft.net
 Cc: Lennert Buytenhek buyt...@wantstofly.org
 Cc: Andrew Lunn and...@lunn.ch
 Cc: Jason Cooper ja...@lakedaemon.net
 Cc: Florian Fainelli flor...@openwrt.org
 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 Cc: Paul Mackerras pau...@samba.org
 Cc: Dale Farnsworth d...@farnsworth.org
 Cc: net...@vger.kernel.org
 Cc: linux-arm-ker...@lists.infradead.org
 Cc: linuxppc-dev@lists.ozlabs.org
 Cc: linux-ker...@vger.kernel.org
 ---
  drivers/net/ethernet/marvell/mv643xx_eth.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c 
 b/drivers/net/ethernet/marvell/mv643xx_eth.c
 index 305038f..c850d04 100644
 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c
 +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
 @@ -604,7 +604,7 @@ static int rxq_process(struct rx_queue *rxq, int budget)
   lro_receive_skb(rxq-lro_mgr, skb, (void *)cmd_sts);
   lro_flush_needed = 1;
   } else
 - netif_receive_skb(skb);
 + napi_gro_receive(mp-napi, skb);
  
   continue;
  

Acked-by: Eric Dumazet eduma...@google.com


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: mv643xx_eth: Add GRO support

2013-04-11 Thread Eric Dumazet

On Thu, 2013-04-11 at 18:02 +0200, Willy Tarreau wrote:

 OK, that makes sense indeed, I didn't think about this case. All
 I remember was that the old call achieved a higher packet rate
 than napi_gro_receive, but it was on an older kernel and I can't
 be more specifics after several months :-/

Its probably true that the GRO handler consumes more cpu for packets
that cant be aggregated in the end.

Thats a trade off, and maybe we could add in the core stack a device
feature to instruct gro handler to do a short cut for packets with no
checksum. Or better a sysctl so that a static_branch can be used.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: mv643xx_eth: Add GRO support

2013-04-11 Thread Eric Dumazet

On Thu, 2013-04-11 at 13:31 -0400, David Miller wrote:

 I think, as per other drivers, LRO should be eliminated completely from
 all drivers, including this one, and GRO used exclusively instead.

This would be awesome.

AFAIK current LRO users are :

drivers/infiniband/hw/nes/nes_hw.c
drivers/net/ethernet/marvell/mv643xx_eth.c
drivers/net/ethernet/pasemi/pasemi_mac.c



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: mv643xx_eth: Add GRO support

2013-04-11 Thread Eric Dumazet

On Thu, 2013-04-11 at 19:51 +0200, Willy Tarreau wrote:

 Eric provided me with one such experimental patch in the past for this
 driver. It worked for me but we never tried to clean it up to propose
 it for inclusion.
 
 If anyone is interested, I might still have it in experimental shape.

We are interested a lot ;)

Thanks !


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: mv643xx_eth: remove deprecated inet_lro support

2013-04-11 Thread Eric Dumazet

On Thu, 2013-04-11 at 21:11 +0200, Sebastian Hesselbarth wrote:
 With recent support for GRO, there is no need to keep both LRO and
 GRO. This patch therefore removes the deprecated inet_lro support
 from mv643xx_eth. This is work is based on an experimental patch
 provided by Eric Dumazet and Willy Tarreau.
 
 Signed-off-by: Sebastian Hesselbarth sebastian.hesselba...@gmail.com
 Based-on-patch-by: Eric Dumazet eric.duma...@gmail.com
 Based-on-patch-by: Willy Tarreau w...@1wt.eu
 ---
 Note: This patch is based upon recent cleanup patches and GRO support
 patch for mv643xx_eth.
 
 Cc: David S. Miller da...@davemloft.net
 Cc: Lennert Buytenhek buyt...@wantstofly.org
 Cc: Andrew Lunn and...@lunn.ch
 Cc: Jason Cooper ja...@lakedaemon.net
 Cc: Florian Fainelli flor...@openwrt.org
 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 Cc: Paul Mackerras pau...@samba.org
 Cc: Dale Farnsworth d...@farnsworth.org
 Cc: Ben Hutchings bhutchi...@solarflare.com
 Cc: Soeren Moch sm...@web.de
 Cc: Eric Dumazet eric.duma...@gmail.com
 Cc: Willy Tarreau w...@1wt.eu
 Cc: net...@vger.kernel.org
 Cc: linux-arm-ker...@lists.infradead.org
 Cc: linuxppc-dev@lists.ozlabs.org
 Cc: linux-ker...@vger.kernel.org
 ---
  drivers/net/ethernet/marvell/mv643xx_eth.c |   97 
 +---
  1 file changed, 3 insertions(+), 94 deletions(-)

Seems fine to me, but you also could remove select INET_LRO
from drivers/net/ethernet/marvell/Kconfig


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH net-next 00/21] treewide: Use consistent api style for address testing

2012-10-19 Thread Eric Dumazet

On Thu, 2012-10-18 at 20:55 -0700, Joe Perches wrote:
 ethernet, ipv4, and ipv6 address testing uses 3 different api naming styles.
 
 ethernet uses:is_foo_ether_addr
 ipv4 uses:ipv4_is_foo
 ipv6 uses:ipv6_addr_foo
 
 Standardize on the ipv6 style of prefix_addr_type to reduce
 the number of styles to remember.
 
 The new consistent styles are:
 
 eth_addr_foo(const u8 *)
 ipv4_addr_foo(__be32)
 ipv6_addr_foo(const struct in6_addr *)
 
 Add temporary backward compatibility #defines for the old names too.
 
 Joe Perches (21):
   etherdevice: Rename is_foo_ether_addr tests to eth_addr_foo
   net: Convert is_foo_ether_addr uses to eth_addr_foo
   arch: Convert is_foo_ether_addr uses to eth_addr_foo
   wireless: Convert is_foo_ether_addr uses to eth_addr_foo
   drivers: net: Convert is_foo_ether_addr uses to eth_addr_foo
   staging: Convert is_foo_ether_addr uses to eth_addr_foo
   infiniband: Convert is_foo_ether_addr uses to eth_addr_foo
   scsi: Convert is_foo_ether_addr uses to eth_addr_foo
   of: Convert is_foo_ether_addr uses to eth_addr_foo
   s390: Convert is_foo_ether_addr uses to eth_addr_foo
   usb: Convert is_foo_ether_addr uses to eth_addr_foo
   uwb: Convert is_foo_ether_addr uses to eth_addr_foo
   Documentation: networking: Convert is_foo_ether_addr uses to 
 eth_addr_foo
   llc_if.h: Convert is_foo_ether_addr uses to eth_addr_foo
   in.h: Rename ipv4_is_foo functions to ipv4_addr_foo
   net: Convert ipv4_is_foo uses to ipv4_addr_foo
   infiniband: Convert ipv4_is_foo uses to ipv4_addr_foo
   ath6kl: Convert ipv4_is_foo uses to ipv4_addr_foo
   parisc: Convert ipv4_is_foo uses to ipv4_addr_foo
   lockd: Convert ipv4_is_foo uses to ipv4_addr_foo
   sctp: Convert ipv4_is_foo uses to ipv4_addr_foo

Yes they are some names discrepancies, thats a big deal.

And we have alloc_skb() / kfree_skb() / skb_clone() 

Why not skb_alloc() / skb_free() / skb_clone() ?

Some people actually know current code by name of functions, they dont
want to change their mind and having to grep include files and git log
to learn the new names of an old function, especially when traveling and
using a laptop.

Sure, when we want to use eth_random_addr(), a grep into include files
to check if its still the right name (old one was random_ether_addr())
is OK because we dont use this one often.

If you think about it, eth_random_addr() was not the perfect name.

Think about all the documentation you can find outside of kernel tree,
RFC and things like that, copy/pasting some linux kernel code.

This kind of changes make our life more difficult, when we have to
backport patches or rebase code, or even perform some searches to find
prior issues/discussions.

Life of a kernel developer is not only dealing with latest Linus (or
-next) tree, and using automatic 'tools'.

Thats a real pain for me at least.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: net: fix typo in freescale/ucc_geth.c

2012-10-09 Thread Eric Dumazet

On Tue, 2012-10-09 at 10:52 +1100, Michael Neuling wrote:
 The following patch: 
   acb600d net: remove skb recycling
 added dev_free_skb() to drivers/net/ethernet/freescale/ucc_geth.c
 
 This is a typo and should be dev_kfree_skb().  This fixes this.
 
 Signed-off-by: Michael Neuling mi...@neuling.org
 ---
 This hit as a compile error in next-20121008 with mpc85xx_defconfig.
 
 diff --git a/drivers/net/ethernet/freescale/ucc_geth.c 
 b/drivers/net/ethernet/freescale/ucc_geth.c
 index dfa0aaa..0a70bb5 100644
 --- a/drivers/net/ethernet/freescale/ucc_geth.c
 +++ b/drivers/net/ethernet/freescale/ucc_geth.c
 @@ -3268,7 +3268,7 @@ static int ucc_geth_rx(struct ucc_geth_private *ugeth, 
 u8 rxQ, int rx_work_limit
   if (netif_msg_rx_err(ugeth))
   ugeth_err(%s, %d: ERROR!!! skb - 0x%08x,
  __func__, __LINE__, (u32) skb);
 - dev_free_skb(skb);
 + dev_kfree_skb(skb);
  
   ugeth-rx_skbuff[rxQ][ugeth-skb_currx[rxQ]] = NULL;
  

Oops, thanks Michael !


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH RFC] Simplify the Linux kernel by reducing its state space

2012-03-31 Thread Eric Dumazet

On Sun, 2012-04-01 at 00:33 +0800, Paul E. McKenney wrote:
 Although there have been numerous complaints about the complexity of
 parallel programming (especially over the past 5-10 years), the plain
 truth is that the incremental complexity of parallel programming over
 that of sequential programming is not as large as is commonly believed.
 Despite that you might have heard, the mind-numbing complexity of modern
 computer systems is not due so much to there being multiple CPUs, but
 rather to there being any CPUs at all.  In short, for the ultimate in
 computer-system simplicity, the optimal choice is NR_CPUS=0.
 
 This commit therefore limits kernel builds to zero CPUs.  This change
 has the beneficial side effect of rendering all kernel bugs harmless.
 Furthermore, this commit enables additional beneficial changes, for
 example, the removal of those parts of the kernel that are not needed
 when there are zero CPUs.
 
 Signed-off-by: Paul E. McKenney paul...@linux.vnet.ibm.com
 Reviewed-by: Thomas Gleixner t...@linutronix.de
 ---

Hmm... I believe you could go one step forward and allow negative values
as well. Antimatter was proven to exist after all.

Hint : nr_cpu_ids is an int, not an unsigned int

Bonus: Existing bugs become must have features.

Of course there is no hurry and this can wait 365 days.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3] net: filter: BPF 'JIT' compiler for PPC64

2011-07-20 Thread Eric Dumazet

Le jeudi 21 juillet 2011 à 11:51 +1000, Matt Evans a écrit :
 An implementation of a code generator for BPF programs to speed up packet
 filtering on PPC64, inspired by Eric Dumazet's x86-64 version.
 
 Filter code is generated as an ABI-compliant function in module_alloc()'d mem
 with stackframe  prologue/epilogue generated if required (simple filters 
 don't
 need anything more than an li/blr).  The filter's local variables, M[], live 
 in
 registers.  Supports all BPF opcodes, although complicated loads from 
 negative
 packet offsets (e.g. SKF_LL_OFF) are not yet supported.
 
 There are a couple of further optimisations left for future work; many-pass
 assembly with branch-reach reduction and a register allocator to push M[]
 variables into volatile registers would improve the code quality further.
 
 This currently supports big-endian 64-bit PowerPC only (but is fairly simple
 to port to PPC32 or LE!).
 
 Enabled in the same way as x86-64:
 
   echo 1  /proc/sys/net/core/bpf_jit_enable
 
 Or, enabled with extra debug output:
 
   echo 2  /proc/sys/net/core/bpf_jit_enable
 
 Signed-off-by: Matt Evans m...@ozlabs.org
 ---
 
 V3: Added BUILD_BUG_ON to assert PACA CPU ID is 16bits, made a comment (in
 LD_MSH) a bit clearer, ratelimited Unknown opcode error and moved
 bpf_jit.S to bpf_jit_64.S (it doesn't make sense to rename bpf_jit_comp.c 
 as
 small portions will eventually get split out into _32/_64.c files when we 
 do
 32bit support).
 
  arch/powerpc/Kconfig  |1 +
  arch/powerpc/Makefile |3 +-
  arch/powerpc/include/asm/ppc-opcode.h |   40 ++
  arch/powerpc/net/Makefile |4 +
  arch/powerpc/net/bpf_jit.h|  227 +++
  arch/powerpc/net/bpf_jit_64.S |  138 +++
  arch/powerpc/net/bpf_jit_comp.c   |  694 
 +
  7 files changed, 1106 insertions(+), 1 deletions(-)

Nice work Matt ;)

Acked-by: Eric Dumazet eric.duma...@gmail.com

Thanks


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2] net: filter: BPF 'JIT' compiler for PPC64

2011-07-19 Thread Eric Dumazet

Le mardi 19 juillet 2011 à 12:13 +1000, Matt Evans a écrit :
 An implementation of a code generator for BPF programs to speed up packet
 filtering on PPC64, inspired by Eric Dumazet's x86-64 version.
 
 Filter code is generated as an ABI-compliant function in module_alloc()'d mem
 with stackframe  prologue/epilogue generated if required (simple filters 
 don't
 need anything more than an li/blr).  The filter's local variables, M[], live 
 in
 registers.  Supports all BPF opcodes, although complicated loads from 
 negative
 packet offsets (e.g. SKF_LL_OFF) are not yet supported.
 
 There are a couple of further optimisations left for future work; many-pass
 assembly with branch-reach reduction and a register allocator to push M[]
 variables into volatile registers would improve the code quality further.
 
 This currently supports big-endian 64-bit PowerPC only (but is fairly simple
 to port to PPC32 or LE!).
 
 Enabled in the same way as x86-64:
 
   echo 1  /proc/sys/net/core/bpf_jit_enable
 
 Or, enabled with extra debug output:
 
   echo 2  /proc/sys/net/core/bpf_jit_enable
 
 Signed-off-by: Matt Evans m...@ozlabs.org
 ---
 
 V2: Removed some cut/paste woe in setting SEEN_X even on writes.
 Merci for le review, Eric!
 
  arch/powerpc/Kconfig  |1 +
  arch/powerpc/Makefile |3 +-
  arch/powerpc/include/asm/ppc-opcode.h |   40 ++
  arch/powerpc/net/Makefile |4 +
  arch/powerpc/net/bpf_jit.S|  138 +++
  arch/powerpc/net/bpf_jit.h|  227 +++
  arch/powerpc/net/bpf_jit_comp.c   |  690 
 +
  7 files changed, 1102 insertions(+), 1 deletions(-)
 

 + case BPF_S_ANC_CPU:
 +#ifdef CONFIG_SMP
 + /*
 +  * PACA ptr is r13:
 +  * raw_smp_processor_id() = local_paca-paca_index
 +  */

This could break if one day linux supports more than 65536 cpus :)

 + PPC_LHZ_OFFS(r_A, 13,
 +  offsetof(struct paca_struct, paca_index));
 +#else
 + PPC_LI(r_A, 0);
 +#endif
 + break;
 +
 +
 + case BPF_S_LDX_B_MSH:
 + /*
 +  * x86 version drops packet (RET 0) when K0, whereas
 +  * interpreter does allow K0 (__load_pointer, special
 +  * ancillary data).
 +  */

Hmm, thanks I'll take a look at this.

 + func = sk_load_byte_msh;
 + goto common_load;
 + break;
 +
 + /*** Jump and branches ***/

 + default:
 + /* The filter contains something cruel  unusual.
 +  * We don't handle it, but also there shouldn't be
 +  * anything missing from our list.
 +  */
 + pr_err(BPF filter opcode %04x (@%d) unsupported\n,
 +filter[i].code, i);

You should at least ratelimit this message ?

On x86_64 I chose to silently fall back to interpretor for a complex
filter or unsupported opcode.

 + return -ENOTSUPP;
 + }
 +
 + }
 + /* Set end-of-body-code address for exit. */
 + addrs[i] = ctx-idx * 4;
 +
 + return 0;
 +}
 +


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2] net: filter: BPF 'JIT' compiler for PPC64

2011-07-19 Thread Eric Dumazet

Le mardi 19 juillet 2011 à 17:55 +1000, Benjamin Herrenschmidt a écrit :
 On Tue, 2011-07-19 at 08:51 +0200, Eric Dumazet wrote:
 
   + case BPF_S_ANC_CPU:
   +#ifdef CONFIG_SMP
   + /*
   +  * PACA ptr is r13:
   +  * raw_smp_processor_id() = local_paca-paca_index
   +  */
  
  This could break if one day linux supports more than 65536 cpus :)
  
   + PPC_LHZ_OFFS(r_A, 13,
   +  offsetof(struct paca_struct, paca_index));
   +#else
   + PPC_LI(r_A, 0);
   +#endif
   + break;
 
 As would our implementation of raw_smp_processor_id() and our
 spinlocks :-) I don't think we need to fix that -now- but you are
 welcome to add something like a
 BUILD_BUG_ON(sizeof(local_paca-paca_index) != 2); as a reminder :-)

Please Matt add to your next version this check. I dont think I have to
submit a one line patch later...

On x86_64, cpu_number field is already 32bit, we have some time before
it becomes 64bit ;)

We probably should add some extra check to make sure segment doesnt
change (%gs on x86_64, r13 on ppc64) on a future linux version.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: filter: BPF 'JIT' compiler for PPC64

2011-07-18 Thread Eric Dumazet

Le lundi 18 juillet 2011 à 17:50 +1000, Matt Evans a écrit :
 An implementation of a code generator for BPF programs to speed up packet
 filtering on PPC64, inspired by Eric Dumazet's x86-64 version.
 
 Filter code is generated as an ABI-compliant function in module_alloc()'d mem
 with stackframe  prologue/epilogue generated if required (simple filters 
 don't
 need anything more than an li/blr).  The filter's local variables, M[], live 
 in
 registers.  Supports all BPF opcodes, although complicated loads from 
 negative
 packet offsets (e.g. SKF_LL_OFF) are not yet supported.
 
 There are a couple of further optimisations left for future work; many-pass
 assembly with branch-reach reduction and a register allocator to push M[]
 variables into volatile registers would improve the code quality further.
 
 This currently supports big-endian 64-bit PowerPC only (but is fairly simple
 to port to PPC32 or LE!).
 
 Enabled in the same way as x86-64:
 
   echo 1  /proc/sys/net/core/bpf_jit_enable
 
 Or, enabled with extra debug output:
 
   echo 2  /proc/sys/net/core/bpf_jit_enable
 
 Signed-off-by: Matt Evans m...@ozlabs.org
 ---
 
 Since the RFC post, this has incorporated the bugfixes/tidies from review 
 plus a
 couple more found in further testing, plus some general/comment tidies.

Hi Matt

A small note about SEEN_XREG usage in PPC against x86_64 :

In x86_64, XREG is stored in EBX : I had to save/restore it in function
prologue epilogue. And set it to zero in prologue to avoid leak of
kernel information.

In PPC, you chose a scratch register, so you only have to zero it in
function prologue, if X is ever read.

So in PPC SEEN_XREG only is to be set of X is read, not if written.

So you dont have to set SEEN_XREG bit in this part :

 + case BPF_S_MISC_TAX: /* X = A */
 + ctx-seen |= SEEN_XREG;
 + PPC_MR(r_X, r_A);
 + break;

and in this part :

 + case BPF_S_LDX_IMM: /* X = K */
 + ctx-seen |= SEEN_XREG;
 + PPC_LI32(r_X, K);
 + break;

and :

 + case BPF_S_LDX_MEM: /* X = mem[K] */
 + PPC_MR(r_X, r_M + (K  0xf));
 + ctx-seen |= SEEN_XREG | SEEN_MEM | (1(K  0xf));
 + break;

and :

 + case BPF_S_LDX_W_LEN: /* X = skb-len; */
 + ctx-seen |= SEEN_XREG;
 + PPC_LWZ_OFFS(r_X, r_skb, offsetof(struct sk_buff, len));
 + break;
 +



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] net: filter: BPF 'JIT' compiler for PPC64

2011-07-18 Thread Eric Dumazet

Le lundi 18 juillet 2011 à 12:42 -0700, David Miller a écrit :
 From: Eric Dumazet eric.duma...@gmail.com
 Date: Mon, 18 Jul 2011 10:39:35 +0200

  So in PPC SEEN_XREG only is to be set of X is read, not if written.

  So you dont have to set SEEN_XREG bit in this part :

 Matt, do you want to integrate changes based upon Eric's feedback
 here or do you want me to apply your patch as-is for now?

This was a really minor point, so Matt feel free to ask an immediate
inclusion ;)

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: softirqs are invoked while bottom halves are masked

2011-07-13 Thread Eric Dumazet

Le mercredi 13 juillet 2011 à 09:20 +0200, Thomas De Schampheleire a
écrit :

 I just want to mention that, although you seem to think that we
 shouldn't have created this thread in the first place, your and Eric's
 remarks have actually helped us in identifying the problem and
 increasing our understanding.
 

Thread was fine IMHO, but should have mentioned the context of non
standard linux-2.6 or net-next-2.6 trees

I would like to mention NETIF_F_LLTX existence.

While being marked deprecated, it allows a lockless tx path :

The device transmit handler can then perform custom actions, including
its own locking if needed.

If the user wants a specific Qdisc setup (instead of a no qdisc one), it
still can.

veth, loopback, dummy, macvlan  bond devices use NETIF_F_LLTX for
example.

But ndo_start_xmit() must always be called with BH disabled.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: softirqs are invoked while bottom halves are masked (was: Re: [PATCH] [PATCH] Fix deadlock in af_packet while stressing raw ethernet socket interface)

2011-07-12 Thread Eric Dumazet

Le mardi 12 juillet 2011 à 11:23 +0200, Thomas De Schampheleire a
écrit :
 Hi,
 
 I'm adding the linuxppc-dev mailing list since this may be pointing to
 an irq/softirq problem in the powerpc architecture-specific code...

 
 Note that the reason we are seeing this problem, may be because the
 kernel we are using contains some patches from Freescale.
 Specifically, in dev_queue_xmit(), support is added for hardware queue
 handling, just before entering the rcu_read_lock_bh():
 

Oh well, what a mess.

 if (dev-features  NETIF_F_HW_QDISC) {
 txq = dev_pick_tx(dev, skb);



 return dev_hard_start_xmit(skb, dev, txq);
This need to be :
local_bh_disable();
rc = dev_hard_start_xmit(skb, dev, txq);
local_bh_enable();
return rc;


 }
 
 /* Disable soft irqs for various locks below. Also
  * stops preemption for RCU.
  */
 rcu_read_lock_bh();
 
 We just tried moving the escaping to dev_hard_start_xmit() after
 taking the lock, but this gives a large number of other problems, e.g.
 
 [   78.662428] BUG: sleeping function called from invalid context at
 mm/slab.c:3101
 [   78.751004] in_atomic(): 1, irqs_disabled(): 0, pid: 1908, name:
 send_eth_socket
 [   78.839582] Call Trace:
 [   78.868784] [ec537b70] [c000789c] show_stack+0x78/0x18c (unreliable)
 [   78.944905] [ec537bb0] [c0022900] __might_sleep+0x100/0x118
 [   79.011636] [ec537bc0] [c00facc4] kmem_cache_alloc+0x48/0x118
 [   79.080446] [ec537be0] [c02cd0e8] __alloc_skb+0x50/0x130
 [   79.144047] [ec537c00] [c02cdf5c] skb_copy+0x44/0xc8
 [   79.203478] [ec537c20] [c029f904] dpa_tx+0x154/0x758

doing GFP_KERNEL allocations in dpa_tx() is wrong, for sure.


 [   79.262907] [ec537c80] [c02d78ec] dev_hard_start_xmit+0x424/0x588
 [   79.335878] [ec537cc0] [c02d7aac] dev_queue_xmit+0x5c/0x3a4
 [   79.402602] [ec537cf0] [c0338d4c] packet_sendmsg+0x8c4/0x988
 [   79.470363] [ec537d70] [c02c3838] sock_sendmsg+0x90/0xb4
 [   79.533960] [ec537e40] [c02c4420] sys_sendto+0xdc/0x120
 [   79.596514] [ec537f10] [c02c57d0] sys_socketcall+0x148/0x210
 [   79.664287] [ec537f40] [c001084c] ret_from_syscall+0x0/0x3c
 [   79.731015] --- Exception: c01 at 0x48051f00
 [   79.731019] LR = 0x4808e030
 
 
 Note that this may just be the cause for us seeing this problem. If
 indeed the main problem is irq_exit() invoking softirqs in a locked
 context, then this patch adding hardware queue support is not really
 relevant.

irq_exit() is fine. This is because BH are not masked because of the
Freescale patches.

Really, suggesting an af_packet patch to solve a problem introduced in
an out of tree patch is insane.

You guys hould have clearly stated you were using an alien kernel.



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: softirqs are invoked while bottom halves are masked (was: Re: [PATCH] [PATCH] Fix deadlock in af_packet while stressing raw ethernet socket interface)

2011-07-12 Thread Eric Dumazet

Le mardi 12 juillet 2011 à 14:03 +0200, Ronny Meeus a écrit :

 Sorry for not mentioning we were using a patched kernel.
 I was not aware that the code involved was patched by the FreeScale
 patches we applied. The code found in the stack dumps is not
 implemented in FSL specific files.
 
 While reading the code of af_packet I saw that the spin_lock_bh is
 used in several places while this is not the case in the tpacket_rcv
 function. Since we had a locking issue in that code, I thought that my
 patch would be OK.
 I was not aware that for that specific function (tpacket_rcv) a
 different lock primitive must be used. A suggestion for improvement:
 it would be better to document this pre-condition in the code.
 
 After doing the change you proposed our code now looks like:
 
 ---if (dev-features  NETIF_F_HW_QDISC) {
 --txq = dev_pick_tx(dev, skb);
 --local_bh_disable();
 --rc = dev_hard_start_xmit(skb, dev, txq);
 --local_bh_enable();
 --return rc;
 ---}
 
 ---/* Disable soft irqs for various locks below. Also
 --- * stops preemption for RCU.
 --- */
 ---rcu_read_lock_bh();
 
 but we still see the issue BUG: sleeping function called from invalid 
 context:

Of course you are if this is the only change you did.

 
 [   91.015989] BUG: sleeping function called from invalid context at
 include/linux/skbuff.h:786
 [   91.117096] in_atomic(): 1, irqs_disabled(): 0, pid: 1865, name: NMTX_T1842
 [   91.200461] Call Trace:
 [   91.229672] [ec58bbd0] [c000789c] show_stack+0x78/0x18c (unreliable)
 [   91.305791] [ec58bc10] [c0022900] __might_sleep+0x100/0x118
 [   91.372524] [ec58bc20] [c029f8d8] dpa_tx+0x128/0x758


Please read again my mail : 

I said : doing GFP_KERNEL allocations in dpa_tx() is wrong, for sure.

I dont have this code, but I suspect it's using : skb_copy(skb,
GFP_KERNEL)

Just say no, use GFP_ATOMIC instead.

Real question is : why skb_copy() is done, since its slow as hell.

 [   91.431957] [ec58bc80] [c02d78ec] dev_hard_start_xmit+0x424/0x588
 [   91.504952] [ec58bcc0] [c02d7ab0] dev_queue_xmit+0x60/0x3ac
 [   91.571692] [ec58bcf0] [c0338d54] packet_sendmsg+0x8c4/0x988
 [   91.639457] [ec58bd70] [c02c3838] sock_sendmsg+0x90/0xb4
 [   91.703066] [ec58be40] [c02c4420] sys_sendto+0xdc/0x120
 [   91.765646] [ec58bf10] [c02c57d0] sys_socketcall+0x148/0x210
 [   91.833420] [ec58bf40] [c001084c] ret_from_syscall+0x0/0x3c
 [   91.900153] --- Exception: c01 at 0x4824df00
 [   91.900157] LR = 0x4828a030
 


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH 1/1] BPF JIT for PPC64

2011-06-25 Thread Eric Dumazet

Le samedi 25 juin 2011 à 09:33 +0200, Andreas Schwab a écrit :
 Matt Evans m...@ozlabs.org writes:
 
  +   stdur1, -128(r1);   \
 
  +   addir5, r1, 128+BPF_PPC_STACK_BASIC+(2*8);  \
 
  +   addir1, r1, 128;\
 
  +   PPC_STD(r_M + i, 1, -128 + (8*i));
 
  +   PPC_LD(r_M + i, 1, -128 + (8*i));
 
 s/128/BPF_PPC_STACK_SAVE/?
 

I am not sure using registers to hold MEM[] is a win if MEM[idx] is used
once in the filter

# tcpdump tcp[20]+tcp[21]=0 -d
(000) ldh  [12]
(001) jeq  #0x800   jt 2jf 15
(002) ldb  [23]
(003) jeq  #0x6 jt 4jf 15
(004) ldh  [20]
(005) jset #0x1fff  jt 15   jf 6
(006) ldxb 4*([14]0xf)
(007) ldb  [x + 34]
(008) st   M[1]
(009) ldb  [x + 35]
(010) tax  
(011) ld   M[1]
(012) add  x
(013) jeq  #0x0 jt 14   jf 15
(014) ret  #65535
(015) ret  #0

In this sample, we use M[1] once ( one store, one load)

So saving previous register content on stack in prologue, and restoring
it in epilogue actually slow down the code, and adds two instructions in filter 
asm code.

This also makes epilogue code not easy (not possible as a matter of fact)
to unwind in helper function

In x86_64 implementation, I chose bpf_error be able to force
an exception, not returning to JIT code but directly to bpf_func() caller

bpf_error:
# force a return 0 from jit handler
xor %eax,%eax
mov -8(%rbp),%rbx
leaveq
ret


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] seqlock: don't smp_rmb in seqlock reader spin loop

2011-05-12 Thread Eric Dumazet

Le jeudi 12 mai 2011 à 04:13 -0500, Milton Miller a écrit :
 Move the smp_rmb after cpu_relax loop in read_seqlock and add
 ACCESS_ONCE to make sure the test and return are consistent.
 
 A multi-threaded core in the lab didn't like the update
 from 2.6.35 to 2.6.36, to the point it would hang during
 boot when multiple threads were active.  Bisection showed
 af5ab277ded04bd9bc6b048c5a2f0e7d70ef0867 (clockevents:
 Remove the per cpu tick skew) as the culprit and it is
 supported with stack traces showing xtime_lock waits including
 tick_do_update_jiffies64 and/or update_vsyscall.
 
 Experimentation showed the combination of cpu_relax and smp_rmb
 was significantly slowing the progress of other threads sharing
 the core, and this patch is effective in avoiding the hang.
 
 A theory is the rmb is affecting the whole core while the
 cpu_relax is causing a resource rebalance flush, together they
 cause an interfernce cadance that is unbroken when the seqlock
 reader has interrupts disabled.  
 
 At first I was confused why the refactor in
 3c22cd5709e8143444a6d08682a87f4c57902df3 (kernel: optimise
 seqlock) didn't affect this patch application, but after some
 study that affected seqcount not seqlock. The new seqcount was
 not factored back into the seqlock.  I defer that the future.
 
 While the removal of the timer interrupt offset created
 contention for the xtime lock while a cpu does the
 additonal work to update the system clock, the seqlock
 implementation with the tight rmb spin loop goes back much
 further, and is just waiting for the right trigger.
 
 Cc: sta...@vger.kernel.org
 Signed-off-by: Milton Miller milt...@bga.com
 ---
 
 To the readers of [RFC] time: xtime_lock is held too long:
 
 I initially thought x86 would not see this because rmb would
 be a nop, but upon closer inspection X86_PPRO_FENCE will add
 a lfence for rmb.
 
 milton
 
 Index: common/include/linux/seqlock.h
 ===
 --- common.orig/include/linux/seqlock.h   2011-04-06 03:27:02.0 
 -0500
 +++ common/include/linux/seqlock.h2011-04-06 03:35:02.0 -0500
 @@ -88,12 +88,12 @@ static __always_inline unsigned read_seq
   unsigned ret;
  
  repeat:
 - ret = sl-sequence;
 - smp_rmb();
 + ret = ACCESS_ONCE(sl-sequence);
   if (unlikely(ret  1)) {
   cpu_relax();
   goto repeat;
   }
 + smp_rmb();
  
   return ret;
  }

I fully agree with your analysis. This is a call to make the change I
suggested earlier [1]. (Use a seqcount object in seqlock_t)

typedef struct {
seqcount_t seq
spinlock_t lock;
} seqlock_t;

I'll submit a patch for 2.6.40

Acked-by: Eric Dumazet eric.duma...@gmail.com

Thanks

[1] Ref: https://lkml.org/lkml/2011/5/6/351



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: Badness with the kernel version 2.6.35-rc1-git1 running on P6 box

2010-07-16 Thread Eric Dumazet

-rc5

Anyway ehea should not use GFP_ATOMIC in its ehea_get_stats() method,
called in process context, but GFP_KERNEL.

Another patch is needed for ehea_refill_rq_def() as well.



[PATCH] ehea: ehea_get_stats() should use GFP_KERNEL

ehea_get_stats() is called in process context and should use GFP_KERNEL
allocation instead of GFP_ATOMIC.

Clearing stats at beginning of ehea_get_stats() is racy in case of
concurrent stat readers.

get_stats() can also use netdev net_device_stats, instead of a private
copy.

Reported-by: divya dipra...@linux.vnet.ibm.com
Signed-off-by: Eric Dumazet eric.duma...@gmail.com
---
 drivers/net/ehea/ehea.h  |1 -
 drivers/net/ehea/ehea_main.c |6 ++
 2 files changed, 2 insertions(+), 5 deletions(-)



___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: Badness with the kernel version 2.6.35-rc1-git1 running on P6 box

2010-07-16 Thread Eric Dumazet

Le vendredi 16 juillet 2010 à 11:56 +0200, Eric Dumazet a écrit :

 [PATCH] ehea: ehea_get_stats() should use GFP_KERNEL
 
 ehea_get_stats() is called in process context and should use GFP_KERNEL
 allocation instead of GFP_ATOMIC.
 
 Clearing stats at beginning of ehea_get_stats() is racy in case of
 concurrent stat readers.
 
 get_stats() can also use netdev net_device_stats, instead of a private
 copy.
 
 Reported-by: divya dipra...@linux.vnet.ibm.com
 Signed-off-by: Eric Dumazet eric.duma...@gmail.com
 ---
  drivers/net/ehea/ehea.h  |1 -
  drivers/net/ehea/ehea_main.c |6 ++
  2 files changed, 2 insertions(+), 5 deletions(-)
 
 

Hmm, net-next-2.6 contains following patch :

commit 3d8009c780ee90fccb5c171caf30aff839f13547
Author: Brian King brk...@linux.vnet.ibm.com
Date:   Wed Jun 30 11:59:12 2010 +

ehea: Allocate stats buffer with GFP_KERNEL

Since ehea_get_stats calls ehea_h_query_ehea_port, which
can sleep, we can also sleep when allocating a page in
this function. This fixes some memory allocation failure
warnings seen under low memory conditions.

Signed-off-by: Brian King brk...@linux.vnet.ibm.com
Signed-off-by: David S. Miller da...@davemloft.net

diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index 8b92acb..3beba70 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -335,7 +335,7 @@ static struct net_device_stats
*ehea_get_stats(struct net_device *dev)
 
memset(stats, 0, sizeof(*stats));
 
-   cb2 = (void *)get_zeroed_page(GFP_ATOMIC);
+   cb2 = (void *)get_zeroed_page(GFP_KERNEL);
if (!cb2) {
ehea_error(no mem for cb2);
goto out;


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver

2010-04-07 Thread Eric Dumazet

Le mercredi 07 avril 2010 à 07:25 -0600, John Linn a écrit :
  -Original Message-
  From: David Miller [mailto:da...@davemloft.net]
  Sent: Tuesday, April 06, 2010 8:52 PM
  To: eric.duma...@gmail.com
  Cc: John Linn; net...@vger.kernel.org; linuxppc-...@ozlabs.org;
 grant.lik...@secretlab.ca;
  jwbo...@linux.vnet.ibm.com; john.willi...@petalogix.com;
 michal.si...@petalogix.com; jty...@cs.ucr.edu
  Subject: Re: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver
  
  From: Eric Dumazet eric.duma...@gmail.com
  Date: Mon, 05 Apr 2010 23:29:53 +0200
  
   So, I ask, cant you use netdev_alloc_skb_ip_align() in this driver ?
  
  Thanks to everyone for getting this patch into shape.
  
  I applied version 4 of the patch to net-next-2.6, thanks!
 
 Thanks David, appreciate everyone's help and patience.
 

You're welcome ;)

Thanks


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver

2010-04-06 Thread Eric Dumazet

Le mardi 06 avril 2010 à 10:12 -0600, John Linn a écrit :
  -Original Message-
  From: Eric Dumazet [mailto:eric.duma...@gmail.com]
  Sent: Monday, April 05, 2010 3:30 PM
  To: John Linn
  Cc: net...@vger.kernel.org; linuxppc-...@ozlabs.org; 
  grant.lik...@secretlab.ca;
  jwbo...@linux.vnet.ibm.com; john.willi...@petalogix.com; 
  michal.si...@petalogix.com; John Tyner
  Subject: Re: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver

  Le lundi 05 avril 2010 à 15:11 -0600, John Linn a écrit :
   This patch adds support for using the LL TEMAC Ethernet driver on
   non-Virtex 5 platforms by adding support for accessing the Soft DMA
   registers as if they were memory mapped instead of solely through the
   DCR's (available on the Virtex 5).

   The patch also updates the driver so that it runs on the MicroBlaze.
   The changes were tested on the PowerPC 440, PowerPC 405, and the
   MicroBlaze platforms.

   Signed-off-by: John Tyner jty...@cs.ucr.edu
   Signed-off-by: John Linn john.l...@xilinx.com

   ---

   +/* Align the IP data in the packet on word boundaries as MicroBlaze
   + * needs it.
   + */
   +
#define XTE_ALIGN   32
   -#define BUFFER_ALIGN(adr) ((XTE_ALIGN - ((u32) adr)) % XTE_ALIGN)
   +#define BUFFER_ALIGN(adr) ((34 - ((u32) adr)) % XTE_ALIGN)

  Very interesting way of doing this, but why such convoluted thing ?

 This is trying to align for a cache line (32 bytes) before my change.

 My change was then also making it align the IP data on a word boundary. 

  Because of the % 32, this is equivalent to :

  #define BUFFER_ALIGN(adr) ((2 - ((u32) adr)) % XTE_ALIGN)

 Yes, but I'm not sure that's clearer IMHO.

  But wait, dont we recognise the magic constant NET_IP_ALIGN ?

 Yes it could be used.  I'm struggling with how to make this all be clearer.

I am not saying its clearer, I am saying we have a standard way to
handle this exact problem (aligning rcvs buffer so that IP header is
aligned)

There is no need to invent new ones, this makes reviewing of this driver
more difficult.

 How about this?
 #define BUFFER_ALIGN(adr) (((XTE_ALIGN + NET_IP_ALIGN) - ((u32) adr)) % 
 XTE_ALIGN)

Sorry, I still dont understand why you need XTE_ALIGN + ...

((A + B) - C) % A   is equal to (B - C) % A

Which one is more readable ?

Please take a look at existing and clean code, no magic macro, and we
can understand the intention.

find drivers/net | xargs grep -n netdev_alloc_skb_ip_align

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

RE: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver

2010-04-06 Thread Eric Dumazet


 
 Yes I see how it's used, but it only allows you to reserve 2 bytes in the skb 
 with no options.


Really ? This would be a bug !

__alloc_skb() uses kmalloc(), this gives you the guarantee you want,
or maybe comment you wrote is not what is _really_ necessary ?

/* Align the IP data in the packet on word boundaries as MicroBlaze
 * needs it.
 */


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver

2010-04-06 Thread Eric Dumazet

Le mardi 06 avril 2010 à 12:53 -0600, Grant Likely a écrit :

 Hold on BUFFER_ALIGN is being used to align the DMA buffer on a
 cache line boundary.  I don't think netdev_alloc_skb() makes any
 guarantees about how the start of the IP header lines up against cache
 line boundaries.  The amount of padding needed is not known until an
 skbuff is obtained from netdev_alloc_skb(), and
 netdev_alloc_skb_ip_align() can only handle a fixed size padding,
 
 It doesn't look like netdev_alloc_skb_ip_align() is the right thing in
 this regard.
 

OK, time to have a long explanation :

netdev_alloc_skb_ip_align() is doing the right thing, but it seems you
guys insist to invent a new private stuff.

I am only wondering if you really know why you do this. 

Many drivers do have same requirements, so every driver should reinvent
the wheel ? Really this is beyond me.

Original code was aligning the buffer on a 32 bytes boundary (because of
a hardware requirement on NIC, I only trust original code on this).

Then you want to change this to align buffer on this 32 bytes boundary
PLUS 2. What is this kind of new requirement ? 

1) Hardware requirement really changed that much. (firmware changed on
NIC). If not using this new alignement, NIC doesnt work at all.

2) or Microblaze arch requires that IP header is aligned on a word
boundary to avoid unaligned traps in IP stack ? (like many arches)

If this is the latest requirement, then use standard mechanism.
skb data is naturally aligned on L1_CACHE_SIZE + SKB_PAD boundaries.
(32 bytes alignment)

netdev_alloc_skb_ip_align()() then skips 2 bytes to make skb data
aligned so that 2 + 14 (sizeof eth header) = 16 : IP header is aligned
(modulo 16)

It just works. If not, we should correct it, please fill a bug report.

Thanks


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] [V3] Add non-Virtex5 support for LL TEMAC driver

2010-04-05 Thread Eric Dumazet

Le lundi 05 avril 2010 à 15:11 -0600, John Linn a écrit :
 This patch adds support for using the LL TEMAC Ethernet driver on
 non-Virtex 5 platforms by adding support for accessing the Soft DMA
 registers as if they were memory mapped instead of solely through the
 DCR's (available on the Virtex 5).
 
 The patch also updates the driver so that it runs on the MicroBlaze.
 The changes were tested on the PowerPC 440, PowerPC 405, and the
 MicroBlaze platforms.
 
 Signed-off-by: John Tyner jty...@cs.ucr.edu
 Signed-off-by: John Linn john.l...@xilinx.com
 
 ---

 +/* Align the IP data in the packet on word boundaries as MicroBlaze
 + * needs it.
 + */
 +
  #define XTE_ALIGN   32
 -#define BUFFER_ALIGN(adr) ((XTE_ALIGN - ((u32) adr)) % XTE_ALIGN)
 +#define BUFFER_ALIGN(adr) ((34 - ((u32) adr)) % XTE_ALIGN)
  

Very interesting way of doing this, but why such convoluted thing ?

Because of the % 32, this is equivalent to :

#define BUFFER_ALIGN(adr) ((2 - ((u32) adr)) % XTE_ALIGN)

But wait, dont we recognise the magic constant NET_IP_ALIGN ?

So, I ask, cant you use netdev_alloc_skb_ip_align() in this driver ?


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [net-next-2.6 PATCH v2 3/3] fs_enet: add FEC TX buffer alignment workaround for MPC5121

2010-02-17 Thread Eric Dumazet

Le mercredi 17 février 2010 à 15:55 +0100, Anatolij Gustschin a écrit :
 MPC5121 FEC requeries 4-byte alignmnent for TX data buffers.
 This patch is a work around that copies misaligned tx packets
 to an aligned skb before sending.
 
 Signed-off-by: John Rigby jcri...@gmail.com
 Signed-off-by: Piotr Ziecik ko...@semihalf.com
 Signed-off-by: Wolfgang Denk w...@denx.de
 Signed-off-by: Anatolij Gustschin ag...@denx.de
 ---
  drivers/net/fs_enet/fs_enet-main.c |   44 
 
  1 files changed, 44 insertions(+), 0 deletions(-)
 
 diff --git a/drivers/net/fs_enet/fs_enet-main.c 
 b/drivers/net/fs_enet/fs_enet-main.c
 index 4297021..166a89d 100644
 --- a/drivers/net/fs_enet/fs_enet-main.c
 +++ b/drivers/net/fs_enet/fs_enet-main.c
 @@ -580,6 +580,37 @@ void fs_cleanup_bds(struct net_device *dev)
  
  
 /**/
  
 +#ifdef CONFIG_FS_ENET_MPC5121_FEC
 +/*
 + * MPC5121 FEC requeries 4-byte alignment for TX data buffer!
 + */
 +static struct sk_buff *tx_skb_align_workaround(struct net_device *dev,
 +struct sk_buff *skb)
 +{
 + struct sk_buff *new_skb;
 + struct fs_enet_private *fep = netdev_priv(dev);
 +
 + /* Alloc new skb */
 + new_skb = dev_alloc_skb(ENET_RX_FRSIZE + 4);


ENET_RX_FRSIZE looks strange in TX path

Why not using skb-len + 4 instead of ENET_RX_FRSIZE + 4 ?


 + if (!new_skb) {
 + dev_warn(fep-dev, Memory squeeze, dropping tx packet.\n);

I am just wondering if this is ratelimited ?

 + return NULL;
 + }
 +
 + /* Make sure new skb is properly aligned */
 + skb_align(new_skb, 4);

 +
 + /* Copy data to new skb ... */
 + skb_copy_from_linear_data(skb, new_skb-data, skb-len);
 + skb_put(new_skb, skb-len);
 +
 + /* ... and free an old one */
 + dev_kfree_skb_any(skb);
 +
 + return new_skb;
 +}
 +#endif
 +
  static int fs_enet_start_xmit(struct sk_buff *skb, struct net_device *dev)
  {
   struct fs_enet_private *fep = netdev_priv(dev);
 @@ -588,6 +619,19 @@ static int fs_enet_start_xmit(struct sk_buff *skb, 
 struct net_device *dev)
   u16 sc;
   unsigned long flags;
  
 +#ifdef CONFIG_FS_ENET_MPC5121_FEC
 + if (((unsigned long)skb-data)  0x3) {
 + skb = tx_skb_align_workaround(dev, skb);
 + if (!skb) {
 + /*
 +  * We have lost packet due to memory allocation error
 +  * in tx_skb_align_workaround(). Hopefully original
 +  * skb is still valid, so try transmit it later.
 +  */

Could you define 'try to transmit later' ?
Who is responsible to trigger this event ?


 + return NETDEV_TX_BUSY;
 + }
 + }
 +#endif
   spin_lock_irqsave(fep-tx_lock, flags);
  
   /*


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: question about softirqs

2009-05-13 Thread Eric Dumazet

Andi Kleen a écrit :
 Thomas Gleixner t...@linutronix.de writes:
 
 
 Err, no. Chris is completely correct:

 if (!in_interrupt())
  wakeup_softirqd();
 
 Yes you have to wake it up just in case, but it doesn't normally
 process the data because a normal softirq comes in faster. It's
 just a safety policy. 
 
 You can check this by checking the accumulated CPU time on your
 ksoftirqs.  Mine are all 0 even on long running systems.
 

Then its a bug Andi. Its quite easy to trigger ksoftirqd with a Gb ethernet 
link.

commit f5f293a4e3d0a0c52cec31de6762c95050156516 corrected something
(making mpstat and top correctly display softirq on cpu stats),
but apparently we still have a problem to report correct time on processes,
particularly on ksoftirq/x

I have one machine SMP flooded by network frames, CPU0 handling all
the work, inside ksoftirq/0 (napi processing : almost no more hard interrupts 
delivered)

Still, top or ps reports no more than 30% of cpu time used by
ksoftirqd, while this cpu only runs ksoftirqd/0 (100% in sirq), and has no idle 
time.

$ps -fp 4 ; mpstat -P 0 1 10 ; ps -fp 4
UIDPID  PPID  C STIME TTY  TIME CMD
root 4 2  1 15:35 ?00:00:46 [ksoftirqd/0]
Linux 2.6.30-rc5-tip-01595-g6f75dad-dirty (svivoipvnx001)   05/13/2009  
_i686_

04:45:01 PM  CPU%usr   %nice%sys %iowait%irq   %soft  %steal  
%guest   %idle
04:45:02 PM00.000.000.000.000.00  100.000.00
0.000.00
04:45:03 PM00.000.000.000.000.00   99.010.00
0.000.99
04:45:04 PM00.000.000.000.000.00  100.000.00
0.000.00
04:45:05 PM00.000.000.000.000.00  100.000.00
0.000.00
04:45:06 PM00.000.000.000.000.00  100.000.00
0.000.00
04:45:07 PM00.000.000.000.000.00  100.000.00
0.000.00
04:45:08 PM00.000.000.000.000.00  100.000.00
0.000.00
04:45:09 PM00.000.000.000.000.00  100.000.00
0.000.00
04:45:10 PM00.000.000.000.000.00  100.000.00
0.000.00
04:45:11 PM00.000.000.000.000.00  100.000.00
0.000.00
Average:   00.000.000.000.000.00   99.900.00
0.000.10
UIDPID  PPID  C STIME TTY  TIME CMD
root 4 2  1 15:35 ?00:00:49 [ksoftirqd/0]

You can see here time consumed by ksoftirqd/0 suring this 10 seconds time frame 
is *only* 3 seconds.

Therefore, we cannot trust ps, not with current kernel.

# cat /proc/4/stat ; sleep 10 ; cat /proc/4/stat
4 (ksoftirqd/0) R 2 0 0 0 -1 2216730688 0 0 0 0 0 15347 0 0 15 -5 1 0 6 0 0 
4294967295 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 0 0 0 0 0 0
4 (ksoftirqd/0) R 2 0 0 0 -1 2216730688 0 0 0 0 0 15670 0 0 15 -5 1 0 6 0 0 
4294967295 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 0 0 0 0 0 0


 The reason Andrea originally added the softirqds was just that
 if you have very softirq intensive workloads they would tie
 up too much CPU time or not make enough process with the default
 don't loop too often heuristics. 
 
 We can not rely on irqs coming in when the softirq is raised from
 
 You can't rely on it, but it happens in near all cases.
 
 -Andi


___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [PATCH 1/2] ucc_geth: Move freeing of TX packets to NAPI context.

2009-03-26 Thread Eric Dumazet

Joakim Tjernlund a écrit :
 Also set NAPI weight to 64 as this is a common value.
 This will make the system alot more responsive while
 ping flooding the ucc_geth ethernet interaface.
 
 Signed-off-by: Joakim Tjernlund joakim.tjernl...@transmode.se
 ---
  drivers/net/ucc_geth.c |   32 
  drivers/net/ucc_geth.h |1 -
  2 files changed, 12 insertions(+), 21 deletions(-)
 
 diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
 index 097aed8..7fc91aa 100644
 --- a/drivers/net/ucc_geth.c
 +++ b/drivers/net/ucc_geth.c
 @@ -3214,7 +3214,7 @@ static int ucc_geth_tx(struct net_device *dev, u8 txQ)
   dev-stats.tx_packets++;
  
   /* Free the sk buffer associated with this TxBD */
 - dev_kfree_skb_irq(ugeth-
 + dev_kfree_skb(ugeth-
 tx_skbuff[txQ][ugeth-skb_dirtytx[txQ]]);
   ugeth-tx_skbuff[txQ][ugeth-skb_dirtytx[txQ]] = NULL;
   ugeth-skb_dirtytx[txQ] =
 @@ -3248,9 +3248,16 @@ static int ucc_geth_poll(struct napi_struct *napi, int 
 budget)

   for (i = 0; i  ug_info-numQueuesRx; i++)
   howmany += ucc_geth_rx(ugeth, i, budget - howmany);

Not related to your patch, but seeing above code, I can understand
you might have a problem in flood situation : Only first queue(s) are
depleted, and last one cannot be because howmany = budget.

This driver might have to remember last queue it handled at previous
call ucc_geth_poll(), so that all rxqueues have same probability to be scanned.

j = ug-lastslot;
for (i = 0; ug_info-numQueuesRx; i++) {
if (++j = ug_info-numQueuesRx)
j = 0;
howmany += ucc_geth_rx(ugeth, j, budget - howmany);
if (howmany = budget)
break;
}
ug-lastslot = j;


  
 + /* Tx event processing */
 + spin_lock(ugeth-lock);
 + for (i = 0; i  ug_info-numQueuesTx; i++) {
 + ucc_geth_tx(ugeth-dev, i);
 + }
 + spin_unlock(ugeth-lock);
 +
   if (howmany  budget) {
   netif_rx_complete(napi);
 - setbits32(ugeth-uccf-p_uccm, UCCE_RX_EVENTS);
 + setbits32(ugeth-uccf-p_uccm, UCCE_RX_EVENTS | UCCE_TX_EVENTS);
   }
  
   return howmany;
 @@ -3264,8 +3271,6 @@ static irqreturn_t ucc_geth_irq_handler(int irq, void 
 *info)
   struct ucc_geth_info *ug_info;
   register u32 ucce;
   register u32 uccm;
 - register u32 tx_mask;
 - u8 i;
  
   ugeth_vdbg(%s: IN, __func__);
  
 @@ -3279,27 +3284,14 @@ static irqreturn_t ucc_geth_irq_handler(int irq, void 
 *info)
   out_be32(uccf-p_ucce, ucce);
  
   /* check for receive events that require processing */
 - if (ucce  UCCE_RX_EVENTS) {
 + if (ucce  (UCCE_RX_EVENTS | UCCE_TX_EVENTS)) {
   if (netif_rx_schedule_prep(ugeth-napi)) {
 - uccm = ~UCCE_RX_EVENTS;
 + uccm = ~(UCCE_RX_EVENTS | UCCE_TX_EVENTS);
   out_be32(uccf-p_uccm, uccm);
   __netif_rx_schedule(ugeth-napi);
   }
   }
  
 - /* Tx event processing */
 - if (ucce  UCCE_TX_EVENTS) {
 - spin_lock(ugeth-lock);
 - tx_mask = UCC_GETH_UCCE_TXB0;
 - for (i = 0; i  ug_info-numQueuesTx; i++) {
 - if (ucce  tx_mask)
 - ucc_geth_tx(dev, i);
 - ucce = ~tx_mask;
 - tx_mask = 1;
 - }
 - spin_unlock(ugeth-lock);
 - }
 -
   /* Errors and other events */
   if (ucce  UCCE_OTHER) {
   if (ucce  UCC_GETH_UCCE_BSY)
 @@ -3733,7 +3725,7 @@ static int ucc_geth_probe(struct of_device* ofdev, 
 const struct of_device_id *ma
   dev-netdev_ops = ucc_geth_netdev_ops;
   dev-watchdog_timeo = TX_TIMEOUT;
   INIT_WORK(ugeth-timeout_work, ucc_geth_timeout_work);
 - netif_napi_add(dev, ugeth-napi, ucc_geth_poll, UCC_GETH_DEV_WEIGHT);
 + netif_napi_add(dev, ugeth-napi, ucc_geth_poll, 64);
   dev-mtu = 1500;
  
   ugeth-msg_enable = netif_msg_init(debug.msg_enable, UGETH_MSG_DEFAULT);
 diff --git a/drivers/net/ucc_geth.h b/drivers/net/ucc_geth.h
 index 44218b8..50bad53 100644
 --- a/drivers/net/ucc_geth.h
 +++ b/drivers/net/ucc_geth.h
 @@ -843,7 +843,6 @@ struct ucc_geth_hardware_statistics {
  /* Driver definitions */
  #define TX_BD_RING_LEN  0x10
  #define RX_BD_RING_LEN  0x10
 -#define UCC_GETH_DEV_WEIGHT TX_BD_RING_LEN
  
  #define TX_RING_MOD_MASK(size)  (size-1)
  #define RX_RING_MOD_MASK(size)  (size-1)


___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

Re: [PATCH] ucc_geth: Move freeing of TX packets to NAPI context.

2009-03-25 Thread Eric Dumazet

Joakim Tjernlund a écrit :
From 1c2f23b1f37f4818c0fd0217b93eb38ab6564840 Mon Sep 17 00:00:00 2001
 From: Joakim Tjernlund joakim.tjernl...@transmode.se
 Date: Tue, 24 Mar 2009 10:19:27 +0100
 Subject: [PATCH] ucc_geth: Move freeing of TX packets to NAPI context.
  Also increase NAPI weight somewhat.
  This will make the system alot more responsive while
  ping flooding the ucc_geth ethernet interaface.
 
 
 Signed-off-by: Joakim Tjernlund joakim.tjernl...@transmode.se
 ---
  drivers/net/ucc_geth.c |   30 +++---
  drivers/net/ucc_geth.h |2 +-
  2 files changed, 12 insertions(+), 20 deletions(-)
 
 diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c
 index 097aed8..7d5d110 100644
 --- a/drivers/net/ucc_geth.c
 +++ b/drivers/net/ucc_geth.c
 @@ -3214,7 +3214,7 @@ static int ucc_geth_tx(struct net_device *dev, u8 txQ)
   dev-stats.tx_packets++;
  
   /* Free the sk buffer associated with this TxBD */
 - dev_kfree_skb_irq(ugeth-
 + dev_kfree_skb(ugeth-
 tx_skbuff[txQ][ugeth-skb_dirtytx[txQ]]);
   ugeth-tx_skbuff[txQ][ugeth-skb_dirtytx[txQ]] = NULL;
   ugeth-skb_dirtytx[txQ] =
 @@ -3248,9 +3248,16 @@ static int ucc_geth_poll(struct napi_struct *napi, int 
 budget)
   for (i = 0; i  ug_info-numQueuesRx; i++)
   howmany += ucc_geth_rx(ugeth, i, budget - howmany);
  

Cant you test (ucce  UCCE_TX_EVENTS) or something here to avoid
taking lock and checking queues if not necessary ?

 + /* Tx event processing */
 + spin_lock(ugeth-lock);
 + for (i = 0; i  ug_info-numQueuesTx; i++) {
 + ucc_geth_tx(ugeth-dev, i);
 + }
 + spin_unlock(ugeth-lock);
 +

Why tx completions dont change howmany ?
It seems strange you changed UCC_GETH_DEV_WEIGHT if not taking into account tx 
event above...


   if (howmany  budget) {
   netif_rx_complete(napi);
 - setbits32(ugeth-uccf-p_uccm, UCCE_RX_EVENTS);
 + setbits32(ugeth-uccf-p_uccm, UCCE_RX_EVENTS | UCCE_TX_EVENTS);
   }
  
   return howmany;
 @@ -3264,8 +3271,6 @@ static irqreturn_t ucc_geth_irq_handler(int irq, void 
 *info)
   struct ucc_geth_info *ug_info;
   register u32 ucce;
   register u32 uccm;
 - register u32 tx_mask;
 - u8 i;
  
   ugeth_vdbg(%s: IN, __func__);
  
 @@ -3279,27 +3284,14 @@ static irqreturn_t ucc_geth_irq_handler(int irq, void 
 *info)
   out_be32(uccf-p_ucce, ucce);
  
   /* check for receive events that require processing */
 - if (ucce  UCCE_RX_EVENTS) {
 + if (ucce  (UCCE_RX_EVENTS | UCCE_TX_EVENTS)) {
   if (netif_rx_schedule_prep(ugeth-napi)) {
 - uccm = ~UCCE_RX_EVENTS;
 + uccm = ~(UCCE_RX_EVENTS | UCCE_TX_EVENTS);
   out_be32(uccf-p_uccm, uccm);
   __netif_rx_schedule(ugeth-napi);
   }
   }
  
 - /* Tx event processing */
 - if (ucce  UCCE_TX_EVENTS) {
 - spin_lock(ugeth-lock);
 - tx_mask = UCC_GETH_UCCE_TXB0;
 - for (i = 0; i  ug_info-numQueuesTx; i++) {
 - if (ucce  tx_mask)
 - ucc_geth_tx(dev, i);
 - ucce = ~tx_mask;
 - tx_mask = 1;
 - }
 - spin_unlock(ugeth-lock);
 - }
 -
   /* Errors and other events */
   if (ucce  UCCE_OTHER) {
   if (ucce  UCC_GETH_UCCE_BSY)
 diff --git a/drivers/net/ucc_geth.h b/drivers/net/ucc_geth.h
 index 44218b8..ea30aa7 100644
 --- a/drivers/net/ucc_geth.h
 +++ b/drivers/net/ucc_geth.h
 @@ -843,7 +843,7 @@ struct ucc_geth_hardware_statistics {
  /* Driver definitions */
  #define TX_BD_RING_LEN  0x10
  #define RX_BD_RING_LEN  0x10
 -#define UCC_GETH_DEV_WEIGHT TX_BD_RING_LEN
 +#define UCC_GETH_DEV_WEIGHT 
 (RX_BD_RING_LEN+TX_BD_RING_LEN/2)
  
  #define TX_RING_MOD_MASK(size)  (size-1)
  #define RX_RING_MOD_MASK(size)  (size-1)


___
Linuxppc-dev mailing list
Linuxppc-dev@ozlabs.org
https://ozlabs.org/mailman/listinfo/linuxppc-dev

80 matches

Mail list logo