date:20150803

[PATCH 1/4] net: switchdev: change fdb addr for a byte array

2015-08-03 Thread Vivien Didelot

The address in the switchdev_obj_fdb structure is currently represented
as a pointer. Replacing it for a 6-byte array allows switchdev to carry
addresses directly read from hardware registers, not stored by the
switch chip driver (as in Rocker).

Signed-off-by: Vivien Didelot 
---
 drivers/net/ethernet/rocker/rocker.c | 2 +-
 include/net/switchdev.h  | 2 +-
 net/bridge/br_fdb.c  | 2 +-
 net/switchdev/switchdev.c| 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker.c 
b/drivers/net/ethernet/rocker/rocker.c
index 4cd5a71..faa5db0 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -4543,7 +4543,7 @@ static int rocker_port_fdb_dump(const struct rocker_port 
*rocker_port,
hash_for_each_safe(rocker->fdb_tbl, bkt, tmp, found, entry) {
if (found->key.pport != rocker_port->pport)
continue;
-   fdb->addr = found->key.addr;
+   memcpy(fdb->addr, found->key.addr, ETH_ALEN);
fdb->vid = rocker_port_vlan_to_vid(rocker_port,
   found->key.vlan_id);
err = obj->cb(rocker_port->dev, obj);
diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 89da893..e90e1a0 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -70,7 +70,7 @@ struct switchdev_obj {
u32 tb_id;
} ipv4_fib;
struct switchdev_obj_fdb {  /* PORT_FDB */
-   const unsigned char *addr;
+   u8 addr[ETH_ALEN];
u16 vid;
} fdb;
} u;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 9e9875d..2c64b6a 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -136,11 +136,11 @@ static void fdb_del_external_learn(struct 
net_bridge_fdb_entry *f)
struct switchdev_obj obj = {
.id = SWITCHDEV_OBJ_PORT_FDB,
.u.fdb = {
-   .addr = f->addr.addr,
.vid = f->vlan_id,
},
};
 
+   memcpy(obj.u.fdb.addr, f->addr.addr, ETH_ALEN);
switchdev_port_obj_del(f->dst->dev, );
 }
 
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 33bafa2..28786e8 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -742,11 +742,11 @@ int switchdev_port_fdb_add(struct ndmsg *ndm, struct 
nlattr *tb[],
struct switchdev_obj obj = {
.id = SWITCHDEV_OBJ_PORT_FDB,
.u.fdb = {
-   .addr = addr,
.vid = vid,
},
};
 
+   memcpy(obj.u.fdb.addr, addr, ETH_ALEN);
return switchdev_port_obj_add(dev, );
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_add);
@@ -769,11 +769,11 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct 
nlattr *tb[],
struct switchdev_obj obj = {
.id = SWITCHDEV_OBJ_PORT_FDB,
.u.fdb = {
-   .addr = addr,
.vid = vid,
},
};
 
+   memcpy(obj.u.fdb.addr, addr, ETH_ALEN);
return switchdev_port_obj_del(dev, );
 }
 EXPORT_SYMBOL_GPL(switchdev_port_fdb_del);
-- 
2.4.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/4] net: dsa: add support for switchdev FDB objects

2015-08-03 Thread Vivien Didelot

Remove the fdb_{add,del,getnext} function pointer in favor of new
port_fdb_{add,del,getnext}.

Implement the switchdev_port_obj_{add,del,dump} functions in DSA to
support the SWITCHDEV_OBJ_PORT_FDB objects.

These functions are called from switchdev_port_bridge_{get,set,del}link.

Signed-off-by: Vivien Didelot 
---
 drivers/net/dsa/mv88e6171.c |   3 -
 drivers/net/dsa/mv88e6352.c |   3 -
 include/net/dsa.h   |  16 ++--
 net/dsa/slave.c | 221 
 4 files changed, 129 insertions(+), 114 deletions(-)

diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c
index 1c78084..cfa21ed 100644
--- a/drivers/net/dsa/mv88e6171.c
+++ b/drivers/net/dsa/mv88e6171.c
@@ -116,9 +116,6 @@ struct dsa_switch_driver mv88e6171_switch_driver = {
.port_join_bridge   = mv88e6xxx_join_bridge,
.port_leave_bridge  = mv88e6xxx_leave_bridge,
.port_stp_update= mv88e6xxx_port_stp_update,
-   .fdb_add= mv88e6xxx_port_fdb_add,
-   .fdb_del= mv88e6xxx_port_fdb_del,
-   .fdb_getnext= mv88e6xxx_port_fdb_getnext,
 };
 
 MODULE_ALIAS("platform:mv88e6171");
diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c
index af210ef..eb4630f 100644
--- a/drivers/net/dsa/mv88e6352.c
+++ b/drivers/net/dsa/mv88e6352.c
@@ -341,9 +341,6 @@ struct dsa_switch_driver mv88e6352_switch_driver = {
.port_join_bridge   = mv88e6xxx_join_bridge,
.port_leave_bridge  = mv88e6xxx_leave_bridge,
.port_stp_update= mv88e6xxx_port_stp_update,
-   .fdb_add= mv88e6xxx_port_fdb_add,
-   .fdb_del= mv88e6xxx_port_fdb_del,
-   .fdb_getnext= mv88e6xxx_port_fdb_getnext,
 };
 
 MODULE_ALIAS("platform:mv88e6172");
diff --git a/include/net/dsa.h b/include/net/dsa.h
index fbca63b..a090c8a 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -296,12 +296,16 @@ struct dsa_switch_driver {
 u32 br_port_mask);
int (*port_stp_update)(struct dsa_switch *ds, int port,
   u8 state);
-   int (*fdb_add)(struct dsa_switch *ds, int port,
-  const unsigned char *addr, u16 vid);
-   int (*fdb_del)(struct dsa_switch *ds, int port,
-  const unsigned char *addr, u16 vid);
-   int (*fdb_getnext)(struct dsa_switch *ds, int port,
-  unsigned char *addr, bool *is_static);
+
+   /*
+* Forwarding database
+*/
+   int (*port_fdb_add)(struct dsa_switch *ds, int port, u16 vid,
+   u8 addr[ETH_ALEN]);
+   int (*port_fdb_del)(struct dsa_switch *ds, int port, u16 vid,
+   u8 addr[ETH_ALEN]);
+   int (*port_fdb_getnext)(struct dsa_switch *ds, int port, u16 *vid,
+   u8 addr[ETH_ALEN], bool *is_static);
 };
 
 void register_switch_driver(struct dsa_switch_driver *type);
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 0010c69..0f99a17 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "dsa_priv.h"
 
 /* slave mii_bus handling ***/
@@ -200,105 +201,6 @@ out:
return 0;
 }
 
-static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
-struct net_device *dev,
-const unsigned char *addr, u16 vid, u16 nlm_flags)
-{
-   struct dsa_slave_priv *p = netdev_priv(dev);
-   struct dsa_switch *ds = p->parent;
-   int ret = -EOPNOTSUPP;
-
-   if (ds->drv->fdb_add)
-   ret = ds->drv->fdb_add(ds, p->port, addr, vid);
-
-   return ret;
-}
-
-static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
-struct net_device *dev,
-const unsigned char *addr, u16 vid)
-{
-   struct dsa_slave_priv *p = netdev_priv(dev);
-   struct dsa_switch *ds = p->parent;
-   int ret = -EOPNOTSUPP;
-
-   if (ds->drv->fdb_del)
-   ret = ds->drv->fdb_del(ds, p->port, addr, vid);
-
-   return ret;
-}
-
-static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb,
-  const unsigned char *addr, u16 vid,
-  bool is_static,
-  u32 portid, u32 seq, int type,
-  unsigned int flags)
-{
-   struct nlmsghdr *nlh;
-   struct ndmsg *ndm;
-
-   nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
-   if (!nlh)
-   return -EMSGSIZE;
-
-   ndm = nlmsg_data(nlh);
-   ndm->ndm_family  = AF_BRIDGE;
-   ndm->ndm_pad1= 0;
-   ndm->ndm_pad2= 0;
-   ndm->ndm_flags   =

[PATCH 2/4] net: switchdev: support static FDB addresses

2015-08-03 Thread Vivien Didelot

This patch adds a is_static boolean to the switchdev_obj_fdb structure,
in order to set the ndm_state to either NUD_NOARP or NUD_REACHABLE.

Signed-off-by: Vivien Didelot 
---
 include/net/switchdev.h   | 1 +
 net/switchdev/switchdev.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index e90e1a0..0e296b8 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -72,6 +72,7 @@ struct switchdev_obj {
struct switchdev_obj_fdb {  /* PORT_FDB */
u8 addr[ETH_ALEN];
u16 vid;
+   bool is_static;
} fdb;
} u;
 };
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 28786e8..b75897c 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -810,7 +810,7 @@ static int switchdev_port_fdb_dump_cb(struct net_device 
*dev,
ndm->ndm_flags   = NTF_SELF;
ndm->ndm_type= 0;
ndm->ndm_ifindex = dev->ifindex;
-   ndm->ndm_state   = NUD_REACHABLE;
+   ndm->ndm_state   = obj->u.fdb.is_static ? NUD_NOARP : NUD_REACHABLE;
 
if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr))
goto nla_put_failure;
-- 
2.4.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

net: dsa: support switchdev FDB objects

2015-08-03 Thread Vivien Didelot

This patchset refactors the DSA and mv88e6xxx code to use the switchdev FDB
objects.

The first two patches add minor but necessary changes to switchdev, the third
one implements the switchdev glue in DSA for FDB routines, and the forth one
refactors the FDB access functions in the mv88e6xxx code.

Below is an example (ports 0-2 belongs to br0, ports 3-4 belongs to br1):

# bridge fdb add 3c:97:0e:11:30:6e dev swp2
# bridge fdb add 3c:97:0e:11:40:78 dev swp3
# bridge fdb add 3c:97:0e:11:50:86 dev swp4
# bridge fdb del 3c:97:0e:11:40:78 dev swp3
# bridge fdb
01:00:5e:00:00:01 dev eth0 self permanent
01:00:5e:00:00:01 dev eth1 self permanent
00:50:d2:10:78:15 dev swp0 master br0 permanent
3c:97:0e:11:30:6e dev swp2 self static
00:50:d2:10:78:15 dev swp3 master br1 permanent
3c:97:0e:11:50:86 dev swp4 self static
# cat /sys/kernel/debug/dsa0/atu
# DB   T/P  Vec State Addr
# 001  Port 004   e   3c:97:0e:11:30:6e
# 004  Port 010   e   3c:97:0e:11:50:86

For the 88E6xxx switches, FIDs 1 to num_ports will be reserved for non-bridged
ports and bridge groups, and the remaining will be later used by VLANs.

This change is necessary to welcome the support for hardware VLANs (which will
follow soon).

Cheers,
-v

Vivien Didelot (4):
  net: switchdev: change fdb addr for a byte array
  net: switchdev: support static FDB addresses
  net: dsa: add support for switchdev FDB objects
  net: dsa: mv88e6xxx: refactor FDB routines

 drivers/net/dsa/mv88e6171.c  |   6 +-
 drivers/net/dsa/mv88e6352.c  |   6 +-
 drivers/net/dsa/mv88e6xxx.c  | 205 ++--
 drivers/net/dsa/mv88e6xxx.h  |  31 +++--
 drivers/net/ethernet/rocker/rocker.c |   2 +-
 include/net/dsa.h|  16 ++-
 include/net/switchdev.h  |   3 +-
 net/bridge/br_fdb.c  |   2 +-
 net/dsa/slave.c  | 221 +++
 net/switchdev/switchdev.c|   6 +-
 10 files changed, 308 insertions(+), 190 deletions(-)

-- 
2.4.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] perf: Clear MSRs on kexec

2015-08-03 Thread Jiri Olsa

On Mon, Aug 03, 2015 at 11:54:17PM +0200, Peter Zijlstra wrote:
> On Mon, Aug 03, 2015 at 11:32:28PM +0200, Jiri Olsa wrote:
> > hi,
> > I'm getting following message on the kdump kernel start
> > 
> >   Broken BIOS detected, complain to your hardware vendor.\
> >   [Firmware Bug]: the BIOS has corrupted hw-PMU resources (MSR 38d is b0)
> > 
> > it seems to be caused by NMI watchdog being configured
> > and fixed counter values stays in MSRs, which triggers
> > warning in check_hw_exists and disables perf support
> > in kdump kernel.. which probably does not hurt ;-)
> > 
> > zeroing MSRs during kdump shutdown seems to work (attached)
> > but I'm not sure thats correct place for kdump perf callback
> 
> Right, but why bother? All that kernel needs to do is write a memory
> dump to someplace and reboot, right? The less you do, the less can go
> wrong.

well, I was hunting that 'Broken BIOS..' message which is wrong

I wouldn't think anyone wants to use perf under kdump kernel,
but you never know ;-)

jirka
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 3/3] cpuidle/coupled: Add sanity check for safe_state_index

2015-08-03 Thread Xunlei Pang

From: Xunlei Pang 

Since we're using cpuidle_driver::safe_state_index directly as the
target state index, it's better to add the sanity check at the point
of registering the driver.

Signed-off-by: Xunlei Pang 
---
 drivers/cpuidle/driver.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 5db1478..def299e 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -223,10 +223,23 @@ static void poll_idle_init(struct cpuidle_driver *drv) {}
 static int __cpuidle_register_driver(struct cpuidle_driver *drv)
 {
int ret;
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+   int i;
+#endif
 
if (!drv || !drv->state_count)
return -EINVAL;
 
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+   for (i = drv->state_count - 1; i >= 0; i--) {
+   if (cpuidle_state_is_coupled(drv, i) &&
+   (drv->safe_state_index == i ||
+drv->safe_state_index < 0 ||
+drv->safe_state_index >= drv->state_count))
+   return -EINVAL;
+   }
+#endif
+
if (cpuidle_disabled())
return -ENODEV;
 
-- 
1.9.1


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 1/3] cpuidle/coupled: Remove cpuidle_device::safe_state_index

2015-08-03 Thread Xunlei Pang

From: Xunlei Pang 

cpuidle_device::safe_state_index need to be initialized before
use, it should be the same as cpuidle_driver::safe_state_index.

We tackled this issue by removing the safe_state_index from the
cpuidle_device structure and use the one in the cpuidle_driver
structure instead.

Suggested-by: Daniel Lezcano 
Signed-off-by: Xunlei Pang 
---
 drivers/cpuidle/coupled.c | 4 ++--
 include/linux/cpuidle.h   | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 7936dce..6493e40 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -473,7 +473,7 @@ int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
return entered_state;
}
entered_state = cpuidle_enter_state(dev, drv,
-   dev->safe_state_index);
+   drv->safe_state_index);
local_irq_disable();
}
 
@@ -521,7 +521,7 @@ retry:
}
 
entered_state = cpuidle_enter_state(dev, drv,
-   dev->safe_state_index);
+   drv->safe_state_index);
local_irq_disable();
}
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index d075d34..786ad32 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -84,7 +84,6 @@ struct cpuidle_device {
struct list_headdevice_list;
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
-   int safe_state_index;
cpumask_t   coupled_cpus;
struct cpuidle_coupled  *coupled;
 #endif
-- 
1.9.1


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/3] cpuidle/coupled: Remove redundant 'dev' argument of cpuidle_state_is_coupled()

2015-08-03 Thread Xunlei Pang

From: Xunlei Pang 

For cpuidle_state_is_coupled(), 'dev' is not used, so remove it.

Signed-off-by: Xunlei Pang 
---
 drivers/cpuidle/coupled.c | 4 +---
 drivers/cpuidle/cpuidle.c | 4 ++--
 drivers/cpuidle/cpuidle.h | 7 +++
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 6493e40..1523e2d 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -176,14 +176,12 @@ void cpuidle_coupled_parallel_barrier(struct 
cpuidle_device *dev, atomic_t *a)
 
 /**
  * cpuidle_state_is_coupled - check if a state is part of a coupled set
- * @dev: struct cpuidle_device for the current cpu
  * @drv: struct cpuidle_driver for the platform
  * @state: index of the target state in drv->states
  *
  * Returns true if the target state is coupled with cpus besides this one
  */
-bool cpuidle_state_is_coupled(struct cpuidle_device *dev,
-   struct cpuidle_driver *drv, int state)
+bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state)
 {
return drv->states[state].flags & CPUIDLE_FLAG_COUPLED;
 }
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 3325393..17a6dc0 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -214,7 +214,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct 
cpuidle_driver *drv,
tick_broadcast_exit();
}
 
-   if (!cpuidle_state_is_coupled(dev, drv, entered_state))
+   if (!cpuidle_state_is_coupled(drv, entered_state))
local_irq_enable();
 
diff = ktime_to_us(ktime_sub(time_end, time_start));
@@ -263,7 +263,7 @@ int cpuidle_select(struct cpuidle_driver *drv, struct 
cpuidle_device *dev)
 int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev,
  int index)
 {
-   if (cpuidle_state_is_coupled(dev, drv, index))
+   if (cpuidle_state_is_coupled(drv, index))
return cpuidle_enter_state_coupled(dev, drv, index);
return cpuidle_enter_state(dev, drv, index);
 }
diff --git a/drivers/cpuidle/cpuidle.h b/drivers/cpuidle/cpuidle.h
index ee97e96..178c5ad 100644
--- a/drivers/cpuidle/cpuidle.h
+++ b/drivers/cpuidle/cpuidle.h
@@ -34,15 +34,14 @@ extern int cpuidle_add_sysfs(struct cpuidle_device *dev);
 extern void cpuidle_remove_sysfs(struct cpuidle_device *dev);
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
-bool cpuidle_state_is_coupled(struct cpuidle_device *dev,
-   struct cpuidle_driver *drv, int state);
+bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state);
 int cpuidle_enter_state_coupled(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int next_state);
 int cpuidle_coupled_register_device(struct cpuidle_device *dev);
 void cpuidle_coupled_unregister_device(struct cpuidle_device *dev);
 #else
-static inline bool cpuidle_state_is_coupled(struct cpuidle_device *dev,
-   struct cpuidle_driver *drv, int state)
+static inline
+bool cpuidle_state_is_coupled(struct cpuidle_driver *drv, int state)
 {
return false;
 }
-- 
1.9.1


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC 0/2] VFIO: Add virtual MSI doorbell support.

2015-08-03 Thread Pranavkumar Sawargaonkar

Hi Bharat,

On 28 July 2015 at 23:28, Alex Williamson  wrote:
> On Tue, 2015-07-28 at 17:23 +, Bhushan Bharat wrote:
>> Hi Alex,
>>
>> > -Original Message-
>> > From: Alex Williamson [mailto:alex.william...@redhat.com]
>> > Sent: Tuesday, July 28, 2015 9:52 PM
>> > To: Pranavkumar Sawargaonkar
>> > Cc: k...@vger.kernel.org; kvm...@lists.cs.columbia.edu; linux-arm-
>> > ker...@lists.infradead.org; linux-kernel@vger.kernel.org;
>> > christoffer.d...@linaro.org; marc.zyng...@arm.com; will.dea...@arm.com;
>> > bhelg...@google.com; a...@arndb.de; rob.herr...@linaro.org;
>> > eric.au...@linaro.org; patc...@apm.com; Bhushan Bharat-R65777; Yoder
>> > Stuart-B08248
>> > Subject: Re: [RFC 0/2] VFIO: Add virtual MSI doorbell support.
>> >
>> > On Fri, 2015-07-24 at 14:33 +0530, Pranavkumar Sawargaonkar wrote:
>> > > In current VFIO MSI/MSI-X implementation, linux host kernel allocates
>> > > MSI/MSI-X vectors when userspace requests through vfio ioctls.
>> > > Vfio creates irqfd mappings to notify MSI/MSI-X interrupts to the
>> > > userspace when raised.
>> > > Guest OS will see emulated MSI/MSI-X controller and receives an
>> > > interrupt when kernel notifies the same via irqfd.
>> > >
>> > > Host kernel allocates MSI/MSI-X using standard linux routines like
>> > > pci_enable_msix_range() and pci_enable_msi_range().
>> > > These routines along with requset_irq() in host kernel sets up
>> > > MSI/MSI-X vectors with Physical MSI/MSI-X addresses provided by
>> > > interrupt controller driver in host kernel.
>> > >
>> > > This means when a device is assigned with the guest OS, MSI/MSI-X
>> > > addresses present in PCIe EP are the PAs programmed by the host linux
>> > kernel.
>> > >
>> > > In x86 MSI/MSI-X physical address range is reserved and iommu is aware
>> > > about these addreses and transalation is bypassed for these address 
>> > > range.
>> > >
>> > > Unlike x86, ARM/ARM64 does not reserve MSI/MSI-X Physical address
>> > > range and all the transactions including MSI go through iommu/smmu
>> > without bypass.
>> > > This requires extending current vfio MSI layer with additional
>> > > functionality for ARM/ARM64 by 1. Programing IOVA (referred as a MSI
>> > > virtual doorbell address)
>> > >in device's MSI vector as a MSI address.
>> > >This IOVA will be provided by the userspace based on the
>> > >MSI/MSI-X addresses reserved for the guest.
>> > > 2. Create an IOMMU mapping between this IOVA and
>> > >Physical address (PA) assigned to the MSI vector.
>> > >
>> > > This RFC is proposing a solution for MSI/MSI-X passthrough for
>> > ARM/ARM64.
>> >
>> >
>> > Hi Pranavkumar,
>> >
>> > Freescale has the same, or very similar, need, so any solution in this 
>> > space
>> > will need to work for both ARM and powerpc.  I'm not a big fan of this
>> > approach as it seems to require the user to configure MSI/X via ioctl and 
>> > then
>> > call a separate ioctl mapping the doorbells.  That's more code for the 
>> > user,
>> > more code to get wrong and potentially a gap between configuring MSI/X
>> > and enabling mappings where we could see IOMMU faults.
>> >
>> > If we know that doorbell mappings are required, why can't we set aside a
>> > bank of IOVA space and have them mapped automatically as MSI/X is being
>> > configured?  Then the user's need for special knowledge and handling of 
>> > this
>> > case is limited to setup.  The IOVA space will be mapped and used as 
>> > needed,
>> > we only need the user to specify the IOVA space reserved for this.  Thanks,
>>
>> We probably need a mix of both to support Freescale PowerPC and ARM
>> based machines.
>> In this mix mode kernel vfio driver will reserve some IOVA for mapping
>> MSI page/s.
>
> If vfio is reserving pages independently from the user, this becomes
> what Marc called "shaping" the VM and what x86 effectively does.  An
> interface extension should expose these implicit regions so the user can
> avoid them for DMA memory mapping.
>
>>  If any other iova mapping will overlap with this then it will return
>> error and user-space. Ideally this should be choosen in such a way
>> that it never overlap, which is easy on some systems but can be tricky
>> on some other system like Freescale PowerPC. This is not sufficient
>> for at-least Freescale PowerPC based SOC. This is because of hardware
>> limitation, where we need to fit this reserved iova address within
>> aperture decided by user-space. So if we allow user-space to change
>> this reserved iova address to a value decided by user-spece itself
>> then we can support both ARM/PowerPC based solutions.
>
> Yes, that's my intention, to allow userspace to specify the reserved
> region.  I believe you have some additional restrictions on the number
> of MSI banks available and whether MSI banks can be shared, but I would
> hope that doesn't preclude a shared interface with ARM.
>
>> I have some implementation ready/tested with this approach and if this
>> approach looks good then I

Re: [RFC][PATCH] ecryptfs: Allow only one instance per lower path

2015-08-03 Thread Richard Weinberger

Tyler,

Am 04.08.2015 um 01:07 schrieb Tyler Hicks:
>> Okay, then I'd argument to give my patch a try although it is not the 
>> solution
>> to the problem I've reported. :-)
>> If you don't mind I'll resend with a proper changelog.
> 
> That patch isn't correct since it assumes that all eCryptfs super blocks
> are equal if the lower paths (and, ultimately, the lower inode) are
> equal. However, the lower path is only one of many properties of an
> eCryptfs superblock. For example, the second mount may have been
> configured to use a different file encryption key.

How would this work if I mount /foo using AES to /mnt_a
and /foo again using 3DES to /mnt_b?
Wouldn't both ecrytpfs instances kill each other's files?

Thanks,
//richard
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3] iio: adc: xilinx-xadc: Push interrupts into threaded context

2015-08-03 Thread Shubhrajyoti Datta

On Fri, Jul 24, 2015 at 6:08 PM, Lars-Peter Clausen  wrote:
> Hi,
>
> Sorry, but I don't think this patch has been sufficiently tested against a
> mainline kernel. The driver wont even probe the way it is right now.
>
> On 07/21/2015 01:14 AM, Xander Huff wrote:
>>
>> The driver currently registers a pair of irq handlers using
>> request_threaded_irq(), however the synchronization mechanism between the
>> hardirq and the threadedirq handler is a regular spinlock.
>
>
> If everything runs in threaded context we don't really need the spinlock
> anymore and can use the mutex throughout.

that should be better from the performance point of view.

>
>>
>> Unfortunately, this breaks PREEMPT_RT builds, where a spinlock can sleep,
>> and is thus not able to be acquired from a hardirq handler. This patch
>> gets
>> rid of the hardirq handler and pushes all interrupt handling into the
>> threaded context.
>
>
> We actually might as well run everything in the hardirq handler (which will
> be threaded in PREEMPT_RT). The reason why we have the threaded handler is
> because xadc_handle_event() used to sleep, but it doesn't do this anymore.

The point is why have the hard irq. If we use hardirq then not mutex
can be used and spinlock will
be busy.

is there something i may be missing?
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: Tree for Aug 4

2015-08-03 Thread Stephen Rothwell

Hi all,

Changes since 20150803:

The security tree gained a conflict against Linus' tree.

Non-merge commits (relative to Linus' tree): 5232
 5240 files changed, 257463 insertions(+), 119966 deletions(-)



I have created today's linux-next tree at
git://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
(patches at http://www.kernel.org/pub/linux/kernel/next/ ).  If you
are tracking the linux-next tree using git, you should not use "git pull"
to do so as that will try to merge the new linux-next release with the
old one.  You should use "git fetch" and checkout or reset to the new
master.

You can see which trees have been included by looking in the Next/Trees
file in the source.  There are also quilt-import.log and merge.log
files in the Next directory.  Between each merge, the tree was built
with a ppc64_defconfig for powerpc and an allmodconfig for x86_64,
a multi_v7_defconfig for arm and a native build of tools/perf. After
the final fixups (if any), it is also built with powerpc allnoconfig
(32 and 64 bit), ppc44x_defconfig and allyesconfig (this fails its final
link) and i386, sparc, sparc64 and arm defconfig.

Below is a summary of the state of the merge.

I am currently merging 224 trees (counting Linus' and 32 trees of patches
pending for Linus' tree).

Stats about the size of the tree over time can be seen at
http://neuling.org/linux-next-size.html .

Status of my local build tests will be at
http://kisskb.ellerman.id.au/linux-next .  If maintainers want to give
advice about cross compilers/configs that work, we are always open to add
more builds.

Thanks to Randy Dunlap for doing many randconfig builds.  And to Paul
Gortmaker for triage and bug fixes.

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au

$ git checkout master
$ git reset --hard stable
Merging origin/master (7e884479bf50 Merge branch 'for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client)
Merging fixes/master (c7e9ad7da219 Merge branch 'perf-urgent-for-linus' of 
git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip)
Merging kbuild-current/rc-fixes (3d1450d54a4f Makefile: Force gzip and xz on 
module install)
Merging arc-current/for-curr (e4140819dadc ARC: signal handling robustify)
Merging arm-current/fixes (3473f26592c1 ARM: 8405/1: VDSO: fix regression with 
toolchains lacking ld.bfd executable)
Merging m68k-current/for-linus (1214c525484c m68k: Use for_each_sg())
Merging metag-fixes/fixes (0164a711c97b metag: Fix ioremap_wc/ioremap_cached 
build errors)
Merging mips-fixes/mips-fixes (1795cd9b3a91 Linux 3.16-rc5)
Merging powerpc-fixes/fixes (b8d65e9662b1 powerpc/eeh-powernv: Fix unbalanced 
IRQ warning)
Merging powerpc-merge-mpe/fixes (bc0195aad0da Linux 4.2-rc2)
Merging sparc/master (4a10a91756ef Merge branch 'upstream' of 
git://git.infradead.org/users/pcmoore/audit)
Merging net/master (636dba8e12d7 act_mirred: avoid calling tcf_hash_release() 
when binding)
Merging ipsec/master (158cd4af8ded packet: missing dev_put() in 
packet_do_bind())
Merging sound-current/for-linus (8ec7cfce3762 ALSA: oxygen: Fix 
logical-not-parentheses warning)
Merging pci-current/for-linus (c9ddbac9c891 PCI: Restore PCI_MSIX_FLAGS_BIRMASK 
definition)
Merging wireless-drivers/master (741e3b9902d1 rtlwifi: rtl8723be: Add module 
parameter for MSI interrupts)
Merging driver-core.current/driver-core-linus (cbfe8fa6cd67 Linux 4.2-rc4)
Merging tty.current/tty-linus (cbfe8fa6cd67 Linux 4.2-rc4)
Merging usb.current/usb-linus (0f79fd807a24 Merge tag 'fixes-for-v4.2-rc6' of 
git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb into usb-linus)
Merging usb-gadget-fixes/fixes (c93e64e91248 usb: udc: core: add device_del() 
call to error pathway)
Merging usb-serial-fixes/usb-linus (74472233233f USB: sierra: add 1199:68AB 
device ID)
Merging staging.current/staging-linus (40c3ef9d2f14 staging: comedi: das1800: 
add missing break in switch)
Merging char-misc.current/char-misc-linus (eaf7e98d43c1 Merge tag 
'extcon-fixes-for-4.2-rc5' of 
git://git.kernel.org/pub/scm/linux/kernel/git/chanwoo/extcon into 
char-misc-linus)
Merging input-current/for-linus (073e570d7c2c Input: alps - only Dell laptops 
have separate button bits for v2 dualpoint sticks)
Merging crypto-current/master (17fb874dee09 hwrng: core - correct error check 
of kthread_run call)
Merging ide/master (d681f1166919 ide: remove deprecated use of pci api)
Merging devicetree-current/devicetree/merge (f76502aa9140 of/dynamic: Fix test 
for PPC_PSERIES)
Merging rr-fixes/fixes (fe0d34d242fa module: weaken locking assertion for oops 
path.)
Merging vfio-fixes/for-linus (4bc94d5dc95d vfio: Fix lockdep issue)
Merging kselftest-fixes/fixes (fee50f3c8427 selftests/futex: Fix 
futex_cmp_requeue_pi() error handling)
Merging backlight-fixes/for-backlight-fixes (68feaca0b13e backlight: pwm: 
Handle EPROBE_DEFER while requesting the PWM)
Merging ftrace-fixes/for-next-

[PATCH v2 net-next 2/2] RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.

2015-08-03 Thread Sowmini Varadhan

Register pernet subsys init/stop functions that will set up
and tear down per-net RDS-TCP listen endpoints. Unregister
pernet subusys functions on 'modprobe -r' to clean up these
end points.

Enable keepalive on both accept and connect socket endpoints.
The keepalive timer expiration will ensure that client socket
endpoints will be removed as appropriate from the netns when
an interface is removed from a namespace.

Register a device notifier callback that will clean up all
sockets (and thus avoid the need to wait for keepalive timeout)
when the loopback device is unregistered from the netns indicating
that the netns is getting deleted.

Signed-off-by: Sowmini Varadhan 
---
v2: net_device notifier for synchronous cleanup of sockets.

 net/rds/tcp.c |  163 -
 net/rds/tcp.h |7 ++-
 net/rds/tcp_connect.c |6 +-
 net/rds/tcp_listen.c  |   38 +++-
 4 files changed, 164 insertions(+), 50 deletions(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 98f5de3..339392b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -35,6 +35,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #include "rds.h"
 #include "tcp.h"
@@ -250,16 +253,7 @@ static void rds_tcp_destroy_conns(void)
}
 }
 
-static void rds_tcp_exit(void)
-{
-   rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-   rds_tcp_listen_stop();
-   rds_tcp_destroy_conns();
-   rds_trans_unregister(_tcp_transport);
-   rds_tcp_recv_exit();
-   kmem_cache_destroy(rds_tcp_conn_slab);
-}
-module_exit(rds_tcp_exit);
+static void rds_tcp_exit(void);
 
 struct rds_transport rds_tcp_transport = {
.laddr_check= rds_tcp_laddr_check,
@@ -281,6 +275,138 @@ struct rds_transport rds_tcp_transport = {
.t_prefer_loopback  = 1,
 };
 
+static int rds_tcp_netid;
+
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+   struct socket *rds_tcp_listen_sock;
+   struct work_struct rds_tcp_accept_w;
+};
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+   struct rds_tcp_net *rtn = container_of(work,
+  struct rds_tcp_net,
+  rds_tcp_accept_w);
+
+   while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
+   cond_resched();
+}
+
+void rds_tcp_accept_work(struct sock *sk)
+{
+   struct net *net = sock_net(sk);
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   queue_work(rds_wq, >rds_tcp_accept_w);
+}
+
+static __net_init int rds_tcp_init_net(struct net *net)
+{
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
+   if (!rtn->rds_tcp_listen_sock) {
+   pr_warn("could not set up listen sock\n");
+   return -EAFNOSUPPORT;
+   }
+   INIT_WORK(>rds_tcp_accept_w, rds_tcp_accept_worker);
+   return 0;
+}
+
+static void __net_exit rds_tcp_exit_net(struct net *net)
+{
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   /* If rds_tcp_exit_net() is called as a result of netns deletion,
+* the rds_tcp_kill_sock() device notifier would already have cleaned
+* up the listen socket, thus there is no work to do in this function.
+*
+* If rds_tcp_exit_net() is called as a result of module unload,
+* i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then
+* we do need to clean up the listen socket here.
+*/
+   if (rtn->rds_tcp_listen_sock) {
+   rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
+   rtn->rds_tcp_listen_sock = NULL;
+   flush_work(>rds_tcp_accept_w);
+   }
+}
+
+static struct pernet_operations rds_tcp_net_ops = {
+   .init = rds_tcp_init_net,
+   .exit = rds_tcp_exit_net,
+   .id = _tcp_netid,
+   .size = sizeof(struct rds_tcp_net),
+};
+
+static void rds_tcp_kill_sock(struct net *net)
+{
+   struct rds_tcp_connection *tc, *_tc;
+   struct sock *sk;
+   struct list_head tmp_list;
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
+   rtn->rds_tcp_listen_sock = NULL;
+   flush_work(>rds_tcp_accept_w);
+   INIT_LIST_HEAD(_list);
+   spin_lock_irq(_tcp_conn_lock);
+   list_for_each_entry_safe(tc, _tc, _tcp_conn_list, t_tcp_node) {
+   struct net *c_net = read_pnet(>conn->c_net);
+
+   if (net != c_net || !tc->t_sock)
+   continue;
+   list_del(>t_tcp_node);
+   list_add_tail(>t_tcp_node, _list);
+   }
+   spin_unlock_irq(_tcp_conn_lock);
+   list_for_each_entry_safe(tc, _tc, _list, t_tcp_node) {
+   sk = tc->t_sock->sk;
+   sk->sk_prot->disconnect(sk, 0);
+

[PATCH v2 net-next 1/2] RDS-TCP: Make RDS-TCP work correctly when it is set up in a netns other than init_net

2015-08-03 Thread Sowmini Varadhan

Open the sockets calling sock_create_kern() with the correct struct net
pointer, and use that struct net pointer when verifying the
address passed to rds_bind().

Signed-off-by: Sowmini Varadhan 
---
v2: David Ahern comments.

 net/rds/bind.c|3 ++-
 net/rds/connection.c  |   16 ++--
 net/rds/ib.c  |2 +-
 net/rds/ib_cm.c   |5 +++--
 net/rds/iw.c  |2 +-
 net/rds/iw_cm.c   |5 +++--
 net/rds/rds.h |   23 +++
 net/rds/send.c|3 ++-
 net/rds/tcp.c |4 ++--
 net/rds/tcp_connect.c |3 ++-
 net/rds/tcp_listen.c  |   16 
 net/rds/transport.c   |4 ++--
 12 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 4ebd29c..dd666fb 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
ret = 0;
goto out;
}
-   trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+   trans = rds_trans_get_preferred(sock_net(sock->sk),
+   sin->sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index da6da57..d4fecb2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn)
  * For now they are not garbage collected once they're created.  They
  * are torn down as the module is removed, if ever.
  */
-static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+static struct rds_connection *__rds_conn_create(struct net *net,
+   __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp,
   int is_outgoing)
 {
@@ -157,6 +158,7 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
conn->c_faddr = faddr;
spin_lock_init(>c_lock);
conn->c_next_tx_seq = 1;
+   rds_conn_net_set(conn, net);
 
init_waitqueue_head(>c_waitq);
INIT_LIST_HEAD(>c_send_queue);
@@ -174,7 +176,7 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
 * can bind to the destination address then we'd rather the messages
 * flow through loopback rather than either transport.
 */
-   loop_trans = rds_trans_get_preferred(faddr);
+   loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
@@ -260,17 +262,19 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
return conn;
 }
 
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+   __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffe..1381422 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned 
int len,
  * allowed to influence which paths have priority.  We could call userspace
  * asserting this policy "routing".
  */
-static int rds_ib_laddr_check(__be32 addr)
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
 {
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0da2a45..f40d8f5 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -448,8 +448,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 (unsigned long long)be64_to_cpu(lguid),
 (unsigned long long)be64_to_cpu(fguid));
 
-   conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, _ib_transport,
-  GFP_KERNEL);
+   /* RDS/IB is not currently netns aware, thus init_net */
+   conn = rds_conn_create(_net, dp->dp_daddr, dp->dp_saddr,
+  _ib_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create

[PATCH v2 net-next 0/2] RDS-TCP: Network namespace support

2015-08-03 Thread Sowmini Varadhan

This patch series contains the set of changes to correctly set up 
the infra for PF_RDS sockets that use TCP as the transport in multiple
network namespaces.

Patch 1 in the series is the minimal set of changes to allow
a single instance of RDS-TCP to run in any (i.e init_net or other) net
namespace.  The changes in this patch set ensure that the execution of 
'modprobe [-r] rds_tcp' sets up the kernel TCP sockets 
relative to the current netns, so that RDS applications can send/recv
packets from that netns, and the netns can later be deleted cleanly.

Patch 2 of the series further allows multiple RDS-TCP instances,
one per network namespace. The changes in this patch allows dynamic
creation/tear-down of RDS-TCP client and server sockets  across all
current and future namespaces. 

v2 changes from RFC sent out earlier:
David Ahern comments in patch 1, net_device notifier in patch 2, 
patch 3 broken off and submitted separately.

Sowmini Varadhan (2):
  Make RDS-TCP work correctly when it is set up in a netns other than
init_net
  Support multiple RDS-TCP listen endpoints, one per netns.

 net/rds/bind.c|3 +-
 net/rds/connection.c  |   16 +++--
 net/rds/ib.c  |2 +-
 net/rds/ib_cm.c   |5 +-
 net/rds/iw.c  |2 +-
 net/rds/iw_cm.c   |5 +-
 net/rds/rds.h |   23 ++-
 net/rds/send.c|3 +-
 net/rds/tcp.c |  167 +++-
 net/rds/tcp.h |7 ++-
 net/rds/tcp_connect.c |9 ++-
 net/rds/tcp_listen.c  |   40 
 net/rds/transport.c   |4 +-
 13 files changed, 216 insertions(+), 70 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: perf eBPF patch ordering. was: Re: perf test LLVM was: Re: [GIT PULL 00/39] perf tools: filtering events using eBPF programs

2015-08-03 Thread Wangnan (F)


Hi Arnaldo,

The following changes since commit 922cc21746202956acb41c89a6190bb50805fa31:

  perf tools: Introduce llvm config options (2015-07-31 12:17:50 -0300)

are available in the git repository at:

  https://github.com/WangNan0/linux.git ebpf

for you to fetch changes up to d85bf4b6470b8d860bbae25418e5ae3ccd9711e8:

  perf tools: Support attach BPF program on uprobe events (2015-08-04 
04:59:20 +)




The new cset has following improvements:

1. Improve error message: now don't dump LLVM environment setting messages
   if clang is found. Also, describe how to pre-compile .c file into .o.

   See: perf tools: Call clang to compile C source to object code
https://github.com/WangNan0/linux/commit/264676a5b922aaf1e9be3800fe06d5b67b06cd12

2. Reorder patches, so when 'perf record' is able to accept '--event 
file.c', the BPF filter
   should work. Also, an example BPF script file is provided, and the 
compilation

   method is described in commit message.

   See:
perf tools: Infrastructure for compiling scriptlets when 
passing '.c' to --event

https://github.com/WangNan0/linux/commit/eca622f4a88e1a791fc2405c398256ad572eba54

3. Introduce 'perf test BPF', which uses previous introduced scriptlet, 
fork a

   'perf record' to utilise it and uses 'perf report' to check the result.

   See: perf tests: Enforce LLVM test for BPF test
https://github.com/WangNan0/linux/commit/a7cdab453863c580446dc2c3a3f3a86f21b770ce

perf test: Enable 'perf test' run as test targets
https://github.com/WangNan0/linux/commit/b14f2627e95d348be5ec19bd24a5117e8c2ffe46
and
perf test: Add 'perf test BPF'
https://github.com/WangNan0/linux/commit/8414217dbfa57df4dbb55642dc26205e1c7cbdf1

4. Fix a bug that if the filename doesn't contain '/' it is recongnised 
as event name then
   failed to be applied by order adjusting in parse-events.l: bring 
{bpf_object} and

   {bpf_source} ahead.

You need to pop 9 patches from your perf/ebpf tree and rebase my tree. 
However, until
patch "perf tools: Enable passing bpf object file to --event" the 
changes is tiny. Please

check.

Thank you.

On 2015/8/4 3:49, Arnaldo Carvalho de Melo wrote:

Em Mon, Aug 03, 2015 at 01:11:16PM -0300, Arnaldo Carvalho de Melo escreveu:

  ERROR:unable to compile ./foo.c
  Hint:Check error message shown above.
 LLVM 3.7 or newer is required. Which can be found from http://llvm.org
 You may want to try git trunk:
 git clone http://llvm.org/git/llvm.git
  and



  or: perf record [] --  []
  -e, --eventevent selector. use 'perf list' to list available 
events
  [root@felicio ~]#
Now to find a hello.c BPF scriptlet...

So, we do not need to provide all this LLVM environment installation
hints when we get to any error, i.e. the one above was just becasuse
"./foo.c" doesn't exist, clang ran successfully, so no need for telling
the user how to install it.

The following error also shouldn't emit those hints:

   [root@felicio ~]# perf record -e ./lock_page.bpf.c sleep 1
   /root/./lock_page.bpf.c:1:5: error: expected parameter declarator
   SEC("lock_page=__lock_page page->flags")
   ^
   /root/./lock_page.bpf.c:1:5: error: expected ')'
   /root/./lock_page.bpf.c:1:4: note: to match this '('
   SEC("lock_page=__lock_page page->flags")
  ^
   /root/./lock_page.bpf.c:1:1: warning: type specifier missing, defaults to 
'int' [-Wimplicit-int]
   SEC("lock_page=__lock_page page->flags")
   ^
   /root/./lock_page.bpf.c:1:41: error: expected ';' after top level declarator
   SEC("lock_page=__lock_page page->flags")
   ^
   ;
   /root/./lock_page.bpf.c:2:22: warning: declaration of 'struct pt_regs' will 
not be visible outside of this function [-Wvisibility]
   int lock_page(struct pt_regs *ctx, int err, unsigned long flags)
  ^
   2 warnings and 3 errors generated.
   ERROR:   unable to compile ./lock_page.bpf.c
   Hint:Check error message shown above.
LLVM 3.7 or newer is required. Which can be found from http://llvm.org
You may want to try git trunk:
git clone http://llvm.org/git/llvm.git
 and
git clone http://llvm.org/git/clang.git

Or fetch the latest clang/llvm 3.7 from pre-built llvm packages for
debian/ubuntu:
http://llvm.org/apt

If you are using old version of clang, change 'clang-bpf-cmd-template'
option in [llvm] section of ~/.perfconfig to:

  "$CLANG_EXEC $CLANG_OPTIONS $KERNEL_INC_OPTIONS \
 -working-directory $WORKING_DIR -c $CLANG_SOURCE \
 -emit-llvm -o - | /path/to/llc -march=bpf -filetype=obj -o -"
(Replace /path/to/llc with path to your llc)

   Hint:You can also pre-compile it into .o
   invalid or unsupported event:

Re: [PATCH] net: dsa: fix EDSA frame from hwaccel frame

2015-08-03 Thread David Miller

From: Vivien Didelot 
Date: Sun,  2 Aug 2015 21:46:02 -0400

> If the underlying network device features NETIF_F_HW_VLAN_CTAG_TX,
> an EDSA frame is prepended with a 802.1q header once queued.
> 
> To fix this, push the VLAN tag to the payload if present, before
> checking the frame protocol.
> 
> [note: we may prefer to access directly VLAN TCI from hwaccel frames,
> but this approach is simpler.]
> 
> Signed-off-by: Vivien Didelot 

This is a bug fix so should target 'net', but you generated the patch
against 'net-next'.

In any event, you should be explicit about the tree you are targetting
in order to not waste my time like this, by simply specifying the
tree in your "[PATCH xxx]" text in your subject line.   Either
"[PATCH net]" or "[PATCH net-next]".

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] cpuidle/coupled: Init cpuidle_device::safe_state_index

2015-08-03 Thread pang . xunlei

Hi Daniel,

Daniel Lezcano  wrote 2015-08-04 AM 12:22:54:
> Re: [PATCH] cpuidle/coupled: Init cpuidle_device::safe_state_index
> 
> On 07/23/2015 02:31 PM, Xunlei Pang wrote:
> > From: Xunlei Pang 
> >
> > cpuidle_device::safe_state_index need to be initialized before use,
> > so assign the driver's safe_state_index to it.
> >
> > Signed-off-by: Xunlei Pang 
> > ---
> >   drivers/cpuidle/cpuidle.c | 2 ++
> >   1 file changed, 2 insertions(+)
> >
> > diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> > index e8e2775..ed5c8efe 100644
> > --- a/drivers/cpuidle/cpuidle.c
> > +++ b/drivers/cpuidle/cpuidle.c
> > @@ -585,6 +585,8 @@ int cpuidle_register(struct cpuidle_driver *drv,
> >  */
> > if (coupled_cpus)
> >device->coupled_cpus = *coupled_cpus;
> > +
> > +  device->safe_state_index = drv->safe_state_index;
> 
> Hey, good catch. We are lucky the safe_state_index is always zero.
> 
> I think we can simplify the code by removing the safe_state_index from 
> the cpuidle_device structure and use the one in the cpuidle_driver 
> structure in coupled.c

Will do, thanks!

Regards,
-Xunlei


ZTE Information Security Notice: The information contained in this mail (and 
any attachment transmitted herewith) is privileged and confidential and is 
intended for the exclusive use of the addressee(s).  If you are not an intended 
recipient, any disclosure, reproduction, distribution or other dissemination or 
use of the information contained is strictly prohibited.  If you have received 
this mail in error, please delete it and notify us immediately.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] smaps: fill missing fields for vma(VM_HUGETLB)

2015-08-03 Thread Naoya Horiguchi

On Tue, Aug 04, 2015 at 02:55:30AM +, Naoya Horiguchi wrote:
> On Wed, Jul 29, 2015 at 04:20:59PM -0700, Mike Kravetz wrote:
> > On 07/29/2015 12:08 PM, David Rientjes wrote:
> > >On Tue, 28 Jul 2015, Jörn Engel wrote:
> > >
> > >>Well, we definitely need something.  Having a 100GB process show 3GB of
> > >>rss is not very useful.  How would we notice a memory leak if it only
> > >>affects hugepages, for example?
> > >>
> > >
> > >Since the hugetlb pool is a global resource, it would also be helpful to
> > >determine if a process is mapping more than expected.  You can't do that
> > >just by adding a huge rss metric, however: if you have 2MB and 1GB
> > >hugepages configured you wouldn't know if a process was mapping 512 2MB
> > >hugepages or 1 1GB hugepage.
> > >
> > >That's the purpose of hugetlb_cgroup, after all, and it supports usage
> > >counters for all hstates.  The test could be converted to use that to
> > >measure usage if configured in the kernel.
> > >
> > >Beyond that, I'm not sure how a per-hstate rss metric would be exported to
> > >userspace in a clean way and other ways of obtaining the same data are
> > >possible with hugetlb_cgroup.  I'm not sure how successful you'd be in
> > >arguing that we need separate rss counters for it.
> >
> > If I want to track hugetlb usage on a per-task basis, do I then need to
> > create one cgroup per task?
> >
> > For example, suppose I have many tasks using hugetlb and the global pool
> > is getting low on free pages.  It might be useful to know which tasks are
> > using hugetlb pages, and how many they are using.
> >
> > I don't actually have this need (I think), but it appears to be what
> > Jörn is asking for.
> 
> One possible way to get hugetlb metric in per-task basis is to walk page
> table via /proc/pid/pagemap, and counting page flags for each mapped page
> (we can easily do this with tools/vm/page-types.c like "page-types -p 
> -b huge"). This is obviously slower than just storing the counter as
> in-kernel data and just exporting it, but might be useful in some situation.

BTW, currently smaps doesn't report any meaningful info for vma(VM_HUGETLB).
I wrote the following patch, which hopefully is helpful for your purpose.

Thanks,
Naoya Horiguchi

---
From: Naoya Horiguchi 
Subject: [PATCH] smaps: fill missing fields for vma(VM_HUGETLB)

Currently smaps reports many zero fields for vma(VM_HUGETLB), which is
inconvenient when we want to know per-task or per-vma base hugetlb usage.
This patch enables these fields by introducing smaps_hugetlb_range().

before patch:

  Size:  20480 kB
  Rss:   0 kB
  Pss:   0 kB
  Shared_Clean:  0 kB
  Shared_Dirty:  0 kB
  Private_Clean: 0 kB
  Private_Dirty: 0 kB
  Referenced:0 kB
  Anonymous: 0 kB
  AnonHugePages: 0 kB
  Swap:  0 kB
  KernelPageSize: 2048 kB
  MMUPageSize:2048 kB
  Locked:0 kB
  VmFlags: rd wr mr mw me de ht

after patch:

  Size:  20480 kB
  Rss:   18432 kB
  Pss:   18432 kB
  Shared_Clean:  0 kB
  Shared_Dirty:  0 kB
  Private_Clean: 0 kB
  Private_Dirty: 18432 kB
  Referenced:18432 kB
  Anonymous: 18432 kB
  AnonHugePages: 0 kB
  Swap:  0 kB
  KernelPageSize: 2048 kB
  MMUPageSize:2048 kB
  Locked:0 kB
  VmFlags: rd wr mr mw me de ht

Signed-off-by: Naoya Horiguchi 
---
 fs/proc/task_mmu.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca1e091881d4..c7218603306d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -610,12 +610,39 @@ static void show_smap_vma_flags(struct seq_file *m, 
struct vm_area_struct *vma)
seq_putc(m, '\n');
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+unsigned long addr, unsigned long end,
+struct mm_walk *walk)
+{
+   struct mem_size_stats *mss = walk->private;
+   struct vm_area_struct *vma = walk->vma;
+   struct page *page = NULL;
+
+   if (pte_present(*pte)) {
+   page = vm_normal_page(vma, addr, *pte);
+   } else if (is_swap_pte(*pte)) {
+   swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+   if (is_migration_entry(swpent))
+   page = migration_entry_to_page(swpent);
+   }
+   if (page)
+   smaps_account(mss, page, huge_page_size(hstate_vma(vma)),
+ pte_young(*pte), pte_dirty(*pte));
+   return 0;
+}
+#endif /* HUGETLB_PAGE */
+
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
struct mm_walk smaps_walk = {
.pmd_entry =

Re: [PATCH] net_dbg_ratelimited: turn into no-op when !DEBUG

2015-08-03 Thread David Miller

From: Joe Perches 
Date: Mon, 03 Aug 2015 21:02:21 -0700

> On Mon, 2015-08-03 at 20:57 -0700, Joe Perches wrote:
>> On Tue, 2015-08-04 at 05:26 +0200, Jason A. Donenfeld wrote:
>> > This patch replaces calls to net_dbg_ratelimited when !DEBUG with
>> > no_printk, keeping with the idiom of all the other debug print helpers.
>> 
>> Makes sense, thanks Jason.
> 
> Perhaps better still would be to use if (0) no_printk so that
> the call and whatever argument calls the net_dbg_ratelimited
> makes are completely eliminated.

Agreed. Jason please respin your patch to work this way.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 4.2-rc5 rcu stalls.

2015-08-03 Thread Sasha Levin

On 08/03/2015 06:03 PM, Paul E. McKenney wrote:
>> > Ugh, that doesn't revert cleanly.  Got something handy ?
> I do not, but perhaps either Sasha or Frederic do.

I've attached a revert courtesy of Peter.


Thanks,
Sasha

 include/linux/preempt.h | 12 
 kernel/sched/core.c | 34 +++---
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 84991f185173..3a93d4cdcce9 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -137,18 +137,6 @@ extern void preempt_count_sub(int val);
 #define preempt_count_inc() preempt_count_add(1)
 #define preempt_count_dec() preempt_count_sub(1)
 
-#define preempt_active_enter() \
-do { \
-	preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
-	barrier(); \
-} while (0)
-
-#define preempt_active_exit() \
-do { \
-	barrier(); \
-	preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET); \
-} while (0)
-
 #ifdef CONFIG_PREEMPT_COUNT
 
 #define preempt_disable() \
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..bd378bd21a0e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2983,7 +2983,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
  *  - return from syscall or exception to user-space
  *  - return from interrupt-handler to user-space
  *
- * WARNING: must be called with preemption disabled!
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
  */
 static void __sched __schedule(void)
 {
@@ -2992,6 +2994,7 @@ static void __sched __schedule(void)
 	struct rq *rq;
 	int cpu;
 
+	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_note_context_switch();
@@ -3058,6 +3061,8 @@ static void __sched __schedule(void)
 	}
 
 	balance_callback(rq);
+
+	sched_preempt_enable_no_resched();
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -3078,9 +3083,7 @@ asmlinkage __visible void __sched schedule(void)
 
 	sched_submit_work(tsk);
 	do {
-		preempt_disable();
 		__schedule();
-		sched_preempt_enable_no_resched();
 	} while (need_resched());
 }
 EXPORT_SYMBOL(schedule);
@@ -3119,14 +3122,15 @@ void __sched schedule_preempt_disabled(void)
 static void __sched notrace preempt_schedule_common(void)
 {
 	do {
-		preempt_active_enter();
+		__preempt_count_add(PREEMPT_ACTIVE);
 		__schedule();
-		preempt_active_exit();
+		__preempt_count_sub(PREEMPT_ACTIVE);
 
 		/*
 		 * Check again in case we missed a preemption opportunity
 		 * between schedule and now.
 		 */
+		barrier();
 	} while (need_resched());
 }
 
@@ -3172,13 +3176,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		return;
 
 	do {
-		/*
-		 * Use raw __prempt_count() ops that don't call function.
-		 * We can't call functions before disabling preemption which
-		 * disarm preemption tracing recursions.
-		 */
-		__preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
-		barrier();
+		__preempt_count_add(PREEMPT_ACTIVE);
 		/*
 		 * Needs preempt disabled in case user_exit() is traced
 		 * and the tracer calls preempt_enable_notrace() causing
@@ -3188,8 +3186,8 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 		__schedule();
 		exception_exit(prev_ctx);
 
+		__preempt_count_sub(PREEMPT_ACTIVE);
 		barrier();
-		__preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
 	} while (need_resched());
 }
 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
@@ -3212,11 +3210,17 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
 	prev_state = exception_enter();
 
 	do {
-		preempt_active_enter();
+		__preempt_count_add(PREEMPT_ACTIVE);
 		local_irq_enable();
 		__schedule();
 		local_irq_disable();
-		preempt_active_exit();
+		__preempt_count_sub(PREEMPT_ACTIVE);
+
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+		barrier();
 	} while (need_resched());
 
 	exception_exit(prev_state);

[PATCH 1/2] x86/lguest: clean up lguest_setup_irq.

2015-08-03 Thread Rusty Russell

We make it static and hoist it higher in the file for the next patch.
We also give a nice panic if it fails during boot.

Signed-off-by: Rusty Russell 
---
 arch/x86/lguest/boot.c | 43 ++-
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 433e5a7dd37f..f38b7e8a88d2 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -835,6 +835,26 @@ static struct irq_chip lguest_irq_controller = {
.irq_unmask = enable_lguest_irq,
 };
 
+/*
+ * Interrupt descriptors are allocated as-needed, but low-numbered ones are
+ * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
+ * tells us the irq is already used: other errors (ie. ENOMEM) we take
+ * seriously.
+ */
+static int lguest_setup_irq(unsigned int irq)
+{
+   int err;
+
+   /* Returns -ve error or vector number. */
+   err = irq_alloc_desc_at(irq, 0);
+   if (err < 0 && err != -EEXIST)
+   return err;
+
+   irq_set_chip_and_handler_name(irq, _irq_controller,
+ handle_level_irq, "level");
+   return 0;
+}
+
 static int lguest_enable_irq(struct pci_dev *dev)
 {
u8 line = 0;
@@ -879,26 +899,6 @@ static void __init lguest_init_IRQ(void)
 }
 
 /*
- * Interrupt descriptors are allocated as-needed, but low-numbered ones are
- * reserved by the generic x86 code.  So we ignore irq_alloc_desc_at if it
- * tells us the irq is already used: other errors (ie. ENOMEM) we take
- * seriously.
- */
-int lguest_setup_irq(unsigned int irq)
-{
-   int err;
-
-   /* Returns -ve error or vector number. */
-   err = irq_alloc_desc_at(irq, 0);
-   if (err < 0 && err != -EEXIST)
-   return err;
-
-   irq_set_chip_and_handler_name(irq, _irq_controller,
- handle_level_irq, "level");
-   return 0;
-}
-
-/*
  * Time.
  *
  * It would be far better for everyone if the Guest had its own clock, but
@@ -1028,7 +1028,8 @@ static void lguest_time_irq(unsigned int irq, struct 
irq_desc *desc)
 static void lguest_time_init(void)
 {
/* Set up the timer interrupt (0) to go to our simple timer routine */
-   lguest_setup_irq(0);
+   if (lguest_setup_irq(0) != 0)
+   panic("Could not set up timer irq");
irq_set_handler(0, lguest_time_irq);
 
clocksource_register_hz(_clock, NSEC_PER_SEC);
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] x86/lguest: Do not setup unused irq vectors

2015-08-03 Thread Rusty Russell

From: Thomas Gleixner 

No point in assigning the interrupt vectors if there is no interrupt
chip installed. Move it to lguest_setup_irq().

(And call it from lguest_enable_irq).

Signed-off-by: Thomas Gleixner 
Signed-off-by: Rusty Russell  (fixed typo)
Signed-off-by: Rusty Russell 
---
 arch/x86/lguest/boot.c | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index f38b7e8a88d2..2566c97c01c8 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -850,21 +850,29 @@ static int lguest_setup_irq(unsigned int irq)
if (err < 0 && err != -EEXIST)
return err;
 
+   /*
+* Tell the Linux infrastructure that the interrupt is
+* controlled by our level-based lguest interrupt controller.
+*/
irq_set_chip_and_handler_name(irq, _irq_controller,
  handle_level_irq, "level");
+
+   /* Some systems map "vectors" to interrupts weirdly.  Not us! */
+   __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq], irq);
return 0;
 }
 
 static int lguest_enable_irq(struct pci_dev *dev)
 {
+   int err;
u8 line = 0;
 
/* We literally use the PCI interrupt line as the irq number. */
pci_read_config_byte(dev, PCI_INTERRUPT_LINE, );
-   irq_set_chip_and_handler_name(line, _irq_controller,
- handle_level_irq, "level");
-   dev->irq = line;
-   return 0;
+   err = lguest_setup_irq(line);
+   if (!err)
+   dev->irq = line;
+   return err;
 }
 
 /* We don't do hotplug PCI, so this shouldn't be called. */
@@ -875,17 +883,13 @@ static void lguest_disable_irq(struct pci_dev *dev)
 
 /*
  * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware
- * interrupt (except 128, which is used for system calls), and then tells the
- * Linux infrastructure that each interrupt is controlled by our level-based
- * lguest interrupt controller.
+ * interrupt (except 128, which is used for system calls).
  */
 static void __init lguest_init_IRQ(void)
 {
unsigned int i;
 
for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
-   /* Some systems map "vectors" to interrupts weirdly.  Not us! */
-   __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
if (i != IA32_SYSCALL_VECTOR)
set_intr_gate(i, irq_entries_start +
8 * (i - FIRST_EXTERNAL_VECTOR));
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 2/7] x86/lguest: Do not setup unused irq vectors

2015-08-03 Thread Rusty Russell

Thomas Gleixner  writes:
> On Mon, 3 Aug 2015, Rusty Russell wrote:
>> Thomas Gleixner  writes:
>> > +
>> > +  /* Some systems map "vectors" to interrupts weirdly.  Not us! */
>> > +  __this_cpu_write(vector_irq[FIRST_EXTERNAL_VECTOR + irq, irq);
>> 
>> Missing ].
>
> Doh.
>
>> [   17.751889] do_IRQ: 0.33 No irq handler for vector (irq -1)
>> 
>> You broke interrupts :(
>
> Right, because I missed the other place which fiddles with
> interrupts. Does the patch below fix the issue?

Yep.  I added error handling.

I reworked it into two patches: one which staticizes lguest_setup_irq()
and moves it up, the other of which applies your changes.

Will post, you can take them...

Thanks,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V9 0/5] map GHES memory region according to EFI memory map

2015-08-03 Thread Borislav Petkov

On Mon, Aug 03, 2015 at 05:23:54PM +0100, Matt Fleming wrote:
> Rafael, Boris?

The ghes.c change looks fine I guess. The whole patchset makes sense
now, with the arch bits extracted. So

Acked-by: Borislav Petkov 

However, we probably should work towards adhering to EFI memory
attributes on x86, long term, as we talked. But that's a future thing.

Thanks.

-- 
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH -v2 6/8] jump_label: Add a new static_key interface

2015-08-03 Thread Borislav Petkov

On Mon, Aug 03, 2015 at 09:07:53PM -0700, Andy Lutomirski wrote:
> Except that, with the new interface, static_key_likely is the other
> way around, right?  If the key is true (i.e. enabled), then it doesn't
> branch.
> 
> I think of the key as a boolean thing that happens to work by code
> patching under the hood.  The fancy patching affects the performance
> but doesn't really make it functionally different from a regular
> variable.  How about making it extra explicit:
> 
> static_key_set(, value);
> 
> where value is a bool or maybe even an unsigned int?

Let's have an actual example:

+   if (static_branch_likely(&__use_tsc)) {
+   u64 tsc_now = rdtsc();
+
+   /* return the value in ns */
+   return cycles_2_ns(tsc_now);
+   }

Well, I can see how the likely/unlikely things can confuse. They
actually don't have anything to do with where we will branch to but how
the code will be laid out, AFAICT. So I'm reading this as:

if (use_tsc)) {
RDTSC;
return;
}

and then it is straightforward.

So in this case, the jump will be disabled and we won't branch anywhere.
It actually becomes:

RDTSC;
return;

which can't get any more optimal than it is.

Hmm, yeah, I see how that can be confusing... But the asm is finally
fine. Hey, at least one thing...

-- 
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH -v2 6/8] jump_label: Add a new static_key interface

2015-08-03 Thread Andy Lutomirski

On Mon, Aug 3, 2015 at 8:37 PM, Borislav Petkov  wrote:
> On Mon, Aug 03, 2015 at 05:57:57PM -0400, Steven Rostedt wrote:
>> That's implementation details, not a general concept that users will
>> need to know about.
>
> Why?
>
> It is a branch, regardless of which insn is used on which arch - it is
> either active and you *branch* to that code or *inactive* and you don't.
> So now it is actually what it should've been from the beginning...

Except that, with the new interface, static_key_likely is the other
way around, right?  If the key is true (i.e. enabled), then it doesn't
branch.

I think of the key as a boolean thing that happens to work by code
patching under the hood.  The fancy patching affects the performance
but doesn't really make it functionally different from a regular
variable.  How about making it extra explicit:

static_key_set(, value);

where value is a bool or maybe even an unsigned int?

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [TRIVIAL PATCH] block: Correct misuses of 0x%

2015-08-03 Thread yalin wang


> On Aug 4, 2015, at 10:27, Joe Perches  wrote:
> 
> On Tue, 2015-08-04 at 10:19 +0800, yalin wang wrote:
>> Ping ? 
>>> On Aug 3, 2015, at 16:56, yalin wang  wrote:
>>> 
>>> 
 On Aug 3, 2015, at 16:03, Joe Perches  wrote:
 
 On Mon, 2015-08-03 at 15:25 +0800, yalin wang wrote:
>> On Aug 3, 2015, at 04:25, Joe Perches  wrote:
>> 
>> Correct misuse of 0x%d in logging messages.
>> 
 []
> why not use like this : dev_dbg(>pdev->dev, "   Max outstanding 
> commands = %#x\n”  ? 
> %#x will add 0x prefix automatically .
 
 It's generally a consistency thing.
 A 0 value would be emitted as 0 and not 0x0.
 
>>> i try on my ubuntu , 
>>> 
>>> static int __init throtl_init(void) 
>>>   
>>> {   
>>>  
>>>printk("module init test: %#x %p\n", 0, (void *)0x123);  
>>> 
>>> return 0;   
>>> 
>>> 
>>> }   
>>> 
>>> 
>>> module_init(throtl_init); 
>>> 
>>> #uname -a
>>> Linux ubuntu 3.16.0-38-generic #52~14.04.1-Ubuntu SMP Fri May 8 09:43:57 
>>> UTC 2015 x86_64 x86_64 x86_64 GNU/Linux
>>> 
>>> #dmesg
>>> [259356.375586] module init test: 0x0 0123
>>> 
>>> it seems don’t need 0x%x for 0, just need %#x for all numbers.
>>> there are lots of use like this, i can change them if needed:
>>> 
>>> # egrep -r  -i '0x%\d*x'  .  | wc -l 
>>> 11776
> 
> I suggest not, it's not a standard usage and the 0
> may be unexpected.
> 
ok, printk for %x  is really not compatible with glibc printf behavior,
another is %p,  printk print %p as hex but not with 0x prefix, while
printf print %p as 0x…. with prefix, is this need change ?
so we don’t need lots of 0x%p printk .









--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] net_dbg_ratelimited: turn into no-op when !DEBUG

2015-08-03 Thread Joe Perches

On Mon, 2015-08-03 at 20:57 -0700, Joe Perches wrote:
> On Tue, 2015-08-04 at 05:26 +0200, Jason A. Donenfeld wrote:
> > This patch replaces calls to net_dbg_ratelimited when !DEBUG with
> > no_printk, keeping with the idiom of all the other debug print helpers.
> 
> Makes sense, thanks Jason.

Perhaps better still would be to use if (0) no_printk so that
the call and whatever argument calls the net_dbg_ratelimited
makes are completely eliminated.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] net_dbg_ratelimited: turn into no-op when !DEBUG

2015-08-03 Thread Joe Perches

On Tue, 2015-08-04 at 05:26 +0200, Jason A. Donenfeld wrote:
> The pr_debug family of functions turns into a no-op when -DDEBUG is not
> specified, opting instead to call "no_printk", which gets compiled to a
> no-op (but retains gcc's nice warnings about printf-style arguments).
> 
> The problem with net_dbg_ratelimited is that it is defined to be a
> variant of net_ratelimited_function, which expands to essentially:
> 
> if (net_ratelimit())
> pr_debug(fmt, ...);
> 
> When DEBUG is not defined, then this becomes,
> 
> if (net_ratelimit())
> ;
> 
> This seems benign, except it isn't. Firstly, there's the obvious
> overhead of calling net_ratelimit needlessly, which does quite some book
> keeping for the rate limiting. Given that the pr_debug and
> net_dbg_ratelimited family of functions are sprinkled liberally through
> performance critical code, with developers assuming they'll be compiled
> out to a no-op most of the time, we certainly do not want this needless
> book keeping. Secondly, and most visibly, even though no debug message
> is printed when DEBUG is not defined, if there is a flood of
> invocations, dmesg winds up peppered with messages such as
> "net_ratelimit: 320 callbacks suppressed". This is because our
> aforementioned net_ratelimit() function actually prints this text in
> some circumstances. It's especially odd to see this when there isn't any
> other accompanying debug message.
> 
> So, in sum, it doesn't make sense to have this function's current
> behavior, and instead it should match what every other debug family of
> functions in the kernel does with !DEBUG -- nothing.
> 
> This patch replaces calls to net_dbg_ratelimited when !DEBUG with
> no_printk, keeping with the idiom of all the other debug print helpers.

Makes sense, thanks Jason.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] Reverted "selftests: add hugetlbfstest"

2015-08-03 Thread Mike Kravetz


Rebased as suggested by Naoya Horiguch

This manually reverts 7e50533d4b84289e4f01de56d6f98e9c64e2229e

The hugetlbfstest test depends on hugetlb pages being counted
in a task's rss.  This functionality is not in the kernel, so
the test will always fail.  Remove test to avoid confusion.

Signed-off-by: Mike Kravetz 
---
 tools/testing/selftests/vm/Makefile|  1 -
 tools/testing/selftests/vm/hugetlbfstest.c | 86 
--

 tools/testing/selftests/vm/run_vmtests | 11 
 3 files changed, 98 deletions(-)
 delete mode 100644 tools/testing/selftests/vm/hugetlbfstest.c

diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile

index 2da6608..bb888c6 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -4,7 +4,6 @@ CFLAGS = -Wall
 BINARIES = compaction_test
 BINARIES += hugepage-mmap
 BINARIES += hugepage-shm
-BINARIES += hugetlbfstest
 BINARIES += map_hugetlb
 BINARIES += mlock2-tests
 BINARIES += on-fault-limit
diff --git a/tools/testing/selftests/vm/hugetlbfstest.c 
b/tools/testing/selftests/vm/hugetlbfstest.c

deleted file mode 100644
index 02e1072..000
--- a/tools/testing/selftests/vm/hugetlbfstest.c
+++ /dev/null
@@ -1,86 +0,0 @@
-#define _GNU_SOURCE
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-typedef unsigned long long u64;
-
-static size_t length = 1 << 24;
-
-static u64 read_rss(void)
-{
-   char buf[4096], *s = buf;
-   int i, fd;
-   u64 rss;
-
-   fd = open("/proc/self/statm", O_RDONLY);
-   assert(fd > 2);
-   memset(buf, 0, sizeof(buf));
-   read(fd, buf, sizeof(buf) - 1);
-   for (i = 0; i < 1; i++)
-   s = strchr(s, ' ') + 1;
-   rss = strtoull(s, NULL, 10);
-   return rss << 12; /* assumes 4k pagesize */
-}
-
-static void do_mmap(int fd, int extra_flags, int unmap)
-{
-   int *p;
-   int flags = MAP_PRIVATE | MAP_POPULATE | extra_flags;
-   u64 before, after;
-   int ret;
-
-   before = read_rss();
-   p = mmap(NULL, length, PROT_READ | PROT_WRITE, flags, fd, 0);
-   assert(p != MAP_FAILED ||
-   !"mmap returned an unexpected error");
-   after = read_rss();
-   assert(llabs(after - before - length) < 0x4 ||
-   !"rss didn't grow as expected");
-   if (!unmap)
-   return;
-   ret = munmap(p, length);
-   assert(!ret || !"munmap returned an unexpected error");
-   after = read_rss();
-   assert(llabs(after - before) < 0x4 ||
-   !"rss didn't shrink as expected");
-}
-
-static int open_file(const char *path)
-{
-   int fd, err;
-
-   unlink(path);
-   fd = open(path, O_CREAT | O_RDWR | O_TRUNC | O_EXCL
-   | O_LARGEFILE | O_CLOEXEC, 0600);
-   assert(fd > 2);
-   unlink(path);
-   err = ftruncate(fd, length);
-   assert(!err);
-   return fd;
-}
-
-int main(void)
-{
-   int hugefd, fd;
-
-   fd = open_file("/dev/shm/hugetlbhog");
-   hugefd = open_file("/hugepages/hugetlbhog");
-
-   system("echo 100 > /proc/sys/vm/nr_hugepages");
-   do_mmap(-1, MAP_ANONYMOUS, 1);
-   do_mmap(fd, 0, 1);
-   do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 1);
-   do_mmap(hugefd, 0, 1);
-   do_mmap(hugefd, MAP_HUGETLB, 1);
-   /* Leak the last one to test do_exit() */
-   do_mmap(-1, MAP_ANONYMOUS | MAP_HUGETLB, 0);
-   printf("oll korrekt.\n");
-   return 0;
-}
diff --git a/tools/testing/selftests/vm/run_vmtests 
b/tools/testing/selftests/vm/run_vmtests

index 231174a..b7ae2b6 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -76,17 +76,6 @@ else
 fi

 echo ""
-echo "running hugetlbfstest"
-echo ""
-./hugetlbfstest
-if [ $? -ne 0 ]; then
-   echo "[FAIL]"
-   exitcode=1
-else
-   echo "[PASS]"
-fi
-
-echo ""
 echo "running userfaultfd"
 echo ""
 ./userfaultfd 128 32
--
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] x86/ldt: allow to disable modify_ldt at runtime

2015-08-03 Thread Borislav Petkov

On Mon, Aug 03, 2015 at 11:45:24AM -0700, Andy Lutomirski wrote:
> P.P.P.S.  Who thought that IRET faults unmasking NMIs made any sense
> whatsoever when NMIs run on an IST stack?  Seriously, people?

What happened with asking Intel for a sane IRET-NG?

Should be relatively easy - take the current IRET microcode, get rid
of the nasty crap, allocate a new opcode and done. Validation should
actually have *less* to do and can reuse all current test cases.

:-)

-- 
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/5] x86, gfp: Cache best near node for memory allocation.

2015-08-03 Thread Tang Chen


Hi TJ,

Sorry for the late reply.

On 07/16/2015 05:48 AM, Tejun Heo wrote:

..
so in initialization pharse makes no sense any more. The best near online
node for each cpu should be cached somewhere.
I'm not really following.  Is this because the now offline node can
later come online and we'd have to break the constant mapping
invariant if we update the mapping later?  If so, it'd be nice to
spell that out.


Yes. Will document this in the next version.


..
  
+int get_near_online_node(int node)

+{
+   return per_cpu(x86_cpu_to_near_online_node,
+  cpumask_first(_to_cpuid_mask_map[node]));
+}
+EXPORT_SYMBOL(get_near_online_node);

Umm... this function is sitting on a fairly hot path and scanning a
cpumask each time.  Why not just build a numa node -> numa node array?


Indeed. Will avoid to scan a cpumask.


..

  
  static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,

unsigned int order)
  {
-   VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
+   VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+
+#if IS_ENABLED(CONFIG_X86) && IS_ENABLED(CONFIG_NUMA)
+   if (!node_online(nid))
+   nid = get_near_online_node(nid);
+#endif
  
  	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));

  }

Ditto.  Also, what's the synchronization rules for NUMA node
on/offlining.  If you end up updating the mapping later, how would
that be synchronized against the above usages?


I think the near online node map should be updated when node online/offline
happens. But about this, I think the current numa code has a little problem.

As you know, firmware info binds a set of CPUs and memory to a node. But
at boot time, if the node has no memory (a memory-less node) , it won't 
be online.

But the CPUs on that node is available, and bound to the near online node.
(Here, I mean numa_set_node(cpu, node).)

Why does the kernel do this ? I think it is used to ensure that we can 
allocate memory
successfully by calling functions like alloc_pages_node() and 
alloc_pages_exact_node().
By these two fuctions, any CPU should be bound to a node who has memory 
so that

memory allocation can be successful.

That means, for a memory-less node at boot time, CPUs on the node is 
online,

but the node is not online.

That also means, "the node is online" equals to "the node has memory". 
Actually, there

are a lot of code in the kernel is using this rule.


But,
1) in cpu_up(), it will try to online a node, and it doesn't check if 
the node has memory.

2) in try_offline_node(), it offlines CPUs first, and then the memory.

This behavior looks a little wired, or let's say it is ambiguous. It 
seems that a NUMA node
consists of CPUs and memory. So if the CPUs are online, the node should 
be online.


And also,
The main purpose of this patch-set is to make the cpuid <-> nodeid 
mapping persistent.
After this patch-set, alloc_pages_node() and alloc_pages_exact_node() 
won't depend on
cpuid <-> nodeid mapping any more. So the node should be online if the 
CPUs on it are

online. Otherwise, we cannot setup interfaces of CPUs under /sys.


Unfortunately, since I don't have a machine a with memory-less node, I 
cannot reproduce

the problem right now.

How do you think the node online behavior should be changed ?

Thanks.





































--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC] kcore:change kcore_read to make sure the kernel read is safe

2015-08-03 Thread yalin wang

This change kcore_read() to use __copy_from_user_inatomic() to
copy data from kernel address, because kern_addr_valid() just make sure
page table is valid during call it, whne it return, the page table may
change, for example, like set_fixmap() function will change kernel page
table, then maybe trigger kernel crash if encounter this unluckily.

Signed-off-by: yalin wang 
---
 fs/proc/kcore.c | 30 --
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 92e6726..b085fde 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -86,8 +86,8 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen)
size = try;
*nphdr = *nphdr + 1;
}
-   *elf_buflen =   sizeof(struct elfhdr) + 
-   (*nphdr + 2)*sizeof(struct elf_phdr) + 
+   *elf_buflen =   sizeof(struct elfhdr) +
+   (*nphdr + 2)*sizeof(struct elf_phdr) +
3 * ((sizeof(struct elf_note)) +
 roundup(sizeof(CORE_STR), 4)) +
roundup(sizeof(struct elf_prstatus), 4) +
@@ -435,6 +435,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
size_t elf_buflen;
int nphdr;
unsigned long start;
+   unsigned long page = 0;
 
read_lock(_lock);
size = get_kcore_size(, _buflen);
@@ -485,7 +486,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
start = kc_offset_to_vaddr(*fpos - elf_buflen);
if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
tsz = buflen;
-   
+
while (buflen) {
struct kcore_list *m;
 
@@ -515,15 +516,32 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
} else {
if (kern_addr_valid(start)) {
unsigned long n;
+   mm_segment_t old_fs = get_fs();
+
+   if (page == 0) {
+   page = __get_free_page(GFP_KERNEL);
+   if (page == 0)
+   return -ENOMEM;
 
-   n = copy_to_user(buffer, (char *)start, tsz);
+   }
+   set_fs(KERNEL_DS);
+   pagefault_disable();
+   n = __copy_from_user_inatomic((void *)page,
+   (__force const void __user *)start,
+   tsz);
+   pagefault_enable();
+   set_fs(old_fs);
+   if (n)
+   memset((void *)page + tsz - n, 0, n);
+
+   n = copy_to_user(buffer, (char *)page, tsz);
/*
 * We cannot distinguish between fault on source
 * and fault on destination. When this happens
 * we clear too and hope it will trigger the
 * EFAULT again.
 */
-   if (n) { 
+   if (n) {
if (clear_user(buffer + tsz - n,
n))
return -EFAULT;
@@ -540,7 +558,7 @@ read_kcore(struct file *file, char __user *buffer, size_t 
buflen, loff_t *fpos)
start += tsz;
tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen);
}
-
+   free_page(page);
return acc;
 }
 
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH -v2 6/8] jump_label: Add a new static_key interface

2015-08-03 Thread Borislav Petkov

On Mon, Aug 03, 2015 at 05:57:57PM -0400, Steven Rostedt wrote:
> That's implementation details, not a general concept that users will
> need to know about.

Why?

It is a branch, regardless of which insn is used on which arch - it is
either active and you *branch* to that code or *inactive* and you don't.
So now it is actually what it should've been from the beginning...

I realize simplifying the terminology around those jump labels/static
branches things comes kinda unnatural now.

-- 
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: manual merge of the target-updates tree with the libata tree

2015-08-03 Thread Stephen Rothwell

Hi Nicholas,

Today's linux-next merge of the target-updates tree got a conflict in:

  drivers/ata/libata-scsi.c

between commit:

  fe16d4f202c5 ("Revert "libata-eh: Set 'information' field for autosense"")

from the libata tree and commit:

  f5a8b3a796db ("scsi: Protect against buffer possible overflow in 
scsi_set_sense_information")

from the target-updates tree.

I fixed it up (the former removed some code that was updated by the
latter, so I just removed the code) and can carry the fix as necessary
(no action is required).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: manual merge of the target-updates tree with the libata tree

2015-08-03 Thread Stephen Rothwell

Hi Nicholas,

Today's linux-next merge of the target-updates tree got conflicts in:

  drivers/scsi/scsi_error.c
  include/scsi/scsi_eh.h

between commit:

  fe16d4f202c5 ("Revert "libata-eh: Set 'information' field for autosense"")

from the libata tree and commit:

  7708c1656552 ("scsi: Move sense handling routines to scsi_common")

from the target-updates tree.

I fixed it up (I left scsi_set_sense_information in its new place) and
can carry the fix as necessary (no action is required).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 1/6] mmc: sdhci-esdhc-imx: add imx7d support and support HS400

2015-08-03 Thread Dong Aisheng

On Wed, Jul 29, 2015 at 05:03:52PM +0800, Haibo Chen wrote:
> The imx7d usdhc is derived from imx6sx, the difference is that
> imx7d support HS400.
> 
> So introduce a new compatible string for imx7d and add HS400
> support for imx7d usdhc.
> 
> Signed-off-by: Haibo Chen 
> ---
>  drivers/mmc/host/sdhci-esdhc-imx.c | 66 
> ++
>  1 file changed, 66 insertions(+)
> 
> diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c 
> b/drivers/mmc/host/sdhci-esdhc-imx.c
> index c6b9f64..b441eed 100644
> --- a/drivers/mmc/host/sdhci-esdhc-imx.c
> +++ b/drivers/mmc/host/sdhci-esdhc-imx.c
> @@ -44,6 +44,7 @@
>  #define  ESDHC_MIX_CTRL_EXE_TUNE (1 << 22)
>  #define  ESDHC_MIX_CTRL_SMPCLK_SEL   (1 << 23)
>  #define  ESDHC_MIX_CTRL_FBCLK_SEL(1 << 25)
> +#define  ESDHC_MIX_CTRL_HS400_EN (1 << 26)
>  /* Bits 3 and 6 are not SDHCI standard definitions */
>  #define  ESDHC_MIX_CTRL_SDHCI_MASK   0xb7
>  /* Tuning bits */
> @@ -60,6 +61,16 @@
>  #define  ESDHC_TUNE_CTRL_MIN 0
>  #define  ESDHC_TUNE_CTRL_MAX ((1 << 7) - 1)
>  
> +/* strobe dll register */
> +#define ESDHC_STROBE_DLL_CTRL0x70
> +#define ESDHC_STROBE_DLL_CTRL_ENABLE (1 << 0)
> +#define ESDHC_STROBE_DLL_CTRL_RESET  (1 << 1)
> +#define ESDHC_STROBE_DLL_CTRL_SLV_DLY_TARGET_SHIFT   3
> +
> +#define ESDHC_STROBE_DLL_STATUS  0x74
> +#define ESDHC_STROBE_DLL_STS_REF_LOCK(1 << 1)
> +#define ESDHC_STROBE_DLL_STS_SLV_LOCK0x1
> +
>  #define ESDHC_TUNING_CTRL0xcc
>  #define ESDHC_STD_TUNING_EN  (1 << 24)
>  /* NOTE: the minimum valid tuning start tap for mx6sl is 1 */
> @@ -120,6 +131,8 @@
>  #define ESDHC_FLAG_ERR004536 BIT(7)
>  /* The IP supports HS200 mode */
>  #define ESDHC_FLAG_HS200 BIT(8)
> +/* The IP supports HS400 mode */
> +#define ESDHC_FLAG_SUP_HS400 BIT(9)
>  
>  struct esdhc_soc_data {
>   u32 flags;
> @@ -156,6 +169,12 @@ static struct esdhc_soc_data usdhc_imx6sx_data = {
>   | ESDHC_FLAG_HAVE_CAP1 | ESDHC_FLAG_HS200,
>  };
>  
> +static struct esdhc_soc_data usdhc_imx7d_data = {
> + .flags = ESDHC_FLAG_USDHC | ESDHC_FLAG_STD_TUNING
> + | ESDHC_FLAG_HAVE_CAP1 | ESDHC_FLAG_HS200
> + | ESDHC_FLAG_SUP_HS400,

Better to use ESDHC_FLAG_HS400 to keep align with exist ESDHC_FLAG_HS200.

Regards
Dong Aisheng
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] net_dbg_ratelimited: turn into no-op when !DEBUG

2015-08-03 Thread Jason A. Donenfeld

The pr_debug family of functions turns into a no-op when -DDEBUG is not
specified, opting instead to call "no_printk", which gets compiled to a
no-op (but retains gcc's nice warnings about printf-style arguments).

The problem with net_dbg_ratelimited is that it is defined to be a
variant of net_ratelimited_function, which expands to essentially:

if (net_ratelimit())
pr_debug(fmt, ...);

When DEBUG is not defined, then this becomes,

if (net_ratelimit())
;

This seems benign, except it isn't. Firstly, there's the obvious
overhead of calling net_ratelimit needlessly, which does quite some book
keeping for the rate limiting. Given that the pr_debug and
net_dbg_ratelimited family of functions are sprinkled liberally through
performance critical code, with developers assuming they'll be compiled
out to a no-op most of the time, we certainly do not want this needless
book keeping. Secondly, and most visibly, even though no debug message
is printed when DEBUG is not defined, if there is a flood of
invocations, dmesg winds up peppered with messages such as
"net_ratelimit: 320 callbacks suppressed". This is because our
aforementioned net_ratelimit() function actually prints this text in
some circumstances. It's especially odd to see this when there isn't any
other accompanying debug message.

So, in sum, it doesn't make sense to have this function's current
behavior, and instead it should match what every other debug family of
functions in the kernel does with !DEBUG -- nothing.

This patch replaces calls to net_dbg_ratelimited when !DEBUG with
no_printk, keeping with the idiom of all the other debug print helpers.

Signed-off-by: Jason A. Donenfeld 
---
 include/linux/net.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/linux/net.h b/include/linux/net.h
index 04aa068..500fdfe 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -239,8 +239,13 @@ do {   
\
net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__)
 #define net_info_ratelimited(fmt, ...) \
net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__)
+#if defined(DEBUG)
 #define net_dbg_ratelimited(fmt, ...)  \
net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__)
+#else
+#define net_dbg_ratelimited(fmt, ...)  \
+   no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
+#endif
 
 bool __net_get_random_once(void *buf, int nbytes, bool *done,
   struct static_key *done_key);
-- 
2.4.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v4 1/7] locking/pvqspinlock: Unconditional PV kick with _Q_SLOW_VAL

2015-08-03 Thread Waiman Long


On 08/01/2015 06:29 PM, Peter Zijlstra wrote:

On Fri, Jul 31, 2015 at 10:21:58PM -0400, Waiman Long wrote:

The smp_store_release() is not a full barrier. In order to avoid missed
wakeup, we may need to add memory barrier around locked and cpu state
variables adding to complexity. As the chance of spurious wakeup is very
low, it is easier and safer to just do an unconditional kick at unlock
time.

Signed-off-by: Waiman Long
---
  kernel/locking/qspinlock_paravirt.h |   11 ---
  1 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/locking/qspinlock_paravirt.h 
b/kernel/locking/qspinlock_paravirt.h
index 15d3733..2dd4b39 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -240,7 +240,6 @@ static void pv_wait_head(struct qspinlock *lock, struct 
mcs_spinlock *node)
cpu_relax();
}

-   WRITE_ONCE(pn->state, vcpu_halted);
if (!lp) { /* ONCE */
lp = pv_hash(lock, pn);
/*
@@ -320,9 +319,15 @@ __visible void __pv_queued_spin_unlock(struct qspinlock 
*lock)
/*
 * At this point the memory pointed at by lock can be freed/reused,
 * however we can still use the pv_node to kick the CPU.
+*
+* As smp_store_release() is not a full barrier, adding a check to
+* the node->state doesn't guarantee the checking is really done
+* after clearing the lock byte

This is true, but _WHY_ is that a problem ?

  since they are in 2 separate

+* cachelines and so hardware can reorder them.

That's just gibberish, even in the same cacheline stuff can get
reordered.

 So either we insert

+* memory barrier here and in the corresponding pv_wait_head()
+* function or we do an unconditional kick which is what is done here.

why, why why ? You've added words, but you've not actually described
what the problem is you're trying to fix.

AFAICT the only thing we really care about here is that the load in
question happens _after_ we observe SLOW, and that is still true.

The order against the unlock is irrelevant.

So we set ->state before we hash and before we set SLOW. Given that
we've seen SLOW, we must therefore also see ->state.

If ->state == halted, this means the CPU in question is blocked and the
pv_node will not get re-used -- if it does get re-used, it wasn't
blocked and we don't care either.

Therefore, ->cpu is stable and we'll kick it into action.

How do you end up not waking a waiting cpu? Explain that.



Yes, it is safe in the current code. In some versions of my pvqspinlock 
patch, I was resetting the state back to running in pv_wait_head(). This 
causes race problem.


The current code, however, will not reset the state back to running and 
so the check is redundant. I will clarify that in the next patch.



*/
-   if (READ_ONCE(node->state) == vcpu_halted)
-   pv_kick(node->cpu);
+   pv_kick(node->cpu);
  }

Also, this patch clearly isn't against my tree.



Yes, I was backing against the latest tip tree. As some of the files in 
the patch were modified in the latest tip tree, I will rebase my patch 
and update it.


Please let me know if I should be using your tree instead.

Cheers,
Longman
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH] iio: adc: vf610: Add IIO buffer support for Vybrid ADC

2015-08-03 Thread Duan Andy

From: Sanchayan Maity  Sent: Monday, August 03, 2015 
11:10 PM
> To: ji...@kernel.org; linux-...@vger.kernel.org
> Cc: ste...@agner.ch; Duan Fugang-B38611; hof...@osadl.org;
> sanjeev_sha...@mentor.com; Estevam Fabio-R49496; knaac...@gmx.de;
> l...@metafoo.de; pme...@pmeerw.net; antoine.ten...@free-electrons.com;
> linux-kernel@vger.kernel.org; linux-arm-ker...@lists.infradead.org;
> Sanchayan Maity
> Subject: [PATCH] iio: adc: vf610: Add IIO buffer support for Vybrid ADC
> 
> This patch adds support for IIO buffer to the Vybrid ADC driver.
> IIO triggered buffer infrastructure along with iio sysfs trigger is used
> to leverage continuous sampling support provided by the ADC block.
> 
> Signed-off-by: Sanchayan Maity 
> ---
>  drivers/iio/adc/Kconfig |   4 ++
>  drivers/iio/adc/vf610_adc.c | 122
> +---
>  2 files changed, 120 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig index
> 7c55658..4661241 100644
> --- a/drivers/iio/adc/Kconfig
> +++ b/drivers/iio/adc/Kconfig
> @@ -337,6 +337,10 @@ config TWL6030_GPADC  config VF610_ADC
>   tristate "Freescale vf610 ADC driver"
>   depends on OF
> + select IIO_BUFFER
> + select IIO_TRIGGER
> + select IIO_SYSFS_TRIGGER
> + select IIO_TRIGGERED_BUFFER
>   help
> Say yes here to support for Vybrid board analog-to-digital
> converter.
> Since the IP is used for i.MX6SLX, the driver also support
> i.MX6SLX.
> diff --git a/drivers/iio/adc/vf610_adc.c b/drivers/iio/adc/vf610_adc.c
> index 23b8fb9..af72b9a 100644
> --- a/drivers/iio/adc/vf610_adc.c
> +++ b/drivers/iio/adc/vf610_adc.c
> @@ -34,8 +34,11 @@
>  #include 
> 
>  #include 
> +#include 
>  #include 
> -#include 
> +#include 
> +#include  #include
> +
> 
>  /* This will be the driver name the kernel reports */  #define
> DRIVER_NAME "vf610-adc"
> @@ -170,6 +173,7 @@ struct vf610_adc {
>   u32 sample_freq_avail[5];
> 
>   struct completion completion;
> + u16 *buffer;
>  };
> 
>  static const u32 vf610_hw_avgs[] = { 1, 4, 8, 16, 32 }; @@ -505,12
> +509,22 @@ static const struct iio_chan_spec_ext_info vf610_ext_info[] =
> {
>   .info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE) |  \
>   BIT(IIO_CHAN_INFO_SAMP_FREQ),   \
>   .ext_info = vf610_ext_info, \
> + .address = (_idx),  \
> + .scan_index = (_idx),   \
> + .scan_type.sign = 'u',  \
> + .scan_type.realbits = 12,   \
> + .scan_type.storagebits = 16,\
>  }
> 
>  #define VF610_ADC_TEMPERATURE_CHAN(_idx, _chan_type) {   \
>   .type = (_chan_type),   \
>   .channel = (_idx),  \
>   .info_mask_separate = BIT(IIO_CHAN_INFO_PROCESSED), \
> + .address = (_idx),  \
> + .scan_index = (_idx),   \
> + .scan_type.sign = 'u',  \
> + .scan_type.realbits = 12,   \
> + .scan_type.storagebits = 16,\
>  }
> 
>  static const struct iio_chan_spec vf610_adc_iio_channels[] = { @@ -531,6
> +545,7 @@ static const struct iio_chan_spec vf610_adc_iio_channels[] = {
>   VF610_ADC_CHAN(14, IIO_VOLTAGE),
>   VF610_ADC_CHAN(15, IIO_VOLTAGE),
>   VF610_ADC_TEMPERATURE_CHAN(26, IIO_TEMP),
> + IIO_CHAN_SOFT_TIMESTAMP(32),
>   /* sentinel */
>  };
> 
> @@ -559,13 +574,21 @@ static int vf610_adc_read_data(struct vf610_adc
> *info)
> 
>  static irqreturn_t vf610_adc_isr(int irq, void *dev_id)  {
> - struct vf610_adc *info = (struct vf610_adc *)dev_id;
> + struct iio_dev *indio_dev = (struct iio_dev *)dev_id;
> + struct vf610_adc *info = iio_priv(indio_dev);
>   int coco;
> 
>   coco = readl(info->regs + VF610_REG_ADC_HS);
>   if (coco & VF610_ADC_HS_COCO0) {
>   info->value = vf610_adc_read_data(info);
> - complete(>completion);
> + if (iio_buffer_enabled(indio_dev)) {
> + info->buffer[0] = info->value;
> + writel(0, info->regs + VF610_REG_ADC_HS);
The register is read only. After ADC_Rn is read, the coco bit is cleared.

> + iio_push_to_buffers_with_timestamp(indio_dev,
> + info->buffer, iio_get_time_ns());
> + iio_trigger_notify_done(indio_dev->trig);
> + } else
> + complete(>completion);
>   }
> 
>   return IRQ_HANDLED;
> @@ -612,6 +635,9 @@ static int vf610_read_raw(struct iio_dev *indio_dev,
>   switch (mask) {
>   case IIO_CHAN_INFO_RAW:
>   case IIO_CHAN_INFO_PROCESSED:
> + if (iio_buffer_enabled(indio_dev))
> + return -EBUSY;
> +
>   mutex_lock(_dev->mlock);
>

Re: [PATCH 0/3] vm hugetlb selftest cleanup

2015-08-03 Thread Naoya Horiguchi

On Thu, Jul 30, 2015 at 05:59:50PM -0700, Mike Kravetz wrote:
> As a followup to discussions of hugetlbfs fallocate, this provides
> cleanup the vm hugetlb selftests.  Remove hugetlbfstest as it tests
> functionality not present in the kernel.  Emphasize that libhugetlbfs
> test suite should be used for hugetlb regression testing.
> 
> Mike Kravetz (3):
>   Reverted "selftests: add hugetlbfstest"
>   selftests:vm: Point to libhugetlbfs for regression testing
>   Documentation: update libhugetlbfs location and use for testing

It seems that patch 1 conflicts with commit bd67d5c15cc1 ("Test compaction
of mlocked memory"), but the resolution is trivial, so for the series ...

Acked-by: Naoya Horiguchi 

Thanks!--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: hugetlb pages not accounted for in rss

2015-08-03 Thread Naoya Horiguchi

On Wed, Jul 29, 2015 at 04:20:59PM -0700, Mike Kravetz wrote:
> On 07/29/2015 12:08 PM, David Rientjes wrote:
> >On Tue, 28 Jul 2015, Jörn Engel wrote:
> >
> >>Well, we definitely need something.  Having a 100GB process show 3GB of
> >>rss is not very useful.  How would we notice a memory leak if it only
> >>affects hugepages, for example?
> >>
> >
> >Since the hugetlb pool is a global resource, it would also be helpful to
> >determine if a process is mapping more than expected.  You can't do that
> >just by adding a huge rss metric, however: if you have 2MB and 1GB
> >hugepages configured you wouldn't know if a process was mapping 512 2MB
> >hugepages or 1 1GB hugepage.
> >
> >That's the purpose of hugetlb_cgroup, after all, and it supports usage
> >counters for all hstates.  The test could be converted to use that to
> >measure usage if configured in the kernel.
> >
> >Beyond that, I'm not sure how a per-hstate rss metric would be exported to
> >userspace in a clean way and other ways of obtaining the same data are
> >possible with hugetlb_cgroup.  I'm not sure how successful you'd be in
> >arguing that we need separate rss counters for it.
>
> If I want to track hugetlb usage on a per-task basis, do I then need to
> create one cgroup per task?
>
> For example, suppose I have many tasks using hugetlb and the global pool
> is getting low on free pages.  It might be useful to know which tasks are
> using hugetlb pages, and how many they are using.
>
> I don't actually have this need (I think), but it appears to be what
> Jörn is asking for.

One possible way to get hugetlb metric in per-task basis is to walk page
table via /proc/pid/pagemap, and counting page flags for each mapped page
(we can easily do this with tools/vm/page-types.c like "page-types -p 
-b huge"). This is obviously slower than just storing the counter as
in-kernel data and just exporting it, but might be useful in some situation.

Thanks,
Naoya 
HoriguchiN�r��yb�X��ǧv�^�)޺{.n�+{zX����ܨ}���Ơz�:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf��^jǫy�m��@A�a���
0��h���i

[PATCH] ARM64: dts: mt6795: enable basic SMP bringup for MT6795

2015-08-03 Thread Scott Shu

This patch adds support SMP on MediaTek MT6795 Cortex-A53 Octa-core SoC.

The patch is based on v4.2-rc1 and following patch series:
(1) Mars Cheng's "Add mt6795 basic chip support" [1]

[1] https://lkml.org/lkml/2015/7/14/63

Signed-off-by: Scott Shu 
---
 arch/arm64/boot/dts/mediatek/mt6795.dtsi |   13 +
 1 file changed, 13 insertions(+)

diff --git a/arch/arm64/boot/dts/mediatek/mt6795.dtsi 
b/arch/arm64/boot/dts/mediatek/mt6795.dtsi
index da200e7..c85659d 100644
--- a/arch/arm64/boot/dts/mediatek/mt6795.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt6795.dtsi
@@ -20,6 +20,11 @@
#address-cells = <2>;
#size-cells = <2>;
 
+   psci {
+   compatible = "arm,psci-0.2";
+   method = "smc";
+   };
+
cpus {
#address-cells = <1>;
#size-cells = <0>;
@@ -27,48 +32,56 @@
cpu0: cpu@0 {
device_type = "cpu";
compatible = "arm,cortex-a53";
+   enable-method = "psci";
reg = <0x000>;
};
 
cpu1: cpu@1 {
device_type = "cpu";
compatible = "arm,cortex-a53";
+   enable-method = "psci";
reg = <0x001>;
};
 
cpu2: cpu@2 {
device_type = "cpu";
compatible = "arm,cortex-a53";
+   enable-method = "psci";
reg = <0x002>;
};
 
cpu3: cpu@3 {
device_type = "cpu";
compatible = "arm,cortex-a53";
+   enable-method = "psci";
reg = <0x003>;
};
 
cpu4: cpu@100 {
device_type = "cpu";
compatible = "arm,cortex-a53";
+   enable-method = "psci";
reg = <0x100>;
};
 
cpu5: cpu@101 {
device_type = "cpu";
compatible = "arm,cortex-a53";
+   enable-method = "psci";
reg = <0x101>;
};
 
cpu6: cpu@102 {
device_type = "cpu";
compatible = "arm,cortex-a53";
+   enable-method = "psci";
reg = <0x102>;
};
 
cpu7: cpu@103 {
device_type = "cpu";
compatible = "arm,cortex-a53";
+   enable-method = "psci";
reg = <0x103>;
};
};
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v5] powerpc/rcpm: add RCPM driver

2015-08-03 Thread Chenhui Zhao




On Tue, Aug 4, 2015 at 4:23 AM, Scott Wood  
wrote:

On Mon, 2015-08-03 at 19:14 +0800, Chenhui Zhao wrote:

 On Sat, Aug 1, 2015 at 8:45 AM, Scott Wood 
 wrote:
 > On Fri, 2015-06-26 at 15:44 +0800,  
Yuantian.Tang@freescale.comwrote:

 > >  +static void rcpm_v1_set_ip_power(bool enable, u32 *mask)
 > >  +{
 > >  + if (enable)
 > >  + setbits32(_v1_regs->ippdexpcr, *mask);
 > >  + else
 > >  + clrbits32(_v1_regs->ippdexpcr, *mask);
 > >  +}
 > >  +
 > >  +static void rcpm_v2_set_ip_power(bool enable, u32 *mask)
 > >  +{
 > >  + if (enable)
 > >  + setbits32(_v2_regs->ippdexpcr[0], *mask);
 > >  + else
 > >  + clrbits32(_v2_regs->ippdexpcr[0], *mask);
 > >  +}
 >
 > Why do these take "u32 *mask" instead of "u32 mask"?
 >
 > -Scott

 I think it can be used in the case where there are several mask 
values.


When would that be?

-Scott


So far, only use one register, even though the register name is 
"IPPDEXPCRn" (has "n" suffix) in T4 RM.


OK. Just change the parameter to "u32 mask".

-Chenhui

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [TRIVIAL PATCH] block: Correct misuses of 0x%

2015-08-03 Thread Joe Perches

On Tue, 2015-08-04 at 10:19 +0800, yalin wang wrote:
> Ping ? 
> > On Aug 3, 2015, at 16:56, yalin wang  wrote:
> > 
> > 
> >> On Aug 3, 2015, at 16:03, Joe Perches  wrote:
> >> 
> >> On Mon, 2015-08-03 at 15:25 +0800, yalin wang wrote:
>  On Aug 3, 2015, at 04:25, Joe Perches  wrote:
>  
>  Correct misuse of 0x%d in logging messages.
>  
> >> []
> >>> why not use like this : dev_dbg(>pdev->dev, "   Max outstanding 
> >>> commands = %#x\n”  ? 
> >>> %#x will add 0x prefix automatically .
> >> 
> >> It's generally a consistency thing.
> >> A 0 value would be emitted as 0 and not 0x0.
> >> 
> > i try on my ubuntu , 
> > 
> > static int __init throtl_init(void) 
> >   
> >  {  
> >   
> > printk("module init test: %#x %p\n", 0, (void *)0x123); 
> >  
> >  return 0;  
> >  
> > 
> >  }  
> >  
> > 
> >  module_init(throtl_init); 
> > 
> > #uname -a
> > Linux ubuntu 3.16.0-38-generic #52~14.04.1-Ubuntu SMP Fri May 8 09:43:57 
> > UTC 2015 x86_64 x86_64 x86_64 GNU/Linux
> > 
> > #dmesg
> > [259356.375586] module init test: 0x0 0123
> > 
> > it seems don’t need 0x%x for 0, just need %#x for all numbers.
> > there are lots of use like this, i can change them if needed:
> > 
> > # egrep -r  -i '0x%\d*x'  .  | wc -l 
> > 11776

I suggest not, it's not a standard usage and the 0
may be unexpected.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] usb: gadget: f_printer: fix the bug of deadlock caused by nested spinlock

2015-08-03 Thread fupan


On 08/03/2015 10:47 PM, Felipe Balbi wrote:

Hi,

On Mon, Aug 03, 2015 at 07:19:43PM +0800, fupan...@windriver.com wrote:

From: fli 

Function printer_func_disable() has called spinlock on printer_dev->lock,
and it'll call function chain of

 printer_reset_interface()
 |
+---dwc3_gadget_ep_disable()
|
 +---__dwc3_gadget_ep_disable()
 |
 +---dwc3_remove_requests()
 |
 +---dwc3_gadget_giveback()
 |
 +---rx_complete()

in the protected block.

However, rx_complete() in f_printer.c calls spinlock on printer_dev->lock again,
which will cause system hang.

The following steps can reproduce this hang:

1. Build the test program from Documentation/usb/gadget_printer.txt as g_printer
2. Plug in the USB device to a host(such as Ubuntu).
3. on the USB device system run:
#modprobe g_printer.ko
#./g_printer -read_data

4. Unplug the USB device from the host

The system will hang later.

In order to avoid this deadlock, moving the spinlock from 
printer_func_disable() into
printer_reset_interface() and excluding the block of calling 
dwc3_gadget_ep_disable(),
in which the critical resource will be protected by its spinlock in 
rx_complete().

This commit will fix the system hang with the following calltrace:

INFO: rcu_preempt detected stalls on CPUs/tasks: { 3} (detected by 0, t=21006 
jiffies, g=524, c=523, q=2)
sending NMI to all CPUs:
NMI backtrace for cpu 3
CPU: 3 PID: 718 Comm: irq/22-dwc3 Not tainted 3.10.38-ltsi-WR6.0.0.11_standard 
#2
Hardware name: Intel Corp. VALLEYVIEW B3 PLATFORM/NOTEBOOK, BIOS 
BYTICRB1.86C.0092.R32.1410021707 10/02/2014
task: f44f4c20 ti: f40f6000 task.ti: f40f6000
EIP: 0060:[] EFLAGS: 0097 CPU: 3
EIP is at _raw_spin_lock_irqsave+0x35/0x40
EAX: 0076 EBX: f80fad00 ECX: 0076 EDX: 0075
ESI: 0096 EDI: ff94 EBP: f40f7e20 ESP: f40f7e18
  DS: 007b ES: 007b FS: 00d8 GS:  SS: 0068
CR0: 8005003b CR2: b77ac000 CR3: 01c3 CR4: 001007f0
DR0:  DR1:  DR2:  DR3: 
DR6: 0ff0 DR7: 0400
Stack:
  f474a720 f80fad00 f40f7e3c f80f93cc c135d486  f474a720 f468fb00
  f4bea894 f40f7e54 f7e35f19 ff00 f468fb00 f468fb24 0086 f40f7e64
  f7e36577 f468fb00 f4bea810 f40f7e74 f7e365a8 f468fb00 f4bea894 f40f7e9c
Call Trace:
  [] rx_complete+0x1c/0xb0 [g_printer]
  [] ? vsnprintf+0x166/0x390
  [] dwc3_gadget_giveback+0xc9/0xf0 [dwc3]
  [] dwc3_remove_requests+0x57/0x70 [dwc3]
  [] __dwc3_gadget_ep_disable+0x18/0x60 [dwc3]
  [] dwc3_gadget_ep_disable+0x89/0xf0 [dwc3]
  [] printer_reset_interface+0x31/0x50 [g_printer]
  [] printer_func_disable+0x20/0x30 [g_printer]
  [] composite_disconnect+0x4b/0x90 [libcomposite]
  [] dwc3_disconnect_gadget+0x38/0x43 [dwc3]
  [] dwc3_gadget_disconnect_interrupt+0x3e/0x5a [dwc3]
  [] dwc3_thread_interrupt+0x5c8/0x610 [dwc3]
  [] irq_thread_fn+0x18/0x30
  [] irq_thread+0x100/0x130
  [] ? irq_finalize_oneshot.part.29+0xb0/0xb0
  [] ? wake_threads_waitq+0x40/0x40
  [] ? irq_thread_dtor+0xb0/0xb0
  [] kthread+0x94/0xa0
  [] ret_from_kernel_thread+0x1b/0x28
  [] ? kthread_create_on_node+0xc0/0xc0

Signed-off-by: fupan li 

Thanks, out of curiosity, do you plan on sending a glue layer for
Windriver's DWC3 ?

No, just this fix patch.

Fupan


cheers



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [TRIVIAL PATCH] block: Correct misuses of 0x%

2015-08-03 Thread yalin wang

Ping ? 
> On Aug 3, 2015, at 16:56, yalin wang  wrote:
> 
> 
>> On Aug 3, 2015, at 16:03, Joe Perches  wrote:
>> 
>> On Mon, 2015-08-03 at 15:25 +0800, yalin wang wrote:
 On Aug 3, 2015, at 04:25, Joe Perches  wrote:
 
 Correct misuse of 0x%d in logging messages.
 
>> []
>>> why not use like this : dev_dbg(>pdev->dev, "   Max outstanding 
>>> commands = %#x\n”  ? 
>>> %#x will add 0x prefix automatically .
>> 
>> It's generally a consistency thing.
>> A 0 value would be emitted as 0 and not 0x0.
>> 
> i try on my ubuntu , 
> 
> static int __init throtl_init(void)   
> 
>  {
> 
> printk("module init test: %#x %p\n", 0, (void *)0x123);   
>
>  return 0;
>
> 
>  }
>
> 
>  module_init(throtl_init); 
> 
> #uname -a
> Linux ubuntu 3.16.0-38-generic #52~14.04.1-Ubuntu SMP Fri May 8 09:43:57 UTC 
> 2015 x86_64 x86_64 x86_64 GNU/Linux
> 
> #dmesg
> [259356.375586] module init test: 0x0 0123
> 
> it seems don’t need 0x%x for 0, just need %#x for all numbers.
> there are lots of use like this, i can change them if needed:
> 
> # egrep -r  -i '0x%\d*x'  .  | wc -l 
> 11776
> 
> 
> Thanks
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 00/15] trivial: Drop unlikely before IS_ERR(_OR_NULL)

2015-08-03 Thread Viresh Kumar

On 03-08-15, 17:38, Steven Rostedt wrote:
> On Fri, 31 Jul 2015 13:23:10 +0300
> "Kirill A. Shutemov"  wrote:
> > We have two cases in code:
> > 
> > drivers/rtc/rtc-gemini.c:   if (likely(IS_ERR(rtc->rtc_dev)))
> > drivers/staging/lustre/lustre/obdclass/lu_object.c: if 
> > (likely(IS_ERR(shadow) && PTR_ERR(shadow) == -ENOENT)) {
> > 
> > The first one is mistake, I think. Or do we expect rtc_device_register()
> > to fail?
> > 
> > The second is redundant. "if (PTR_ERR(shadow) == -ENOENT)" should do the
> > job.
> > 
> 
> Yep, those look like bugs to me.

Yeah, I have fixed both of them :)

-- 
viresh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/8] watchdog: watchdog_dev: Use single variable name for struct watchdog_device

2015-08-03 Thread Guenter Roeck

The current code uses 'wdd', wddev', and 'watchdog' as variable names
for struct watchdog_device. This is confusing and makes it difficult
to enhance the code. Replace it all with 'wdd'.

Cc: Timo Kokkonen 
Cc: Uwe Kleine-König 
Signed-off-by: Guenter Roeck 
---
 drivers/watchdog/watchdog_dev.c | 151 
 1 file changed, 75 insertions(+), 76 deletions(-)

diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 6aaefbad303e..06171c73daf5 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -51,7 +51,7 @@ static struct watchdog_device *old_wdd;
 
 /*
  * watchdog_ping: ping the watchdog.
- * @wddev: the watchdog device to ping
+ * @wdd: the watchdog device to ping
  *
  * If the watchdog has no own ping operation then it needs to be
  * restarted via the start operation. This wrapper function does
@@ -59,65 +59,65 @@ static struct watchdog_device *old_wdd;
  * We only ping when the watchdog device is running.
  */
 
-static int watchdog_ping(struct watchdog_device *wddev)
+static int watchdog_ping(struct watchdog_device *wdd)
 {
int err = 0;
 
-   mutex_lock(>lock);
+   mutex_lock(>lock);
 
-   if (test_bit(WDOG_UNREGISTERED, >status)) {
+   if (test_bit(WDOG_UNREGISTERED, >status)) {
err = -ENODEV;
goto out_ping;
}
 
-   if (!watchdog_active(wddev))
+   if (!watchdog_active(wdd))
goto out_ping;
 
-   if (wddev->ops->ping)
-   err = wddev->ops->ping(wddev);  /* ping the watchdog */
+   if (wdd->ops->ping)
+   err = wdd->ops->ping(wdd);  /* ping the watchdog */
else
-   err = wddev->ops->start(wddev); /* restart watchdog */
+   err = wdd->ops->start(wdd); /* restart watchdog */
 
 out_ping:
-   mutex_unlock(>lock);
+   mutex_unlock(>lock);
return err;
 }
 
 /*
  * watchdog_start: wrapper to start the watchdog.
- * @wddev: the watchdog device to start
+ * @wdd: the watchdog device to start
  *
  * Start the watchdog if it is not active and mark it active.
  * This function returns zero on success or a negative errno code for
  * failure.
  */
 
-static int watchdog_start(struct watchdog_device *wddev)
+static int watchdog_start(struct watchdog_device *wdd)
 {
int err = 0;
 
-   mutex_lock(>lock);
+   mutex_lock(>lock);
 
-   if (test_bit(WDOG_UNREGISTERED, >status)) {
+   if (test_bit(WDOG_UNREGISTERED, >status)) {
err = -ENODEV;
goto out_start;
}
 
-   if (watchdog_active(wddev))
+   if (watchdog_active(wdd))
goto out_start;
 
-   err = wddev->ops->start(wddev);
+   err = wdd->ops->start(wdd);
if (err == 0)
-   set_bit(WDOG_ACTIVE, >status);
+   set_bit(WDOG_ACTIVE, >status);
 
 out_start:
-   mutex_unlock(>lock);
+   mutex_unlock(>lock);
return err;
 }
 
 /*
  * watchdog_stop: wrapper to stop the watchdog.
- * @wddev: the watchdog device to stop
+ * @wdd: the watchdog device to stop
  *
  * Stop the watchdog if it is still active and unmark it active.
  * This function returns zero on success or a negative errno code for
@@ -125,155 +125,154 @@ out_start:
  * If the 'nowayout' feature was set, the watchdog cannot be stopped.
  */
 
-static int watchdog_stop(struct watchdog_device *wddev)
+static int watchdog_stop(struct watchdog_device *wdd)
 {
int err = 0;
 
-   mutex_lock(>lock);
+   mutex_lock(>lock);
 
-   if (test_bit(WDOG_UNREGISTERED, >status)) {
+   if (test_bit(WDOG_UNREGISTERED, >status)) {
err = -ENODEV;
goto out_stop;
}
 
-   if (!watchdog_active(wddev))
+   if (!watchdog_active(wdd))
goto out_stop;
 
-   if (test_bit(WDOG_NO_WAY_OUT, >status)) {
-   dev_info(wddev->dev, "nowayout prevents watchdog being 
stopped!\n");
+   if (test_bit(WDOG_NO_WAY_OUT, >status)) {
+   dev_info(wdd->dev, "nowayout prevents watchdog being 
stopped!\n");
err = -EBUSY;
goto out_stop;
}
 
-   err = wddev->ops->stop(wddev);
+   err = wdd->ops->stop(wdd);
if (err == 0)
-   clear_bit(WDOG_ACTIVE, >status);
+   clear_bit(WDOG_ACTIVE, >status);
 
 out_stop:
-   mutex_unlock(>lock);
+   mutex_unlock(>lock);
return err;
 }
 
 /*
  * watchdog_get_status: wrapper to get the watchdog status
- * @wddev: the watchdog device to get the status from
+ * @wdd: the watchdog device to get the status from
  * @status: the status of the watchdog device
  *
  * Get the watchdog's status flags.
  */
 
-static int watchdog_get_status(struct watchdog_device *wddev,
+static int watchdog_get_status(struct watchdog_device *wdd,

[PATCH 3/8] watchdog: Introduce WDOG_RUNNING flag

2015-08-03 Thread Guenter Roeck

The WDOG_RUNNING flag is expected to be set by watchdog drivers if
the hardware watchdog is running. If the flag is set, the watchdog
subsystem will ping the watchdog even if the watchdog device is closed.

The watchdog driver stop function is now optional and may be omitted
if the watchdog can not be stopped. If stopping the watchdog is not
possible but the driver implements a stop function, it is responsible
to set the WDOG_RUNNING flag in its stop function.

Cc: Timo Kokkonen 
Cc: Uwe Kleine-König 
Signed-off-by: Guenter Roeck 
---
 Documentation/watchdog/watchdog-kernel-api.txt | 19 -
 drivers/watchdog/watchdog_core.c   |  2 +-
 drivers/watchdog/watchdog_dev.c| 39 --
 include/linux/watchdog.h   |  7 +
 4 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/Documentation/watchdog/watchdog-kernel-api.txt 
b/Documentation/watchdog/watchdog-kernel-api.txt
index 5fa085276874..7fda3c86cf46 100644
--- a/Documentation/watchdog/watchdog-kernel-api.txt
+++ b/Documentation/watchdog/watchdog-kernel-api.txt
@@ -144,17 +144,18 @@ are:
   device.
   The routine needs a pointer to the watchdog timer device structure as a
   parameter. It returns zero on success or a negative errno code for failure.
-* stop: with this routine the watchdog timer device is being stopped.
-  The routine needs a pointer to the watchdog timer device structure as a
-  parameter. It returns zero on success or a negative errno code for failure.
-  Some watchdog timer hardware can only be started and not be stopped. The
-  driver supporting this hardware needs to make sure that a start and stop
-  routine is being provided. This can be done by using a timer in the driver
-  that regularly sends a keepalive ping to the watchdog timer hardware.
 
 Not all watchdog timer hardware supports the same functionality. That's why
 all other routines/operations are optional. They only need to be provided if
 they are supported. These optional routines/operations are:
+* stop: with this routine the watchdog timer device is being stopped.
+  The routine needs a pointer to the watchdog timer device structure as a
+  parameter. It returns zero on success or a negative errno code for failure.
+  Some watchdog timer hardware can only be started and not be stopped. A
+  driver supporting such hardware does not have to implement the stop routine.
+  If a driver has no stop function, the watchdog core will set WDOG_RUNNING and
+  start calling the driver's keepalive pings function after the watchdog device
+  is closed.
 * ping: this is the routine that sends a keepalive ping to the watchdog timer
   hardware.
   The routine needs a pointer to the watchdog timer device structure as a
@@ -206,6 +207,10 @@ bit-operations. The status bits that are defined are:
   any watchdog_ops, so that you can be sure that no operations (other then
   unref) will get called after unregister, even if userspace still holds a
   reference to /dev/watchdog
+* WDOG_RUNNING: Set by the watchdog driver if the hardware watchdog is running.
+  The bit must be set if the watchdog timer hardware can not be stopped;
+  otherwise it is optional. If set, the watchdog driver core will send
+  keepalive pings to the watchdog hardware while the watchdog device is closed.
 
   To set the WDOG_NO_WAY_OUT status bit (before registering your watchdog
   timer device) you can either:
diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
index 1a8059455413..b38d1b7ae10e 100644
--- a/drivers/watchdog/watchdog_core.c
+++ b/drivers/watchdog/watchdog_core.c
@@ -145,7 +145,7 @@ static int __watchdog_register_device(struct 
watchdog_device *wdd)
return -EINVAL;
 
/* Mandatory operations need to be supported */
-   if (wdd->ops->start == NULL || wdd->ops->stop == NULL)
+   if (!wdd->ops->start)
return -EINVAL;
 
watchdog_check_min_max_timeout(wdd);
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 25849c1d6dc1..e0fbc4ac9bb7 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -58,8 +58,9 @@ static inline bool watchdog_need_worker(struct 
watchdog_device *wdd)
unsigned int hm = wdd->max_hw_timeout_ms;
unsigned int m = wdd->max_timeout * 1000;
 
-   return watchdog_active(wdd) && hm && hm != m &&
-   wdd->timeout * 500 > hm;
+   return (watchdog_active(wdd) && hm && hm != m &&
+   wdd->timeout * 500 > hm) ||
+  (!watchdog_active(wdd) && watchdog_running(wdd));
 }
 
 static inline void watchdog_update_worker(struct watchdog_device *wdd,
@@ -87,7 +88,7 @@ static int _watchdog_ping(struct watchdog_device *wdd)
if (test_bit(WDOG_UNREGISTERED, >status))
return -ENODEV;
 
-   if (!watchdog_active(wdd))
+   if (!watchdog_active(wdd) && !watchdog_running(wdd))
return

[PATCH 8/8] watchdog: at91sam9: Convert to use infrastructure triggered keepalives

2015-08-03 Thread Guenter Roeck

The watchdog infrastructure now supports handling watchdog keepalive
if the watchdog is running while the watchdog device is closed.
The infrastructure now also supports generating additional heartbeats
if the maximum hardware timeout is smaller than or close to the
configured timeout. Convert the driver to use this
infrastructure.

Signed-off-by: Guenter Roeck 
---
 drivers/watchdog/at91sam9_wdt.c | 102 +---
 1 file changed, 11 insertions(+), 91 deletions(-)

diff --git a/drivers/watchdog/at91sam9_wdt.c b/drivers/watchdog/at91sam9_wdt.c
index e4698f7c5f93..0de39b52962c 100644
--- a/drivers/watchdog/at91sam9_wdt.c
+++ b/drivers/watchdog/at91sam9_wdt.c
@@ -29,7 +29,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -48,8 +47,8 @@
  * use this to convert a watchdog
  * value from/to milliseconds.
  */
-#define ticks_to_hz_rounddown(t)   t) + 1) * HZ) >> 8)
-#define ticks_to_hz_roundup(t) (t) + 1) * HZ) + 255) >> 8)
+#define ticks_to_ms_rounddown(t)   t) + 1) * 1000) >> 8)
+#define ticks_to_ms_roundup(t) (t) + 1) * 1000) + 255) >> 8)
 #define ticks_to_secs(t)   (((t) + 1) >> 8)
 #define secs_to_ticks(s)   ((s) ? (((s) << 8) - 1) : 0)
 
@@ -64,9 +63,6 @@
 /* Hardware timeout in seconds */
 #define WDT_HW_TIMEOUT 2
 
-/* Timer heartbeat (500ms) */
-#define WDT_TIMEOUT(HZ/2)
-
 /* User land timeout */
 #define WDT_HEARTBEAT 15
 static int heartbeat;
@@ -83,11 +79,8 @@ MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once 
started "
 struct at91wdt {
struct watchdog_device wdd;
void __iomem *base;
-   unsigned long next_heartbeat;   /* the next_heartbeat for the timer */
-   struct timer_list timer;/* The timer that pings the watchdog */
u32 mr;
u32 mr_mask;
-   unsigned long heartbeat;/* WDT heartbeat in jiffies */
bool nowayout;
unsigned int irq;
 };
@@ -107,47 +100,13 @@ static irqreturn_t wdt_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
 }
 
-/*
- * Reload the watchdog timer.  (ie, pat the watchdog)
- */
-static inline void at91_wdt_reset(struct at91wdt *wdt)
-{
-   wdt_write(wdt, AT91_WDT_CR, AT91_WDT_KEY | AT91_WDT_WDRSTT);
-}
-
-/*
- * Timer tick
- */
-static void at91_ping(unsigned long data)
-{
-   struct at91wdt *wdt = (struct at91wdt *)data;
-   if (time_before(jiffies, wdt->next_heartbeat) ||
-   !watchdog_active(>wdd)) {
-   at91_wdt_reset(wdt);
-   mod_timer(>timer, jiffies + wdt->heartbeat);
-   } else {
-   pr_crit("I will reset your machine !\n");
-   }
-}
-
 static int at91_wdt_start(struct watchdog_device *wdd)
 {
struct at91wdt *wdt = to_wdt(wdd);
-   /* calculate when the next userspace timeout will be */
-   wdt->next_heartbeat = jiffies + wdd->timeout * HZ;
-   return 0;
-}
 
-static int at91_wdt_stop(struct watchdog_device *wdd)
-{
-   /* The watchdog timer hardware can not be stopped... */
-   return 0;
-}
+   wdt_write(wdt, AT91_WDT_CR, AT91_WDT_KEY | AT91_WDT_WDRSTT);
 
-static int at91_wdt_set_timeout(struct watchdog_device *wdd, unsigned int 
new_timeout)
-{
-   wdd->timeout = new_timeout;
-   return at91_wdt_start(wdd);
+   return 0;
 }
 
 static int at91_wdt_init(struct platform_device *pdev, struct at91wdt *wdt)
@@ -157,8 +116,8 @@ static int at91_wdt_init(struct platform_device *pdev, 
struct at91wdt *wdt)
u32 value;
int err;
u32 mask = wdt->mr_mask;
-   unsigned long min_heartbeat = 1;
-   unsigned long max_heartbeat;
+   unsigned int min_timeout = jiffies_to_msecs(1);
+   unsigned int hw_timeout;
struct device *dev = >dev;
 
tmp = wdt_read(wdt, AT91_WDT_MR);
@@ -180,31 +139,15 @@ static int at91_wdt_init(struct platform_device *pdev, 
struct at91wdt *wdt)
delta = (tmp & AT91_WDT_WDD) >> 16;
 
if (delta < value)
-   min_heartbeat = ticks_to_hz_roundup(value - delta);
+   min_timeout = ticks_to_ms_roundup(value - delta);
 
-   max_heartbeat = ticks_to_hz_rounddown(value);
-   if (!max_heartbeat) {
+   hw_timeout = ticks_to_ms_rounddown(value);
+   if (hw_timeout < min_timeout * 2) {
dev_err(dev,
"heartbeat is too small for the system to handle it 
correctly\n");
return -EINVAL;
}
-
-   /*
-* Try to reset the watchdog counter 4 or 2 times more often than
-* actually requested, to avoid spurious watchdog reset.
-* If this is not possible because of the min_heartbeat value, reset
-* it at the min_heartbeat period.
-*/
-   if ((max_heartbeat / 4) >= min_heartbeat)
-   wdt->heartbeat = max_heartbeat / 4;
-   else if ((max_heartbeat / 2) >= min_heartbeat)
-   wdt->heartbeat = max_heartbeat / 2;
-

[PATCH 4/8] watchdog: Make set_timeout function optional

2015-08-03 Thread Guenter Roeck

For some watchdogs, the hardware timeout is fixed, and the
watchdog driver depends on the watchdog core to handle the
actual timeout. In this situation, the watchdog driver might
only set the 'timeout' variable but do nothing else.
This can as well be handled by the infrastructure, so make
the set_timeout callback optional. If WDIOF_SETTIMEOUT is
configured but the .set_timeout callback is not available,
update the timeout variable in the infrastructure code.

Signed-off-by: Guenter Roeck 
---
 Documentation/watchdog/watchdog-kernel-api.txt | 4 
 drivers/watchdog/watchdog_dev.c| 9 ++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/Documentation/watchdog/watchdog-kernel-api.txt 
b/Documentation/watchdog/watchdog-kernel-api.txt
index 7fda3c86cf46..2f1a4ad7e565 100644
--- a/Documentation/watchdog/watchdog-kernel-api.txt
+++ b/Documentation/watchdog/watchdog-kernel-api.txt
@@ -178,6 +178,10 @@ they are supported. These optional routines/operations are:
   because the watchdog does not necessarily has a 1 second resolution).
   (Note: the WDIOF_SETTIMEOUT needs to be set in the options field of the
   watchdog's info structure).
+  If the watchdog driver does not have to perform any action but setting the
+  timeout value of the watchdog_device, this callback can be omitted.
+  If set_timeout is not provided but WDIOF_SETTIMEOUT is set, the watchdog
+  infrastructure updates the timeout value of the watchdog_device internally.
 * get_timeleft: this routines returns the time that's left before a reset.
 * ref: the operation that calls kref_get on the kref of a dynamically
   allocated watchdog_device struct.
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index e0fbc4ac9bb7..73bae196a081 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -260,9 +260,9 @@ out_status:
 static int watchdog_set_timeout(struct watchdog_device *wdd,
unsigned int timeout)
 {
-   int err;
+   int err = 0;
 
-   if (!wdd->ops->set_timeout || !(wdd->info->options & WDIOF_SETTIMEOUT))
+   if (!(wdd->info->options & WDIOF_SETTIMEOUT))
return -EOPNOTSUPP;
 
if (watchdog_timeout_invalid(wdd, timeout))
@@ -275,7 +275,10 @@ static int watchdog_set_timeout(struct watchdog_device 
*wdd,
goto out_timeout;
}
 
-   err = wdd->ops->set_timeout(wdd, timeout);
+   if (wdd->ops->set_timeout)
+   err = wdd->ops->set_timeout(wdd, timeout);
+   else
+   wdd->timeout = timeout;
 
watchdog_update_worker(wdd, true, false);
 
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/8] watchdog: imx2: Convert to use infrastructure triggered keepalives

2015-08-03 Thread Guenter Roeck

The watchdog infrastructure now supports handling watchdog keepalive
if the watchdog is running while the watchdog device is closed.
Convert the driver to use this infrastructure.

Signed-off-by: Guenter Roeck 
---
 drivers/watchdog/imx2_wdt.c | 72 -
 1 file changed, 12 insertions(+), 60 deletions(-)

diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c
index 0bb1a1d1b170..66feef254661 100644
--- a/drivers/watchdog/imx2_wdt.c
+++ b/drivers/watchdog/imx2_wdt.c
@@ -25,7 +25,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -34,7 +33,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #define DRIVER_NAME "imx2-wdt"
@@ -62,7 +60,6 @@
 struct imx2_wdt_device {
struct clk *clk;
struct regmap *regmap;
-   struct timer_list timer;/* Pings the watchdog when closed */
struct watchdog_device wdog;
struct notifier_block restart_handler;
 };
@@ -151,16 +148,6 @@ static int imx2_wdt_ping(struct watchdog_device *wdog)
return 0;
 }
 
-static void imx2_wdt_timer_ping(unsigned long arg)
-{
-   struct watchdog_device *wdog = (struct watchdog_device *)arg;
-   struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
-
-   /* ping it every wdog->timeout / 2 seconds to prevent reboot */
-   imx2_wdt_ping(wdog);
-   mod_timer(>timer, jiffies + wdog->timeout * HZ / 2);
-}
-
 static int imx2_wdt_set_timeout(struct watchdog_device *wdog,
unsigned int new_timeout)
 {
@@ -177,40 +164,19 @@ static int imx2_wdt_start(struct watchdog_device *wdog)
 {
struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
 
-   if (imx2_wdt_is_running(wdev)) {
-   /* delete the timer that pings the watchdog after close */
-   del_timer_sync(>timer);
+   if (imx2_wdt_is_running(wdev))
imx2_wdt_set_timeout(wdog, wdog->timeout);
-   } else
+   else
imx2_wdt_setup(wdog);
 
-   return imx2_wdt_ping(wdog);
-}
-
-static int imx2_wdt_stop(struct watchdog_device *wdog)
-{
-   /*
-* We don't need a clk_disable, it cannot be disabled once started.
-* We use a timer to ping the watchdog while /dev/watchdog is closed
-*/
-   imx2_wdt_timer_ping((unsigned long)wdog);
-   return 0;
-}
-
-static inline void imx2_wdt_ping_if_active(struct watchdog_device *wdog)
-{
-   struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
+   set_bit(WDOG_RUNNING, >status);
 
-   if (imx2_wdt_is_running(wdev)) {
-   imx2_wdt_set_timeout(wdog, wdog->timeout);
-   imx2_wdt_timer_ping((unsigned long)wdog);
-   }
+   return imx2_wdt_ping(wdog);
 }
 
 static const struct watchdog_ops imx2_wdt_ops = {
.owner = THIS_MODULE,
.start = imx2_wdt_start,
-   .stop = imx2_wdt_stop,
.ping = imx2_wdt_ping,
.set_timeout = imx2_wdt_set_timeout,
 };
@@ -277,9 +243,10 @@ static int __init imx2_wdt_probe(struct platform_device 
*pdev)
watchdog_set_nowayout(wdog, nowayout);
watchdog_init_timeout(wdog, timeout, >dev);
 
-   setup_timer(>timer, imx2_wdt_timer_ping, (unsigned long)wdog);
-
-   imx2_wdt_ping_if_active(wdog);
+   if (imx2_wdt_is_running(wdev)) {
+   imx2_wdt_set_timeout(wdog, wdog->timeout);
+   set_bit(WDOG_RUNNING, >status);
+   }
 
/*
 * Disable the watchdog power down counter at boot. Otherwise the power
@@ -320,7 +287,6 @@ static int __exit imx2_wdt_remove(struct platform_device 
*pdev)
watchdog_unregister_device(wdog);
 
if (imx2_wdt_is_running(wdev)) {
-   del_timer_sync(>timer);
imx2_wdt_ping(wdog);
dev_crit(>dev, "Device removed: Expect reboot!\n");
}
@@ -334,10 +300,9 @@ static void imx2_wdt_shutdown(struct platform_device *pdev)
 
if (imx2_wdt_is_running(wdev)) {
/*
-* We are running, we need to delete the timer but will
-* give max timeout before reboot will take place
+* We are running, configure max timeout before reboot
+* will take place.
 */
-   del_timer_sync(>timer);
imx2_wdt_set_timeout(wdog, IMX2_WDT_MAX_TIME);
imx2_wdt_ping(wdog);
dev_crit(>dev, "Device shutdown: Expect reboot!\n");
@@ -355,10 +320,6 @@ static int imx2_wdt_suspend(struct device *dev)
if (imx2_wdt_is_running(wdev)) {
imx2_wdt_set_timeout(wdog, IMX2_WDT_MAX_TIME);
imx2_wdt_ping(wdog);
-
-   /* The watchdog is not active */
-   if (!watchdog_active(wdog))
-   del_timer_sync(>timer);
}
 
clk_disable_unprepare(wdev->clk);
@@ -384,19 +345,10 @@ static int imx2_wdt_resume(struct device *dev)

[PATCH 7/8] watchdog: gpio_wdt: Convert to use infrastructure triggered keepalives

2015-08-03 Thread Guenter Roeck

The watchdog infrastructure now supports handling watchdog keepalive
if the watchdog is running while the watchdog device is closed.
The infrastructure now also supports generating additional heartbeats
if the maximum hardware timeout is smaller than or close to the
configured timeout. Convert the driver to use this infrastructure.

Signed-off-by: Guenter Roeck 
---
 drivers/watchdog/gpio_wdt.c | 65 -
 1 file changed, 11 insertions(+), 54 deletions(-)

diff --git a/drivers/watchdog/gpio_wdt.c b/drivers/watchdog/gpio_wdt.c
index 1687cc2d7122..cbbdae440bfa 100644
--- a/drivers/watchdog/gpio_wdt.c
+++ b/drivers/watchdog/gpio_wdt.c
@@ -32,12 +32,8 @@ struct gpio_wdt_priv {
boolactive_low;
boolstate;
boolalways_running;
-   boolarmed;
unsigned inthw_algo;
-   unsigned inthw_margin;
-   unsigned long   last_jiffies;
struct notifier_block   notifier;
-   struct timer_list   timer;
struct watchdog_device  wdd;
 };
 
@@ -50,20 +46,12 @@ static void gpio_wdt_disable(struct gpio_wdt_priv *priv)
gpio_direction_input(priv->gpio);
 }
 
-static void gpio_wdt_start_impl(struct gpio_wdt_priv *priv)
-{
-   priv->state = priv->active_low;
-   gpio_direction_output(priv->gpio, priv->state);
-   priv->last_jiffies = jiffies;
-   mod_timer(>timer, priv->last_jiffies + priv->hw_margin);
-}
-
 static int gpio_wdt_start(struct watchdog_device *wdd)
 {
struct gpio_wdt_priv *priv = watchdog_get_drvdata(wdd);
 
-   gpio_wdt_start_impl(priv);
-   priv->armed = true;
+   priv->state = priv->active_low;
+   gpio_direction_output(priv->gpio, priv->state);
 
return 0;
 }
@@ -72,10 +60,9 @@ static int gpio_wdt_stop(struct watchdog_device *wdd)
 {
struct gpio_wdt_priv *priv = watchdog_get_drvdata(wdd);
 
-   priv->armed = false;
if (!priv->always_running) {
-   mod_timer(>timer, 0);
gpio_wdt_disable(priv);
+   clear_bit(WDOG_RUNNING, >wdd.status);
}
 
return 0;
@@ -85,32 +72,6 @@ static int gpio_wdt_ping(struct watchdog_device *wdd)
 {
struct gpio_wdt_priv *priv = watchdog_get_drvdata(wdd);
 
-   priv->last_jiffies = jiffies;
-
-   return 0;
-}
-
-static int gpio_wdt_set_timeout(struct watchdog_device *wdd, unsigned int t)
-{
-   wdd->timeout = t;
-
-   return gpio_wdt_ping(wdd);
-}
-
-static void gpio_wdt_hwping(unsigned long data)
-{
-   struct watchdog_device *wdd = (struct watchdog_device *)data;
-   struct gpio_wdt_priv *priv = watchdog_get_drvdata(wdd);
-
-   if (priv->armed && time_after(jiffies, priv->last_jiffies +
- msecs_to_jiffies(wdd->timeout * 1000))) {
-   dev_crit(wdd->dev, "Timer expired. System will reboot soon!\n");
-   return;
-   }
-
-   /* Restart timer */
-   mod_timer(>timer, jiffies + priv->hw_margin);
-
switch (priv->hw_algo) {
case HW_ALGO_TOGGLE:
/* Toggle output pin */
@@ -124,6 +85,8 @@ static void gpio_wdt_hwping(unsigned long data)
gpio_set_value_cansleep(priv->gpio, priv->active_low);
break;
}
+
+   return 0;
 }
 
 static int gpio_wdt_notify_sys(struct notifier_block *nb, unsigned long code,
@@ -132,12 +95,10 @@ static int gpio_wdt_notify_sys(struct notifier_block *nb, 
unsigned long code,
struct gpio_wdt_priv *priv = container_of(nb, struct gpio_wdt_priv,
  notifier);
 
-   mod_timer(>timer, 0);
-
switch (code) {
case SYS_HALT:
case SYS_POWER_OFF:
-   gpio_wdt_disable(priv);
+   gpio_wdt_stop(>wdd);
break;
default:
break;
@@ -157,7 +118,6 @@ static const struct watchdog_ops gpio_wdt_ops = {
.start  = gpio_wdt_start,
.stop   = gpio_wdt_stop,
.ping   = gpio_wdt_ping,
-   .set_timeout= gpio_wdt_set_timeout,
 };
 
 static int gpio_wdt_probe(struct platform_device *pdev)
@@ -205,9 +165,6 @@ static int gpio_wdt_probe(struct platform_device *pdev)
if (hw_margin < 2 || hw_margin > 65535)
return -EINVAL;
 
-   /* Use safe value (1/2 of real timeout) */
-   priv->hw_margin = msecs_to_jiffies(hw_margin / 2);
-
priv->always_running = of_property_read_bool(pdev->dev.of_node,
 "always-running");
 
@@ -217,11 +174,15 @@ static int gpio_wdt_probe(struct platform_device *pdev)
priv->wdd.ops   = _wdt_ops;
priv->wdd.min_timeout   = SOFT_TIMEOUT_MIN;
priv->wdd.max_timeout   = SOFT_TIMEOUT_MAX;
+   priv->wdd.max_hw_timeout_ms = hw_margin;
 
if

[PATCH 6/8] watchdog: retu: Convert to use infrastructure triggered keepalives

2015-08-03 Thread Guenter Roeck

The watchdog infrastructure now supports handling watchdog keepalive
if the watchdog is running while the watchdog device is closed.
Convert the driver to use this infrastructure.

Signed-off-by: Guenter Roeck 
---
 drivers/watchdog/retu_wdt.c | 78 -
 1 file changed, 7 insertions(+), 71 deletions(-)

diff --git a/drivers/watchdog/retu_wdt.c b/drivers/watchdog/retu_wdt.c
index b7c68e275aeb..ce2982a7670c 100644
--- a/drivers/watchdog/retu_wdt.c
+++ b/drivers/watchdog/retu_wdt.c
@@ -28,69 +28,22 @@
 /* Watchdog timer values in seconds */
 #define RETU_WDT_MAX_TIMER 63
 
-struct retu_wdt_dev {
-   struct retu_dev *rdev;
-   struct device   *dev;
-   struct delayed_work ping_work;
-};
-
-/*
- * Since Retu watchdog cannot be disabled in hardware, we must kick it
- * with a timer until userspace watchdog software takes over. If
- * CONFIG_WATCHDOG_NOWAYOUT is set, we never start the feeding.
- */
-static void retu_wdt_ping_enable(struct retu_wdt_dev *wdev)
-{
-   retu_write(wdev->rdev, RETU_REG_WATCHDOG, RETU_WDT_MAX_TIMER);
-   schedule_delayed_work(>ping_work,
-   round_jiffies_relative(RETU_WDT_MAX_TIMER * HZ / 2));
-}
-
-static void retu_wdt_ping_disable(struct retu_wdt_dev *wdev)
-{
-   retu_write(wdev->rdev, RETU_REG_WATCHDOG, RETU_WDT_MAX_TIMER);
-   cancel_delayed_work_sync(>ping_work);
-}
-
-static void retu_wdt_ping_work(struct work_struct *work)
-{
-   struct retu_wdt_dev *wdev = container_of(to_delayed_work(work),
-   struct retu_wdt_dev, ping_work);
-   retu_wdt_ping_enable(wdev);
-}
-
 static int retu_wdt_start(struct watchdog_device *wdog)
 {
-   struct retu_wdt_dev *wdev = watchdog_get_drvdata(wdog);
+   struct retu_dev *rdev = watchdog_get_drvdata(wdog);
 
-   retu_wdt_ping_disable(wdev);
+   set_bit(WDOG_RUNNING, >status);
 
-   return retu_write(wdev->rdev, RETU_REG_WATCHDOG, wdog->timeout);
-}
-
-static int retu_wdt_stop(struct watchdog_device *wdog)
-{
-   struct retu_wdt_dev *wdev = watchdog_get_drvdata(wdog);
-
-   retu_wdt_ping_enable(wdev);
-
-   return 0;
-}
-
-static int retu_wdt_ping(struct watchdog_device *wdog)
-{
-   struct retu_wdt_dev *wdev = watchdog_get_drvdata(wdog);
-
-   return retu_write(wdev->rdev, RETU_REG_WATCHDOG, wdog->timeout);
+   return retu_write(rdev, RETU_REG_WATCHDOG, wdog->timeout);
 }
 
 static int retu_wdt_set_timeout(struct watchdog_device *wdog,
unsigned int timeout)
 {
-   struct retu_wdt_dev *wdev = watchdog_get_drvdata(wdog);
+   struct retu_dev *rdev = watchdog_get_drvdata(wdog);
 
wdog->timeout = timeout;
-   return retu_write(wdev->rdev, RETU_REG_WATCHDOG, wdog->timeout);
+   return retu_write(rdev, RETU_REG_WATCHDOG, wdog->timeout);
 }
 
 static const struct watchdog_info retu_wdt_info = {
@@ -101,8 +54,6 @@ static const struct watchdog_info retu_wdt_info = {
 static const struct watchdog_ops retu_wdt_ops = {
.owner  = THIS_MODULE,
.start  = retu_wdt_start,
-   .stop   = retu_wdt_stop,
-   .ping   = retu_wdt_ping,
.set_timeout= retu_wdt_set_timeout,
 };
 
@@ -111,39 +62,26 @@ static int retu_wdt_probe(struct platform_device *pdev)
struct retu_dev *rdev = dev_get_drvdata(pdev->dev.parent);
bool nowayout = WATCHDOG_NOWAYOUT;
struct watchdog_device *retu_wdt;
-   struct retu_wdt_dev *wdev;
int ret;
 
retu_wdt = devm_kzalloc(>dev, sizeof(*retu_wdt), GFP_KERNEL);
if (!retu_wdt)
return -ENOMEM;
 
-   wdev = devm_kzalloc(>dev, sizeof(*wdev), GFP_KERNEL);
-   if (!wdev)
-   return -ENOMEM;
-
retu_wdt->info  = _wdt_info;
retu_wdt->ops   = _wdt_ops;
retu_wdt->timeout   = RETU_WDT_MAX_TIMER;
retu_wdt->min_timeout   = 0;
retu_wdt->max_timeout   = RETU_WDT_MAX_TIMER;
 
-   watchdog_set_drvdata(retu_wdt, wdev);
+   watchdog_set_drvdata(retu_wdt, rdev);
watchdog_set_nowayout(retu_wdt, nowayout);
 
-   wdev->rdev  = rdev;
-   wdev->dev   = >dev;
-
-   INIT_DELAYED_WORK(>ping_work, retu_wdt_ping_work);
-
ret = watchdog_register_device(retu_wdt);
if (ret < 0)
return ret;
 
-   if (nowayout)
-   retu_wdt_ping(retu_wdt);
-   else
-   retu_wdt_ping_enable(wdev);
+   retu_wdt_start(retu_wdt);
 
platform_set_drvdata(pdev, retu_wdt);
 
@@ -153,10 +91,8 @@ static int retu_wdt_probe(struct platform_device *pdev)
 static int retu_wdt_remove(struct platform_device *pdev)
 {
struct watchdog_device *wdog = platform_get_drvdata(pdev);
-   struct retu_wdt_dev *wdev = watchdog_get_drvdata(wdog);
 
watchdog_unregister_device(wdog);
-

[PATCH 2/8] watchdog: Introduce hardware maximum timeout in watchdog core

2015-08-03 Thread Guenter Roeck

Introduce an optional hardware maximum timeout in the watchdog core.
The hardware maximum timeout can be lower than the maximum timeout.

Drivers can set the maximum hardare timeout value in the watchdog data
structure. If the configured timeout exceeds half the value of the
maximum hardware timeout, the watchdog core enables a timer function
to assist sending keepalive requests to the watchdog driver.

Cc: Timo Kokkonen 
Cc: Uwe Kleine-König 
Signed-off-by: Guenter Roeck 
---
 Documentation/watchdog/watchdog-kernel-api.txt |  14 +++
 drivers/watchdog/watchdog_dev.c| 121 +
 include/linux/watchdog.h   |  21 -
 3 files changed, 135 insertions(+), 21 deletions(-)

diff --git a/Documentation/watchdog/watchdog-kernel-api.txt 
b/Documentation/watchdog/watchdog-kernel-api.txt
index d8b0d3367706..5fa085276874 100644
--- a/Documentation/watchdog/watchdog-kernel-api.txt
+++ b/Documentation/watchdog/watchdog-kernel-api.txt
@@ -53,9 +53,12 @@ struct watchdog_device {
unsigned int timeout;
unsigned int min_timeout;
unsigned int max_timeout;
+   unsigned int max_hw_timeout_ms;
+   unsigned long last_keepalive;
void *driver_data;
struct mutex lock;
unsigned long status;
+   struct delayed_work work;
struct list_head deferred;
 };
 
@@ -73,8 +76,18 @@ It contains following fields:
   additional information about the watchdog timer itself. (Like it's unique 
name)
 * ops: a pointer to the list of watchdog operations that the watchdog supports.
 * timeout: the watchdog timer's timeout value (in seconds).
+  This is the time after which the system will reboot if user space does
+  not send a heartbeat request if the watchdog device is opened.
+  This may or may not be the hardware watchdog timeout. See max_hw_timeout_ms
+  for more details.
 * min_timeout: the watchdog timer's minimum timeout value (in seconds).
 * max_timeout: the watchdog timer's maximum timeout value (in seconds).
+* max_hw_timeout_ms: Maximum hardware timeout, in milli-seconds. May differ
+  from max_timeout. If set, the infrastructure will send a heartbeat to the
+  watchdog driver if 'timeout' is larger than 'max_hw_timeout / 2',
+  unless user space failed to ping the watchdog for 'timeout' seconds.
+* last_keepalive: Time of most recent keepalive triggered from user space,
+  in jiffies.
 * bootstatus: status of the device after booting (reported with watchdog
   WDIOF_* status bits).
 * driver_data: a pointer to the drivers private data of a watchdog device.
@@ -85,6 +98,7 @@ It contains following fields:
   information about the status of the device (Like: is the watchdog timer
   running/active, is the nowayout bit set, is the device opened via
   the /dev/watchdog interface or not, ...).
+* work: Worker data structure for WatchDog Timer Driver Core internal use only.
 * deferred: entry in wtd_deferred_reg_list which is used to
   register early initialized watchdogs.
 
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 06171c73daf5..25849c1d6dc1 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -37,7 +37,9 @@
 #include/* For the -ENODEV/... values */
 #include   /* For printk/panic/... */
 #include   /* For file operations */
+#include  /* For timeout functions */
 #include /* For watchdog specific items */
+#include/* For workqueue */
 #include   /* For handling misc devices */
 #include /* For __init/__exit/... */
 #include  /* For copy_to_user/put_user/... */
@@ -49,6 +51,53 @@ static dev_t watchdog_devt;
 /* the watchdog device behind /dev/watchdog */
 static struct watchdog_device *old_wdd;
 
+static struct workqueue_struct *watchdog_wq;
+
+static inline bool watchdog_need_worker(struct watchdog_device *wdd)
+{
+   unsigned int hm = wdd->max_hw_timeout_ms;
+   unsigned int m = wdd->max_timeout * 1000;
+
+   return watchdog_active(wdd) && hm && hm != m &&
+   wdd->timeout * 500 > hm;
+}
+
+static inline void watchdog_update_worker(struct watchdog_device *wdd,
+ bool cancel, bool sync)
+{
+   if (watchdog_need_worker(wdd)) {
+   unsigned int t = wdd->timeout * 1000;
+
+   if (wdd->max_hw_timeout_ms && t > wdd->max_hw_timeout_ms)
+   t = wdd->max_hw_timeout_ms;
+   queue_delayed_work(watchdog_wq, >work,
+  msecs_to_jiffies(t / 2));
+   } else if (cancel) {
+   if (sync)
+   cancel_delayed_work_sync(>work);
+   else
+   cancel_delayed_work(>work);
+   }
+}
+
+static int _watchdog_ping(struct watchdog_device *wdd)
+{
+   int err;
+
+   if (test_bit(WDOG_UNREGISTERED, >status))
+   return -ENODEV;
+
+   if (!watchdog_active(wdd))
+

[PATCH 0/8] watchdog: Add support for keepalives triggered by infrastructure

2015-08-03 Thread Guenter Roeck

The watchdog infrastructure is currently purely passive, meaning
it only passes information from user space to drivers and vice versa.

Since watchdog hardware tends to have its own quirks, this can result
in quite complex watchdog drivers. A number of scanarios are especially common.

- A watchdog is always active and can not be disabled, or can not be disabled
  once enabled. To support such hardware, watchdog drivers have to implement
  their own timers and use those timers to trigger watchdog keepalives while
  the watchdog device is not or not yet opened.
- A variant of this is the desire to enable a watchdog as soon as its driver
  has been instantiated, to protect the system while it is still booting up,
  but the watchdog daemon is not yet running.
- Some watchdogs have a very short maximum timeout, in the range of just a few
  seconds. Such low timeouts are difficult if not impossible to support from
  user space. Drivers supporting such watchdog hardware need to implement
  a timer function to augment heartbeats from user space.

This patch set solves the above problems while keeping changes to the
watchdog core minimal.

- A new status flag, WDOG_RUNNING, informs the watchdog subsystem that a
  watchdog is running, and that the watchdog subsystem needs to generate
  heartbeat requests while the associated watchdog device is closed.
- A new parameter in the watchdog data structure, max_hw_timeout_ms, informs
  the watchdog subsystem about a maximum hardware timeout. The watchdog
  subsystem uses this information together with the configured timeout
  and the maximum permitted timeout to determine if it needs to generate
  additional heartbeat requests.

Patch #1 is a preparatory patch.

Patch #2 adds timer functionality to the watchdog core. It solves the problem
of short maximum hardware timeouts by augmenting heartbeats triggered from
user space with internally triggered heartbeats.

Patch #3 adds functionality to generate heartbeats while the watchdog device is
closed. It handles situation where where the watchdog is running after
the driver has been instantiated, but the device is not yet opened,
and post-close situations necessary if a watchdog can not be stopped.

Patch #4 makes the set_timeout function optional. This is now possible since
timeout changes can now be completely handled in the watchdog core, for
example if the hardware watchdog timeout is fixed.

Patch #5 to #8 are example conversions of some watchdog drivers.
Those patches will require testing.

This patch set does not solve all limitations of the watchdog subsystem.
Specifically, it does not add support for the following features.

- It is desirable to be able to specify a maximum early timeout,
  from booting the system to opening the watchdog device.
- Some watchdogs may require a minimum period of time between
  heartbeats. Examples are DA9062 and possibly AT91SAM9x.

This and other features will be adddessed with subsequent patches.

The patch set is inspired by an earlier patch set from Timo Kokonnen.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] staging/lustre: Properly reference kthread_run instead of cfs_daemonize

2015-08-03 Thread green

From: Oleg Drokin 

cfs_daemonize is long gone and replaced by a proper call to kthread_run,
so update the comment to reflect that fact.

Signed-off-by: Oleg Drokin 
---
 drivers/staging/lustre/lustre/include/lustre_net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h 
b/drivers/staging/lustre/lustre/include/lustre_net.h
index 77a7de9..3bb2f8b 100644
--- a/drivers/staging/lustre/lustre/include/lustre_net.h
+++ b/drivers/staging/lustre/lustre/include/lustre_net.h
@@ -2183,7 +2183,7 @@ struct ptlrpcd_ctl {
 */
struct ptlrpc_request_set  *pc_set;
/**
-* Thread name used in cfs_daemonize()
+* Thread name used in kthread_run()
 */
charpc_name[16];
/**
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] staging/lustre/ptlrpc: Remove stray cfs_daemonize comment

2015-08-03 Thread green

From: Oleg Drokin 

Ever since daemonize was removed in 3.18, there are no longer
any flags passed to kthread_run.
Most of the comments were deleted, but this one lingered on
until now.

Signed-off-by: Oleg Drokin 
---
 drivers/staging/lustre/lustre/ptlrpc/pinger.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/staging/lustre/lustre/ptlrpc/pinger.c 
b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
index d05c37c..f8edb79 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/pinger.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/pinger.c
@@ -318,8 +318,6 @@ int ptlrpc_start_pinger(void)
 
strcpy(pinger_thread.t_name, "ll_ping");
 
-   /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we
-* just drop the VM and FILES in cfs_daemonize_ctxt() right away. */
rc = PTR_ERR(kthread_run(ptlrpc_pinger_main, _thread,
 "%s", pinger_thread.t_name));
if (IS_ERR_VALUE(rc)) {
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/2] Lustre: remove cfs_daemonize from comments

2015-08-03 Thread green

From: Oleg Drokin 

cfs_daemonize was removed long ago, but I just stumbled upon
a couple of instances where it was still referenced in the comments,
so here are the patches to clean it up and not cause any unnecessary
confusion.

Oleg Drokin (2):
  staging/lustre/ptlrpc: Remove stray daemonize comment
  staging/lustre: Properly reference kthread_run instead of
cfs_daemonize

 drivers/staging/lustre/lustre/include/lustre_net.h | 2 +-
 drivers/staging/lustre/lustre/ptlrpc/pinger.c  | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/2] clk: rockchip: add pclk_pd_pmu to the list of rk3288 critical clocks

2015-08-03 Thread hl


Hi Stephen,

On 04/08/15 09:14, Stephen Boyd wrote:

On 08/03/2015 06:03 PM, Lin Huang wrote:

From: huang lin 

pclk_pd_pmu needs to keep running and with the upcoming gpio clock
handling this is not always the case anymore. So add it to the list
of critical clocks for now.

Signed-off-by: Heiko Stuebner 
Signed-off-by: Lin Huang 


From: says huang lin, first signed-off-by is Heiko Stuebner, and final 
signed-off-by is Lin Huang... who actually authored this patch? Is Lin 
Huang the same person as huang lin ?


This patch writed by Heiko and upload by me, i am sorry confuse you with 
my name, yes, Lin Huang and huang lin is the same person, it is all me.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH V3 request from stable 3.10 to 3.14] efi: fix 32bit kernel boot failed problem using efi

2015-08-03 Thread fupan.li

From: Fupan Li 

Commit 35d5134b7d5a
("x86/efi: Correct EFI boot stub use of code32_start")
imported a bug, which will cause 32bit kernel boot failed
using efi method. It should use the label's address instead
of the value stored in the label to caculate the address of
code32_start.

Signed-off-by: Fupan Li 
---
 arch/x86/boot/compressed/head_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/boot/compressed/head_32.S 
b/arch/x86/boot/compressed/head_32.S
index abb988a..3b28eff 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -54,7 +54,7 @@ ENTRY(efi_pe_entry)
callreloc
 reloc:
popl%ecx
-   sublreloc, %ecx
+   subl$reloc, %ecx
movl%ecx, BP_code32_start(%eax)
 
sub $0x4, %esp
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] serial: don't announce CIR serial ports

2015-08-03 Thread Peter Hurley

Hi Maciej,

On 08/02/2015 05:09 PM, Maciej S. Szmigiero wrote:
> CIR type serial ports aren't real serial ports.
> This is just a way to prevent legacy serial driver
> from probing and eventually binding some resources
> so don't announce them like normal serial ports.

I'd like to keep some form of reporting so that we know the
port was properly probed; what about extending uart_report_port()
to including CIR + disabled status?

Secondly, good catch! Because we should not be trying to
register a console on this port, nor driving modem signals.

So maybe an early exit after uart_report_port?

Regards,
Peter Hurley

> Signed-off-by: Maciej Szmigiero 
> ---
>  drivers/tty/serial/serial_core.c |2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/tty/serial/serial_core.c 
> b/drivers/tty/serial/serial_core.c
> index f368520..99f944d 100644
> --- a/drivers/tty/serial/serial_core.c
> +++ b/drivers/tty/serial/serial_core.c
> @@ -2237,7 +2237,7 @@ uart_configure_port(struct uart_driver *drv, struct 
> uart_state *state,
>   port->ops->config_port(port, flags);
>   }
>  
> - if (port->type != PORT_UNKNOWN) {
> + if (port->type != PORT_UNKNOWN && port->type != PORT_8250_CIR) {
>   unsigned long flags;
>  
>   uart_report_port(drv, port);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 01/11] user_ns: 3 new LSM hooks for user namespace operations

2015-08-03 Thread Kees Cook

On Mon, Aug 3, 2015 at 4:34 AM, Lukasz Pawelczyk
 wrote:
> On pią, 2015-07-31 at 22:48 -0500, Serge E. Hallyn wrote:
>> On Fri, Jul 31, 2015 at 11:28:56AM +0200, Lukasz Pawelczyk wrote:
>> > On czw, 2015-07-30 at 16:30 -0500, Serge E. Hallyn wrote:
>> > > On Fri, Jul 24, 2015 at 12:04:35PM +0200, Lukasz Pawelczyk wrote:
>> > > > @@ -969,6 +982,7 @@ static int userns_install(struct nsproxy
>> > > > *nsproxy, struct ns_common *ns)
>> > > >  {
>> > > > struct user_namespace *user_ns = to_user_ns(ns);
>> > > > struct cred *cred;
>> > > > +   int err;
>> > > >
>> > > > /* Don't allow gaining capabilities by reentering
>> > > >  * the same user namespace.
>> > > > @@ -986,6 +1000,10 @@ static int userns_install(struct nsproxy
>> > > > *nsproxy, struct ns_common *ns)
>> > > > if (!ns_capable(user_ns, CAP_SYS_ADMIN))
>> > > > return -EPERM;
>> > > >
>> > > > +   err = security_userns_setns(nsproxy, user_ns);
>> > > > +   if (err)
>> > > > +   return err;
>> > >
>> > > So at this point the LSM thinks current is in the new ns.  If
>> > > prepare_creds() fails below, should it be informed of that?
>> > > (Or am I over-thinking this?)
>> > >
>> > > > +
>> > > > cred = prepare_creds();
>> > > > if (!cred)
>> > > > return -ENOMEM;
>> >
>> > Hmm, the use case for this hook I had in mind was just to allow or
>> > disallow the operation based on the information passed in
>> > arguments.
>> > Not to register the current in any way so LSM can think it is or
>> > isn't
>> > in the new namespace.
>> >
>> > I think that any other LSM check that would like to know in what
>> > namespace the current is, would just check that from current's
>> > creds.
>> > Not use some stale and duplicated information the above hook could
>> > have
>> > registered.
>> >
>> > I see no reason for this hook to change the LSM state, only to
>> > answer
>> > the question: allowed/disallowed (eventually return an error cause
>> > it
>> > is unable to give an answer which falls into the disallow
>> > category).
>>
>> How about renaming it "security_userns_may_setns()" for clarity?
>
> I personally have nothing against it. However looking at already
> existing hooks only one of them has "may" in the name (unix_may_send)
> while a lot clearly have exactly this purpose (e.g. most of inode_*
> family, some from file_* and task_*). So it seems the trend is against
> it.
>
> What do you think? Anyone else has an opinion?

Personally, I prefer that hooks be named as closely to their caller,
or calling context, as possible. In this case, it seems like "may" is
implied. It's an LSM like all the others, so it can fail, which would
cause the caller to fail too, so "may" tends to be implicit. I would
leave it as-is, but I could be convinced otherwise.

-Kees

-- 
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/2] x86/ldt: allow to disable modify_ldt at runtime

2015-08-03 Thread Kees Cook

On Mon, Aug 3, 2015 at 4:19 PM, Willy Tarreau  wrote:
> On Mon, Aug 03, 2015 at 03:35:15PM -0700, Kees Cook wrote:
>> Yay for perm disable! Thank you! :)
>
> Andy would like to see this evolve towards something possibly
> more complete and/or generic. I think this needs more thoughts
> and that we should possibly stick to 0/1 for now and decide how
> we want to make this evolve later to cover permanent disable,
> various ABIs, etc...
>
> What do you think ?

That's probably the best way forward. I still think a generic syscall
disabling feature would be nice. :) I won't have time to work on it
for a little while, though.

-Kees

-- 
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/7] cpufreq: opp: fix handling of turbo modes

2015-08-03 Thread Krzysztof Kozlowski

On 30.07.2015 23:37, Kukjin Kim wrote:
> On 07/27/15 20:47, Bartlomiej Zolnierkiewicz wrote:
>> On Monday, July 27, 2015 05:06:41 PM Viresh Kumar wrote:
>>> On 27-07-15, 13:14, Bartlomiej Zolnierkiewicz wrote:
 Sorry but you don't seem to understand the issue.
>>>
>>> :)
>>>
>>> No, I did. I understand that if someone uses opp bindings today with
>>> some entries as turbo OPPs, cpufreq will use them as normal
>>> frequencies. And that may harm the board.
>>>
>>> BUT, opp-v2 code isn't ready to be used yet. And platforms should see
>>> what all is implemented before trying to use them.
>>
>> OK.
>>
>>> All I was saying is, this isn't a FIX as we haven't introduced the
>>> feature yet. Otherwise I had no issues with the patch.
>>
>> I will update the description for the next patchset revision.
>>
> Hi Bart,
> 
> When will you re-post v3? Because I have a plan to send a pull-request
> to arm-soc until this weekend...

Dear Kukjin,

We are already at 4.2-rc5 and you did not send the pull request before
the weekend as you said. It is really late and there is no special
reason for delaying the request. What happened?

Best regards,
Krzysztof

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

linux-next: manual merge of the security tree with Linus' tree

2015-08-03 Thread Stephen Rothwell

Hi James,

Today's linux-next merge of the security tree got a conflict in:

  security/yama/yama_lsm.c

between commit:

  5413fcdbe9e7 ("Adding YAMA hooks also when YAMA is not stacked.")

from Linus' tree and commit:

  730daa164e7c ("Yama: remove needless CONFIG_SECURITY_YAMA_STACKED")

from the security tree.

I fixed it up (the latter removed the code updated by the former, so I
just did that) and can carry the fix as necessary (no action is required).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 1/2] clk: rockchip: add pclk_pd_pmu to the list of rk3288 critical clocks

2015-08-03 Thread Stephen Boyd


On 08/03/2015 06:03 PM, Lin Huang wrote:

From: huang lin 

pclk_pd_pmu needs to keep running and with the upcoming gpio clock
handling this is not always the case anymore. So add it to the list
of critical clocks for now.

Signed-off-by: Heiko Stuebner 
Signed-off-by: Lin Huang 


From: says huang lin, first signed-off-by is Heiko Stuebner, and final 
signed-off-by is Lin Huang... who actually authored this patch? Is Lin 
Huang the same person as huang lin ?


--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: add the block to the tail of the list in expand()

2015-08-03 Thread Xishi Qiu

On 2015/8/3 12:10, Dave Hansen wrote:

> On 08/02/2015 07:05 PM, Xishi Qiu wrote:
 Also, this might not do very much good in practice.  If you are
 splitting a high-order page, you are doing the split because the
 lower-order lists are empty.  So won't that list_add() be to an empty
>>
>> I made a mistake, you are right, all the lower-order lists are empty,
>> so it is no sense to add to the tail.
> 
> I actually tested this experimentally and the lists are not always
> empty.  It's probably __rmqueue_smallest() vs. __rmqueue_fallback() logic.
> 
> In any case, you might want to double-check.
> 

Hi Dave,

How did you do the experiment?

Thanks,
Xishi Qiu

> .
> 



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: cgroup/loop Bad page state oops in Linux v4.2-rc3-136-g45b4b782e848

2015-08-03 Thread Josh Boyer

On Mon, Aug 3, 2015 at 12:56 PM, Josh Boyer  wrote:
> On Mon, Aug 3, 2015 at 10:28 AM, Mike Snitzer  wrote:
>> On Sun, Aug 02 2015 at 10:01P -0400,
>> Josh Boyer  wrote:
>>
>>> On Fri, Jul 31, 2015 at 2:58 PM, Josh Boyer  
>>> wrote:
>>> > On Thu, Jul 30, 2015 at 8:19 PM, Mike Snitzer  wrote:
>>> >>
>>> >> The only commit that looks even remotely related (given 32bit concerns)
>>> >> would be 1c220c69ce0dcc0f234a9f263ad9c0864f971852
>>> >
>>> > Confirmed.  I built kernels for our tester that started with the
>>> > working snapshot and applied the patches above one at a time.  The
>>> > failing patch was the commit you suspected.
>>> >
>>> > I can try and build a 4.2-rc4 kernel with that reverted, but it would
>>> > be good if someone could start thinking about how that could cause
>>> > this issue.
>>>
>>> A revert on top of 4.2-rc4 booted.  So this is currently causing
>>> issues with upstream as well.
>>
>> Hi Josh,
>>
>> I've staged the following fix in linux-next (for 4.2-rc6 inclusion):
>> https://git.kernel.org/cgit/linux/kernel/git/device-mapper/linux-dm.git/commit/?h=for-next=76270d574acc897178a5c8be0bd2a743a77e4bac
>>
>> Can you please verify that it works for your 32bit testcase against
>> 4.2-rc4 (or rc5)?
>
> Sure, I'll get a kernel with this included spun up and ask Adam to test.

Adam tested this with success.  If you're still collecting patch
metadata, adding:

Tested-by: Adam Williamson 

would be appreciated.

josh

>> From: Mike Snitzer 
>> Date: Mon, 3 Aug 2015 09:54:58 -0400
>> Subject: [PATCH] dm: fix dm_merge_bvec regression on 32 bit systems
>>
>> A DM regression on 32 bit systems was reported against v4.2-rc3 here:
>> https://lkml.org/lkml/2015/7/29/401
>>
>> Fix this by reverting both commit 1c220c69 ("dm: fix casting bug in
>> dm_merge_bvec()") and 148e51ba ("dm: improve documentation and code
>> clarity in dm_merge_bvec").  This combined revert is done to eliminate
>> the possibility of a partial revert in stable@ kernels.
>>
>> In hindsight the correct fix, at the time 1c220c69 was applied to fix
>> the regression that 148e51ba introduced, should've been to simply revert
>> 148e51ba.
>>
>> Reported-by: Josh Boyer 
>> Acked-by: Joe Thornber 
>> Signed-off-by: Mike Snitzer 
>> Cc: sta...@vger.kernel.org # 3.19+
>> ---
>>  drivers/md/dm.c | 27 ++-
>>  1 file changed, 10 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/md/dm.c b/drivers/md/dm.c
>> index ab37ae1..0d7ab20 100644
>> --- a/drivers/md/dm.c
>> +++ b/drivers/md/dm.c
>> @@ -1729,7 +1729,8 @@ static int dm_merge_bvec(struct request_queue *q,
>> struct mapped_device *md = q->queuedata;
>> struct dm_table *map = dm_get_live_table_fast(md);
>> struct dm_target *ti;
>> -   sector_t max_sectors, max_size = 0;
>> +   sector_t max_sectors;
>> +   int max_size = 0;
>>
>> if (unlikely(!map))
>> goto out;
>> @@ -1742,18 +1743,10 @@ static int dm_merge_bvec(struct request_queue *q,
>>  * Find maximum amount of I/O that won't need splitting
>>  */
>> max_sectors = min(max_io_len(bvm->bi_sector, ti),
>> - (sector_t) queue_max_sectors(q));
>> + (sector_t) BIO_MAX_SECTORS);
>> max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
>> -
>> -   /*
>> -* FIXME: this stop-gap fix _must_ be cleaned up (by passing a 
>> sector_t
>> -* to the targets' merge function since it holds sectors not bytes).
>> -* Just doing this as an interim fix for stable@ because the more
>> -* comprehensive cleanup of switching to sector_t will impact every
>> -* DM target that implements a ->merge hook.
>> -*/
>> -   if (max_size > INT_MAX)
>> -   max_size = INT_MAX;
>> +   if (max_size < 0)
>> +   max_size = 0;
>>
>> /*
>>  * merge_bvec_fn() returns number of bytes
>> @@ -1761,13 +1754,13 @@ static int dm_merge_bvec(struct request_queue *q,
>>  * max is precomputed maximal io size
>>  */
>> if (max_size && ti->type->merge)
>> -   max_size = ti->type->merge(ti, bvm, biovec, (int) max_size);
>> +   max_size = ti->type->merge(ti, bvm, biovec, max_size);
>> /*
>>  * If the target doesn't support merge method and some of the devices
>> -* provided their merge_bvec method (we know this by looking for the
>> -* max_hw_sectors that dm_set_device_limits may set), then we can't
>> -* allow bios with multiple vector entries.  So always set max_size
>> -* to 0, and the code below allows just one page.
>> +* provided their merge_bvec method (we know this by looking at
>> +* queue_max_hw_sectors), then we can't allow bios with multiple 
>> vector
>> +* entries.  So always set max_size to 0, and the code below allows
>> +* just one page.
>>  */
>> else if

Re: [PATCH] x86: Clean up files of Intel Processor Trace

2015-08-03 Thread Takao Indoh

On 2015/08/03 20:03, Borislav Petkov wrote:
> On Mon, Aug 03, 2015 at 11:08:07AM +0200, Peter Zijlstra wrote:
>> For those of us suffering OCDs and all, its a good change though. The
>> alfabet song does go: A, B, C, D etc.. after all. Not: A, C, D, B ...
> 
> ... except that x86 encoding orders regs like it was originally: AX,
> CX, DX, BX, ... Don't ask me why - looks like someone thought that the
> C (count) and D (double precision - AX extension) registers were more
> important than B (base).
> 
> Or someone was simply illiterate.
> 

I thought this was typo. If it is intentional, I'll keep it intact.

Thanks,
Takao Indoh

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/2] pinctrl: rockchip: only enable gpio clock when it setting

2015-08-03 Thread Lin Huang

From: huang lin 

gpio can keep state even the clock disable, for save power
consumption, only enable gpio clock when it setting

Signed-off-by: Heiko Stuebner 
Signed-off-by: Lin Huang 
---
Changes in v2:
Advices by Douglas Anderson
-use readl_relaxed() instead readl()
-fix commit message format error

 drivers/pinctrl/pinctrl-rockchip.c | 57 +++---
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/drivers/pinctrl/pinctrl-rockchip.c 
b/drivers/pinctrl/pinctrl-rockchip.c
index cc2843a..70a4539 100644
--- a/drivers/pinctrl/pinctrl-rockchip.c
+++ b/drivers/pinctrl/pinctrl-rockchip.c
@@ -945,6 +945,7 @@ static int _rockchip_pmx_gpio_set_direction(struct 
gpio_chip *chip,
if (ret < 0)
return ret;
 
+   clk_enable(bank->clk);
spin_lock_irqsave(>slock, flags);
 
data = readl_relaxed(bank->reg_base + GPIO_SWPORT_DDR);
@@ -953,9 +954,11 @@ static int _rockchip_pmx_gpio_set_direction(struct 
gpio_chip *chip,
data |= BIT(pin);
else
data &= ~BIT(pin);
+
writel_relaxed(data, bank->reg_base + GPIO_SWPORT_DDR);
 
spin_unlock_irqrestore(>slock, flags);
+   clk_disable(bank->clk);
 
return 0;
 }
@@ -1389,6 +1392,7 @@ static void rockchip_gpio_set(struct gpio_chip *gc, 
unsigned offset, int value)
unsigned long flags;
u32 data;
 
+   clk_enable(bank->clk);
spin_lock_irqsave(>slock, flags);
 
data = readl(reg);
@@ -1398,6 +1402,7 @@ static void rockchip_gpio_set(struct gpio_chip *gc, 
unsigned offset, int value)
writel(data, reg);
 
spin_unlock_irqrestore(>slock, flags);
+   clk_disable(bank->clk);
 }
 
 /*
@@ -1409,7 +1414,9 @@ static int rockchip_gpio_get(struct gpio_chip *gc, 
unsigned offset)
struct rockchip_pin_bank *bank = gc_to_pin_bank(gc);
u32 data;
 
+   clk_enable(bank->clk);
data = readl(bank->reg_base + GPIO_EXT_PORT);
+   clk_disable(bank->clk);
data >>= offset;
data &= 1;
return data;
@@ -1546,6 +1553,7 @@ static int rockchip_irq_set_type(struct irq_data *d, 
unsigned int type)
if (ret < 0)
return ret;
 
+   clk_enable(bank->clk);
spin_lock_irqsave(>slock, flags);
 
data = readl_relaxed(bank->reg_base + GPIO_SWPORT_DDR);
@@ -1603,6 +1611,7 @@ static int rockchip_irq_set_type(struct irq_data *d, 
unsigned int type)
default:
irq_gc_unlock(gc);
spin_unlock_irqrestore(>slock, flags);
+   clk_disable(bank->clk);
return -EINVAL;
}
 
@@ -1611,6 +1620,7 @@ static int rockchip_irq_set_type(struct irq_data *d, 
unsigned int type)
 
irq_gc_unlock(gc);
spin_unlock_irqrestore(>slock, flags);
+   clk_disable(bank->clk);
 
return 0;
 }
@@ -1620,8 +1630,10 @@ static void rockchip_irq_suspend(struct irq_data *d)
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct rockchip_pin_bank *bank = gc->private;
 
+   clk_enable(bank->clk);
bank->saved_masks = irq_reg_readl(gc, GPIO_INTMASK);
irq_reg_writel(gc, ~gc->wake_active, GPIO_INTMASK);
+   clk_disable(bank->clk);
 }
 
 static void rockchip_irq_resume(struct irq_data *d)
@@ -1629,7 +1641,27 @@ static void rockchip_irq_resume(struct irq_data *d)
struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
struct rockchip_pin_bank *bank = gc->private;
 
+   clk_enable(bank->clk);
irq_reg_writel(gc, bank->saved_masks, GPIO_INTMASK);
+   clk_disable(bank->clk);
+}
+
+static void rockchip_irq_gc_mask_clr_bit(struct irq_data *d)
+{
+   struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+   struct rockchip_pin_bank *bank = gc->private;
+
+   clk_enable(bank->clk);
+   irq_gc_mask_clr_bit(d);
+}
+
+void rockchip_irq_gc_mask_set_bit(struct irq_data *d)
+{
+   struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+   struct rockchip_pin_bank *bank = gc->private;
+
+   irq_gc_mask_set_bit(d);
+   clk_disable(bank->clk);
 }
 
 static int rockchip_interrupts_register(struct platform_device *pdev,
@@ -1640,7 +1672,7 @@ static int rockchip_interrupts_register(struct 
platform_device *pdev,
unsigned int clr = IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN;
struct irq_chip_generic *gc;
int ret;
-   int i;
+   int i, j;
 
for (i = 0; i < ctrl->nr_banks; ++i, ++bank) {
if (!bank->valid) {
@@ -1649,11 +1681,19 @@ static int rockchip_interrupts_register(struct 
platform_device *pdev,
continue;
}
 
+   ret = clk_enable(bank->clk);
+   if (ret) {
+   dev_err(>dev, "failed to enable clock for bank 
%s\n",
+   bank->name);
+   continue;
+   }
+

[PATCH v2 1/2] clk: rockchip: add pclk_pd_pmu to the list of rk3288 critical clocks

2015-08-03 Thread Lin Huang

From: huang lin 

pclk_pd_pmu needs to keep running and with the upcoming gpio clock
handling this is not always the case anymore. So add it to the list
of critical clocks for now.

Signed-off-by: Heiko Stuebner 
Signed-off-by: Lin Huang 
---
Changes in v2:
Advices by Douglas Anderson
-fix commit message format error

 drivers/clk/rockchip/clk-rk3288.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/clk/rockchip/clk-rk3288.c 
b/drivers/clk/rockchip/clk-rk3288.c
index 0df5bae..9040878 100644
--- a/drivers/clk/rockchip/clk-rk3288.c
+++ b/drivers/clk/rockchip/clk-rk3288.c
@@ -780,6 +780,7 @@ static const char *const rk3288_critical_clocks[] 
__initconst = {
"aclk_cpu",
"aclk_peri",
"hclk_peri",
+   "pclk_pd_pmu",
 };
 
 #ifdef CONFIG_PM_SLEEP
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86: Clean up files of Intel Processor Trace

2015-08-03 Thread Takao Indoh

On 2015/08/03 18:44, Alexander Shishkin wrote:
> On 3 August 2015 at 12:08, Peter Zijlstra  wrote:
>> On Mon, Aug 03, 2015 at 12:03:13PM +0300, Alexander Shishkin wrote:
>>> Takao Indoh  writes:
>>
>>> Even though TOPA_SHIFT happens to be the same as PAGE_SHIFT, it is a
>>> property of a separate hardware block, not mmu. PAGE_SHIFT is 12, but
>>> 12 is not always PAGE_SHIFT.
>>
>> PAGE_SHIFT is _always_ 12 on x86. Changing that will require changing
>> the page table format, a rather unlikely thing to go happen.
> 
> Of course. Yet that doesn't justify turning every 12 into PAGE_SHIFT
> is what I'm saying.
> 
> Oh, look, it's PAGE_SHIFT o'clock on x86, time for lunch. :)

I thought the base address of output region is page aligned. I took a
look at Intel SDM again, it just says the base address is 4K-aligned
physical address, does not mention page size. So, logically TOPA_SHIFT
and PAGE_SHIFT are different things and I'll remove this change in next
version.

Thanks,
Takao Indoh

> 
> Regards,
> --
> Alex
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

perf, tools: Refactor and support interval and CSV metrics v2

2015-08-03 Thread Andi Kleen

[v2: Addressed (near) all review feedback. No manpage updates so far.
 Add support for --per-core metrics. Various cleanups.]

Currently perf stat does not support printing computed metrics for interval (-I 
xxx)
or CSV (-x,) mode. For example IPC or TSX metrics over time are quite useful to 
know.

This patch implements them. The main obstacle was that the
metrics printing was all open coded all over the metrics computation code.
The second patch refactors the metrics printing to work through call backs that
can be more easily changed. This also cleans up the metrics printing 
significantly.
The indentation is now handled through printf, no more need to manually count 
spaces.

Then based on that it implements metrics printing for CSV and interval mode.

Example output:

% perf stat  -I1000 -a sleep 1
#  time  counts unit eventsmetric   
   multiplex
 1.001301370   12020.049593  task-clock (msec)  
   (100.00%)
 1.001301370  3,952  context-switches  #0.329 
K/sec(100.00%)
 1.001301370 69  cpu-migrations#0.006 
K/sec(100.00%)
 1.001301370 76  page-faults   #0.006 
K/sec  
 1.001301370386,582,789  cycles#0.032 
GHz  (100.00%)
 1.001301370716,441,544  stalled-cycles-frontend   #  185.33% 
frontend cycles idle (100.00%)
 1.001301370  stalled-cycles-backend   
 1.001301370101,751,678  instructions  #0.26  
insn per cycle 
 1.001301370   #7.04  
stalled cycles per insn  (100.00%)
 1.001301370 20,914,692  branches  #1.740 
M/sec(100.00%)
 1.001301370  1,943,630  branch-misses #9.29% 
of all branches

CSV mode

% perf stat  -x, -I1000 -a sleep 1
 1.000852081,12016.143006,,task-clock
 1.000852081,4457,,context-switches,12015168277,100.00,0.371,K/sec
 1.000852081,50,,cpu-migrations,12014024424,100.00,0.004,K/sec
 1.000852081,76,,page-faults,12013076716,100.00,0.006,K/sec
 1.000852081,515854373,,cycles,12011235336,100.00,0.043,GHz
 
1.000852081,1030742150,,stalled-cycles-frontend,12010984057,100.00,199.81,frontend
 cycles idle
 1.000852081,,,stalled-cycles-backend,0,100.00
 1.000852081,116782495,,instructions,12011130729,100.00,0.23,insn per cycle
 1.00085208112011130729,100.00,8.83,stalled cycles per insn
 1.000852081,23748237,,branches,12010745125,100.00,1.976,M/sec
 1.000852081,1976560,,branch-misses,12010501884,100.00,8.32,of all branches


Available in
git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-misc perf/stat-metrics-2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 7/8] perf, tools, stat: Implement CSV metrics output

2015-08-03 Thread Andi Kleen

From: Andi Kleen 

Now support CSV output for metrics. With the new output callbacks
this is relatively straight forward by creating new callbacks.

The new line callback needs to know the number of fields to skip them
correctly

v2: Split out function argument changes
Signed-off-by: Andi Kleen 
---
 tools/perf/builtin-stat.c | 62 +++
 1 file changed, 62 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 949b45f..40900c5 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -661,6 +661,49 @@ static void print_metric_std(void *ctx, const char *color, 
const char *fmt,
fprintf(out, " %-*s", METRIC_LEN - n - 1, unit);
 }
 
+static void new_line_csv(void *ctx)
+{
+   struct outstate *os = ctx;
+   int i;
+
+   fputc('\n', os->fh);
+   if (os->prefix)
+   fprintf(os->fh, "%s%s", os->prefix, csv_sep);
+   for (i = 0; i < os->nfields; i++)
+   fputs(csv_sep, os->fh);
+}
+
+static void print_metric_csv(void *ctx,
+const char *color __maybe_unused,
+const char *fmt, const char *unit, double val)
+{
+   struct outstate *os = ctx;
+   FILE *out = os->fh;
+   char buf[64], *vals, *ends;
+
+   if (unit == NULL) {
+   fprintf(out, "%s%s%s%s", csv_sep, csv_sep, csv_sep, csv_sep);
+   return;
+   }
+   fprintf(out, "%s%" PRIu64 "%s%.2f%s",
+   csv_sep,
+   os->run,
+   csv_sep,
+   os->ena ? 100.0 * os->run / os->ena : 100.0,
+   csv_sep);
+   snprintf(buf, sizeof(buf), fmt, val);
+   vals = buf;
+   while (isspace(*vals))
+   vals++;
+   ends = vals;
+   while (isdigit(*ends) || *ends == '.')
+   ends++;
+   *ends = 0;
+   while (isspace(*unit))
+   unit++;
+   fprintf(out, "%s%s%s", vals, csv_sep, unit);
+}
+
 static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
double msecs = avg / 1e6;
@@ -732,6 +775,25 @@ static void printout(int id, int nr, struct perf_evsel 
*counter, double uval,
else
nl = new_line_std;
 
+   if (csv_output) {
+   static int aggr_fields[] = {
+   [AGGR_GLOBAL] = 0,
+   [AGGR_THREAD] = 1,
+   [AGGR_NONE] = 1,
+   [AGGR_SOCKET] = 2,
+   [AGGR_CORE] = 2,
+   };
+
+   pm = print_metric_csv;
+   nl = new_line_csv;
+   os.nfields = 1;
+   os.nfields += aggr_fields[aggr_mode];
+   if (counter->cgrp)
+   os.nfields++;
+   os.run = run;
+   os.ena = ena;
+   }
+
if (nsec_counter(counter))
nsec_printout(id, nr, counter, uval);
else
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/8] perf, tools, stat: Abstract stat metrics printing

2015-08-03 Thread Andi Kleen

From: Andi Kleen 

Abstract the printing of shadow metrics. Instead of every
metric calling fprintf directly and taking care of indentation,
use two call backs: one to print metrics and another to
start a new line.

This will allow adding metrics to CSV mode and also
using them for other purposes.

The computation of padding is now done in the central
callback, instead of every metric doing it manually.
This makes it easier to add new metrics.

v2: Refactor functions, printout now does more. Move
shadow printing.
Signed-off-by: Andi Kleen 
---
 tools/perf/builtin-stat.c |  96 +
 tools/perf/util/stat-shadow.c | 158 ++
 tools/perf/util/stat.h|  10 ++-
 3 files changed, 158 insertions(+), 106 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index baca81d..31395c8 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -617,6 +617,47 @@ static void aggr_printout(struct perf_evsel *evsel, int 
id, int nr)
}
 }
 
+struct outstate {
+   FILE *fh;
+};
+
+#define BASE_INDENT 41
+#define AGGR_INDENT  8
+#define METRIC_LEN  35
+#define NA_INDENT   16
+
+static void new_line_no_aggr_std(void *ctx)
+{
+   struct outstate *os = ctx;
+   fprintf(os->fh, "\n%*s", BASE_INDENT + NA_INDENT, "");
+}
+
+static void new_line_std(void *ctx)
+{
+   struct outstate *os = ctx;
+   fprintf(os->fh, "\n%-*s", BASE_INDENT + AGGR_INDENT, "");
+}
+
+static void print_metric_std(void *ctx, const char *color, const char *fmt,
+const char *unit, double val)
+{
+   struct outstate *os = ctx;
+   FILE *out = os->fh;
+   int n;
+
+   if (unit == NULL) {
+   fprintf(out, "%-*s", METRIC_LEN, "");
+   return;
+   }
+
+   n = fprintf(out, " # ");
+   if (color)
+   n += color_fprintf(out, color, fmt, val);
+   else
+   n += fprintf(out, fmt, val);
+   fprintf(out, " %-*s", METRIC_LEN - n - 1, unit);
+}
+
 static void nsec_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
double msecs = avg / 1e6;
@@ -648,7 +689,6 @@ static void abs_printout(int id, int nr, struct perf_evsel 
*evsel, double avg)
 {
double sc =  evsel->scale;
const char *fmt;
-   int cpu = cpu_map__id_to_cpu(id);
 
if (csv_output) {
fmt = sc != 1.0 ?  "%.2f%s" : "%.0f%s";
@@ -661,9 +701,6 @@ static void abs_printout(int id, int nr, struct perf_evsel 
*evsel, double avg)
 
aggr_printout(evsel, id, nr);
 
-   if (aggr_mode == AGGR_GLOBAL)
-   cpu = 0;
-
fprintf(output, fmt, avg, csv_sep);
 
if (evsel->unit)
@@ -676,10 +713,30 @@ static void abs_printout(int id, int nr, struct 
perf_evsel *evsel, double avg)
if (evsel->cgrp)
fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 
-   if (csv_output || interval)
-   return;
+}
 
-   perf_stat__print_shadow_stats(output, evsel, avg, cpu, aggr_mode);
+static void printout(int id, int nr, struct perf_evsel *counter, double uval)
+{
+   struct outstate os = { .fh = output };
+   print_metric_t pm = print_metric_std;
+   void (*nl)(void *);
+
+   if (aggr_mode == AGGR_NONE)
+   nl = new_line_no_aggr_std;
+   else
+   nl = new_line_std;
+
+   if (nsec_counter(counter))
+   nsec_printout(id, nr, counter, uval);
+   else
+   abs_printout(id, nr, counter, uval);
+
+   perf_stat__print_shadow_stats(counter, uval,
+ aggr_mode == AGGR_GLOBAL ? 0 :
+ cpu_map__id_to_cpu(id),
+ pm,
+ nl,
+ );
 }
 
 static void print_aggr(char *prefix)
@@ -735,12 +792,7 @@ static void print_aggr(char *prefix)
continue;
}
uval = val * counter->scale;
-
-   if (nsec_counter(counter))
-   nsec_printout(id, nr, counter, uval);
-   else
-   abs_printout(id, nr, counter, uval);
-
+   printout(id, nr, counter, uval);
if (!csv_output)
print_noise(counter, 1.0);
 
@@ -770,11 +822,7 @@ static void print_aggr_thread(struct perf_evsel *counter, 
char *prefix)
fprintf(output, "%s", prefix);
 
uval = val * counter->scale;
-
-   if (nsec_counter(counter))
-   nsec_printout(thread, 0, counter, uval);
-   else
-   abs_printout(thread, 0, counter, uval);
+   printout(thread, 0, counter, uval);
 
if (!csv_output)

[PATCH 6/8] perf, tools, stat: Move noise/running printing into printout

2015-08-03 Thread Andi Kleen

From: Andi Kleen 

Move the running/noise printing into printout to avoid
duplicated code in the callers.

Signed-off-by: Andi Kleen 
---
 tools/perf/builtin-stat.c | 33 +++--
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index aea9350..949b45f 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -718,7 +718,7 @@ static void abs_printout(int id, int nr, struct perf_evsel 
*evsel, double avg)
 }
 
 static void printout(int id, int nr, struct perf_evsel *counter, double uval,
-char *prefix)
+char *prefix, u64 run, u64 ena, double noise)
 {
struct outstate os = {
.fh = output,
@@ -743,6 +743,12 @@ static void printout(int id, int nr, struct perf_evsel 
*counter, double uval,
  pm,
  nl,
  );
+
+   if (!csv_output) {
+   print_noise(counter, noise);
+   if (run != ena)
+   fprintf(output, "  (%.2f%%)", 100.0 * run / ena);
+   }
 }
 
 static void print_aggr(char *prefix)
@@ -798,11 +804,7 @@ static void print_aggr(char *prefix)
continue;
}
uval = val * counter->scale;
-   printout(id, nr, counter, uval, prefix);
-   if (!csv_output)
-   print_noise(counter, 1.0);
-
-   print_running(run, ena);
+   printout(id, nr, counter, uval, prefix, run, ena, 1.0);
fputc('\n', output);
}
}
@@ -828,12 +830,7 @@ static void print_aggr_thread(struct perf_evsel *counter, 
char *prefix)
fprintf(output, "%s", prefix);
 
uval = val * counter->scale;
-   printout(thread, 0, counter, uval, prefix);
-
-   if (!csv_output)
-   print_noise(counter, 1.0);
-
-   print_running(run, ena);
+   printout(thread, 0, counter, uval, prefix, run, ena, 1.0);
fputc('\n', output);
}
 }
@@ -877,11 +874,7 @@ static void print_counter_aggr(struct perf_evsel *counter, 
char *prefix)
}
 
uval = avg * counter->scale;
-   printout(-1, 0, counter, uval, prefix);
-
-   print_noise(counter, avg);
-
-   print_running(avg_running, avg_enabled);
+   printout(-1, 0, counter, uval, prefix, avg_running, avg_enabled, avg);
fprintf(output, "\n");
 }
 
@@ -929,11 +922,7 @@ static void print_counter(struct perf_evsel *counter, char 
*prefix)
}
 
uval = val * counter->scale;
-   printout(cpu, 0, counter, uval, prefix);
-
-   if (!csv_output)
-   print_noise(counter, 1.0);
-   print_running(run, ena);
+   printout(cpu, 0, counter, uval, prefix, run, ena, 1.0);
 
fputc('\n', output);
}
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/8] perf, tools: Remove trail argument to color vsprintf

2015-08-03 Thread Andi Kleen

From: Jiri Olsa 

Seems like it's always '\n' through color_fprintf_ln, which
is not used at all, removing.. ;-)

Signed-off-by: Andi Kleen 
---
 tools/perf/util/color.c | 16 ++--
 tools/perf/util/color.h |  1 -
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
index 55355b3..ff051d2 100644
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c
@@ -68,7 +68,7 @@ static int __color_vsnprintf(char *bf, size_t size, const 
char *color,
 }
 
 static int __color_vfprintf(FILE *fp, const char *color, const char *fmt,
-   va_list args, const char *trail)
+   va_list args)
 {
int r = 0;
 
@@ -87,8 +87,6 @@ static int __color_vfprintf(FILE *fp, const char *color, 
const char *fmt,
r += vfprintf(fp, fmt, args);
if (perf_use_color_default && *color)
r += fprintf(fp, "%s", PERF_COLOR_RESET);
-   if (trail)
-   r += fprintf(fp, "%s", trail);
return r;
 }
 
@@ -100,7 +98,7 @@ int color_vsnprintf(char *bf, size_t size, const char *color,
 
 int color_vfprintf(FILE *fp, const char *color, const char *fmt, va_list args)
 {
-   return __color_vfprintf(fp, color, fmt, args, NULL);
+   return __color_vfprintf(fp, color, fmt, args);
 }
 
 int color_snprintf(char *bf, size_t size, const char *color,
@@ -126,16 +124,6 @@ int color_fprintf(FILE *fp, const char *color, const char 
*fmt, ...)
return r;
 }
 
-int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...)
-{
-   va_list args;
-   int r;
-   va_start(args, fmt);
-   r = __color_vfprintf(fp, color, fmt, args, "\n");
-   va_end(args);
-   return r;
-}
-
 /*
  * This function splits the buffer by newlines and colors the lines 
individually.
  *
diff --git a/tools/perf/util/color.h b/tools/perf/util/color.h
index 38146f9..a93997f 100644
--- a/tools/perf/util/color.h
+++ b/tools/perf/util/color.h
@@ -35,7 +35,6 @@ int color_vsnprintf(char *bf, size_t size, const char *color,
 int color_vfprintf(FILE *fp, const char *color, const char *fmt, va_list args);
 int color_fprintf(FILE *fp, const char *color, const char *fmt, ...);
 int color_snprintf(char *bf, size_t size, const char *color, const char *fmt, 
...);
-int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...);
 int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char 
*buf);
 int value_color_snprintf(char *bf, size_t size, const char *fmt, double value);
 int percent_color_snprintf(char *bf, size_t size, const char *fmt, ...);
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 8/8] perf, tools: Support metrics in --per-core/socket mode

2015-08-03 Thread Andi Kleen

From: Andi Kleen 

Enable metrics printing in --per-core / --per-socket mode. We need
to save the shadow metrics in a unique place. Always use the first
CPU in the aggregation. Then use the same CPU to retrieve the
shadow value later.

Example output:

% perf stat --per-core -a ./BC1s

 Performance counter stats for 'system wide':

S0-C0   22966.020381  task-clock (msec) #2.004 
CPUs utilized(100.00%)
S0-C0   2 49  context-switches  #0.017 
K/sec(100.00%)
S0-C0   2  4  cpu-migrations#0.001 
K/sec(100.00%)
S0-C0   2467  page-faults   #0.157 
K/sec
S0-C0   2  4,599,061,773  cycles#1.551 
GHz  (100.00%)
S0-C0   2  9,755,886,883  instructions  #2.12  
insn per cycle   (100.00%)
S0-C0   2  1,906,272,125  branches  #  642.704 
M/sec(100.00%)
S0-C0   2 81,180,867  branch-misses #4.26% 
of all branches
S0-C1   22965.995373  task-clock (msec) #2.003 
CPUs utilized(100.00%)
S0-C1   2 62  context-switches  #0.021 
K/sec(100.00%)
S0-C1   2  8  cpu-migrations#0.003 
K/sec(100.00%)
S0-C1   2281  page-faults   #0.095 
K/sec
S0-C1   2  6,347,290  cycles#0.002 
GHz  (100.00%)
S0-C1   2  4,654,156  instructions  #0.73  
insn per cycle   (100.00%)
S0-C1   2947,121  branches  #0.319 
M/sec(100.00%)
S0-C1   2 37,322  branch-misses #3.94% 
of all branches

   1.480409747 seconds time elapsed

Signed-off-by: Andi Kleen 
---
 tools/perf/builtin-stat.c | 45 +++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 40900c5..33b58c0 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -731,6 +731,22 @@ static void nsec_printout(int id, int nr, struct 
perf_evsel *evsel, double avg)
fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
 }
 
+static int first_shadow_cpu(struct perf_evsel *evsel, int id)
+{
+   int i;
+
+   if (aggr_get_id == NULL)
+   return 0;
+
+   for (i = 0; i < perf_evsel__nr_cpus(evsel); i++) {
+   int cpu2 = perf_evsel__cpus(evsel)->map[i];
+
+   if (aggr_get_id(evsel_list->cpus, cpu2) == id)
+   return cpu2;
+   }
+   return 0;
+}
+
 static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
 {
double sc =  evsel->scale;
@@ -800,8 +816,7 @@ static void printout(int id, int nr, struct perf_evsel 
*counter, double uval,
abs_printout(id, nr, counter, uval);
 
perf_stat__print_shadow_stats(counter, uval,
- aggr_mode == AGGR_GLOBAL ? 0 :
- cpu_map__id_to_cpu(id),
+ first_shadow_cpu(counter, id),
  pm,
  nl,
  );
@@ -813,6 +828,30 @@ static void printout(int id, int nr, struct perf_evsel 
*counter, double uval,
}
 }
 
+static void aggr_update_shadow(void)
+{
+   int cpu, cpu2, s2, id, s;
+   u64 val;
+   struct perf_evsel *counter;
+
+   for (s = 0; s < aggr_map->nr; s++) {
+   id = aggr_map->map[s];
+   evlist__for_each(evsel_list, counter) {
+   val = 0;
+   for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); 
cpu++) {
+   cpu2 = perf_evsel__cpus(counter)->map[cpu];
+   s2 = aggr_get_id(evsel_list->cpus, cpu2);
+   if (s2 != id)
+   continue;
+   val += perf_counts(counter->counts, cpu, 
0)->val;
+   }
+   val = scale_val(counter, val);
+   perf_stat__update_shadow_stats(counter, ,
+  
first_shadow_cpu(counter, id));
+   }
+   }
+}
+
 static void print_aggr(char *prefix)
 {
struct perf_evsel *counter;
@@ -823,6 +862,8 @@ static void print_aggr(char *prefix)
if (!(aggr_map || aggr_get_id))
return;
 
+   aggr_update_shadow();
+

Re: [PATCH] powerpc/hvsi: Fix endianness issues in the HVSI driver

2015-08-03 Thread Michael Ellerman

On Fri, 2015-07-31 at 11:29 +0200, Laurent Dufour wrote:
> This patch fixes several endianness issues detected when running the HVSI
> driver in little endian mode.
> 
> These issues are raised in little endian mode because the data exchanged in
> memory between the kernel and the hypervisor has to be in big endian
> format.

Can you include the sparse output before and after?

cheers


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/8] perf, tools, stat: Move sw clock metrics printout to stat-shadow

2015-08-03 Thread Andi Kleen

From: Andi Kleen 

The sw clock metrics printing was missed in the earlier move to
stat-shadow of all the other metric printouts. Move it too.

Signed-off-by: Andi Kleen 
---
 tools/perf/builtin-stat.c | 9 -
 tools/perf/util/stat-shadow.c | 4 
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index d99d850..baca81d 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -642,15 +642,6 @@ static void nsec_printout(int id, int nr, struct 
perf_evsel *evsel, double avg)
 
if (evsel->cgrp)
fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
-
-   if (csv_output || interval)
-   return;
-
-   if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
-   fprintf(output, " # %8.3f CPUs utilized  ",
-   avg / avg_stats(_nsecs_stats));
-   else
-   fprintf(output, "   ");
 }
 
 static void abs_printout(int id, int nr, struct perf_evsel *evsel, double avg)
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index 53e8bb7..88d23d9 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -415,6 +415,10 @@ void perf_stat__print_shadow_stats(FILE *out, struct 
perf_evsel *evsel,
ratio = total / avg;
 
fprintf(out, " # %8.0f cycles / elision   ", ratio);
+   } else if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK) &&
+  (ratio = avg_stats(_nsecs_stats)) != 0) {
+   fprintf(output, " # %8.3f CPUs utilized  ",
+avg / ratio);
} else if (runtime_nsecs_stats[cpu].n != 0) {
char unit = 'M';
 
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/8] perf, tools, stat: Add support for metrics in interval mode

2015-08-03 Thread Andi Kleen

From: Andi Kleen 

Now that we can modify the metrics printout functions easily,
it's straight forward to support metric printing for interval mode.
All that is needed is to print the time stamp on every new line.
Pass the prefix into the context and print it out.

Signed-off-by: Andi Kleen 
---
 tools/perf/builtin-stat.c | 24 +++-
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 31395c8..aea9350 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -619,6 +619,9 @@ static void aggr_printout(struct perf_evsel *evsel, int id, 
int nr)
 
 struct outstate {
FILE *fh;
+   const char *prefix;
+   int nfields;
+   u64 run, ena;
 };
 
 #define BASE_INDENT 41
@@ -629,13 +632,13 @@ struct outstate {
 static void new_line_no_aggr_std(void *ctx)
 {
struct outstate *os = ctx;
-   fprintf(os->fh, "\n%*s", BASE_INDENT + NA_INDENT, "");
+   fprintf(os->fh, "\n%s%-*s", os->prefix, BASE_INDENT + NA_INDENT, "");
 }
 
 static void new_line_std(void *ctx)
 {
struct outstate *os = ctx;
-   fprintf(os->fh, "\n%-*s", BASE_INDENT + AGGR_INDENT, "");
+   fprintf(os->fh, "\n%s%-*s", os->prefix, BASE_INDENT + AGGR_INDENT, "");
 }
 
 static void print_metric_std(void *ctx, const char *color, const char *fmt,
@@ -712,12 +715,15 @@ static void abs_printout(int id, int nr, struct 
perf_evsel *evsel, double avg)
 
if (evsel->cgrp)
fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
-
 }
 
-static void printout(int id, int nr, struct perf_evsel *counter, double uval)
+static void printout(int id, int nr, struct perf_evsel *counter, double uval,
+char *prefix)
 {
-   struct outstate os = { .fh = output };
+   struct outstate os = {
+   .fh = output,
+   .prefix = prefix ? prefix : ""
+   };
print_metric_t pm = print_metric_std;
void (*nl)(void *);
 
@@ -792,7 +798,7 @@ static void print_aggr(char *prefix)
continue;
}
uval = val * counter->scale;
-   printout(id, nr, counter, uval);
+   printout(id, nr, counter, uval, prefix);
if (!csv_output)
print_noise(counter, 1.0);
 
@@ -822,7 +828,7 @@ static void print_aggr_thread(struct perf_evsel *counter, 
char *prefix)
fprintf(output, "%s", prefix);
 
uval = val * counter->scale;
-   printout(thread, 0, counter, uval);
+   printout(thread, 0, counter, uval, prefix);
 
if (!csv_output)
print_noise(counter, 1.0);
@@ -871,7 +877,7 @@ static void print_counter_aggr(struct perf_evsel *counter, 
char *prefix)
}
 
uval = avg * counter->scale;
-   printout(-1, 0, counter, uval);
+   printout(-1, 0, counter, uval, prefix);
 
print_noise(counter, avg);
 
@@ -923,7 +929,7 @@ static void print_counter(struct perf_evsel *counter, char 
*prefix)
}
 
uval = val * counter->scale;
-   printout(cpu, 0, counter, uval);
+   printout(cpu, 0, counter, uval, prefix);
 
if (!csv_output)
print_noise(counter, 1.0);
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/8] perf, tools: Do not include escape sequences in color_vfprintf return

2015-08-03 Thread Andi Kleen

From: Andi Kleen 

color_vprintf was including the length of the invisible escape
sequences in its return argument. Don't include them to make
the return value usable for indentation calculations.

v2: Add comment, rebase
Signed-off-by: Andi Kleen 
---
 tools/perf/util/color.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c
index ff051d2..9b95654 100644
--- a/tools/perf/util/color.c
+++ b/tools/perf/util/color.c
@@ -67,6 +67,7 @@ static int __color_vsnprintf(char *bf, size_t size, const 
char *color,
return r;
 }
 
+/* Colors are not included in return value */
 static int __color_vfprintf(FILE *fp, const char *color, const char *fmt,
va_list args)
 {
@@ -83,10 +84,10 @@ static int __color_vfprintf(FILE *fp, const char *color, 
const char *fmt,
}
 
if (perf_use_color_default && *color)
-   r += fprintf(fp, "%s", color);
+   fprintf(fp, "%s", color);
r += vfprintf(fp, fmt, args);
if (perf_use_color_default && *color)
-   r += fprintf(fp, "%s", PERF_COLOR_RESET);
+   fprintf(fp, "%s", PERF_COLOR_RESET);
return r;
 }
 
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [REGRESSION] Re: i915 driver crashes on T540p if docking station attached

2015-08-03 Thread Rafael J. Wysocki

On Tuesday, August 04, 2015 12:05:14 AM Daniel Vetter wrote:
> On Mon, Aug 3, 2015 at 7:24 PM, Linus Torvalds
>  wrote:
> >>  However, I'm
> >> still seeing a large number of drm/i915 related warning messages and
> >> other kernel kvetching.
> >
> > I suspect I can live with that for now. The lockdep one looks like
> > it's mainly an initialization issue, so you'd never get the actual
> > deadlock in practice, but it's obviously annoying.  The intel_pm.c one
> > I'll have to defer to the i915 people for..
> 
> The lockdep splat is just acpi being inconsistent with init_mutex vs.
> backlight notifier_chain (which has it's own lock) calls. init_mutex
> is new in 4.2 and has been added in
> 
> commit 87521e16a7abbf3fa337f56cb4d1e18247f15e8a
> Author: Hans de Goede 
> Date:   Tue Jun 16 16:27:48 2015 +0200
> 
> acpi-video-detect: Rewrite backlight interface selection logic
> 
> 
> Not mine ;-) But adding relevant people.

Hans, can you have a look at this, please?

Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH 1/2] KVM: x86: set TMR when the interrupt is accepted

2015-08-03 Thread Zhang, Yang Z

Paolo Bonzini wrote on 2015-08-03:
> 
> 
> On 03/08/2015 12:23, Zhang, Yang Z wrote:
>>> In any case, the TMR behavior introduced by the APICv patches is
>>> completely different from the hardware behavior, so it has to be fixed.
>> 
>> But any real problem with it?
> 
> It is a problem for split irqchip, where the EOI exit bitmap can be
> inferred from the IOAPIC routes but the TMR cannot.  The hardware
> behavior on the other hand can be implemented purely within the LAPIC.

So updating the TMR within LAPIC is the only solution to handle it?

> 
>>>  The alternative is to inject level-triggered interrupts
>>> synchronously, without using posted interrupts.
>>> 
>>> I'll write some testcases to understand the functioning of TMR in
>>> the virtual-APIC page, but the manual seems clear to me.
>> 
>> Currently, no existing hardware will use TMR and will not cause any
>> problem.(That's the reason why we leave it in Xen).But we don't know
>> whether future hardware will use it or not(SDM always keeps changing
>> :)).
> 
> But that would be covered by a different execution control (for
> backwards compatibility).  We'll get there when such a feature is introduced.

Yes, we can leave it in future. But one concern is that it may hard to handle 
it at that time if someone also develops feature which rely on it (like current 
patch to split irqchip). 

> 
>> And per 24.11.4's description, the perfect solution is don't modify
>> it. btw, IIRC, only TMR doesn't follow the rule. All other VMCS
>> accesses are issued in right VMCS context.
> 
> Yes, that's correct.  It's just the TMR.
> 
> Paolo


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 3/5] ARM: Exynos: switch to using generic cpufreq driver for Exynos4x12

2015-08-03 Thread Krzysztof Kozlowski

On 03.08.2015 22:55, Bartlomiej Zolnierkiewicz wrote:
> 
> Hi,
> 
> On Monday, August 03, 2015 08:15:13 PM Krzysztof Kozlowski wrote:
>> W dniu 03.08.2015 o 19:36, Bartlomiej Zolnierkiewicz pisze:
>>> On Monday, August 03, 2015 03:59:26 PM Viresh Kumar wrote:
 On 03-08-15, 12:17, Bartlomiej Zolnierkiewicz wrote:
>
> Hi,
>
> On Saturday, August 01, 2015 04:47:21 PM Viresh Kumar wrote:
>> On 31-07-15, 20:49, Bartlomiej Zolnierkiewicz wrote:
>>> diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
>>> index 659879a..bf6d596 100644
>>> --- a/drivers/cpufreq/Kconfig
>>> +++ b/drivers/cpufreq/Kconfig
>>> @@ -191,6 +191,7 @@ config CPUFREQ_DT
>>> # if CPU_THERMAL is on and THERMAL=m, CPUFREQ_DT cannot be =y:
>>> depends on !CPU_THERMAL || THERMAL
>>> select PM_OPP
>>> +   select EXYNOS_THERMAL if ARCH_EXYNOS
>>> help
>>>   This adds a generic DT based cpufreq driver for frequency 
>>> management.
>>>   It supports both uniprocessor (UP) and symmetric 
>>> multiprocessor (SMP)
>>
>> No, we shouldn't pollute generic Kconfig options with platform specific 
>> stuff.
>
> The old code depended on this.  You couldn't enable boost support
> without enabling thermal support (ARM_EXYNOS_CPU_FREQ_BOOST_SW
> config option selected EXYNOS_THERMAL).
>
>> Why don't you enable thermal in your .config?
>
> It is enabled in exynos_defconfig but without the above change it
> can disabled manually which is something that we don't want.

 You are not getting it. I am not asking you to not select thermal, but
 to select it from within your architecture Kconfig option if you want.
>>>
>>> OK.  Krzysztof/Kukjin do you agree with selecting EXYNOS_THERMAL
>>> from ARCH_EXYNOS in the platform code?
>>
>> I agree, with your explanation it seems good. Can you just add this
>> justification to the commit message?
> 
> Updated patch below (I'm not resending the whole series as all other
> patches remain unchanged).
> 
>>>
 Over that, thermal is really an option, not a dependency. So, if
 someone manually disables it, its his problem not yours :)
>>>
>>> I would really like it to be dependency not an option (+ I think
>>> that ideally it should be checked at runtime, IOW we should be
>>> checking from cpufreq-dt driver if the thermal support is enabled
>>> before enabling boost support).
>>
>> That would be the best. It is fine with me if you want to do this in
>> consecutive patches (after applying patch selecting/depending on it in
>> mach-exynos code).
> 
> -8<---
> 
>>From 2595b5e6164a2d1b76626e14302b148b7af5e050 Mon Sep 17 00:00:00 2001
> From: Bartlomiej Zolnierkiewicz 
> Date: Mon, 3 Aug 2015 15:49:06 +0200
> Subject: [PATCH] ARM: Exynos: switch to using generic cpufreq driver for
>  Exynos4x12
> 
> The new CPU clock type allows the use of generic CPUfreq driver.
> Switch Exynos4x12 to using generic cpufreq driver.
> 
> Previously (when exynos-cpufreq driver was used with boost
> functionality) ARM_EXYNOS_CPU_FREQ_BOOST_SW config option
> (which enabled boost functionality) selected EXYNOS_THERMAL
> one.  After switching Exynos4x12 platforms to use cpufreq-dt
> driver boost support is enabled in the cpufreq-dt driver
> itself (because there are turbo OPPs defined in the board's
> DTS file).  However we still would like to allow enabling
> boost support only if thermal support is also enabled for
> Exynos platforms.  To achieve this make ARCH_EXYNOS config
> option select THERMAL and EXYNOS_THERMAL ones.
> 
> Please also note that the switch to use the generic cpufreq-dt
> driver fixes the minor issue present with the old code (support
> for 'boost' mode in the exynos-cpufreq driver was enabled for
> all supported SoCs even though 'boost' frequency was provided
> only for Exynos4x12 ones).
> 
> Cc: Tomasz Figa 
> Cc: Kukjin Kim 
> Cc: Thomas Abraham 
> Cc: Javier Martinez Canillas 
> Signed-off-by: Bartlomiej Zolnierkiewicz 
> ---
>  arch/arm/mach-exynos/Kconfig  | 2 ++
>  arch/arm/mach-exynos/exynos.c | 2 ++
>  2 files changed, 4 insertions(+)
> 
> diff --git a/arch/arm/mach-exynos/Kconfig b/arch/arm/mach-exynos/Kconfig
> index 81064cd..491914c 100644
> --- a/arch/arm/mach-exynos/Kconfig
> +++ b/arch/arm/mach-exynos/Kconfig
> @@ -25,6 +25,8 @@ menuconfig ARCH_EXYNOS
>   select S5P_DEV_MFC
>   select SRAM
>   select MFD_SYSCON
> + select THERMAL
> + select EXYNOS_THERMAL
>   help
> Support for SAMSUNG EXYNOS SoCs (EXYNOS4/5)
>  
> diff --git a/arch/arm/mach-exynos/exynos.c b/arch/arm/mach-exynos/exynos.c
> index 77ac021..1c47aee 100644
> --- a/arch/arm/mach-exynos/exynos.c
> +++ b/arch/arm/mach-exynos/exynos.c
> @@ -227,6 +227,8 @@ static void __init exynos_init_irq(void)
>  static const struct of_device_id exynos_cpufreq_matches[] = {
>   { .compatible =

Re: [PATCH v3 5/5] cpufreq: remove no longer needed CPU_FREQ_BOOST_SW config option

2015-08-03 Thread Rafael J. Wysocki

On Saturday, August 01, 2015 04:45:37 PM Viresh Kumar wrote:
> On 31-07-15, 20:49, Bartlomiej Zolnierkiewicz wrote:
> > Remove no longer needed CPU_FREQ_BOOST_SW config option.
> > 
> > As a result scaling_boost_freqs sysfs attribute is available
> > when cpufreq-dt driver is used and boost support is enabled.
> > 
> > Cc: Viresh Kumar 
> > Cc: Thomas Abraham 
> > Cc: Javier Martinez Canillas 
> > Cc: Krzysztof Kozlowski 
> > Signed-off-by: Bartlomiej Zolnierkiewicz 
> > ---
> >  drivers/cpufreq/Kconfig  | 4 
> >  drivers/cpufreq/freq_table.c | 2 --
> >  2 files changed, 6 deletions(-)
> > 
> > diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
> > index bf6d596..de00a52 100644
> > --- a/drivers/cpufreq/Kconfig
> > +++ b/drivers/cpufreq/Kconfig
> > @@ -21,10 +21,6 @@ if CPU_FREQ
> >  config CPU_FREQ_GOV_COMMON
> > bool
> >  
> > -config CPU_FREQ_BOOST_SW
> > -   bool
> > -   depends on THERMAL
> > -
> >  config CPU_FREQ_STAT
> > tristate "CPU frequency translation statistics"
> > default y
> > diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
> > index a8f1daf..4c5de5b 100644
> > --- a/drivers/cpufreq/freq_table.c
> > +++ b/drivers/cpufreq/freq_table.c
> > @@ -293,9 +293,7 @@ 
> > EXPORT_SYMBOL_GPL(cpufreq_freq_attr_scaling_boost_freqs);
> >  
> >  struct freq_attr *cpufreq_generic_attr[] = {
> > _freq_attr_scaling_available_freqs,
> > -#ifdef CONFIG_CPU_FREQ_BOOST_SW
> > _freq_attr_scaling_boost_freqs,
> > -#endif
> > NULL,
> >  };
> >  EXPORT_SYMBOL_GPL(cpufreq_generic_attr);
> 
> So, this will start appearing for all platforms that use cpufreq_generic_attr
> and that may not be the right thing. We may only want to show boost 
> frequencies
> only if the driver supports it.
> 
> @Rafael: What do you say?

I agree.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] clk: pxa: pxa3xx: fix CKEN register access

2015-08-03 Thread Stephen Boyd


On 08/03/2015 12:58 PM, Robert Jarzmik wrote:

Clocks 0 to 31 are on CKENA, and not CKENB. The clock register names
were inadequately inverted. As a consequence, all clock operations were
happening on CKENB, because almost all but 2 clocks are on CKENA.

As the clocks were activated by the bootloader in the former tests, it
escaped the testing that the wrong clock gate was manipulated. The error
was revealed by changing the pxa3xx-and driver to a module, where tupon
unloading the wrong clock was disabled in CKENB.

Signed-off-by: Robert Jarzmik 
---


Did you want a fixes tag to send this back to stable?

--
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/4] perf, tools, stat: Abstract stat metrics printing

2015-08-03 Thread Andi Kleen

> 
> because we already need to make the print_metric callback global,
> would it be better to make this struct global, having all the
> needed callbacks defined within? something like:

It's actually not global, but static.

I skipped this change. After some other changes there is only
a single function call with these arguments left, so it's not 
an issue to pass it around.

-Andi
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] serial: don't announce CIR serial ports

2015-08-03 Thread Maciej S. Szmigiero

On 04.08.2015 01:40, Greg Kroah-Hartman wrote:
> On Sun, Aug 02, 2015 at 11:09:57PM +0200, Maciej S. Szmigiero wrote:
>> CIR type serial ports aren't real serial ports.
>> This is just a way to prevent legacy serial driver
>> from probing and eventually binding some resources
>> so don't announce them like normal serial ports.
>>
>> Signed-off-by: Maciej Szmigiero 
>> ---
>>  drivers/tty/serial/serial_core.c |2 +-
>>  1 files changed, 1 insertions(+), 1 deletions(-)
>>
>> diff --git a/drivers/tty/serial/serial_core.c 
>> b/drivers/tty/serial/serial_core.c
>> index f368520..99f944d 100644
>> --- a/drivers/tty/serial/serial_core.c
>> +++ b/drivers/tty/serial/serial_core.c
>> @@ -2237,7 +2237,7 @@ uart_configure_port(struct uart_driver *drv, struct 
>> uart_state *state,
>>  port->ops->config_port(port, flags);
>>  }
>>  
>> -if (port->type != PORT_UNKNOWN) {
>> +if (port->type != PORT_UNKNOWN && port->type != PORT_8250_CIR) {
>>  unsigned long flags;
>>  
>>  uart_report_port(drv, port);
> 
> This does not seem correct, why is this type of "port" somehow special
> that it should be skiped?

PORT_8250_CIR is not an actual serial port, it is a way to tell serial driver
to not really bind to some resources (I/O port, memory range, IRQ). 

The 8250 driver does scan a few predefined locations (I/O ports on x86) to
discover legacy serial ports there.

The problem is that some of devices that shouldn't been driven by 8250 driver
implement enough of serial port interface to be identified as a serial port
(and bound to) during this scan by this driver.
This prevents their native driver from binding to them since their resources
are already taken.

When a serial port has PORT_8250_CIR type the relevant resources won't be
reserved by 8250 driver and trying to use this port will result in ENODEV or
EIO (drivers/tty/serial/8250/8250_core.c:serial8250_do_startup()
returns -ENODEV unconditionally for such type of port).

Marking port as PORT_8250_CIR type is done by 8250 PNP driver which
will do this for a some of PNP IDs
(besides marking them it also won't bind to these devices).

Currently only Winbond CIR port PNP ID is on the list,
but I have also submitted other patch to add SMSC IR port too, as it has the
same problem.

Overall, I guess the announcement purpose is for user to tell him what serial
ports he has on the system, and by extension, which ones he can use.
Since user can't really use this type of serial port it would be nice to not
announce it.
That's why I've submitted this patch.

> thanks,
> 
> greg k-h
> 

Thanks and best regards,
Maciej Szmigiero

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/3] drivers: staging: wilc1000: use 'void' for no arguments functions

2015-08-03 Thread Greg KH

On Sun, Aug 02, 2015 at 09:23:51PM +0530, Chandra S Gorentla wrote:
> Added 'void' keyword in the paranthesis of function definitions, when
> there are no arguments to the functions.  This fixes the checkpatch.pl
> error - "Bad function definition 'function()' should probably be
> function(void)".
> 
> Signed-off-by: Chandra S Gorentla 

This series does not apply cleanly.  Please rebased on the latest
staging-testing branch of the staging.git tree and resend.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Staging : lustre :Replace comma with a semicolon

2015-08-03 Thread Greg Kroah-Hartman

On Mon, Aug 03, 2015 at 09:22:51PM +0530, Shraddha Barke wrote:
> Should I resend the patches?
> 

You have sent a bunch of patches, and I don't know what order to apply
them in, or what ones should be applied and which should not.

So please resend all of the outstanding patches that I have not applied,
in a patch series, properly numbered, so that I know what order to apply
them in, and how many there are.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [char-misc-next 3/3 V2] mei: disconnect on connection request timeout

2015-08-03 Thread Greg Kroah-Hartman

On Wed, Jul 29, 2015 at 02:59:34PM +0300, Tomas Winkler wrote:
> From: Alexander Usyskin 
> 
> For the FW with  HBM version >= 2.0 we don't need to reset the whole
> device in case of a particular client failing to connect. It is
> sufficient to send a disconnect request to bring the device to the
> stable state.
> 
> Signed-off-by: Alexander Usyskin 
> Signed-off-by: Tomas Winkler 
> ---
> V2: 1. Remove bougous check on pm_runtime_active that prevented
>disconnection from a client

I applied the previous version, can you send a fix-up for the
differences?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [char-misc-next 0/9 RESEND] mei: support for async event notifications

2015-08-03 Thread Greg Kroah-Hartman

On Mon, Jul 27, 2015 at 06:36:18AM +, Winkler, Tomas wrote:
> 
> > 
> > On Sun, Jul 26, 2015 at 09:54:14AM +0300, Tomas Winkler wrote:
> > > FW has gained new capability where a FW client can asynchronously
> > > notify the host that an event has occurred in its process.
> > > The notification doesn't provide any data and host may need to query
> > > further the FW client in order to get details of the event.
> > > New IOCTLs are introduced for the user space to enable/disable
> > > and consume the event notifications.
> > > The asynchronous nature is provided via poll and fasync.
> > 
> > What changed to require a RESEND?
> 
> You've asked for it.

You might want to give me some context, as I have no idea why I asked
for a resend.  Obviously you must have done something to the series from
the previous one, right?

thanks,

greg "short term memory of a squirrel" k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] arm: perf: Add event descriptions

2015-08-03 Thread Drew Richardson

Add additional information about hardware events to make counters self
describing. This makes the hardware PMUs easier to use as perf list
contains the possible events instead of users having to refer to
documentation like the ARM TRMs. This could also allow tools like
oprofile to support PMUs without requiring an update.

Signed-off-by: Drew Richardson 
---
 arch/arm/kernel/perf_event.c|   1 +
 arch/arm/kernel/perf_event_v7.c | 617 
 2 files changed, 618 insertions(+)

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 54272e0be713..a7f2c84bae15 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -548,6 +548,7 @@ static void armpmu_init(struct arm_pmu *armpmu)
.stop   = armpmu_stop,
.read   = armpmu_read,
.filter_match   = armpmu_filter_match,
+   .attr_groups= armpmu->pmu.attr_groups,
};
 }
 
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index f9b37f876e20..d46bc78b5997 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -547,6 +547,616 @@ static const unsigned 
scorpion_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = 
ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 };
 
+static ssize_t armv7_event_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct dev_ext_attribute *ea = container_of(attr,
+   struct dev_ext_attribute,
+   attr);
+
+   return snprintf(buf, PAGE_SIZE, "%s\n", (char *)ea->var);
+}
+
+#define ARMV7_EVENT_ATTR(config, name) \
+   struct dev_ext_attribute armv7_event_attr_##config##_##name = \
+   { __ATTR(name, S_IRUGO, armv7_event_show, NULL), \
+ "config=0x" #config }
+
+static ARMV7_EVENT_ATTR(00, sw_incr);
+static ARMV7_EVENT_ATTR(01, l1i_cache_refill);
+static ARMV7_EVENT_ATTR(02, l1i_tlb_refill);
+static ARMV7_EVENT_ATTR(03, l1d_cache_refill);
+static ARMV7_EVENT_ATTR(04, l1d_cache);
+static ARMV7_EVENT_ATTR(05, l1d_tlb_refill);
+static ARMV7_EVENT_ATTR(06, ld_retired);
+static ARMV7_EVENT_ATTR(07, st_retired);
+static ARMV7_EVENT_ATTR(08, inst_retired);
+static ARMV7_EVENT_ATTR(09, exc_taken);
+static ARMV7_EVENT_ATTR(0a, exc_return);
+static ARMV7_EVENT_ATTR(0b, cid_write_retired);
+static ARMV7_EVENT_ATTR(0c, pc_write_retired);
+static ARMV7_EVENT_ATTR(0d, br_immed_retired);
+static ARMV7_EVENT_ATTR(0e, br_return_retired);
+static ARMV7_EVENT_ATTR(0f, unaligned_ldst_retired);
+static ARMV7_EVENT_ATTR(10, br_mis_pred);
+static ARMV7_EVENT_ATTR(11, cpu_cycles);
+static ARMV7_EVENT_ATTR(12, br_pred);
+static ARMV7_EVENT_ATTR(13, mem_access);
+static ARMV7_EVENT_ATTR(14, l1i_cache);
+static ARMV7_EVENT_ATTR(15, l1d_cache_wb);
+static ARMV7_EVENT_ATTR(16, l2d_cache);
+static ARMV7_EVENT_ATTR(17, l2d_cache_refill);
+static ARMV7_EVENT_ATTR(18, l2d_cache_wb);
+static ARMV7_EVENT_ATTR(19, bus_access);
+static ARMV7_EVENT_ATTR(1a, memory_error);
+static ARMV7_EVENT_ATTR(1b, inst_spec);
+static ARMV7_EVENT_ATTR(1c, ttbr_write_retired);
+static ARMV7_EVENT_ATTR(1d, bus_cycles);
+static ARMV7_EVENT_ATTR(40, java_bc_exec);
+static ARMV7_EVENT_ATTR(40, l1d_cache_ld);
+static ARMV7_EVENT_ATTR(40, wb_full);
+static ARMV7_EVENT_ATTR(41, java_swbc_exec);
+static ARMV7_EVENT_ATTR(41, l1d_cache_st);
+static ARMV7_EVENT_ATTR(41, l2_store_merged);
+static ARMV7_EVENT_ATTR(42, jazelle_branch_executed);
+static ARMV7_EVENT_ATTR(42, l1d_cache_refill_ld);
+static ARMV7_EVENT_ATTR(42, l2_store_bufferable);
+static ARMV7_EVENT_ATTR(43, l1d_cache_refill_st);
+static ARMV7_EVENT_ATTR(43, l2_access);
+static ARMV7_EVENT_ATTR(44, l2_miss);
+static ARMV7_EVENT_ATTR(45, axi_read);
+static ARMV7_EVENT_ATTR(46, axi_write);
+static ARMV7_EVENT_ATTR(46, l1d_cache_wb_victim);
+static ARMV7_EVENT_ATTR(47, l1d_cache_wb_clean);
+static ARMV7_EVENT_ATTR(47, mem_replay);
+static ARMV7_EVENT_ATTR(48, l1d_cache_inval);
+static ARMV7_EVENT_ATTR(48, mem_replay_unaligned);
+static ARMV7_EVENT_ATTR(49, l1d_miss_hash);
+static ARMV7_EVENT_ATTR(4a, l1i_miss_hash);
+static ARMV7_EVENT_ATTR(4b, l1d_page_coloring);
+static ARMV7_EVENT_ATTR(4c, l1d_hit_neon);
+static ARMV7_EVENT_ATTR(4c, l1d_tlb_refill_ld);
+static ARMV7_EVENT_ATTR(4d, l1d_access_neon);
+static ARMV7_EVENT_ATTR(4d, l1d_tlb_refill_st);
+static ARMV7_EVENT_ATTR(4e, l2_access_neon);
+static ARMV7_EVENT_ATTR(4f, l2_hit_neon);
+static ARMV7_EVENT_ATTR(50, coherent_miss);
+static ARMV7_EVENT_ATTR(50, l1i_access);
+static ARMV7_EVENT_ATTR(50, l2d_cache_ld);
+static ARMV7_EVENT_ATTR(51, coherent_hit);
+static ARMV7_EVENT_ATTR(51, l2d_cache_st);
+static ARMV7_EVENT_ATTR(51, return_mispredict);
+static ARMV7_EVENT_ATTR(52, branch_mispredict);
+static ARMV7_EVENT_ATTR(52, l2d_cache_refill_ld);
+static

Re: [PATCH] x86: serialize LVTT and TSC_DEADLINE write

2015-08-03 Thread Shaohua Li

On Sun, Aug 02, 2015 at 09:41:08PM +0200, Thomas Gleixner wrote:
> On Sun, 2 Aug 2015, Shaohua Li wrote:
> 
> > On Sat, Aug 01, 2015 at 12:10:41PM +0200, Thomas Gleixner wrote:
> > > On Fri, 31 Jul 2015, Shaohua Li wrote:
> > > > @@ -336,6 +336,22 @@ static void __setup_APIC_LVTT(unsigned int clocks, 
> > > > int oneshot, int irqen)
> > > > apic_write(APIC_LVTT, lvtt_value);
> > > >  
> > > > if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
> > > > +   u64 msr;
> > > > +
> > > > +   /*
> > > > +* See Intel SDM: TSC-Deadline Mode chapter. In xAPIC 
> > > > mode,
> > > > +* writing APIC LVTT and TSC_DEADLINE MSR isn't 
> > > > serialized.
> > > > +* This uses the algorithm described in Intel SDM to 
> > > > serialize
> > > > +* the two writes
> > > > +* */
> > > > +   while (1) {
> > > > +   wrmsrl(MSR_IA32_TSC_DEADLINE, -1L);
> > > > +   rdmsrl(MSR_IA32_TSC_DEADLINE, msr);
> > > > +   if (msr)
> > > > +   break;
> > > > +   }
> > > > +   wrmsrl(MSR_IA32_TSC_DEADLINE, 0);
> > > 
> > > 
> > > I think this is exceptionally silly. A proper fence after the
> > > apic_write() should have the same effect.
> > 
> > Not sure what happens in the hardware, I could have a try of fence, but
> > I'd prefer using the algorithm Intel described. This is not a fast path,
> 
> s/algorithm/voodoo/
> 
> > the loop will exit immediately regardless the issue occurs anyway.
> 
> Well, the SDM also says:
> 
>  "To allow for efficient access to the APIC registers in x2APIC mode,
>   the serializing semantics of WRMSR are relaxed when writing to the
>   APIC registers. Thus, system software should not use “WRMSR to APIC
>   registers in x2APIC mode” as a serializing instruction. Read and write
>   accesses to the APIC registers will occur in program order. A WRMSR to
>   an APIC register may complete before all preceding stores are globally
>   visible; software can prevent this by inserting a serializing
>   instruction, an SFENCE, or an MFENCE before the WRMSR."
> 
> And that's what happens here. The write to the LVT has not yet hit the
> APIC, so the WRMSR has no effect.

What you quoted is for x2APIC, I didn't see similar description for
xAPIC.

Tested mfence here, it does work. But I'm not convinced it's the right
thing. the xAPIC access is memory mapped IO, mfence is nothing related
to it. Anyway, cc-ed more intel people, hope they can share some
insights.

Thanks,
Shaohua
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: epoll and multiple processes - eliminate unneeded process wake-ups

2015-08-03 Thread Eric Wong

Madars Vitolins  wrote:
> Hi Folks,
> 
> I am developing kind of open systems application, which uses
> multiple processes/executables where each of them monitors some set
> of resources (in this case POSIX Queues) via epoll interface. For
> example when 10 processes on same queue are in state of epoll_wait()
> and one message arrives, all 10 processes gets woken up and all of
> them tries to read the message from Q. One succeeds, the others gets
> EAGAIN error. The problem is with those others, which generates
> extra context switches - useless CPU usage. With more processes
> inefficiency gets higher.
> 
> I tried to use EPOLLONESHOT, but no help. Seems this is suitable for
> multi-threaded application and not for multi-process application.

Correct.  Most FDs are not shared across processes.

> Ideal mechanism for this would be:
> 1. If multiple epoll sets in kernel matches same event and one or
> more processes are in state of epoll_wait() - then send event only
> to one waiter.
> 2. If none of processes are in wait state, then send the event to
> all epoll sets (as it is currently). Then the first free process
> will grab the event.

Jason Baron was working on this (search LKML archives for
EPOLLEXCLUSIVE, EPOLLROUNDROBIN, EPOLL_ROTATE)

However, I was unconvinced about modifying epoll.

Perhaps I may be more easily convinced about your mqueue case than his
case for listen sockets, though[*]

Typical applications have few (probably only one) listen sockets or
POSIX mqueues; so I would rather use dedicated threads to issue
blocking syscalls (accept4 or mq_timedreceive).

Making blocking syscalls allows exclusive wakeups to avoid thundering
herds.

> How do you think, would it be real to implement this? How about
> concurrency?
> Can you please give me some hints from which points in code to start
> to implement these changes?

For now, I suggest dedicating a thread in each process to do
mq_timedreceive/mq_receive, assuming you only have a small amount
of queues in your system.


[*] mq_timedreceive may copy a largish buffer which benefits from
staying on the same CPU as much as possible.
Contrary, accept4 only creates a client socket.  With a C10K+
socket server (e.g. http/memcached/DB), a typical new client
socket spends a fair amount of time idle.  Thus I don't believe
memory locality inside the kernel is much concern when there's
thousands of accepted client sockets.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Linux 4.1 kernel warning in unregister_blkdev

2015-08-03 Thread Vinson Lee

Hi.

I've hit this warning multiple times from running the Linux Test
Project ltp_block_dev test case.

ltp_block_dev: Test Case 7: unregister_blkdev() with major=0
[ cut here ]
WARNING: CPU: 10 PID: 33952 at block/genhd.c:352 unregister_blkdev+0x67/0x9c()
Modules linked in: ltp_block_dev(O) veth tun lp parport coretemp
tcp_diag inet_diag iTCO_wdt iTCO_vendor_support i7core_edac edac_core
i2c_i801 dcdbas kvm ipmi_si ipmi_devintf crc32c_intel ipmi_msghandler
acpi_cpufreq ghash_clmulni_intel ioatdma shpchp lpc_ich mfd_core hed
microcode dell_rbu xfs libcrc32c igb ptp pps_core i2c_algo_bit
i2c_core dca ipv6 [last unloaded: ltp_fw_load]
CPU: 10 PID: 33952 Comm: block_dev Tainted: G   O4.1.3 #1
 0009 8811fc8f3cd8 81538de8 00da
  8811fc8f3d18 81068171 8811fc8f3d38
 8127f73a   81f01550
Call Trace:
 [] dump_stack+0x45/0x57
 [] warn_slowpath_common+0xa1/0xbb
 [] ? unregister_blkdev+0x67/0x9c
 [] warn_slowpath_null+0x1a/0x1c
 [] unregister_blkdev+0x67/0x9c
 [] sys_tcase+0x59b/0x78f [ltp_block_dev]
 [] dev_attr_store+0x18/0x22
 [] sysfs_kf_write+0x3e/0x40
 [] kernfs_fop_write+0xeb/0x13b
 [] __vfs_write+0x28/0xa7
 [] ? __sb_start_write+0xb0/0xe4
 [] ? security_file_permission+0x2e/0x33
 [] vfs_write+0x8f/0xe5
 [] SyS_write+0x44/0x78
 [] system_call_fastpath+0x12/0x6a
---[ end trace ea5af5d6119d362a ]---
ltp_block_dev: Test Case Result: PASS
ltp_block_dev: device released

Cheers,
Vinson
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1752 matches

Mail list logo