Re: [RFC PATCH 1/5] spi: introduce flag for memory mapped read

2015-08-04 Thread Michal Suchanek
On 5 August 2015 at 07:35, Vignesh R  wrote:
>
>
> On 08/05/2015 10:51 AM, Michal Suchanek wrote:
>> Hello,
>>
>> On 4 August 2015 at 19:59, R, Vignesh  wrote:
>>>
>>>
>>> On 8/4/2015 9:21 PM, Mark Brown wrote:
 On Mon, Aug 03, 2015 at 10:27:19AM +0530, Vignesh R wrote:


>>>
>>> TI QSPI controller has two blocks:
>>> 1. SPI_CORE: This is generic(normal) spi mode. This can be used to
>>> communicate with any SPI devices (serial flashes as well as non-flash
>>> devices like touchscreen).
>>> 2. SFI_MM_IF(SPI memory mapped interface): The SFI_MM_IF block only
>>> allows reading and writing to an SPI flash device only. Used to speed up
>>> flash reads. It _cannot_ be used to communicate with non flash devices.
>>> Now, the spi_message that ti-qspi receives in transfer_one() callback
>>> can be from mtd device(in which case SFI_MM_IF can be used) or from any
>>> other non flash SPI device (in which case SFI_MM_IF must not be used
>>> instead SPI_CORE is to be used) but there is no way(is there?) to
>>> distinguish where spi_message is from. Therefore I introduced flag
>>> (use_mmap_mode) to struct spi_message. mtd driver will set flag to true,
>>> this helps the ti-qspi driver to determine that the user is flash device
>>> and thus can do read via SFI_MM_IF. If this flag is not set then the
>>> user is assumed to be non flash SPI driver and will use SPI_CORE block
>>> to communicate.
>>>
>>> On the whole, I just need a way to determine that the user is a flash
>>> device in order to switch to memory mapped interface.
>>>
>>
>> Maybe it can be set on the SPI slave rather than each message.
>
> You mean to add flag to spi_device struct? That's ok for me.
>

There are already mode flags so you can just add one more.

Thanks

Michal
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 net-next 1/2] RDS-TCP: Make RDS-TCP work correctly when it is set up in a netns other than init_net

2015-08-04 Thread Sowmini Varadhan
Open the sockets calling sock_create_kern() with the correct struct net
pointer, and use that struct net pointer when verifying the
address passed to rds_bind().

Signed-off-by: Sowmini Varadhan 
---
v2: David Ahern comments.

 net/rds/bind.c|3 ++-
 net/rds/connection.c  |   16 ++--
 net/rds/ib.c  |2 +-
 net/rds/ib_cm.c   |5 +++--
 net/rds/iw.c  |2 +-
 net/rds/iw_cm.c   |5 +++--
 net/rds/rds.h |   23 +++
 net/rds/send.c|3 ++-
 net/rds/tcp.c |4 ++--
 net/rds/tcp_connect.c |3 ++-
 net/rds/tcp_listen.c  |   16 
 net/rds/transport.c   |4 ++--
 12 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 4ebd29c..dd666fb 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
ret = 0;
goto out;
}
-   trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
+   trans = rds_trans_get_preferred(sock_net(sock->sk),
+   sin->sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index da6da57..d4fecb2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn)
  * For now they are not garbage collected once they're created.  They
  * are torn down as the module is removed, if ever.
  */
-static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+static struct rds_connection *__rds_conn_create(struct net *net,
+   __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp,
   int is_outgoing)
 {
@@ -157,6 +158,7 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
conn->c_faddr = faddr;
spin_lock_init(>c_lock);
conn->c_next_tx_seq = 1;
+   rds_conn_net_set(conn, net);
 
init_waitqueue_head(>c_waitq);
INIT_LIST_HEAD(>c_send_queue);
@@ -174,7 +176,7 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
 * can bind to the destination address then we'd rather the messages
 * flow through loopback rather than either transport.
 */
-   loop_trans = rds_trans_get_preferred(faddr);
+   loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
@@ -260,17 +262,19 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
return conn;
 }
 
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+   __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffe..1381422 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned 
int len,
  * allowed to influence which paths have priority.  We could call userspace
  * asserting this policy "routing".
  */
-static int rds_ib_laddr_check(__be32 addr)
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
 {
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0da2a45..f40d8f5 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -448,8 +448,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 (unsigned long long)be64_to_cpu(lguid),
 (unsigned long long)be64_to_cpu(fguid));
 
-   conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, _ib_transport,
-  GFP_KERNEL);
+   /* RDS/IB is not currently netns aware, thus init_net */
+   conn = rds_conn_create(_net, dp->dp_daddr, dp->dp_saddr,
+  _ib_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create 

[PATCH v3 net-next 0/2] RDS-TCP: Network namespace support

2015-08-04 Thread Sowmini Varadhan
This patch series contains the set of changes to correctly set up 
the infra for PF_RDS sockets that use TCP as the transport in multiple
network namespaces.

Patch 1 in the series is the minimal set of changes to allow
a single instance of RDS-TCP to run in any (i.e init_net or other) net
namespace.  The changes in this patch set ensure that the execution of 
'modprobe [-r] rds_tcp' sets up the kernel TCP sockets 
relative to the current netns, so that RDS applications can send/recv
packets from that netns, and the netns can later be deleted cleanly.

Patch 2 of the series further allows multiple RDS-TCP instances,
one per network namespace. The changes in this patch allows dynamic
creation/tear-down of RDS-TCP client and server sockets  across all
current and future namespaces. 

v2 changes from RFC sent out earlier:
David Ahern comments in patch 1, net_device notifier in patch 2, 
patch 3 broken off and submitted separately.
v3: Cong Wang review comments.

Sowmini Varadhan (2):
  Make RDS-TCP work correctly when it is set up in a netns other than
init_net
  Support multiple RDS-TCP listen endpoints, one per netns.

 net/rds/bind.c|3 +-
 net/rds/connection.c  |   16 +++--
 net/rds/ib.c  |2 +-
 net/rds/ib_cm.c   |5 +-
 net/rds/iw.c  |2 +-
 net/rds/iw_cm.c   |5 +-
 net/rds/rds.h |   23 ++-
 net/rds/send.c|3 +-
 net/rds/tcp.c |  165 +++-
 net/rds/tcp.h |7 ++-
 net/rds/tcp_connect.c |9 ++-
 net/rds/tcp_listen.c  |   40 
 net/rds/transport.c   |4 +-
 13 files changed, 214 insertions(+), 70 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 net-next 2/2] RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.

2015-08-04 Thread Sowmini Varadhan
Register pernet subsys init/stop functions that will set up
and tear down per-net RDS-TCP listen endpoints. Unregister
pernet subusys functions on 'modprobe -r' to clean up these
end points.

Enable keepalive on both accept and connect socket endpoints.
The keepalive timer expiration will ensure that client socket
endpoints will be removed as appropriate from the netns when
an interface is removed from a namespace.

Register a device notifier callback that will clean up all
sockets (and thus avoid the need to wait for keepalive timeout)
when the loopback device is unregistered from the netns indicating
that the netns is getting deleted.

Signed-off-by: Sowmini Varadhan 
---
v2: net_device notifier for synchronous cleanup of sockets.
v3: Cong Wang comments

 net/rds/tcp.c |  161 -
 net/rds/tcp.h |7 ++-
 net/rds/tcp_connect.c |6 +-
 net/rds/tcp_listen.c  |   38 +++-
 4 files changed, 162 insertions(+), 50 deletions(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 98f5de3..c42b60b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -35,6 +35,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #include "rds.h"
 #include "tcp.h"
@@ -250,16 +253,7 @@ static void rds_tcp_destroy_conns(void)
}
 }
 
-static void rds_tcp_exit(void)
-{
-   rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-   rds_tcp_listen_stop();
-   rds_tcp_destroy_conns();
-   rds_trans_unregister(_tcp_transport);
-   rds_tcp_recv_exit();
-   kmem_cache_destroy(rds_tcp_conn_slab);
-}
-module_exit(rds_tcp_exit);
+static void rds_tcp_exit(void);
 
 struct rds_transport rds_tcp_transport = {
.laddr_check= rds_tcp_laddr_check,
@@ -281,6 +275,136 @@ struct rds_transport rds_tcp_transport = {
.t_prefer_loopback  = 1,
 };
 
+static int rds_tcp_netid;
+
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+   struct socket *rds_tcp_listen_sock;
+   struct work_struct rds_tcp_accept_w;
+};
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+   struct rds_tcp_net *rtn = container_of(work,
+  struct rds_tcp_net,
+  rds_tcp_accept_w);
+
+   while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
+   cond_resched();
+}
+
+void rds_tcp_accept_work(struct sock *sk)
+{
+   struct net *net = sock_net(sk);
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   queue_work(rds_wq, >rds_tcp_accept_w);
+}
+
+static __net_init int rds_tcp_init_net(struct net *net)
+{
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
+   if (!rtn->rds_tcp_listen_sock) {
+   pr_warn("could not set up listen sock\n");
+   return -EAFNOSUPPORT;
+   }
+   INIT_WORK(>rds_tcp_accept_w, rds_tcp_accept_worker);
+   return 0;
+}
+
+static void __net_exit rds_tcp_exit_net(struct net *net)
+{
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   /* If rds_tcp_exit_net() is called as a result of netns deletion,
+* the rds_tcp_kill_sock() device notifier would already have cleaned
+* up the listen socket, thus there is no work to do in this function.
+*
+* If rds_tcp_exit_net() is called as a result of module unload,
+* i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then
+* we do need to clean up the listen socket here.
+*/
+   if (rtn->rds_tcp_listen_sock) {
+   rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
+   rtn->rds_tcp_listen_sock = NULL;
+   flush_work(>rds_tcp_accept_w);
+   }
+}
+
+static struct pernet_operations rds_tcp_net_ops = {
+   .init = rds_tcp_init_net,
+   .exit = rds_tcp_exit_net,
+   .id = _tcp_netid,
+   .size = sizeof(struct rds_tcp_net),
+};
+
+static void rds_tcp_kill_sock(struct net *net)
+{
+   struct rds_tcp_connection *tc, *_tc;
+   struct sock *sk;
+   LIST_HEAD(tmp_list);
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
+   rtn->rds_tcp_listen_sock = NULL;
+   flush_work(>rds_tcp_accept_w);
+   spin_lock_irq(_tcp_conn_lock);
+   list_for_each_entry_safe(tc, _tc, _tcp_conn_list, t_tcp_node) {
+   struct net *c_net = read_pnet(>conn->c_net);
+
+   if (net != c_net || !tc->t_sock)
+   continue;
+   list_move_tail(>t_tcp_node, _list);
+   }
+   spin_unlock_irq(_tcp_conn_lock);
+   list_for_each_entry_safe(tc, _tc, _list, t_tcp_node) {
+   sk = tc->t_sock->sk;
+   sk->sk_prot->disconnect(sk, 0);
+   tcp_done(sk);
+   if 

Re: [patch 1/2] dma: ipu: Prepare irq handlers for irq argument removal

2015-08-04 Thread Vinod Koul
On Sat, Aug 01, 2015 at 07:06:58AM +, Thomas Gleixner wrote:
> The irq argument of most interrupt flow handlers is unused or merily
> used instead of a local variable. The handlers which need the irq
> argument can retrieve the irq number from the irq descriptor.
> 
> Search and update was done with coccinelle and the invaluable help of
> Julia Lawall.

Applied both with fix in subsystem name

Thanks
-- 
~Vinod

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 1/5] spi: introduce flag for memory mapped read

2015-08-04 Thread Vignesh R


On 08/05/2015 10:51 AM, Michal Suchanek wrote:
> Hello,
> 
> On 4 August 2015 at 19:59, R, Vignesh  wrote:
>>
>>
>> On 8/4/2015 9:21 PM, Mark Brown wrote:
>>> On Mon, Aug 03, 2015 at 10:27:19AM +0530, Vignesh R wrote:
>>>
 @use_mmap_mode: Some SPI controller chips are optimized for interacting
 with serial flash memories. These chips have memory mapped interface,
 through which entire serial flash memory slave can be read/written as if
 though they are physical memories (like RAM). Using this interface,
 flash can be accessed using memcpy() function and the spi controller
 hardware will take care of communicating with serial flash over SPI.
 Setting this flag will indicate the SPI controller driver that the
 spi_message is from mtd layer to read from/write to flash. The SPI
 master driver can then appropriately switch the controller to memory
 mapped interface to read from/write to flash, based on this flag (See
 drivers/spi/spi-ti-qspi.c for example).
 NOTE: If the SPI controller chip lacks memory mapped interface, then the
 driver will ignore this flag and use normal SPI protocol to read
 from/write to flash. Communication with non-flash SPI devices is not
 possible using the memory mapped interface.
>>>
>>> I still can't tell from the above what this interface is supposed to do.
>>> It sounds like the use of memory mapped mode is supposed to be
>>> transparent to users, it should just affect how the controller interacts
>>> with the hardware, but if that's the case why do we need to expose it to
>>> users at all?  Shouldn't the driver just use memory mapped mode if it's
>>> faster?
>>>
>>
>> TI QSPI controller has two blocks:
>> 1. SPI_CORE: This is generic(normal) spi mode. This can be used to
>> communicate with any SPI devices (serial flashes as well as non-flash
>> devices like touchscreen).
>> 2. SFI_MM_IF(SPI memory mapped interface): The SFI_MM_IF block only
>> allows reading and writing to an SPI flash device only. Used to speed up
>> flash reads. It _cannot_ be used to communicate with non flash devices.
>> Now, the spi_message that ti-qspi receives in transfer_one() callback
>> can be from mtd device(in which case SFI_MM_IF can be used) or from any
>> other non flash SPI device (in which case SFI_MM_IF must not be used
>> instead SPI_CORE is to be used) but there is no way(is there?) to
>> distinguish where spi_message is from. Therefore I introduced flag
>> (use_mmap_mode) to struct spi_message. mtd driver will set flag to true,
>> this helps the ti-qspi driver to determine that the user is flash device
>> and thus can do read via SFI_MM_IF. If this flag is not set then the
>> user is assumed to be non flash SPI driver and will use SPI_CORE block
>> to communicate.
>>
>> On the whole, I just need a way to determine that the user is a flash
>> device in order to switch to memory mapped interface.
>>
> 
> Maybe it can be set on the SPI slave rather than each message.

You mean to add flag to spi_device struct? That's ok for me.

-- 
Regards
Vignesh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] wilc1000: wilc_wfi_cfgoperations.c: fixed brace coding style issues

2015-08-04 Thread Sudip Mukherjee
On Wed, Aug 05, 2015 at 12:08:16AM +0200, Daniel Machon wrote:
> Fixed brace coding styles issues
> 
> Signed-off-by: Daniel Machon 
> ---
>  drivers/staging/wilc1000/wilc_wfi_cfgoperations.c | 11 +--
>  1 file changed, 5 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c 
> b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
> index 92064db..23097ee 100644
> --- a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
> +++ b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
> @@ -1216,8 +1216,8 @@ static int WILC_WFI_add_key(struct wiphy *wiphy, struct 
> net_device *netdev, u8 k
>  
>  
>  
> - if (!pairwise)
> - {
> + if (!pairwise) {
> +
>   if (params->cipher == WLAN_CIPHER_SUITE_TKIP)
>   u8gmode = ENCRYPT_ENABLED | WPA | TKIP;
>   else
> @@ -1315,8 +1315,8 @@ static int WILC_WFI_add_key(struct wiphy *wiphy, struct 
> net_device *netdev, u8 k
>  
>   {
>   u8mode = 0;
> - if (!pairwise)
> - {
> + if (!pairwise) {
> +
This blank line and the one above it are not required.

regards
sudip
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 11/28] drivers:hv: Modify hv_vmbus to search for all MMIO ranges available.

2015-08-04 Thread Greg KH
On Sat, Aug 01, 2015 at 04:08:15PM -0700, K. Y. Srinivasan wrote:
> From: ja...@microsoft.com 

That's not his name :(

I've stopped here, please fix up and resend the rest of the series.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/2] dmaengine: Add scatter-gathered memset support

2015-08-04 Thread Vinod Koul
On Mon, Jul 06, 2015 at 12:19:22PM +0200, Maxime Ripard wrote:
> Hi Vinod,
> 
> Here is a patch serie that adds a new dmaengine operation for
> scatter-gathered memset.
> 
> Indeed, doing a memset over a discontiguous buffer is quite
> inefficient at the moment, since you have to create and submit each
> chunk separately, which might result in a huge list of transfers,
> while some controllers can handle that just fine.

Applied, thanks

-- 
~Vinod

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv5 2/5] Staging: most: mostcore/core.c. Fix "Using plain integer as NULL pointer" warnings

2015-08-04 Thread Sudip Mukherjee
On Tue, Aug 04, 2015 at 08:44:52PM +0200, Adrian Remonda wrote:
> This patch fixes the warning generated by sparse: "Using plain integer
> as NULL pointer" by replacing the offending 0 with NULL.
> 
> Signed-off-by: Adrian Remonda 
> ---
>  drivers/staging/most/mostcore/core.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/staging/most/mostcore/core.c 
> b/drivers/staging/most/mostcore/core.c
> index b8871364169c..383e06968b41 100644
> --- a/drivers/staging/most/mostcore/core.c
> +++ b/drivers/staging/most/mostcore/core.c
> @@ -982,7 +982,7 @@ static ssize_t store_add_link(struct most_aim_obj 
> *aim_obj,
>   if (ret)
>   return ret;
>  
> - if (mdev_devnod == 0 || *mdev_devnod == 0) {
> + if (mdev_devnod == NULL || *mdev_devnod == 0) {
Usually we write the NULL test as:
if (!mdev_devnod || *mdev_devnod == 0)

regards
sudip
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 2/2] dmaengine: hdmac: Add memset capabilities

2015-08-04 Thread Vinod Koul
On Mon, Jul 20, 2015 at 10:42:58AM +0200, Maxime Ripard wrote:
> Just like for the XDMAC, the SoCs that embed the HDMAC don't have any kind
> of GPU, and need to accelerate a few framebuffer-related operations through
> their DMA controller.
> 
> However, unlike the XDMAC, the HDMAC doesn't have the memset capability
> built-in. That can be easily emulated though, by doing a transfer with a
> fixed adress on the variable that holds the value we want to set.
typo  

Applied, with typo fixed

-- 
~Vinod

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv5 1/5] Staging: most: mostcore/core.c. Fix "missing static keyword" warnings

2015-08-04 Thread Sudip Mukherjee
On Tue, Aug 04, 2015 at 08:44:51PM +0200, Adrian Remonda wrote:
> This is a patch to the mostcore/core.c file. It makes
> several local functions and structures static to prevent global
> visibility.
> 
> Signed-off-by: Adrian Remonda 
> ---

> @@ -1255,7 +1255,7 @@ static void arm_mbo(struct mbo *mbo)
>   *
>   * Returns the number of allocated and enqueued MBOs.
>   */
> -int arm_mbo_chain(struct most_c_obj *c, int dir, void (*compl)(struct mbo *))
> +static int arm_mbo_chain(struct most_c_obj *c, int dir, void (*compl)(struct 
> mbo *))
This introduced a new checkpatch warning about "line over 80
characters".

regards
sudip
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/2] dmaengine: Add an enum for the dmaengine alignment constraints

2015-08-04 Thread Vinod Koul
On Mon, Jul 20, 2015 at 10:41:32AM +0200, Maxime Ripard wrote:
> Most drivers need to set constraints on the buffer alignment for async tx
> operations. However, even though it is documented, some drivers either use
> a defined constant that is not matching what the alignment variable expects
> (like DMA_BUSWIDTH_* constants) or fill the alignment in bytes instead of
> power of two.
> 
> Add a new enum for these alignments that matches what the framework
> expects, and convert the drivers to it.

Applied, thanks

-- 
~Vinod

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 1/5] spi: introduce flag for memory mapped read

2015-08-04 Thread Michal Suchanek
Hello,

On 4 August 2015 at 19:59, R, Vignesh  wrote:
>
>
> On 8/4/2015 9:21 PM, Mark Brown wrote:
>> On Mon, Aug 03, 2015 at 10:27:19AM +0530, Vignesh R wrote:
>>
>>> @use_mmap_mode: Some SPI controller chips are optimized for interacting
>>> with serial flash memories. These chips have memory mapped interface,
>>> through which entire serial flash memory slave can be read/written as if
>>> though they are physical memories (like RAM). Using this interface,
>>> flash can be accessed using memcpy() function and the spi controller
>>> hardware will take care of communicating with serial flash over SPI.
>>> Setting this flag will indicate the SPI controller driver that the
>>> spi_message is from mtd layer to read from/write to flash. The SPI
>>> master driver can then appropriately switch the controller to memory
>>> mapped interface to read from/write to flash, based on this flag (See
>>> drivers/spi/spi-ti-qspi.c for example).
>>> NOTE: If the SPI controller chip lacks memory mapped interface, then the
>>> driver will ignore this flag and use normal SPI protocol to read
>>> from/write to flash. Communication with non-flash SPI devices is not
>>> possible using the memory mapped interface.
>>
>> I still can't tell from the above what this interface is supposed to do.
>> It sounds like the use of memory mapped mode is supposed to be
>> transparent to users, it should just affect how the controller interacts
>> with the hardware, but if that's the case why do we need to expose it to
>> users at all?  Shouldn't the driver just use memory mapped mode if it's
>> faster?
>>
>
> TI QSPI controller has two blocks:
> 1. SPI_CORE: This is generic(normal) spi mode. This can be used to
> communicate with any SPI devices (serial flashes as well as non-flash
> devices like touchscreen).
> 2. SFI_MM_IF(SPI memory mapped interface): The SFI_MM_IF block only
> allows reading and writing to an SPI flash device only. Used to speed up
> flash reads. It _cannot_ be used to communicate with non flash devices.
> Now, the spi_message that ti-qspi receives in transfer_one() callback
> can be from mtd device(in which case SFI_MM_IF can be used) or from any
> other non flash SPI device (in which case SFI_MM_IF must not be used
> instead SPI_CORE is to be used) but there is no way(is there?) to
> distinguish where spi_message is from. Therefore I introduced flag
> (use_mmap_mode) to struct spi_message. mtd driver will set flag to true,
> this helps the ti-qspi driver to determine that the user is flash device
> and thus can do read via SFI_MM_IF. If this flag is not set then the
> user is assumed to be non flash SPI driver and will use SPI_CORE block
> to communicate.
>
> On the whole, I just need a way to determine that the user is a flash
> device in order to switch to memory mapped interface.
>

Maybe it can be set on the SPI slave rather than each message.

Thanks

Michal
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [LINUX RFC 1/2] mtd: spi-nor: add dual parallel mode support

2015-08-04 Thread Ranjit Abhimanyu Waghmode
Hi Mark,

> -Original Message-
> From: Mark Brown [mailto:broo...@kernel.org]
> Sent: Monday, August 03, 2015 9:38 PM
> To: Ranjit Abhimanyu Waghmode
> Cc: dw...@infradead.org; computersforpe...@gmail.com; Michal Simek;
> Soren Brinkmann; zaj...@gmail.com; b...@decadent.org.uk; ma...@denx.de;
> b32...@freescale.com; knut.wohl...@de.bosch.com; juh...@openwrt.org;
> bean...@micron.com; linux-...@lists.infradead.org; linux-
> ker...@vger.kernel.org; linux-...@vger.kernel.org; linux-arm-
> ker...@lists.infradead.org; Harini Katakam; Punnaiah Choudary Kalluri; Ranjit
> Abhimanyu Waghmode; ran27...@gmail.com
> Subject: Re: [LINUX RFC 1/2] mtd: spi-nor: add dual parallel mode support
> 
> On Mon, Aug 03, 2015 at 02:35:06PM +0530, Ranjit Waghmode wrote:
> 
> >  drivers/mtd/devices/m25p80.c  |  1 +
> >  drivers/mtd/spi-nor/spi-nor.c | 92 ++--
> ---
> >  include/linux/mtd/spi-nor.h   |  3 ++
> >  include/linux/spi/spi.h   |  2 +
> >  4 files changed, 79 insertions(+), 19 deletions(-)
> 
> You need to at least split this into two patches, one adding a new SPI 
> interface
> and another using it in MTD.  Probably the MTD core and driver changes need
> splitting too.  Please see SubmittingPatches for discussion of splitting 
> things.
> 

I will split and resend the same.

> > diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index
> > d673072..8dec349 100644
> > --- a/include/linux/spi/spi.h
> > +++ b/include/linux/spi/spi.h
> > @@ -355,6 +355,8 @@ struct spi_master {
> >  #define SPI_MASTER_NO_TX   BIT(2)  /* can't do buffer write */
> >  #define SPI_MASTER_MUST_RX  BIT(3) /* requires rx */
> >  #define SPI_MASTER_MUST_TX  BIT(4) /* requires tx */
> > +#define SPI_MASTER_DATA_STRIPE BIT(7)  /* support
> data stripe */
> > +#define SPI_MASTER_BOTH_CS BIT(8)  /* enable both
> chips */
> 
> This is really not adequate description for a new API, I can't tell what "data
> stripe" is supposed to mean at all and I've got at best a vague idea what 
> "both
> chips" really means.  This means other developers won't be able to tell how to
> use or implement these flags either, and it means I can't really review this. 
>  You
> need to provide more information here, both in the code and in the commit
> message.
> 

I'm sorry about that. I have added description in cover letter, but will add 
more information about the same here too.

> I'd also expect some handling in the core for these, for example error 
> handling if
> they can't be supported.

Will update and send you the updated version.

Thanks,
Ranjit
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:x86/platform] x86/uv/time: Migrate to new set-state interface

2015-08-04 Thread Viresh Kumar
On 04-08-15, 10:25, Nathan Zimmer wrote:
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 907ce01..9c2beb3 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -10885,6 +10885,15 @@ W:   http://en.wikipedia.org/wiki/Util-linux
>  T:   git git://git.kernel.org/pub/scm/utils/util-linux/util-linux.git
>  S:   Maintained
>  
> +UV PLATFORM
> +M: Mike Travis 
> +M: uv_ker...@sgi.com

Should that be L:, as it looks to be a list?

> +S: Supported
> +F: arch/x86/kernel/apic/x2apic_uv_x.c
> +F: arch/x86/platform/uv/
> +F: arch/x86/include/asm/uv/
> +F: drivers/char/uv_mmtimer.c
> +
>  UVESAFB DRIVER
>  M:   Michal Januszewski 
>  L:   linux-fb...@vger.kernel.org
> -- 
> 1.8.2.1
> 


-- 
viresh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Gift

2015-08-04 Thread Mrs Maria-Elisabeth Schaeffler
I intend to give to you a portion of my Wealth as a free-will financial 
donation to you.
Respond now to partake.

Regards
Maria-Elisabeth Schaeffler
Email:charityinquiri...@qq.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] intel_pstate: append more Oracle OEM table id to vendor bypass list

2015-08-04 Thread Viresh Kumar
On 05-08-15, 09:28, Ethan Zhao wrote:
> Append more Oracle X86 servers that have their own power management,
> 
> SUN FIRE X4275 M3
> SUN FIRE X4170 M3
> and
> SUN FIRE X6-2
> 
> Signed-off-by: Ethan Zhao 
> ---
>  drivers/cpufreq/intel_pstate.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index c45d274..c57b011 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -1156,6 +1156,10 @@ static struct hw_vendor_info vendor_info[] = {
>   {1, "ORACLE", "X4270M3 ", PPC},
>   {1, "ORACLE", "X4270M2 ", PPC},
>   {1, "ORACLE", "X4170M2 ", PPC},
> + {1, "ORACLE", "X4170 M3", PPC},
> + {1, "ORACLE", "X4275 M3", PPC},
> + {1, "ORACLE", "X6-2", PPC},
> + {1, "ORACLE", "Sudbury ", PPC},
>   {0, "", ""},
>  };

Acked-by: Viresh Kumar 

-- 
viresh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Armadaxp GPIO interrupts

2015-08-04 Thread raghu MG
Hi Andrew,

Thanks for your quick response,dont mind for the delay(India,US timings).
I checked with both multi_v7_defconfig & mvebu_v7_defconfig , the result is same
"irq: Cannot allocate irq_descs @ IRQ47, assuming pre-allocated"

Pasting here the GPIO driver probe debug statements

armada-xp-pinctrl f1018000.pin-ctrl: registered pinctrl driver
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 680
mvebu_gpio_probe 682 match->name=
mvebu_gpio_probe 683 match->compatible=marvell,orion-gpio
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 727
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 734 res->start=f1018100
mvchip->membase=cf8c0100
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 747 soc_variant=1
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 785
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 788
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 805
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 840
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 844 mvchip->irqbase=47
irq: Cannot allocate irq_descs @ IRQ47, assuming pre-allocated
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 850 mvchip->domain->name=(null)
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 859
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 680
mvebu_gpio_probe 682 match->name=
mvebu_gpio_probe 683 match->compatible=marvell,orion-gpio
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 727
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 734 res->start=f1018140
mvchip->membase=cf8c2140
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 747 soc_variant=1
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 785
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 788
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 805
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 840
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 844 mvchip->irqbase=79
irq: Cannot allocate irq_descs @ IRQ79, assuming pre-allocated
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 850 mvchip->domain->name=(null)
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 859
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 680
mvebu_gpio_probe 682 match->name=
mvebu_gpio_probe 683 match->compatible=marvell,orion-gpio
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 727
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 734 res->start=f1018180
mvchip->membase=cf8c4180
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 747 soc_variant=1
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 785
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 788
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 805
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 840
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 844 mvchip->irqbase=111
irq: Cannot allocate irq_descs @ IRQ111, assuming pre-allocated
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 850 mvchip->domain->name=(null)
drivers/gpio/gpio-mvebu.c mvebu_gpio_probe 859



I will try to dig in more information in the probe & irq_domain_add_simple


On Tue, Aug 4, 2015 at 9:04 PM, Andrew Lunn  wrote:
> On Tue, Aug 04, 2015 at 08:52:17PM +0530, raghu MG wrote:
>> Hello,
>>
>> I am working on a card which as GPIOs connected to external I/O's. The
>> board consists of ARMADAXP 78460 host cpu.
>>
>> Board currently runs Linux-4.1 with modified  armada-xp-gp.dtb for ArmadaXP.
>> I enabled "orion-gpio" driver to initialize GPIOs as given in
>> armada-xp-mv78460.
>
> What kernel configuration are you using? Do you have the same problem
> with multi_v7_defconfig and mvebu_v7_defconfig?
>
>> The driver while initializing calls irq_domain_add_simple which throws up
>> following warning
>> "irq: Cannot allocate irq_descs @ IRQ47, assuming pre-allocated"
>> The warning repeats for next set(32-63  & 64-66).
>>
>> Also the GPIO IRQs are not getting listed in cat /proc/interrupts
>
> irq_domain_add_simple() returning an error is fatal for the probe. The
> driver will not be loaded, so more than interrupts will be missing,
> all the gpios will be missing.
>
> Andrew
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2 net-next 3/9] ipv6: Export nf_ct_frag6_gather()

2015-08-04 Thread Joe Stringer
Signed-off-by: Joe Stringer 
Acked-by: Thomas Graf 
---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c 
b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6d02498..701cd2b 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -633,6 +633,7 @@ ret_orig:
kfree_skb(clone);
return skb;
 }
+EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
 
 void nf_ct_frag6_consume_orig(struct sk_buff *skb)
 {
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2 net-next 8/9] openvswitch: Allow matching on conntrack label

2015-08-04 Thread Joe Stringer
Allow matching and setting the conntrack label field. As with ct_mark,
this is populated by executing the ct() action, and is a writable field.
The set_field() action may be used to modify the label, which will take
effect on the most recent conntrack entry.

E.g.: actions:ct(zone=1),set_field(1->ct_label)

This will perform conntrack lookup in zone 1, then modify the label for
that entry. The conntrack entry itself must be committed using the
"commit" flag in the conntrack action flags for this change to persist.

Signed-off-by: Joe Stringer 
---
v2: Split out setting the connlabel size for the current namespace.
---
 include/uapi/linux/openvswitch.h |  6 
 net/openvswitch/actions.c|  4 +++
 net/openvswitch/conntrack.c  | 68 
 net/openvswitch/conntrack.h  | 15 +
 net/openvswitch/flow.c   |  1 +
 net/openvswitch/flow.h   |  1 +
 net/openvswitch/flow_netlink.c   | 18 ++-
 7 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 207788c..f360dc9 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -326,6 +326,7 @@ enum ovs_key_attr {
OVS_KEY_ATTR_CT_STATE,  /* u8 bitmask of OVS_CS_F_* */
OVS_KEY_ATTR_CT_ZONE,   /* u16 connection tracking zone. */
OVS_KEY_ATTR_CT_MARK,   /* u32 connection tracking mark */
+   OVS_KEY_ATTR_CT_LABEL,  /* 16-octet connection tracking label */
 
 #ifdef __KERNEL__
OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -438,6 +439,11 @@ struct ovs_key_nd {
__u8nd_tll[ETH_ALEN];
 };
 
+#define OVS_CT_LABEL_LEN   16
+struct ovs_key_ct_label {
+   __u8ct_label[OVS_CT_LABEL_LEN];
+};
+
 /* OVS_KEY_ATTR_CT_STATE flags */
 #define OVS_CS_F_NEW   0x01 /* Beginning of a new connection. */
 #define OVS_CS_F_ESTABLISHED   0x02 /* Part of an existing connection. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index e96e516..2e3ff11 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -944,6 +944,10 @@ static int execute_masked_set_action(struct sk_buff *skb,
  *get_mask(a, u32 *));
break;
 
+   case OVS_KEY_ATTR_CT_LABEL:
+   err = ovs_ct_set_label(skb, flow_key, nla_data(a),
+   get_mask(a, struct ovs_key_ct_label *));
+   break;
}
 
return err;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 81b80da..6a64a32 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -110,6 +111,30 @@ u32 ovs_ct_get_mark(const struct sk_buff *skb)
return ct ? ct->mark : 0;
 }
 
+void ovs_ct_get_label(const struct sk_buff *skb,
+ struct ovs_key_ct_label *label)
+{
+   enum ip_conntrack_info ctinfo;
+   struct nf_conn_labels *cl = NULL;
+   struct nf_conn *ct;
+
+   ct = nf_ct_get(skb, );
+   if (ct)
+   cl = nf_ct_labels_find(ct);
+
+   if (cl) {
+   size_t len = cl->words * sizeof(long);
+
+   if (len > OVS_CT_LABEL_LEN)
+   len = OVS_CT_LABEL_LEN;
+   else if (len < OVS_CT_LABEL_LEN)
+   memset(label, 0, OVS_CT_LABEL_LEN);
+   memcpy(label, cl->bits, len);
+   } else {
+   memset(label, 0, OVS_CT_LABEL_LEN);
+   }
+}
+
 static bool __ovs_ct_state_valid(u8 state)
 {
return (state && !(state & OVS_CS_F_INVALID));
@@ -202,6 +227,7 @@ static void __ovs_ct_update_key(struct sk_buff *skb, struct 
sw_flow_key *key,
key->ct.state = state;
key->ct.zone = zone;
key->ct.mark = ovs_ct_get_mark(skb);
+   ovs_ct_get_label(skb, >ct.label);
 }
 
 static void ovs_ct_update_key(struct sk_buff *skb, struct sw_flow_key *key,
@@ -359,6 +385,41 @@ int ovs_ct_set_mark(struct sk_buff *skb, struct 
sw_flow_key *key,
 #endif
 }
 
+int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key,
+const struct ovs_key_ct_label *label,
+const struct ovs_key_ct_label *mask)
+{
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+   enum ip_conntrack_info ctinfo;
+   struct nf_conn_labels *cl;
+   struct nf_conn *ct;
+   int err;
+
+   /* This must happen directly after lookup/commit. */
+   ct = nf_ct_get(skb, );
+   if (!ct)
+   return -EINVAL;
+
+   cl = nf_ct_labels_find(ct);
+   if (!cl) {
+   nf_ct_labels_ext_add(ct);
+   cl = nf_ct_labels_find(ct);
+   }
+   if (!cl || cl->words * sizeof(long) < OVS_CT_LABEL_LEN)
+   return -ENOSPC;
+
+   err = nf_connlabels_replace(ct, (u32 *)label, (u32 *)mask,
+ 

[PATCHv2 net-next 1/9] openvswitch: Serialize acts with original netlink len

2015-08-04 Thread Joe Stringer
Previously, we used the kernel-internal netlink actions length to
calculate the size of messages to serialize back to userspace.
However,the sw_flow_actions may not be formatted exactly the same as the
actions on the wire, so store the original actions length when
de-serializing and re-use the original length when serializing.

Signed-off-by: Joe Stringer 
Acked-by: Thomas Graf 
---
 net/openvswitch/datapath.c | 2 +-
 net/openvswitch/flow.h | 1 +
 net/openvswitch/flow_netlink.c | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index ffe984f..d5b5473 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -713,7 +713,7 @@ static size_t ovs_flow_cmd_msg_size(const struct 
sw_flow_actions *acts,
 
/* OVS_FLOW_ATTR_ACTIONS */
if (should_fill_actions(ufid_flags))
-   len += nla_total_size(acts->actions_len);
+   len += nla_total_size(acts->orig_len);
 
return len
+ nla_total_size(sizeof(struct ovs_flow_stats)) /* 
OVS_FLOW_ATTR_STATS */
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index b62cdb3..082a87b 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -144,6 +144,7 @@ struct sw_flow_id {
 
 struct sw_flow_actions {
struct rcu_head rcu;
+   size_t orig_len;/* From flow_cmd_new netlink actions size */
u32 actions_len;
struct nlattr actions[];
 };
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index a6eb77a..d536fb7 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1545,6 +1545,7 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int 
size, bool log)
return ERR_PTR(-ENOMEM);
 
sfa->actions_len = 0;
+   sfa->orig_len = size;
return sfa;
 }
 
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2 net-next 9/9] openvswitch: Allow attaching helpers to ct action

2015-08-04 Thread Joe Stringer
Add support for using conntrack helpers to assist protocol detection.
The new OVS_CT_ATTR_HELPER attribute of the ct action specifies a helper
to be used for this connection.

Example ODP flows allowing FTP connections from ports 1->2:
in_port=1,tcp,action=ct(helper=ftp,commit),2
in_port=2,tcp,ct_state=-trk,action=ct(),recirc(1)
recirc_id=1,in_port=2,tcp,ct_state=+trk-new+est,action=1
recirc_id=1,in_port=2,tcp,ct_state=+trk+rel,action=1

Signed-off-by: Joe Stringer 
---
 include/uapi/linux/openvswitch.h |   1 +
 net/openvswitch/conntrack.c  | 109 ++-
 2 files changed, 108 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index f360dc9..e816170 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -626,6 +626,7 @@ enum ovs_ct_attr {
OVS_CT_ATTR_UNSPEC,
OVS_CT_ATTR_FLAGS,  /* u8 bitmask of OVS_CT_F_*. */
OVS_CT_ATTR_ZONE,   /* u16 zone id. */
+   OVS_CT_ATTR_HELPER,
__OVS_CT_ATTR_MAX
 };
 
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 6a64a32..1f2a9bc 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -30,6 +31,7 @@ struct ovs_ct_len_tbl {
 };
 
 struct ovs_conntrack_info {
+   struct nf_conntrack_helper *helper;
struct nf_conn *ct;
u32 flags;
u16 zone;
@@ -145,6 +147,51 @@ bool ovs_ct_state_valid(const struct sw_flow_key *key)
return __ovs_ct_state_valid(key->ct.state);
 }
 
+/* 'skb' should already be pulled to nh_ofs. */
+static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
+{
+   const struct nf_conntrack_helper *helper;
+   const struct nf_conn_help *help;
+   enum ip_conntrack_info ctinfo;
+   unsigned int protoff;
+   struct nf_conn *ct;
+
+   ct = nf_ct_get(skb, );
+   if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+   return NF_ACCEPT;
+
+   help = nfct_help(ct);
+   if (!help)
+   return NF_ACCEPT;
+
+   helper = rcu_dereference(help->helper);
+   if (!helper)
+   return NF_ACCEPT;
+
+   switch (proto) {
+   case NFPROTO_IPV4:
+   protoff = ip_hdrlen(skb);
+   break;
+   case NFPROTO_IPV6: {
+   u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+   __be16 frag_off;
+
+   protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
+  , _off);
+   if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+   pr_debug("proto header not found\n");
+   return NF_ACCEPT;
+   }
+   break;
+   }
+   default:
+   WARN_ONCE(1, "helper invoked on non-IP family!");
+   return NF_DROP;
+   }
+
+   return helper->help(skb, protoff, ct, ctinfo);
+}
+
 static int handle_fragments(struct net *net, struct sw_flow_key *key,
u16 zone, struct sk_buff *skb)
 {
@@ -217,6 +264,13 @@ static bool skb_nfct_cached(const struct net *net, const 
struct sk_buff *skb,
return false;
if (info->zone != nf_ct_zone(ct))
return false;
+   if (info->helper) {
+   struct nf_conn_help *help;
+
+   help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
+   if (help && help->helper != info->helper)
+   return false;
+   }
 
return true;
 }
@@ -274,6 +328,11 @@ static int __ovs_ct_lookup(struct net *net, const struct 
sw_flow_key *key,
if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING,
skb) != NF_ACCEPT)
return -ENOENT;
+
+   if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+   WARN_ONCE(1, "helper rejected packet");
+   return -EINVAL;
+   }
}
 
return 0;
@@ -420,15 +479,41 @@ int ovs_ct_set_label(struct sk_buff *skb, struct 
sw_flow_key *key,
 #endif
 }
 
+static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
+const struct sw_flow_key *key, bool log)
+{
+   struct nf_conntrack_helper *helper;
+   struct nf_conn_help *help;
+
+   helper = nf_conntrack_helper_try_module_get(name, info->family,
+   key->ip.proto);
+   if (!helper) {
+   OVS_NLERR(log, "Unknown helper \"%s\"", name);
+   return -ENOENT;
+   }
+
+   help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
+   if (!help) {
+   module_put(helper->me);
+   return -ENOMEM;
+   }
+
+   help->helper = helper;
+   info->helper = helper;
+   return 0;
+}
+
 

[PATCHv2 net-next 5/9] openvswitch: Add conntrack action

2015-08-04 Thread Joe Stringer
Expose the kernel connection tracker via OVS. Userspace components can
make use of the "ct()" action, followed by "recirculate", to populate
the conntracking state in the OVS flow key, and subsequently match on
that state.

Example ODP flows allowing traffic from 1->2, only replies from 2->1:
in_port=1,tcp,action=ct(commit,zone=1),2
in_port=2,ct_state=-trk,tcp,action=ct(zone=1),recirc(1)
recirc_id=1,in_port=2,ct_state=+trk+est-new,tcp,action=1

IP fragments are handled by transparently assembling them as part of the
ct action. The maximum received unit (MRU) size is tracked so that
refragmentation can occur during output.

IP frag handling contributed by Andy Zhou.

Signed-off-by: Joe Stringer 
Signed-off-by: Justin Pettit 
Signed-off-by: Andy Zhou 
---
This can be tested with the corresponding userspace component here:
https://www.github.com/justinpettit/openvswitch conntrack

v2: Don't take references to devs or dsts in output path.
Shift ovs_ct_init()/ovs_ct_exit() into this patch
Handle output case where flow key is invalidated
Store the entire L2 header to apply to fragments
Various minor simplifications
Improve comments/logs
Style fixes
Rebase
---
 include/uapi/linux/openvswitch.h |  41 
 net/openvswitch/Kconfig  |  11 +
 net/openvswitch/Makefile |   2 +
 net/openvswitch/actions.c| 154 -
 net/openvswitch/conntrack.c  | 475 +++
 net/openvswitch/conntrack.h  |  97 
 net/openvswitch/datapath.c   |  73 --
 net/openvswitch/datapath.h   |   8 +
 net/openvswitch/flow.c   |   3 +
 net/openvswitch/flow.h   |   6 +
 net/openvswitch/flow_netlink.c   |  72 --
 net/openvswitch/flow_netlink.h   |   4 +-
 net/openvswitch/vport.c  |   1 +
 13 files changed, 910 insertions(+), 37 deletions(-)
 create mode 100644 net/openvswitch/conntrack.c
 create mode 100644 net/openvswitch/conntrack.h

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index d6b8854..1dae30a 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -164,6 +164,9 @@ enum ovs_packet_cmd {
  * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the
  * output port is actually a tunnel port. Contains the output tunnel key
  * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes.
+ * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and
+ * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
+ * size.
  *
  * These attributes follow the  ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
@@ -180,6 +183,7 @@ enum ovs_packet_attr {
OVS_PACKET_ATTR_UNUSED2,
OVS_PACKET_ATTR_PROBE,  /* Packet operation is a feature probe,
   error logging should be suppressed. */
+   OVS_PACKET_ATTR_MRU,/* Maximum received IP fragment size. */
__OVS_PACKET_ATTR_MAX
 };
 
@@ -319,6 +323,8 @@ enum ovs_key_attr {
OVS_KEY_ATTR_MPLS,  /* array of struct ovs_key_mpls.
 * The implementation may restrict
 * the accepted length of the array. */
+   OVS_KEY_ATTR_CT_STATE,  /* u8 bitmask of OVS_CS_F_* */
+   OVS_KEY_ATTR_CT_ZONE,   /* u16 connection tracking zone. */
 
 #ifdef __KERNEL__
OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
@@ -431,6 +437,15 @@ struct ovs_key_nd {
__u8nd_tll[ETH_ALEN];
 };
 
+/* OVS_KEY_ATTR_CT_STATE flags */
+#define OVS_CS_F_NEW   0x01 /* Beginning of a new connection. */
+#define OVS_CS_F_ESTABLISHED   0x02 /* Part of an existing connection. */
+#define OVS_CS_F_RELATED   0x04 /* Related to an established
+* connection. */
+#define OVS_CS_F_INVALID   0x20 /* Could not track connection. */
+#define OVS_CS_F_REPLY_DIR 0x40 /* Flow is in the reply direction. */
+#define OVS_CS_F_TRACKED   0x80 /* Conntrack has occurred. */
+
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
  * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
@@ -595,6 +610,29 @@ struct ovs_action_hash {
 };
 
 /**
+ * enum ovs_ct_attr - Attributes for %OVS_ACTION_ATTR_CT action.
+ * @OVS_CT_ATTR_FLAGS: u32 connection tracking flags.
+ * @OVS_CT_ATTR_ZONE: u16 connection tracking zone.
+ * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
+ */
+enum ovs_ct_attr {
+   OVS_CT_ATTR_UNSPEC,
+   OVS_CT_ATTR_FLAGS,  /* u8 bitmask of OVS_CT_F_*. */
+   OVS_CT_ATTR_ZONE,   /* u16 zone id. */
+   __OVS_CT_ATTR_MAX
+};
+
+#define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
+
+/*
+ * OVS_CT_ATTR_FLAGS flags - bitmask of %OVS_CT_F_*
+ * @OVS_CT_F_COMMIT: Commits the flow to the conntrack hashtable in the
+ * specified zone. Future packets for the 

[PATCHv2 net-next 7/9] netfilter: Always export nf_connlabels_replace()

2015-08-04 Thread Joe Stringer
The following patches will reuse this code from OVS.

Signed-off-by: Joe Stringer 
---
 net/netfilter/nf_conntrack_labels.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_labels.c 
b/net/netfilter/nf_conntrack_labels.c
index 06e71a0..3ce5c31 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -50,7 +50,6 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit)
 }
 EXPORT_SYMBOL_GPL(nf_connlabel_set);
 
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
 static void replace_u32(u32 *address, u32 mask, u32 new)
 {
u32 old, tmp;
@@ -91,7 +90,6 @@ int nf_connlabels_replace(struct nf_conn *ct,
return 0;
 }
 EXPORT_SYMBOL_GPL(nf_connlabels_replace);
-#endif
 
 int nf_connlabels_get(struct net *net, unsigned int n_bits)
 {
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2 net-next 6/9] openvswitch: Allow matching on conntrack mark

2015-08-04 Thread Joe Stringer
From: Justin Pettit 

Allow matching and setting the conntrack mark field. As with conntrack
state and zone, these are populated by executing the ct() action. Unlike
these, the ct_mark is also a writable field. The set_field() action may
be used to modify the mark, which will take effect on the most recent
conntrack entry.

E.g.: actions:ct(zone=0),ct(zone=1),set_field(1->ct_mark)

This will perform conntrack lookup in zone 0, then lookup in zone 1,
then modify the mark for the entry in zone 1. The mark for the entry in
zone 0 is unchanged. The conntrack entry itself must be committed using
the "commit" flag in the conntrack action flags for this change to persist.

Signed-off-by: Justin Pettit 
Signed-off-by: Joe Stringer 
---
 include/uapi/linux/openvswitch.h |  1 +
 net/openvswitch/actions.c|  6 ++
 net/openvswitch/conntrack.c  | 40 
 net/openvswitch/conntrack.h  | 14 ++
 net/openvswitch/flow.c   |  1 +
 net/openvswitch/flow.h   |  1 +
 net/openvswitch/flow_netlink.c   | 15 ++-
 7 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 1dae30a..207788c 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -325,6 +325,7 @@ enum ovs_key_attr {
 * the accepted length of the array. */
OVS_KEY_ATTR_CT_STATE,  /* u8 bitmask of OVS_CS_F_* */
OVS_KEY_ATTR_CT_ZONE,   /* u16 connection tracking zone. */
+   OVS_KEY_ATTR_CT_MARK,   /* u32 connection tracking mark */
 
 #ifdef __KERNEL__
OVS_KEY_ATTR_TUNNEL_INFO,  /* struct ip_tunnel_info */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 718b223..e96e516 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -938,6 +938,12 @@ static int execute_masked_set_action(struct sk_buff *skb,
err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
__be32 *));
break;
+
+   case OVS_KEY_ATTR_CT_MARK:
+   err = ovs_ct_set_mark(skb, flow_key, nla_get_u32(a),
+ *get_mask(a, u32 *));
+   break;
+
}
 
return err;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 586ce66..81b80da 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -101,6 +101,15 @@ u16 ovs_ct_get_zone(const struct sk_buff *skb)
return ct ? nf_ct_zone(ct) : NF_CT_DEFAULT_ZONE;
 }
 
+u32 ovs_ct_get_mark(const struct sk_buff *skb)
+{
+   enum ip_conntrack_info ctinfo;
+   struct nf_conn *ct;
+
+   ct = nf_ct_get(skb, );
+   return ct ? ct->mark : 0;
+}
+
 static bool __ovs_ct_state_valid(u8 state)
 {
return (state && !(state & OVS_CS_F_INVALID));
@@ -192,6 +201,7 @@ static void __ovs_ct_update_key(struct sk_buff *skb, struct 
sw_flow_key *key,
 {
key->ct.state = state;
key->ct.zone = zone;
+   key->ct.mark = ovs_ct_get_mark(skb);
 }
 
 static void ovs_ct_update_key(struct sk_buff *skb, struct sw_flow_key *key,
@@ -323,6 +333,32 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
return err;
 }
 
+int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
+   u32 ct_mark, u32 mask)
+{
+#ifdef CONFIG_NF_CONNTRACK_MARK
+   enum ip_conntrack_info ctinfo;
+   struct nf_conn *ct;
+   u32 new_mark;
+
+   /* This must happen directly after lookup/commit. */
+   ct = nf_ct_get(skb, );
+   if (!ct)
+   return -EINVAL;
+
+   new_mark = ct_mark | (ct->mark & ~(mask));
+   if (ct->mark != new_mark) {
+   ct->mark = new_mark;
+   nf_conntrack_event_cache(IPCT_MARK, ct);
+   key->ct.mark = ct_mark;
+   }
+
+   return 0;
+#else
+   return -ENOTSUPP;
+#endif
+}
+
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
[OVS_CT_ATTR_FLAGS] = { .minlen = sizeof(u32),
.maxlen = sizeof(u32) },
@@ -386,6 +422,10 @@ bool ovs_ct_verify(enum ovs_key_attr attr)
if (attr & OVS_KEY_ATTR_CT_ZONE)
return true;
 #endif
+#ifdef CONFIG_NF_CONNTRACK_MARK
+   if (attr & OVS_KEY_ATTR_CT_MARK)
+   return true;
+#endif
 
return false;
 }
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index 0e09a6d..b0f06b4 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,6 +37,9 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, 
struct sk_buff *);
 int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,
   const struct ovs_conntrack_info *);
 
+int ovs_ct_set_mark(struct sk_buff *, struct sw_flow_key *, u32 ct_mark,
+   

[PATCHv2 net-next 4/9] netfilter: connlabels: Export setting connlabel length

2015-08-04 Thread Joe Stringer
Add functions to change connlabel length into nf_conntrack_labels.c so
they may be reused by other modules like OVS and nftables without
needing to jump through xt_match_check() hoops.

Suggested-by: Florian Westphal 
Signed-off-by: Joe Stringer 
---
v2: Protect connlabel modification with spinlock.
Fix reference leak in error case.
Style fixups.
---
 include/net/netfilter/nf_conntrack_labels.h |  4 
 net/netfilter/nf_conntrack_labels.c | 32 +
 net/netfilter/xt_connlabel.c| 16 ---
 3 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_labels.h 
b/include/net/netfilter/nf_conntrack_labels.h
index dec6336..7e2b1d0 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -54,7 +54,11 @@ int nf_connlabels_replace(struct nf_conn *ct,
 #ifdef CONFIG_NF_CONNTRACK_LABELS
 int nf_conntrack_labels_init(void);
 void nf_conntrack_labels_fini(void);
+int nf_connlabels_get(struct net *net, unsigned int n_bits);
+void nf_connlabels_put(struct net *net);
 #else
 static inline int nf_conntrack_labels_init(void) { return 0; }
 static inline void nf_conntrack_labels_fini(void) {}
+static inline int nf_connlabels_get(struct net *net, unsigned int n_bits) { 
return 0; }
+static inline void nf_connlabels_put(struct net *net) {}
 #endif
diff --git a/net/netfilter/nf_conntrack_labels.c 
b/net/netfilter/nf_conntrack_labels.c
index bb53f12..06e71a0 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -14,6 +14,8 @@
 #include 
 #include 
 
+static spinlock_t nf_connlabels_lock;
+
 static unsigned int label_bits(const struct nf_conn_labels *l)
 {
unsigned int longs = l->words;
@@ -91,6 +93,35 @@ int nf_connlabels_replace(struct nf_conn *ct,
 EXPORT_SYMBOL_GPL(nf_connlabels_replace);
 #endif
 
+int nf_connlabels_get(struct net *net, unsigned int n_bits)
+{
+   size_t words;
+
+   if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE))
+   return -ERANGE;
+
+   words = BITS_TO_LONGS(n_bits);
+
+   spin_lock(_connlabels_lock);
+   net->ct.labels_used++;
+   if (words > net->ct.label_words)
+   net->ct.label_words = words;
+   spin_unlock(_connlabels_lock);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_get);
+
+void nf_connlabels_put(struct net *net)
+{
+   spin_lock(_connlabels_lock);
+   net->ct.labels_used--;
+   if (net->ct.labels_used == 0)
+   net->ct.label_words = 0;
+   spin_unlock(_connlabels_lock);
+}
+EXPORT_SYMBOL_GPL(nf_connlabels_put);
+
 static struct nf_ct_ext_type labels_extend __read_mostly = {
.len= sizeof(struct nf_conn_labels),
.align  = __alignof__(struct nf_conn_labels),
@@ -99,6 +130,7 @@ static struct nf_ct_ext_type labels_extend __read_mostly = {
 
 int nf_conntrack_labels_init(void)
 {
+   spin_lock_init(_connlabels_lock);
return nf_ct_extend_register(_extend);
 }
 
diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c
index 9f8719d..bb9cbeb 100644
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -42,10 +42,6 @@ static int connlabel_mt_check(const struct xt_mtchk_param 
*par)
XT_CONNLABEL_OP_SET;
struct xt_connlabel_mtinfo *info = par->matchinfo;
int ret;
-   size_t words;
-
-   if (info->bit > XT_CONNLABEL_MAXBIT)
-   return -ERANGE;
 
if (info->options & ~options) {
pr_err("Unknown options in mask %x\n", info->options);
@@ -59,19 +55,15 @@ static int connlabel_mt_check(const struct xt_mtchk_param 
*par)
return ret;
}
 
-   par->net->ct.labels_used++;
-   words = BITS_TO_LONGS(info->bit+1);
-   if (words > par->net->ct.label_words)
-   par->net->ct.label_words = words;
-
+   ret = nf_connlabels_get(par->net, info->bit + 1);
+   if (ret < 0)
+   nf_ct_l3proto_module_put(par->family);
return ret;
 }
 
 static void connlabel_mt_destroy(const struct xt_mtdtor_param *par)
 {
-   par->net->ct.labels_used--;
-   if (par->net->ct.labels_used == 0)
-   par->net->ct.label_words = 0;
+   nf_connlabels_put(par->net);
nf_ct_l3proto_module_put(par->family);
 }
 
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCHv2 net-next 2/9] openvswitch: Move MASKED* macros to datapath.h

2015-08-04 Thread Joe Stringer
This will allow the ovs-conntrack code to reuse these macros.

Signed-off-by: Joe Stringer 
Acked-by: Thomas Graf 
---
 net/openvswitch/actions.c  | 52 ++
 net/openvswitch/datapath.h |  4 
 2 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index cf04c2f..e50678d 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -185,10 +185,6 @@ static int pop_mpls(struct sk_buff *skb, struct 
sw_flow_key *key,
return 0;
 }
 
-/* 'KEY' must not have any bits set outside of the 'MASK' */
-#define MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK)))
-#define SET_MASKED(OLD, KEY, MASK) ((OLD) = MASKED(OLD, KEY, MASK))
-
 static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
const __be32 *mpls_lse, const __be32 *mask)
 {
@@ -201,7 +197,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
return err;
 
stack = (__be32 *)skb_mpls_header(skb);
-   lse = MASKED(*stack, *mpls_lse, *mask);
+   lse = OVS_MASKED(*stack, *mpls_lse, *mask);
if (skb->ip_summed == CHECKSUM_COMPLETE) {
__be32 diff[] = { ~(*stack), lse };
 
@@ -244,9 +240,9 @@ static void ether_addr_copy_masked(u8 *dst_, const u8 
*src_, const u8 *mask_)
const u16 *src = (const u16 *)src_;
const u16 *mask = (const u16 *)mask_;
 
-   SET_MASKED(dst[0], src[0], mask[0]);
-   SET_MASKED(dst[1], src[1], mask[1]);
-   SET_MASKED(dst[2], src[2], mask[2]);
+   OVS_SET_MASKED(dst[0], src[0], mask[0]);
+   OVS_SET_MASKED(dst[1], src[1], mask[1]);
+   OVS_SET_MASKED(dst[2], src[2], mask[2]);
 }
 
 static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
@@ -330,10 +326,10 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 
l4_proto,
 static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4],
   const __be32 mask[4], __be32 masked[4])
 {
-   masked[0] = MASKED(old[0], addr[0], mask[0]);
-   masked[1] = MASKED(old[1], addr[1], mask[1]);
-   masked[2] = MASKED(old[2], addr[2], mask[2]);
-   masked[3] = MASKED(old[3], addr[3], mask[3]);
+   masked[0] = OVS_MASKED(old[0], addr[0], mask[0]);
+   masked[1] = OVS_MASKED(old[1], addr[1], mask[1]);
+   masked[2] = OVS_MASKED(old[2], addr[2], mask[2]);
+   masked[3] = OVS_MASKED(old[3], addr[3], mask[3]);
 }
 
 static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
@@ -350,15 +346,15 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 
l4_proto,
 static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask)
 {
/* Bits 21-24 are always unmasked, so this retains their values. */
-   SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16));
-   SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8));
-   SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask);
+   OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16));
+   OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8));
+   OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask);
 }
 
 static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl,
   u8 mask)
 {
-   new_ttl = MASKED(nh->ttl, new_ttl, mask);
+   new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask);
 
csum_replace2(>check, htons(nh->ttl << 8), htons(new_ttl << 8));
nh->ttl = new_ttl;
@@ -384,7 +380,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
 * makes sense to check if the value actually changed.
 */
if (mask->ipv4_src) {
-   new_addr = MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
+   new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
 
if (unlikely(new_addr != nh->saddr)) {
set_ip_addr(skb, nh, >saddr, new_addr);
@@ -392,7 +388,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
}
}
if (mask->ipv4_dst) {
-   new_addr = MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
+   new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
 
if (unlikely(new_addr != nh->daddr)) {
set_ip_addr(skb, nh, >daddr, new_addr);
@@ -480,7 +476,8 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
*(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
}
if (mask->ipv6_hlimit) {
-   SET_MASKED(nh->hop_limit, key->ipv6_hlimit, mask->ipv6_hlimit);
+   OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit,
+  mask->ipv6_hlimit);
flow_key->ip.ttl = nh->hop_limit;
}
return 0;
@@ -509,8 +506,8 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key 
*flow_key,
 

[PATCHv2 net-next 0/9] OVS conntrack support

2015-08-04 Thread Joe Stringer
The goal of this series is to allow OVS to send packets through the Linux
kernel connection tracker, and subsequently match on fields populated by
conntrack.

This version addresses the feedback from v2, mostly minor tidyups and a few
corner cases that were missed in v1. The biggest change is rather than
attempting to handle VLANs specifically for header reconstruction after
fragmentation, the entire L2 header is used as a more generic solution which
should handle more corner cases correctly.

This functionality is enabled through the CONFIG_OPENVSWITCH_CONNTRACK option.

The branch below has been updated with the corresponding userspace pieces:
https://github.com/justinpettit/ovs conntrack

v2: Split out per-netns connlabel width setting functions
Simplify reference tracking in output path.
Handle output cases where flow key is invalidated by prior push/pop
Store entire L2 header to apply to fragments
Various bits of refactoring, comments, styles, log improvements
Defer patch to scrub skb
Rebase

v1: First non-RFC post.
Fragment handling.
Conntrack label support.

Joe Stringer (8):
  openvswitch: Serialize acts with original netlink len
  openvswitch: Move MASKED* macros to datapath.h
  ipv6: Export nf_ct_frag6_gather()
  netfilter: connlabels: Export setting connlabel length
  openvswitch: Add conntrack action
  netfilter: Always export nf_connlabels_replace()
  openvswitch: Allow matching on conntrack label
  openvswitch: Allow attaching helpers to ct action

Justin Pettit (1):
  openvswitch: Allow matching on conntrack mark

 include/net/netfilter/nf_conntrack_labels.h |   4 +
 include/uapi/linux/openvswitch.h|  49 ++
 net/ipv6/netfilter/nf_conntrack_reasm.c |   1 +
 net/netfilter/nf_conntrack_labels.c |  34 +-
 net/netfilter/xt_connlabel.c|  16 +-
 net/openvswitch/Kconfig |  11 +
 net/openvswitch/Makefile|   2 +
 net/openvswitch/actions.c   | 216 +++--
 net/openvswitch/conntrack.c | 688 
 net/openvswitch/conntrack.h | 126 +
 net/openvswitch/datapath.c  |  75 ++-
 net/openvswitch/datapath.h  |  12 +
 net/openvswitch/flow.c  |   5 +
 net/openvswitch/flow.h  |   9 +
 net/openvswitch/flow_netlink.c  | 102 -
 net/openvswitch/flow_netlink.h  |   4 +-
 net/openvswitch/vport.c |   1 +
 17 files changed, 1276 insertions(+), 79 deletions(-)
 create mode 100644 net/openvswitch/conntrack.c
 create mode 100644 net/openvswitch/conntrack.h

-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


linux-next: manual merge of the access_once tree with the arm64 tree

2015-08-04 Thread Stephen Rothwell
Hi Christian,

Today's linux-next merge of the access_once tree got a conflict in:

  arch/arm64/include/asm/barrier.h

between commit:

  4b3dc9679cf7 ("arm64: force CONFIG_SMP=y and remove redundant #ifdefs")

from the arm64 tree and commit:

  76695af20c01 ("locking, arch: use WRITE_ONCE()/READ_ONCE() in 
smp_store_release()/smp_load_acquire()")

from the access_once tree.

I fixed it up (the former removed the code modified by the latter, so I
just did that) and can carry the fix as necessary (no action is required).

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH net-next 1/9] openvswitch: Scrub packet in ovs_vport_receive()

2015-08-04 Thread Joe Stringer
On 1 August 2015 at 12:17, Thomas Graf  wrote:
> On 07/31/15 at 10:51am, Joe Stringer wrote:
>> On 31 July 2015 at 07:34, Hannes Frederic Sowa  wrote:
>> > In general, this shouldn't be necessary as the packet should already be
>> > scrubbed before they arrive here.
>> >
>> > Could you maybe add a WARN_ON and check how those skbs with conntrack
>> > data traverse the stack? I also didn't understand why make it dependent
>> > on the socket.
>>
>> OK, sure. One case I could think of is with an OVS internal port in
>> another namespace, directly attached to the bridge. I'll have a play
>> around with WARN_ON and see if I can come up with something more
>> trimmed down.
>
> The OVS internal port will definitely pass through an unscrubbed
> packet across namespaces. I think the proper thing to do would be
> to scrub but conditionally keep metadata.

It's only "unscrubbed" when receiving from local stack at the moment.
Some pieces are cleared when handing towards the local stack, and
there's no configuration for that behaviour. Presumably internal port
transmit and receive should mirror each other?

I don't have a specific use case either way. The remaining code for
this series handles this case correctly, it's just a matter of what
behaviour we're looking for. We could implement the flag as you say, I
presume that userspace would need to specify this during vport
creation and the default should work similar to the existing behaviour
(ie, keep metadata). One thing that's not entirely clear to me is
exactly which metadata should be represented by this flag and whether
the single flag is expressive enough.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [Consult] cris: arch-v10: About $dtp0 register

2015-08-04 Thread Hans-Peter Nilsson
> From: Chen Gang 
> Date: Mon, 20 Jul 2015 23:32:52 +0200

> On 7/9/15 08:44, Chen Gang wrote:
> The latest upstream cris gcc will cause issue for next-20150720 with
> allmodconfig (although it can let next-20150702 pass allmodconfig):
> 
> CC [M]  kernel/rcu/rcutorture.o
[...]
>   include/linux/rcutiny.h:55:20: internal compiler error: Segmentation fault
>static inline void rcu_barrier_sched(void)
>   ^
>   0xad879f crash_signal
> ../../gcc/gcc/toplev.c:352
>   0xbf0fb8 tree_check(tree_node*, char const*, int, char const*, tree_code)
> ../../gcc/gcc/tree.h:2857
>   0xbf0fb8 fold_builtin_alloca_with_align
> ../../gcc/gcc/tree-ssa-ccp.c:2110
[...]

> If no any additional response, I shall try to fix it in the next month.

No need to enter a new bug report, this appears to be
.  It is also observed for other
ports, at least with the original C++ test-case there.  I added
a reduced rcutorture.c to the bug-report.  Thanks.

brgds, H-P
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 net-next 2/2] RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.

2015-08-04 Thread Cong Wang
On Mon, Aug 3, 2015 at 10:29 PM, Sowmini Varadhan
 wrote:
> +static struct pernet_operations rds_tcp_net_ops = {
> +   .init = rds_tcp_init_net,
> +   .exit = rds_tcp_exit_net,
> +   .id = _tcp_netid,
> +   .size = sizeof(struct rds_tcp_net),
> +};
> +
> +static void rds_tcp_kill_sock(struct net *net)
> +{
> +   struct rds_tcp_connection *tc, *_tc;
> +   struct sock *sk;
> +   struct list_head tmp_list;
> +   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
> +
> +   rds_tcp_listen_stop(rtn->rds_tcp_listen_sock);
> +   rtn->rds_tcp_listen_sock = NULL;
> +   flush_work(>rds_tcp_accept_w);
> +   INIT_LIST_HEAD(_list);

Can be folded as LIST_HEAD(tmp_list).


> +   spin_lock_irq(_tcp_conn_lock);
> +   list_for_each_entry_safe(tc, _tc, _tcp_conn_list, t_tcp_node) {
> +   struct net *c_net = read_pnet(>conn->c_net);
> +
> +   if (net != c_net || !tc->t_sock)
> +   continue;
> +   list_del(>t_tcp_node);
> +   list_add_tail(>t_tcp_node, _list);

list_move_tail().


> +   }
> +   spin_unlock_irq(_tcp_conn_lock);
> +   list_for_each_entry_safe(tc, _tc, _list, t_tcp_node) {
> +   sk = tc->t_sock->sk;
> +   sk->sk_prot->disconnect(sk, 0);
> +   tcp_done(sk);
> +   if (tc->conn->c_passive)
> +   rds_conn_destroy(tc->conn->c_passive);
> +   rds_conn_destroy(tc->conn);
> +   }
> +}
> +
> +static int rds_tcp_dev_event(struct notifier_block *this,
> +unsigned long event, void *ptr)
> +{
> +   struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> +
> +   /* rds-tcp registers as a pernet subys, so the ->exit will only
> +* get invoked after network acitivity has quiesced. We need to
> +* clean up all sockets  to quiesce network activity, and use
> +* the unregistration of the per-net loopback device as a trigger
> +* to start that cleanup.
> +*/
> +   if (event == NETDEV_UNREGISTER_FINAL &&
> +   strcmp(dev->name, "lo") == 0)


Shouldn't check device name, check ->ifindex == LOOPBACK_IFINDEX
instead.


> +   rds_tcp_kill_sock(dev_net(dev));
> +
> +   return NOTIFY_DONE;
> +}
> +

...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: windfarm: decrement client count when unregistering

2015-08-04 Thread Michael Ellerman
On Fri, 2015-31-07 at 12:08:58 UTC, Paul Bolle wrote:
> wf_unregister_client() increments the client count when a client
> unregisters. That is obviously incorrect. Decrement that client count
> instead.
> 
> Fixes: 75722d3992f5 ("[PATCH] ppc64: Thermal control for SMU based machines")
> 
> Signed-off-by: Paul Bolle 
> ---
> cross-compiled only. I don't have a PPC machine at hand, sorry. And this
> does need some run-time testing, I'd day.
> 
> windfarm_corex_exit() contains:
> BUG_ON(wf_client_count != 0);
> 
> I wonder why that, apparently. never triggered.

Hmm interesting.

A quick test here on an iMacG5 shows that we get into a state where we can't
remove windfarm_core:

  $ lsmod
  Module  Size  Used by
  windfarm_smu_sensors7549  2
  windfarm_core  15391  1 windfarm_smu_sensors


Which means we can't trigger windfarm_core_exit() and the BUG_ON().

I also get an oops when removing windfarm_lm75_sensor, so I suspect there are
gremlins in the module ref counting for windfarm.

I'll merge this as probably correct.

  [ cut here ]
  WARNING: at ../kernel/module.c:1116
  Modules linked in: windfarm_lm75_sensor(-) windfarm_smu_sensors 
windfarm_smu_controls windfarm_core [last unloaded: windfarm_cpufreq_clamp]
  CPU: 0 PID: 2860 Comm: modprobe Not tainted 
4.2.0-rc2-00043-gf4e908dd3cbe-dirty #2
  task: c0003d9c4fe0 ti: c0003df2 task.ti: c0003df2
  NIP: c00d62d0 LR: d04338bc CTR: c00d62a0
  REGS: c0003df23660 TRAP: 0700   Not tainted  
(4.2.0-rc2-00043-gf4e908dd3cbe-dirty)
  MSR: 90029032   CR: 82002884  XER: 2000
  SOFTE: 1 
  GPR00: d04338b0 c0003df238e0 c0b27800 d0474b00 
  GPR04: c0003d185900 0001 3e5de000 175c 
  GPR08: c0a2c068 0001  d04343c0 
  GPR12: c00d62a0 c000 0004 0001 
  GPR16:    ffe108bc 
  GPR20:  209b0278  0001 
  GPR24: ffe11915 209b0008 209b02ac  
  GPR28:  c0003d1b3c80  d0474b00 
  NIP [c00d62d0] .module_put+0x30/0x40
  LR [d04338bc] .wf_put_sensor+0x9c/0xf0 [windfarm_core]
  Call Trace:
  [c0003df238e0] [d04338b0] .wf_put_sensor+0x90/0xf0 
[windfarm_core] (unreliable)
  [c0003df23960] [d0474020] .wf_lm75_remove+0x20/0x40 
[windfarm_lm75_sensor]
  [c0003df239d0] [c058cb8c] .i2c_device_remove+0x7c/0xb0
  [c0003df23a50] [c0450dd4] .__device_release_driver+0xb4/0x180
  [c0003df23ad0] [c0451a08] .driver_detach+0x138/0x180
  [c0003df23b70] [c0450720] .bus_remove_driver+0x70/0xf0
  [c0003df23bf0] [c04523a8] .driver_unregister+0x38/0x70
  [c0003df23c70] [c058d718] .i2c_del_driver+0x28/0x40
  [c0003df23cf0] [d04743fc] .wf_lm75_driver_exit+0x18/0x2cc 
[windfarm_lm75_sensor]
  [c0003df23d60] [c00d82bc] .SyS_delete_module+0x18c/0x250
  [c0003df23e30] [c0007c98] system_call+0x38/0xd0
  Instruction dump:
  2c23 4d820020 392302e0 7c2004ac 7d404828 2c0a0001 394a 41c00010 
  7d40492d 40c2ffec 7c0004ac 55490ffe <0b09> 4e800020 6000 6000 
  ---[ end trace 013348a741cf9320 ]---


cheers
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/9] KVM: MMU: move FNAME(is_rsvd_bits_set) to mmu.c

2015-08-04 Thread Xiao Guangrong
FNAME(is_rsvd_bits_set) does not depend on guest mmu mode, move it
to mmu.c to stop being compiled multiple times

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c |  8 
 arch/x86/kvm/paging_tmpl.h | 13 ++---
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f432e9b..3f1c403 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3546,6 +3546,14 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
 }
 
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+   int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
+
+   return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
+   ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
+}
+
 #define PTTYPE_EPT 18 /* arbitrary */
 #define PTTYPE PTTYPE_EPT
 #include "paging_tmpl.h"
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 0f67d7e..736e6ab 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -128,14 +128,6 @@ static inline void FNAME(protect_clean_gpte)(unsigned 
*access, unsigned gpte)
*access &= mask;
 }
 
-static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-   int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
-
-   return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
-   ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
-}
-
 static inline int FNAME(is_present_gpte)(unsigned long pte)
 {
 #if PTTYPE != PTTYPE_EPT
@@ -172,7 +164,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu 
*vcpu,
  struct kvm_mmu_page *sp, u64 *spte,
  u64 gpte)
 {
-   if (FNAME(is_rsvd_bits_set)(>arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+   if (is_rsvd_bits_set(>arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;
 
if (!FNAME(is_present_gpte)(gpte))
@@ -353,8 +345,7 @@ retry_walk:
if (unlikely(!FNAME(is_present_gpte)(pte)))
goto error;
 
-   if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte,
-walker->level))) {
+   if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
}
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 0/9] KVM: MMU: fix and improve validation of mmio page fault

2015-08-04 Thread Xiao Guangrong
Changelog in v2:
- rename reset_*_rsvds_bits_mask() to reset_*_zero_bits_mask() and
  is_shadow_rsvd_bits_set() to is_shadow_zero_bits_set() to better
  match what we are checking. Thanks for Paolo's suggestion.

Current code validating mmio #PF is buggy, it was spotted by Pavel
Shirshov, the bug is that qemu complained with "KVM: unknown exit,
hardware reason 31" and KVM shown these info:
[84245.284948] EPT: Misconfiguration.
[84245.285056] EPT: GPA: 0xfeda848
[84245.285154] ept_misconfig_inspect_spte: spte 0x5eaef50107 level 4
[84245.285344] ept_misconfig_inspect_spte: spte 0x5f5fadc107 level 3
[84245.285532] ept_misconfig_inspect_spte: spte 0x5141d18107 level 2
[84245.285723] ept_misconfig_inspect_spte: spte 0x52e40dad77 level 1

This is because we got a mmio #PF and the handler see the mmio spte
becomes normal (points to the ram page)

However, this is valid after introducing fast mmio spte invalidation which
increases the generation-number instead of zapping mmio sptes, a example
is as follows:
1. QEMU drops mmio region by adding a new memslot
2. invalidate all mmio sptes
3.

VCPU 0VCPU 1
access the invalid mmio spte

access the region originally was MMIO before
set the spte to the normal ram map

mmio #PF
check the spte and see it becomes normal ram mapping !!!

The first patch simply fixes the bug by dropping the validation in mmio
handler which is good for backport

Later patches enable fully check reserved bits for shadow page table
entries, since shadow page table and guest page table have the some
format, this patches reuse the logic which checks reserved bits on
guest pte to check sptes

In order to catching as many bugs as possible, we not only check the
reserved bits on hardware but also check other bits that spte never used

Xiao Guangrong (9):
  KVM: MMU: fix validation of mmio page fault
  KVM: MMU: move FNAME(is_rsvd_bits_set) to mmu.c
  KVM: MMU: introduce rsvd_bits_validate
  KVM: MMU: split reset_rsvds_bits_mask
  KVM: MMU: split reset_rsvds_bits_mask_ept
  KVM: MMU: introduce the framework to check zero bits on sptes
  KVM: MMU: introduce is_shadow_zero_bits_set()
  KVM: MMU: fully check zero bits for sptes
  KVM: VMX: drop ept misconfig check

 arch/x86/include/asm/kvm_host.h |  16 ++-
 arch/x86/kvm/mmu.c  | 283 
 arch/x86/kvm/mmu.h  |   4 +-
 arch/x86/kvm/paging_tmpl.h  |  13 +-
 arch/x86/kvm/svm.c  |   1 +
 arch/x86/kvm/vmx.c  |  74 +--
 arch/x86/kvm/x86.c  |   3 +-
 7 files changed, 193 insertions(+), 201 deletions(-)

-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 3/9] KVM: MMU: introduce rsvd_bits_validate

2015-08-04 Thread Xiao Guangrong
These two fields, rsvd_bits_mask and bad_mt_xwr, in "struct kvm_mmu" are
used to check if reserved bits set on guest ptes, move them to a data
struct so that the approach can be applied to check host shadow page
table entries as well

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |  8 +++--
 arch/x86/kvm/mmu.c  | 75 +++--
 arch/x86/kvm/x86.c  |  3 +-
 3 files changed, 50 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6e851d5..3e33c0d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -252,6 +252,11 @@ struct kvm_pio_request {
int size;
 };
 
+struct rsvd_bits_validate {
+   u64 rsvd_bits_mask[2][4];
+   u64 bad_mt_xwr;
+};
+
 /*
  * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
  * 32-bit).  The kvm_mmu structure abstracts the details of the current mmu
@@ -289,8 +294,7 @@ struct kvm_mmu {
 
u64 *pae_root;
u64 *lm_root;
-   u64 rsvd_bits_mask[2][4];
-   u64 bad_mt_xwr;
+   struct rsvd_bits_validate guest_rsvd_check;
 
/*
 * Bitmap: bit set = last pte in walk
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3f1c403..23633f5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3548,10 +3548,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
 
 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
 {
+   struct rsvd_bits_validate *rsvd_check = >guest_rsvd_check;
int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
 
-   return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) |
-   ((mmu->bad_mt_xwr & (1ull << low6)) != 0);
+   return (gpte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
+   ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
 }
 
 #define PTTYPE_EPT 18 /* arbitrary */
@@ -3570,12 +3571,13 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 
gpte, int level)
 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
  struct kvm_mmu *context)
 {
+   struct rsvd_bits_validate *rsvd_check = >guest_rsvd_check;
int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
u64 gbpages_bit_rsvd = 0;
u64 nonleaf_bit8_rsvd = 0;
 
-   context->bad_mt_xwr = 0;
+   rsvd_check->bad_mt_xwr = 0;
 
if (!context->nx)
exb_bit_rsvd = rsvd_bits(63, 63);
@@ -3592,52 +3594,58 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
switch (context->root_level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
-   context->rsvd_bits_mask[0][1] = 0;
-   context->rsvd_bits_mask[0][0] = 0;
-   context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+   rsvd_check->rsvd_bits_mask[0][1] = 0;
+   rsvd_check->rsvd_bits_mask[0][0] = 0;
+   rsvd_check->rsvd_bits_mask[1][0] =
+   rsvd_check->rsvd_bits_mask[0][0];
 
if (!is_pse(vcpu)) {
-   context->rsvd_bits_mask[1][1] = 0;
+   rsvd_check->rsvd_bits_mask[1][1] = 0;
break;
}
 
if (is_cpuid_PSE36())
/* 36bits PSE 4MB page */
-   context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+   rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
else
/* 32 bits PSE 4MB page */
-   context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+   rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
break;
case PT32E_ROOT_LEVEL:
-   context->rsvd_bits_mask[0][2] =
+   rsvd_check->rsvd_bits_mask[0][2] =
rsvd_bits(maxphyaddr, 63) |
rsvd_bits(5, 8) | rsvd_bits(1, 2);  /* PDPTE */
-   context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+   rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62);  /* PDE */
-   context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+   rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62);  /* PTE */
-   context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+   rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 62) |
rsvd_bits(13, 20);  /* large page */
-   context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+   rsvd_check->rsvd_bits_mask[1][0] =
+   rsvd_check->rsvd_bits_mask[0][0];
break;
   

[PATCH v2 5/9] KVM: MMU: split reset_rsvds_bits_mask_ept

2015-08-04 Thread Xiao Guangrong
Since shdow ept page tables and intel nested guest page tables have the
same format, split reset_rsvds_bits_mask_ept so that the logic can be
reused by later patches which check zero bits on sptes

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 693d565..d11d212 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3660,11 +3660,10 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
is_pse(vcpu));
 }
 
-static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
-   struct kvm_mmu *context, bool execonly)
+static void
+__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+   int maxphyaddr, bool execonly)
 {
-   struct rsvd_bits_validate *rsvd_check = >guest_rsvd_check;
-   int maxphyaddr = cpuid_maxphyaddr(vcpu);
int pte;
 
rsvd_check->rsvd_bits_mask[0][3] =
@@ -3693,6 +3692,13 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu 
*vcpu,
}
 }
 
+static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
+   struct kvm_mmu *context, bool execonly)
+{
+   __reset_rsvds_bits_mask_ept(>guest_rsvd_check,
+   cpuid_maxphyaddr(vcpu), execonly);
+}
+
 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
  struct kvm_mmu *mmu, bool ept)
 {
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 6/9] KVM: MMU: introduce the framework to check zero bits on sptes

2015-08-04 Thread Xiao Guangrong
We have abstracted the data struct and functions which are used to check
reserved bit on guest page tables, now we extend the logic to check
zero bits on shadow page tables

The zero bits on sptes include not only reserved bits on hardware but also
the bits sptes nerve used

Signed-off-by: Xiao Guangrong 
---
 arch/x86/include/asm/kvm_host.h |  8 +++
 arch/x86/kvm/mmu.c  | 50 +
 arch/x86/kvm/mmu.h  |  3 +++
 arch/x86/kvm/svm.c  |  1 +
 4 files changed, 62 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3e33c0d..09acaa6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -294,6 +294,14 @@ struct kvm_mmu {
 
u64 *pae_root;
u64 *lm_root;
+
+   /*
+* check zero bits on shadow page table entries, these
+* bits include not only hardware reserved bits but also
+* the bits spte never used.
+*/
+   struct rsvd_bits_validate shadow_zero_check;
+
struct rsvd_bits_validate guest_rsvd_check;
 
/*
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d11d212..edf1ec5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3699,6 +3699,53 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu 
*vcpu,
cpuid_maxphyaddr(vcpu), execonly);
 }
 
+/*
+ * the page table on host is the shadow page table for the page
+ * table in guest or amd nested guest, its mmu features completely
+ * follow the features in guest.
+ */
+void
+reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
+{
+   __reset_rsvds_bits_mask(vcpu, >shadow_zero_check,
+   boot_cpu_data.x86_phys_bits,
+   context->shadow_root_level, context->nx,
+   guest_cpuid_has_gbpages(vcpu), is_pse(vcpu));
+}
+EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
+
+/*
+ * the direct page table on host, use as much mmu features as
+ * possible, however, kvm currently does not do execution-protection.
+ */
+static void
+reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+   struct kvm_mmu *context)
+{
+   if (guest_cpuid_is_amd(vcpu))
+   __reset_rsvds_bits_mask(vcpu, >shadow_zero_check,
+   boot_cpu_data.x86_phys_bits,
+   context->shadow_root_level, false,
+   cpu_has_gbpages, true);
+   else
+   __reset_rsvds_bits_mask_ept(>shadow_zero_check,
+   boot_cpu_data.x86_phys_bits,
+   false);
+
+}
+
+/*
+ * as the comments in reset_shadow_zero_bits_mask() except it
+ * is the shadow page table for intel nested guest.
+ */
+static void
+reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
+   struct kvm_mmu *context, bool execonly)
+{
+   __reset_rsvds_bits_mask_ept(>shadow_zero_check,
+   boot_cpu_data.x86_phys_bits, execonly);
+}
+
 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
  struct kvm_mmu *mmu, bool ept)
 {
@@ -3877,6 +3924,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
update_permission_bitmask(vcpu, context, false);
update_last_pte_bitmap(vcpu, context);
+   reset_tdp_shadow_zero_bits_mask(vcpu, context);
 }
 
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
@@ -3904,6 +3952,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
context->base_role.smap_andnot_wp
= smap && !is_write_protection(vcpu);
context->base_role.smm = is_smm(vcpu);
+   reset_shadow_zero_bits_mask(vcpu, context);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
@@ -3927,6 +3976,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool 
execonly)
 
update_permission_bitmask(vcpu, context, true);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
+   reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 398d21c..2299d15 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -53,6 +53,9 @@ static inline u64 rsvd_bits(int s, int e)
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 
+void
+reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
+
 /*
  * Return values of handle_mmio_page_fault_common:
  * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 568cd0f..189e464 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2107,6 +2107,7 @@ static void 

[PATCH v2 1/9] KVM: MMU: fix validation of mmio page fault

2015-08-04 Thread Xiao Guangrong
We got the bug that qemu complained with "KVM: unknown exit, hardware
reason 31" and KVM shown these info:
[84245.284948] EPT: Misconfiguration.
[84245.285056] EPT: GPA: 0xfeda848
[84245.285154] ept_misconfig_inspect_spte: spte 0x5eaef50107 level 4
[84245.285344] ept_misconfig_inspect_spte: spte 0x5f5fadc107 level 3
[84245.285532] ept_misconfig_inspect_spte: spte 0x5141d18107 level 2
[84245.285723] ept_misconfig_inspect_spte: spte 0x52e40dad77 level 1

This is because we got a mmio #PF and the handler see the mmio spte becomes
normal (points to the ram page)

However, this is valid after introducing fast mmio spte invalidation which
increases the generation-number instead of zapping mmio sptes, a example
is as follows:
1. QEMU drops mmio region by adding a new memslot
2. invalidate all mmio sptes
3.

VCPU 0VCPU 1
access the invalid mmio spte
access the region originally was MMIO before
set the spte to the normal ram map

mmio #PF
check the spte and see it becomes normal ram mapping !!!

This patch fixes the bug just by dropping the check in mmio handler, it's
good for backport. Full check will be introduced in later patches

Reported-by: Pavel Shirshov 
Tested-by: Pavel Shirshov 
Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c | 45 -
 1 file changed, 45 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6de896f..f432e9b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -357,12 +357,6 @@ static u64 __get_spte_lockless(u64 *sptep)
 {
return ACCESS_ONCE(*sptep);
 }
-
-static bool __check_direct_spte_mmio_pf(u64 spte)
-{
-   /* It is valid if the spte is zapped. */
-   return spte == 0ull;
-}
 #else
 union split_spte {
struct {
@@ -478,23 +472,6 @@ retry:
 
return spte.spte;
 }
-
-static bool __check_direct_spte_mmio_pf(u64 spte)
-{
-   union split_spte sspte = (union split_spte)spte;
-   u32 high_mmio_mask = shadow_mmio_mask >> 32;
-
-   /* It is valid if the spte is zapped. */
-   if (spte == 0ull)
-   return true;
-
-   /* It is valid if the spte is being zapped. */
-   if (sspte.spte_low == 0ull &&
-   (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
-   return true;
-
-   return false;
-}
 #endif
 
 static bool spte_is_locklessly_modifiable(u64 spte)
@@ -3299,21 +3276,6 @@ static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, 
u64 addr, bool direct)
return vcpu_match_mmio_gva(vcpu, addr);
 }
 
-
-/*
- * On direct hosts, the last spte is only allows two states
- * for mmio page fault:
- *   - It is the mmio spte
- *   - It is zapped or it is being zapped.
- *
- * This function completely checks the spte when the last spte
- * is not the mmio spte.
- */
-static bool check_direct_spte_mmio_pf(u64 spte)
-{
-   return __check_direct_spte_mmio_pf(spte);
-}
-
 static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
 {
struct kvm_shadow_walk_iterator iterator;
@@ -3356,13 +3318,6 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, 
u64 addr, bool direct)
}
 
/*
-* It's ok if the gva is remapped by other cpus on shadow guest,
-* it's a BUG if the gfn is not a mmio page.
-*/
-   if (direct && !check_direct_spte_mmio_pf(spte))
-   return RET_MMIO_PF_BUG;
-
-   /*
 * If the page table is zapped by other cpus, let CPU fault again on
 * the address.
 */
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 8/9] KVM: MMU: fully check zero bits for sptes

2015-08-04 Thread Xiao Guangrong
The #PF with PFEC.RSV = 1 is designed to speed MMIO emulation, however,
it is possible that the RSV #PF is caused by real BUG by mis-configure
shadow page table entries

This patch enables full check for the zero bits on shadow page table
entries which include not only the reserved bit on hardware but also
the bits spte never used, then dump the shadow page table hierarchy
if the real bug is detected

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c | 41 +++--
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e6a7ed0..1393317 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3295,31 +3295,60 @@ static bool quickly_check_mmio_pf(struct kvm_vcpu 
*vcpu, u64 addr, bool direct)
return vcpu_match_mmio_gva(vcpu, addr);
 }
 
-static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+/* return true if reserved bit is detected on spte. */
+static bool
+walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 {
struct kvm_shadow_walk_iterator iterator;
-   u64 spte = 0ull;
+   u64 sptes[PT64_ROOT_LEVEL], spte = 0ull;
+   int root = 0, leaf;
+   bool reserved = false;
 
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-   return spte;
+   goto exit;
 
walk_shadow_page_lockless_begin(vcpu);
-   for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+   for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+   leaf = iterator.level;
+
+   if (!root)
+   root = leaf;
+
+   sptes[leaf - 1] = spte;
+
if (!is_shadow_present_pte(spte))
break;
+
+   reserved |= is_shadow_zero_bits_set(>arch.mmu, spte,
+   leaf);
+   }
walk_shadow_page_lockless_end(vcpu);
 
-   return spte;
+   if (reserved) {
+   pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump 
hierarchy:\n",
+  __func__, addr);
+   while (root >= leaf) {
+   pr_err("-- spte 0x%llx level %d.\n",
+  sptes[root - 1], root);
+   root--;
+   }
+   }
+exit:
+   *sptep = spte;
+   return reserved;
 }
 
 int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 {
u64 spte;
+   bool reserved;
 
if (quickly_check_mmio_pf(vcpu, addr, direct))
return RET_MMIO_PF_EMULATE;
 
-   spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+   reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, );
+   if (unlikely(reserved))
+   return RET_MMIO_PF_BUG;
 
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 9/9] KVM: VMX: drop ept misconfig check

2015-08-04 Thread Xiao Guangrong
The logic used to check ept misconfig is completely contained in common
reserved bits check for sptes, so it can be removed

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c | 22 
 arch/x86/kvm/mmu.h |  1 -
 arch/x86/kvm/vmx.c | 74 ++
 3 files changed, 2 insertions(+), 95 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1393317..1dabaec 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4937,28 +4937,6 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
return nr_mmu_pages;
 }
 
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
-{
-   struct kvm_shadow_walk_iterator iterator;
-   u64 spte;
-   int nr_sptes = 0;
-
-   if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
-   return nr_sptes;
-
-   walk_shadow_page_lockless_begin(vcpu);
-   for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
-   sptes[iterator.level-1] = spte;
-   nr_sptes++;
-   if (!is_shadow_present_pte(spte))
-   break;
-   }
-   walk_shadow_page_lockless_end(vcpu);
-
-   return nr_sptes;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
 {
kvm_mmu_unload(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2299d15..e4202e4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -50,7 +50,6 @@ static inline u64 rsvd_bits(int s, int e)
return ((1ULL << (e - s + 1)) - 1) << s;
 }
 
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
 
 void
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 217f663..4cf25b9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5748,73 +5748,9 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
-static u64 ept_rsvd_mask(u64 spte, int level)
-{
-   int i;
-   u64 mask = 0;
-
-   for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
-   mask |= (1ULL << i);
-
-   if (level == 4)
-   /* bits 7:3 reserved */
-   mask |= 0xf8;
-   else if (spte & (1ULL << 7))
-   /*
-* 1GB/2MB page, bits 29:12 or 20:12 reserved respectively,
-* level == 1 if the hypervisor is using the ignored bit 7.
-*/
-   mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE;
-   else if (level > 1)
-   /* bits 6:3 reserved */
-   mask |= 0x78;
-
-   return mask;
-}
-
-static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
-  int level)
-{
-   printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
-
-   /* 010b (write-only) */
-   WARN_ON((spte & 0x7) == 0x2);
-
-   /* 110b (write/execute) */
-   WARN_ON((spte & 0x7) == 0x6);
-
-   /* 100b (execute-only) and value not supported by logical processor */
-   if (!cpu_has_vmx_ept_execute_only())
-   WARN_ON((spte & 0x7) == 0x4);
-
-   /* not 000b */
-   if ((spte & 0x7)) {
-   u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
-
-   if (rsvd_bits != 0) {
-   printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
-__func__, rsvd_bits);
-   WARN_ON(1);
-   }
-
-   /* bits 5:3 are _not_ reserved for large page or leaf page */
-   if ((rsvd_bits & 0x38) == 0) {
-   u64 ept_mem_type = (spte & 0x38) >> 3;
-
-   if (ept_mem_type == 2 || ept_mem_type == 3 ||
-   ept_mem_type == 7) {
-   printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
-   __func__, ept_mem_type);
-   WARN_ON(1);
-   }
-   }
-   }
-}
-
 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
-   u64 sptes[4];
-   int nr_sptes, i, ret;
+   int ret;
gpa_t gpa;
 
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
@@ -5835,13 +5771,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
return 1;
 
/* It is the real ept misconfig */
-   printk(KERN_ERR "EPT: Misconfiguration.\n");
-   printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
-
-   nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
-
-   for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
-   ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
+   WARN_ON(1);
 
vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
-- 
2.1.0

--
To unsubscribe 

[PATCH v2 4/9] KVM: MMU: split reset_rsvds_bits_mask

2015-08-04 Thread Xiao Guangrong
Since softmmu & AMD nested shadow page tables and guest page tables have
the same format, split reset_rsvds_bits_mask so that the logic can be
reused by later patches which check zero bits on sptes

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c | 26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 23633f5..693d565 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3568,20 +3568,21 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 
gpte, int level)
 #include "paging_tmpl.h"
 #undef PTTYPE
 
-static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *context)
+static void
+__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+   struct rsvd_bits_validate *rsvd_check,
+   int maxphyaddr, int level, bool nx, bool gbpages,
+   bool pse)
 {
-   struct rsvd_bits_validate *rsvd_check = >guest_rsvd_check;
-   int maxphyaddr = cpuid_maxphyaddr(vcpu);
u64 exb_bit_rsvd = 0;
u64 gbpages_bit_rsvd = 0;
u64 nonleaf_bit8_rsvd = 0;
 
rsvd_check->bad_mt_xwr = 0;
 
-   if (!context->nx)
+   if (!nx)
exb_bit_rsvd = rsvd_bits(63, 63);
-   if (!guest_cpuid_has_gbpages(vcpu))
+   if (!gbpages)
gbpages_bit_rsvd = rsvd_bits(7, 7);
 
/*
@@ -3591,7 +3592,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
if (guest_cpuid_is_amd(vcpu))
nonleaf_bit8_rsvd = rsvd_bits(8, 8);
 
-   switch (context->root_level) {
+   switch (level) {
case PT32_ROOT_LEVEL:
/* no rsvd bits for 2 level 4K page table entries */
rsvd_check->rsvd_bits_mask[0][1] = 0;
@@ -3599,7 +3600,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
rsvd_check->rsvd_bits_mask[1][0] =
rsvd_check->rsvd_bits_mask[0][0];
 
-   if (!is_pse(vcpu)) {
+   if (!pse) {
rsvd_check->rsvd_bits_mask[1][1] = 0;
break;
}
@@ -3650,6 +3651,15 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
}
 }
 
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *context)
+{
+   __reset_rsvds_bits_mask(vcpu, >guest_rsvd_check,
+   cpuid_maxphyaddr(vcpu), context->root_level,
+   context->nx, guest_cpuid_has_gbpages(vcpu),
+   is_pse(vcpu));
+}
+
 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
struct kvm_mmu *context, bool execonly)
 {
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 7/9] KVM: MMU: introduce is_shadow_zero_bits_set()

2015-08-04 Thread Xiao Guangrong
We have the same data struct to check reserved bits on guest page tables
and shadow page tables, split is_rsvd_bits_set() so that the logic can be
shared between these two paths

Signed-off-by: Xiao Guangrong 
---
 arch/x86/kvm/mmu.c | 28 +++-
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index edf1ec5..e6a7ed0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3268,6 +3268,25 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu 
*vcpu, gva_t vaddr,
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, 
exception);
 }
 
+static bool
+__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
+{
+   int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
+
+   return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
+   ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
+}
+
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+   return __is_rsvd_bits_set(>guest_rsvd_check, gpte, level);
+}
+
+static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
+{
+   return __is_rsvd_bits_set(>shadow_zero_check, spte, level);
+}
+
 static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
 {
if (direct)
@@ -3546,15 +3565,6 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, 
unsigned level, unsigned gp
return mmu->last_pte_bitmap & (1 << index);
 }
 
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
-   struct rsvd_bits_validate *rsvd_check = >guest_rsvd_check;
-   int bit7 = (gpte >> 7) & 1, low6 = gpte & 0x3f;
-
-   return (gpte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
-   ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
-}
-
 #define PTTYPE_EPT 18 /* arbitrary */
 #define PTTYPE PTTYPE_EPT
 #include "paging_tmpl.h"
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 0/2] x86, mwaitt: introduce AMD mwaitt support

2015-08-04 Thread Borislav Petkov
On Wed, Aug 05, 2015 at 11:18:50AM +0800, Huang Rui wrote:
> cat /sys/bus/pci/devices/\:00\:18.4/hwmon/hwmon0/power1_acc;
> sleep 1s;
> cat /sys/bus/pci/devices/\:00\:18.4/hwmon/hwmon0/power1_acc;
> 
> * TSC-based default delay:  485115 uWatts average power
> * MWAITX-based delay:   252738 uWatts average power
> 
> Thus, that's about 240 milliWatts less power consumption. The test
> method relies on the support of AMD CPU accumulated power algorithm in
> fam15_power for which patches are forthcoming.

Cool power consumption drop is actually even measureable.

Also, I think implementing it as a loop, as Peter suggested, was the
right thing to do due to this statement in MWAITX's definition in the
APM:

"There is no indication after exiting MWAITX of why the processor exited
or if the timer expired. It is up to software to check whether the
awaiting store has occurred, and if not, determining how much time
has elapsed if it wants to re-establish the MONITORX with a new timer
value."

So all in all, those patches are starting to shape up nicely. One small
nit I have is using "MWAITT" (with a T) together with MWAITX while the
APM calls it only MWAITX. But I can fix that when applying and drop all
MWAITT occurrences.

Thanks.

-- 
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mtd: Fix switch-bool compilation warning

2015-08-04 Thread Tomer Barletz
With gcc 5.1 I get:
warning: switch condition has boolean value [-Wswitch-bool]

Signed-off-by: Tomer Barletz 
---
 drivers/mtd/mtd_blkdevs.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 41acc50..8830475 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -97,14 +97,13 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
if (req->cmd_flags & REQ_DISCARD)
return tr->discard(dev, block, nsect);
 
-   switch(rq_data_dir(req)) {
-   case READ:
+   if (rq_data_dir(req) == READ) {
for (; nsect > 0; nsect--, block++, buf += tr->blksize)
if (tr->readsect(dev, block, buf))
return -EIO;
rq_flush_dcache_pages(req);
return 0;
-   case WRITE:
+   } else {
if (!tr->writesect)
return -EIO;
 
@@ -113,9 +112,6 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
if (tr->writesect(dev, block, buf))
return -EIO;
return 0;
-   default:
-   printk(KERN_NOTICE "Unknown request %u\n", rq_data_dir(req));
-   return -EIO;
}
 }
 
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] ipv6:Fix concurrent access issue in the function inet6_rtm_deladdr

2015-08-04 Thread Cong Wang
On Tue, Aug 4, 2015 at 7:39 PM, Nicholas Krause  wrote:
> From: Nicholas Krause 
>
> This fixes the issue with conncurrent access when calling the function
> inte6_addr_del due to this function using non locked wrapper versions
> of certain functions by locking the routing mutex before and after this
> call with rtnl_lock/unlock. After the unlocking just return the error
> code as normal to signal success or failure to the caller of the function
> inet_6_rtm_addr.
>

Huh? Isn't inet6_rtm_deladdr() already called with rtnl lock?

What bug are you trying to fix?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 08/27] power: Export I2C module alias information in missing drivers

2015-08-04 Thread Sebastian Reichel
Hi,

On Thu, Jul 30, 2015 at 06:18:33PM +0200, Javier Martinez Canillas wrote:
> The I2C core always reports the MODALIAS uevent as "i2c: regardless if the driver was matched using the I2C id_table or the
> of_match_table. So the driver needs to export the I2C table and this
> be built into the module or udev won't have the necessary information
> to auto load the correct module when the device is added.

Thanks, queued.

-- Sebastian


signature.asc
Description: Digital signature


Re: [RFC] kcore:change kcore_read to make sure the kernel read is safe

2015-08-04 Thread yalin wang

> On Aug 5, 2015, at 05:18, Dave Hansen  wrote:
> 
> On 08/03/2015 08:37 PM, yalin wang wrote:
>> This change kcore_read() to use __copy_from_user_inatomic() to
>> copy data from kernel address, because kern_addr_valid() just make sure
>> page table is valid during call it, whne it return, the page table may
>> change, for example, like set_fixmap() function will change kernel page
>> table, then maybe trigger kernel crash if encounter this unluckily.
> 
> I don't see any cases at the moment that will crash.  set_fixmap()
> doesn't ever clear out any ptes, right?
> 
> I guess the root problem here is that we don't have any good (generic)
> locking of kernel page tables inside the linear map.  Can you come up
> with a case where this will _actually_ crash?
> 
Thanks for your comments.
i don’t have crash for this, but when i read code, i see this part not safe,
so i make this patch :).

>> fs/proc/kcore.c | 30 --
>> 1 file changed, 24 insertions(+), 6 deletions(-)
>> 
>> diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
>> index 92e6726..b085fde 100644
>> --- a/fs/proc/kcore.c
>> +++ b/fs/proc/kcore.c
>> @@ -86,8 +86,8 @@ static size_t get_kcore_size(int *nphdr, size_t 
>> *elf_buflen)
>>  size = try;
>>  *nphdr = *nphdr + 1;
>>  }
>> -*elf_buflen =   sizeof(struct elfhdr) + 
>> -(*nphdr + 2)*sizeof(struct elf_phdr) + 
>> +*elf_buflen =   sizeof(struct elfhdr) +
>> +(*nphdr + 2)*sizeof(struct elf_phdr) +
> 
> I'm having a hard time spotting the change here.  Whitespace?
i  will seperate in another patch for format correctness.
> 
>>  3 * ((sizeof(struct elf_note)) +
>>   roundup(sizeof(CORE_STR), 4)) +
>>  roundup(sizeof(struct elf_prstatus), 4) +
>> @@ -435,6 +435,7 @@ read_kcore(struct file *file, char __user *buffer, 
>> size_t buflen, loff_t *fpos)
>>  size_t elf_buflen;
>>  int nphdr;
>>  unsigned long start;
>> +unsigned long page = 0;
>> 
>>  read_lock(_lock);
>>  size = get_kcore_size(, _buflen);
>> @@ -485,7 +486,7 @@ read_kcore(struct file *file, char __user *buffer, 
>> size_t buflen, loff_t *fpos)
>>  start = kc_offset_to_vaddr(*fpos - elf_buflen);
>>  if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
>>  tsz = buflen;
>> -
>> +
> 
> Please keep the unnecessary whitespace changes for another patch.
> 
>>  while (buflen) {
>>  struct kcore_list *m;
>> 
>> @@ -515,15 +516,32 @@ read_kcore(struct file *file, char __user *buffer, 
>> size_t buflen, loff_t *fpos)
>>  } else {
>>  if (kern_addr_valid(start)) {
>>  unsigned long n;
>> +mm_segment_t old_fs = get_fs();
>> +
>> +if (page == 0) {
>> +page = __get_free_page(GFP_KERNEL);
>> +if (page == 0)
>> +return -ENOMEM;
> 
> FWIW, we usually code this as "!page" instead of "page == 0".  I also
> wouldn't call it 'page'.
> 
> Also, why is this using a raw __get_free_page() while the code above it
> uses a kmalloc()?
> 
because i am using a page size buffer, more efficient  to use __get_free_page()
than  kmalloc() here .

>> -n = copy_to_user(buffer, (char *)start, tsz);
>> +}
>> +set_fs(KERNEL_DS);
>> +pagefault_disable();
>> +n = __copy_from_user_inatomic((void *)page,
>> +(__force const void __user *)start,
>> +tsz);
>> +pagefault_enable();
>> +set_fs(old_fs);
>> +if (n)
>> +memset((void *)page + tsz - n, 0, n);
>> +
>> +n = copy_to_user(buffer, (char *)page, tsz);
> 
> So, first of all, we are using __copy_from_user_inatomic() to copy to
> and from a *kernel* addresses, and it doesn't even get a comment? :)
> 
i will add comment.
> Fundamentally, we're trying to be able to safely survive faults in the
> kernel linear map here.  I think we've got to get a better handle on
> when that happens rather than just paper over it when it does.  (Aside:
> There might actually be a missing use of get_online_mems() here.)
> 
ok.

> Maybe we should just be walking the kernel page tables ourselves and do
> a kmap().  We might have a stale pte but we don't have to worry about
> actual racy updates while we are doing the copy.
> 
so if do like this, we can remove kern_addr_valid() function, and i just walk 
pte and use get_page_unelss_zero()
to grab the valid page  ?


>>  /*
>>   * 

Re: [PATCH 00/13] Enhance twl4030_charger functionality. - V3

2015-08-04 Thread Sebastian Reichel
Hi,

On Thu, Jul 30, 2015 at 10:11:24AM +1000, NeilBrown wrote:
> Following is most of my twl4030_charger patches, rebased against
>  git://git.infradead.org/battery-2.6
> 
> Since the previous set I have added the conversion to
> module_platform_driver so EPROBE_DEFER can be used, and fixed
> a few minor typos.
> 
> This does not include the changes to add extcon support, in part
> because extcon has seen some changes lately which leave me even more
> confused about how best to use it than before.
> I need to sort that out before I can resolve the rest of my usb phy
> patches and then add a few more charger patches.

Thanks, queued.

-- Sebastian


signature.asc
Description: Digital signature


[PATCH v6 0/2] x86, mwaitt: introduce AMD mwaitt support

2015-08-04 Thread Huang Rui
Hi,

This patch set introduces a new instruction support on AMD Carrizo (Family
15h, Model 60h-6fh). It adds mwaitx delay function with a configurable
timer.

Andy and Boris provide a suggestion which uses mwaitx on delay method.

As Peter's suggestion of last version (v5), the serial of patch set
provides a test result.
http://marc.info/?l=linux-kernel=143436586513713=2

Some discussions of the background, please see:
http://marc.info/?l=linux-kernel=143202042530498=2
http://marc.info/?l=linux-kernel=143161327003541=2
http://marc.info/?l=linux-kernel=143222815331016=2

Patch set is rebased on tip/master.

Changes from v1 -> v2
- Remove mwaitx idle implementation since some disputes without power
  improvement.
- Add a patch which implement another use case on delay.
- Introduce a kernel parameter (delay) to make delay method configurable.

Changes from v2 -> v3
- Add compared data on commit message
- Remove kernel parameter
- Add hint to avoid to access deep state in future
- Update mwaitx delay method as Petter's suggestion

Changes from v3 -> v4
- Put the MONITORX/MWAITX description into comments

Changes from v4 -> v5
- Remove mwaitx function
- Use mwaitx_delay at init_amd
- Use cpu_tts as montioring address scope

Changes from v5 -> v6
- Move definitions into patch 1
- Completed the power consumption testing both with MWAITX and without
  MWAITX
- Use mwaitx_delay at bsp_init_amd

In MWAITX delay, the CPU core will be quiesced in a waiting phase,
diminishing its power consumption.

Run a simple test to measure power consumption:

cat /sys/bus/pci/devices/\:00\:18.4/hwmon/hwmon0/power1_acc;
sleep 1s;
cat /sys/bus/pci/devices/\:00\:18.4/hwmon/hwmon0/power1_acc;

* TSC-based default delay:  485115 uWatts average power
* MWAITX-based delay:   252738 uWatts average power

Thus, that's about 240 milliWatts less power consumption. The test
method relies on the support of AMD CPU accumulated power algorithm in
fam15_power for which patches are forthcoming.

Thanks,
Rui

Huang Rui (2):
  x86, mwaitt: add monitorx and mwaitx instruction
  x86, mwaitt: introduce mwaitx delay with a configurable timer

 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/include/asm/delay.h  |  1 +
 arch/x86/include/asm/mwait.h  | 43 +++
 arch/x86/kernel/cpu/amd.c |  4 
 arch/x86/lib/delay.c  | 48 ++-
 5 files changed, 96 insertions(+), 1 deletion(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 1/2] x86, mwaitt: add monitorx and mwaitx instruction

2015-08-04 Thread Huang Rui
On AMD Carrizo processors (Family 15h, Model 60h-6fh), there is a new
feature called MWAITT (MWAIT with a timer) as an extension of
MONITOR/MWAIT.

MWAITT, another name is MWAITX (MWAIT with extensions), has a configurable
timer that causes MWAITX to exit on expiration.

Compared with MONITOR/MWAIT, there are minor differences in opcode and
input parameters.

MWAITX ECX[1]: enable timer if set
MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks

MWAIT   MWAITX
opcode  0f 01 c9   |0f 01 fb
ECX[0]  value of RFLAGS.IF seen by instruction
ECX[1]  unused/#GP if set  |enable timer if set
ECX[31:2] unused/#GP if set
EAX   unused (reserve for hint)
EBX[31:0]   unused |max wait time (loops)

MONITOR MONITORX
opcode  0f 01 c8   |0f 01 fa
EAX (logical) address to monitor
ECX #GP if not zero

The software P0 frequency is the same as the TSC frequency.

Max timeout = EBX/(TSC frequency)

Signed-off-by: Huang Rui 
Cc: Borislav Petkov 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Thomas Gleixner 
Cc: Andreas Herrmann 
---
 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/include/asm/mwait.h  | 43 +++
 2 files changed, 44 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 4b11974..9978a98 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -177,6 +177,7 @@
 #define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter 
extensions */
 #define X86_FEATURE_BPEXT  (6*32+26) /* data breakpoint extension */
 #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions 
*/
+#define X86_FEATURE_MWAITT ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) 
*/
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 653dfa7..47f3540 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -14,6 +14,9 @@
 #define CPUID5_ECX_INTERRUPT_BREAK 0x2
 
 #define MWAIT_ECX_INTERRUPT_BREAK  0x1
+#define MWAITX_ECX_TIMER_ENABLEBIT(1)
+#define MWAITX_MAX_LOOPS   ((u32)-1)
+#define MWAITX_DISABLE_CSTATES 0xf
 
 static inline void __monitor(const void *eax, unsigned long ecx,
 unsigned long edx)
@@ -23,6 +26,14 @@ static inline void __monitor(const void *eax, unsigned long 
ecx,
 :: "a" (eax), "c" (ecx), "d"(edx));
 }
 
+static inline void __monitorx(const void *eax, unsigned long ecx,
+ unsigned long edx)
+{
+   /* "monitorx %eax, %ecx, %edx;" */
+   asm volatile(".byte 0x0f, 0x01, 0xfa;"
+:: "a" (eax), "c" (ecx), "d"(edx));
+}
+
 static inline void __mwait(unsigned long eax, unsigned long ecx)
 {
/* "mwait %eax, %ecx;" */
@@ -30,6 +41,38 @@ static inline void __mwait(unsigned long eax, unsigned long 
ecx)
 :: "a" (eax), "c" (ecx));
 }
 
+/*
+ * MWAITT allows for both a timer value to get you out of the MWAIT as
+ * well as the normal exit conditions.
+ *
+ * MWAITX ECX[1]: enable timer if set
+ * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks
+ *
+ * Below is the compared data between MWAIT and MWAITX on AMD
+ * processors:
+ * MWAIT   MWAITX
+ * opcode  0f 01 c9   |0f 01 fb
+ * ECX[0]  value of RFLAGS.IF seen by instruction
+ * ECX[1]  unused/#GP if set  |enable timer if set
+ * ECX[31:2] unused/#GP if set
+ * EAX   unused (reserve for hint)
+ * EBX[31:0]   unused |max wait time (loops)
+ *
+ * MONITOR MONITORX
+ * opcode  0f 01 c8   |0f 01 fa
+ * EAX (logical) address to monitor
+ * ECX #GP if not zero
+ *
+ * The software P0 frequency is the same as the TSC frequency.
+ */
+static inline void __mwaitx(unsigned long eax, unsigned long ebx,
+   unsigned long ecx)
+{
+   /* "mwaitx %eax, %ebx, %ecx;" */
+   asm volatile(".byte 0x0f, 0x01, 0xfb;"
+:: "a" (eax), "b" (ebx), "c" (ecx));
+}
+
 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 {
trace_hardirqs_on();
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] staging: lustre: Fix coding style errors

2015-08-04 Thread Joe Perches
On Tue, 2015-08-04 at 16:10 +0800, Jandy Gou wrote:
> Signed-off-by: Jandy Gou 
[]
> diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h 
> b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
[]
> @@ -721,7 +721,7 @@ kiblnd_nid2peerlist (lnet_nid_t nid)
>   unsigned int hash =
>   ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
>  
> - return (_data.kib_peers [hash]);
> + return (_data.kib_peers[hash]);

Please run your own patches though checkpatch and
please remove the unnecessary parenthesis around
the return value at the same time.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] kcore:change kcore_read to make sure the kernel read is safe

2015-08-04 Thread yalin wang

> On Aug 5, 2015, at 06:38, Andrew Morton  wrote:
> 
> On Tue, 4 Aug 2015 11:37:57 +0800 yalin wang  wrote:
> 
>> This change kcore_read() to use __copy_from_user_inatomic() to
>> copy data from kernel address, because kern_addr_valid() just make sure
>> page table is valid during call it, whne it return, the page table may
>> change, for example, like set_fixmap() function will change kernel page
>> table, then maybe trigger kernel crash if encounter this unluckily.
> 
> The changelog is a bit hard to follow.  How does this version look?
> 
> : read_kcore() does a copy_to_user() from kernel memory.  This could cause a
> : crash if the source (kernel) address is concurrently unmapped via, say,
> : set_fixmap().  The kern_addr_valid() check is racy and won't reliably
> : prevent this.
> : 
> : Change kcore_read() to use __copy_from_user_inatomic() via a temporary
> : buffer to catch such situations.
> 
> What actually happens when copy_to_user() gets a fault on the source
> address?  It *could* handle it and return -EFAULT.  I forget...
> 
> Also...  what is special about this particular copy_to_user()?  Isn't
> every copy_to_user() in the kernel vulnerable to a concurrent
> set_fixmap()?  Is it that only read_kcore() will read pages which are
> subject to set_fixmap() alteration?
> 
Thanks for your great comment .
i agree with your git change log,
one more question, at first i only focus on arm64 arch,
it only check __user* address during copy_from{to}_user,
but other architecther like X86 check both source and dest address
in copy_from{to}_user, is there some special reason do like this?
in my view , just need check __user* address is enough, and if also
have ex_table for kernel address access maybe hide some BUG in kernel ,
i think kernel don’t need it, or am i miss something ?

if copy_from{to}_user both check source and dest address,
we don’t need this patch, it is safe .
Maybe we need one more API , like:
copy_data_in_user(__user *source, __user *dest, size_t size)  ??
>> --- a/fs/proc/kcore.c
>> +++ b/fs/proc/kcore.c
>> @@ -86,8 +86,8 @@ static size_t get_kcore_size(int *nphdr, size_t 
>> *elf_buflen)
>>  size = try;
>>  *nphdr = *nphdr + 1;
>>  }
>> -*elf_buflen =   sizeof(struct elfhdr) + 
>> -(*nphdr + 2)*sizeof(struct elf_phdr) + 
>> +*elf_buflen =   sizeof(struct elfhdr) +
>> +(*nphdr + 2)*sizeof(struct elf_phdr) +
> 
> Unrelated whitespace fixes really shouldn't be in here.  They don't
> bother me too much, but some people get upset ;)
> 

i will seperate in another patch for format correctness.

>>  3 * ((sizeof(struct elf_note)) +
>>   roundup(sizeof(CORE_STR), 4)) +
>>  roundup(sizeof(struct elf_prstatus), 4) +
>> @@ -435,6 +435,7 @@ read_kcore(struct file *file, char __user *buffer, 
>> size_t buflen, loff_t *fpos)
>>  size_t elf_buflen;
>>  int nphdr;
>>  unsigned long start;
>> +unsigned long page = 0;
> 
> "page" isn't a very good name - when we see that identifier we expect
> it to be a `struct page *'.  Maybe call it copy_buf or something.
> 
> (And incoming argument "buffer" was poorly named.  "buffer" implies some
> temporary intermediate thing, which is inappropriate here!)
> 
will change name.

>>  read_lock(_lock);
>>  size = get_kcore_size(, _buflen);
>> @@ -485,7 +486,7 @@ read_kcore(struct file *file, char __user *buffer, 
>> size_t buflen, loff_t *fpos)
>>  start = kc_offset_to_vaddr(*fpos - elf_buflen);
>>  if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
>>  tsz = buflen;
>> -
>> +
>>  while (buflen) {
>>  struct kcore_list *m;
>> 
>> @@ -515,15 +516,32 @@ read_kcore(struct file *file, char __user *buffer, 
>> size_t buflen, loff_t *fpos)
>>  } else {
>>  if (kern_addr_valid(start)) {
> 
> Do we still need the (racy) kern_addr_valid() test?  The code should
> work OK if this is removed?
> 
Yes, can remove

>>  unsigned long n;
>> +mm_segment_t old_fs = get_fs();
>> +
>> +if (page == 0) {
>> +page = __get_free_page(GFP_KERNEL);
>> +if (page == 0)
>> +return -ENOMEM;
>> 
>> -n = copy_to_user(buffer, (char *)start, tsz);
>> +}
>> +set_fs(KERNEL_DS);
>> +pagefault_disable();
>> +n = __copy_from_user_inatomic((void *)page,
>> +(__force const void __user *)start,
>> +tsz);
>> +pagefault_enable();
>> +set_fs(old_fs);
> 
> We should have a code comment in here 

Re: veths often slow to come up

2015-08-04 Thread Cong Wang
(Cc'ing netdev for network issues)

On Tue, Aug 4, 2015 at 6:42 AM, Shaun Crampton
 wrote:
> Please CC me on any responses, thanks.
>
> Setting both ends of a veth to be oper UP completes very quickly but I
> find that pings only start flowing over the veth after about a second.
> This seems to correlate with the NO-CARRIER flag being set or the
> interface being in "state UNKNOWN" or "state DOWN² for about a second
> (demo script below).
>
> If I run the script repeatedly then sometimes it completes very quickly on
> subsequent runs as if there¹s a hot cache somewhere.
>
> Could this be a bug or is there a configuration to speed this up?  Seems
> odd that it¹s almost exactly 1s on the first run.
>
> Seen on these kernels:
> * 3.13.0-57-generic #95-Ubuntu SMP Fri Jun 19 09:28:15 UTC 2015 x86_64
> x86_64 x86_64 GNU/Linux
> * 4.0.9-coreos #2 SMP Thu Jul 30 01:07:55 UTC 2015 x86_64 Intel(R) Xeon(R)
> CPU @ 2.50GHz GenuineIntel GNU/Linux
>
> Regards,
>
> -Shaun
>
>
> Running my test script below (Assumes veth0/1 do not already exist):
>
> $ sudo ./veth-test.sh
> Time to create veth:
>
> real0m0.019s
> user0m0.002s
> sys 0m0.010s
>
> Time to wait for carrier:
>
> real0m1.005s
> user0m0.007s
> sys 0m0.123s
>
>
>
> # veth-test.sh
>
> #!/bin/bash
> function create_veth {
>   ip link add type veth
>   ip link set veth0 up
>   ip link set veth1 up
> }
> function wait_for_carrier {
>   while ! ip link show | grep -qE 'veth[01]';
>   do
> sleep 0.05
>   done
>   while ip link show | grep -E 'veth[01]¹ | \
> grep -Eq 'NO-CARRIER|state DOWN|state UNKNOWN';
>   do
> sleep 0.05
>   done
> }
> echo "Time to create veth:"
> time create_veth
> echo
> echo "Time to wait for carrier:"
> time wait_for_carrier
> ip link del veth0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/3] media: atmel-isi: move configure_geometry() to start_streaming()

2015-08-04 Thread Josh Wu
As in set_fmt() function we only need to know which format is been set,
we don't need to access the ISI hardware in this moment.

So move the configure_geometry(), which access the ISI hardware, to
start_streaming() will make code more consistent and simpler.

Signed-off-by: Josh Wu 
Reviewed-by: Laurent Pinchart 
---

Changes in v2:
- Add Laurent's reviewed-by tag.

 drivers/media/platform/soc_camera/atmel-isi.c | 17 +
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/media/platform/soc_camera/atmel-isi.c 
b/drivers/media/platform/soc_camera/atmel-isi.c
index 0fd6bc9..cb46aec 100644
--- a/drivers/media/platform/soc_camera/atmel-isi.c
+++ b/drivers/media/platform/soc_camera/atmel-isi.c
@@ -391,6 +391,11 @@ static int start_streaming(struct vb2_queue *vq, unsigned 
int count)
/* Disable all interrupts */
isi_writel(isi, ISI_INTDIS, (u32)~0UL);
 
+   ret = configure_geometry(isi, icd->user_width, icd->user_height,
+   icd->current_fmt->code);
+   if (ret < 0)
+   return ret;
+
spin_lock_irq(>lock);
/* Clear any pending interrupt */
isi_readl(isi, ISI_STATUS);
@@ -478,8 +483,6 @@ static int isi_camera_init_videobuf(struct vb2_queue *q,
 static int isi_camera_set_fmt(struct soc_camera_device *icd,
  struct v4l2_format *f)
 {
-   struct soc_camera_host *ici = to_soc_camera_host(icd->parent);
-   struct atmel_isi *isi = ici->priv;
struct v4l2_subdev *sd = soc_camera_to_subdev(icd);
const struct soc_camera_format_xlate *xlate;
struct v4l2_pix_format *pix = >fmt.pix;
@@ -512,16 +515,6 @@ static int isi_camera_set_fmt(struct soc_camera_device 
*icd,
if (mf->code != xlate->code)
return -EINVAL;
 
-   /* Enable PM and peripheral clock before operate isi registers */
-   pm_runtime_get_sync(ici->v4l2_dev.dev);
-
-   ret = configure_geometry(isi, pix->width, pix->height, xlate->code);
-
-   pm_runtime_put(ici->v4l2_dev.dev);
-
-   if (ret < 0)
-   return ret;
-
pix->width  = mf->width;
pix->height = mf->height;
pix->field  = mf->field;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 3/3] media: atmel-isi: add sanity check for supported formats in set_fmt()

2015-08-04 Thread Josh Wu
After adding the format check in set_fmt(), we don't need any format check
in configure_geometry(). So make configure_geometry() as void type.

Signed-off-by: Josh Wu 
---

Changes in v2:
- new added patch

 drivers/media/platform/soc_camera/atmel-isi.c | 39 +--
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/drivers/media/platform/soc_camera/atmel-isi.c 
b/drivers/media/platform/soc_camera/atmel-isi.c
index cb46aec..d0df518 100644
--- a/drivers/media/platform/soc_camera/atmel-isi.c
+++ b/drivers/media/platform/soc_camera/atmel-isi.c
@@ -103,17 +103,19 @@ static u32 isi_readl(struct atmel_isi *isi, u32 reg)
return readl(isi->regs + reg);
 }
 
-static int configure_geometry(struct atmel_isi *isi, u32 width,
+static void configure_geometry(struct atmel_isi *isi, u32 width,
u32 height, u32 code)
 {
u32 cfg2;
 
/* According to sensor's output format to set cfg2 */
switch (code) {
-   /* YUV, including grey */
+   default:
+   /* Grey */
case MEDIA_BUS_FMT_Y8_1X8:
cfg2 = ISI_CFG2_GRAYSCALE;
break;
+   /* YUV */
case MEDIA_BUS_FMT_VYUY8_2X8:
cfg2 = ISI_CFG2_YCC_SWAP_MODE_3;
break;
@@ -127,8 +129,6 @@ static int configure_geometry(struct atmel_isi *isi, u32 
width,
cfg2 = ISI_CFG2_YCC_SWAP_DEFAULT;
break;
/* RGB, TODO */
-   default:
-   return -EINVAL;
}
 
isi_writel(isi, ISI_CTRL, ISI_CTRL_DIS);
@@ -139,8 +139,29 @@ static int configure_geometry(struct atmel_isi *isi, u32 
width,
cfg2 |= ((height - 1) << ISI_CFG2_IM_VSIZE_OFFSET)
& ISI_CFG2_IM_VSIZE_MASK;
isi_writel(isi, ISI_CFG2, cfg2);
+}
 
-   return 0;
+static bool is_supported(struct soc_camera_device *icd,
+   const struct soc_camera_format_xlate *xlate)
+{
+   bool ret = true;
+
+   switch (xlate->code) {
+   /* YUV, including grey */
+   case MEDIA_BUS_FMT_Y8_1X8:
+   case MEDIA_BUS_FMT_VYUY8_2X8:
+   case MEDIA_BUS_FMT_UYVY8_2X8:
+   case MEDIA_BUS_FMT_YVYU8_2X8:
+   case MEDIA_BUS_FMT_YUYV8_2X8:
+   break;
+   /* RGB, TODO */
+   default:
+   dev_err(icd->parent, "not supported format: %d\n",
+   xlate->code);
+   ret = false;
+   }
+
+   return ret;
 }
 
 static irqreturn_t atmel_isi_handle_streaming(struct atmel_isi *isi)
@@ -391,10 +412,8 @@ static int start_streaming(struct vb2_queue *vq, unsigned 
int count)
/* Disable all interrupts */
isi_writel(isi, ISI_INTDIS, (u32)~0UL);
 
-   ret = configure_geometry(isi, icd->user_width, icd->user_height,
+   configure_geometry(isi, icd->user_width, icd->user_height,
icd->current_fmt->code);
-   if (ret < 0)
-   return ret;
 
spin_lock_irq(>lock);
/* Clear any pending interrupt */
@@ -515,6 +534,10 @@ static int isi_camera_set_fmt(struct soc_camera_device 
*icd,
if (mf->code != xlate->code)
return -EINVAL;
 
+   /* check with atmel-isi support format */
+   if (!is_supported(icd, xlate))
+   return -EINVAL;
+
pix->width  = mf->width;
pix->height = mf->height;
pix->field  = mf->field;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/3] media: atmel-isi: setup the ISI_CFG2 register directly

2015-08-04 Thread Josh Wu
In the function configure_geometry(), we will setup the ISI CFG2
according to the sensor output format.

It make no sense to just read back the CFG2 register and just set part
of it.

So just set up this register directly makes things simpler.
Currently only support YUV format from camera sensor.

Signed-off-by: Josh Wu 
Reviewed-by: Laurent Pinchart 
---

Changes in v2:
- add Laurent's reviewed-by tag.

 drivers/media/platform/soc_camera/atmel-isi.c | 20 +++-
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/media/platform/soc_camera/atmel-isi.c 
b/drivers/media/platform/soc_camera/atmel-isi.c
index 274a6f7..0fd6bc9 100644
--- a/drivers/media/platform/soc_camera/atmel-isi.c
+++ b/drivers/media/platform/soc_camera/atmel-isi.c
@@ -106,24 +106,25 @@ static u32 isi_readl(struct atmel_isi *isi, u32 reg)
 static int configure_geometry(struct atmel_isi *isi, u32 width,
u32 height, u32 code)
 {
-   u32 cfg2, cr;
+   u32 cfg2;
 
+   /* According to sensor's output format to set cfg2 */
switch (code) {
/* YUV, including grey */
case MEDIA_BUS_FMT_Y8_1X8:
-   cr = ISI_CFG2_GRAYSCALE;
+   cfg2 = ISI_CFG2_GRAYSCALE;
break;
case MEDIA_BUS_FMT_VYUY8_2X8:
-   cr = ISI_CFG2_YCC_SWAP_MODE_3;
+   cfg2 = ISI_CFG2_YCC_SWAP_MODE_3;
break;
case MEDIA_BUS_FMT_UYVY8_2X8:
-   cr = ISI_CFG2_YCC_SWAP_MODE_2;
+   cfg2 = ISI_CFG2_YCC_SWAP_MODE_2;
break;
case MEDIA_BUS_FMT_YVYU8_2X8:
-   cr = ISI_CFG2_YCC_SWAP_MODE_1;
+   cfg2 = ISI_CFG2_YCC_SWAP_MODE_1;
break;
case MEDIA_BUS_FMT_YUYV8_2X8:
-   cr = ISI_CFG2_YCC_SWAP_DEFAULT;
+   cfg2 = ISI_CFG2_YCC_SWAP_DEFAULT;
break;
/* RGB, TODO */
default:
@@ -131,17 +132,10 @@ static int configure_geometry(struct atmel_isi *isi, u32 
width,
}
 
isi_writel(isi, ISI_CTRL, ISI_CTRL_DIS);
-
-   cfg2 = isi_readl(isi, ISI_CFG2);
-   /* Set YCC swap mode */
-   cfg2 &= ~ISI_CFG2_YCC_SWAP_MODE_MASK;
-   cfg2 |= cr;
/* Set width */
-   cfg2 &= ~(ISI_CFG2_IM_HSIZE_MASK);
cfg2 |= ((width - 1) << ISI_CFG2_IM_HSIZE_OFFSET) &
ISI_CFG2_IM_HSIZE_MASK;
/* Set height */
-   cfg2 &= ~(ISI_CFG2_IM_VSIZE_MASK);
cfg2 |= ((height - 1) << ISI_CFG2_IM_VSIZE_OFFSET)
& ISI_CFG2_IM_VSIZE_MASK;
isi_writel(isi, ISI_CFG2, cfg2);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v6 2/2] x86, mwaitt: introduce mwaitx delay with a configurable timer

2015-08-04 Thread Huang Rui
MWAITX can enable a timer and a corresponding timer value specified in
SW P0 clocks. The SW P0 frequency is the same as TSC. The timer
provides an upper bound on how long the instruction waits before
exiting.

The implementation of delay function in kernel can leverage the timer
of MWAITX. This patch provides a new method (delay_mwaitx) to measure
delay time.

In MWAITX delay, the CPU core will be quiesced in a waiting phase,
diminishing its power consumption.

Run a simple test to measure power consumption:

cat /sys/bus/pci/devices/\:00\:18.4/hwmon/hwmon0/power1_acc;
sleep 1s;
cat /sys/bus/pci/devices/\:00\:18.4/hwmon/hwmon0/power1_acc;

* TSC-based default delay:  485115 uWatts average power
* MWAITX-based delay:   252738 uWatts average power

Thus, that's about 240 milliWatts less power consumption. The test
method relies on the support of AMD CPU accumulated power algorithm in
fam15_power for which patches are forthcoming.

Suggested-by: Andy Lutomirski 
Suggested-by: Borislav Petkov 
Suggested-by: Peter Zijlstra 
Signed-off-by: Huang Rui 
Cc: Ingo Molnar 
Cc: Thomas Gleixner 
Cc: Andreas Herrmann 
---
 arch/x86/include/asm/delay.h |  1 +
 arch/x86/kernel/cpu/amd.c|  4 
 arch/x86/lib/delay.c | 48 +++-
 3 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
index 9b3b4f2..36a760b 100644
--- a/arch/x86/include/asm/delay.h
+++ b/arch/x86/include/asm/delay.h
@@ -4,5 +4,6 @@
 #include 
 
 void use_tsc_delay(void);
+void use_mwaitx_delay(void);
 
 #endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 51ad2af..730e620 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_X86_64
 # include 
@@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
/* A random value per boot for bit slice [12:upper_bit) */
va_align.bits = get_random_int() & va_align.mask;
}
+
+   if (cpu_has(c, X86_FEATURE_MWAITT))
+   use_mwaitx_delay();
 }
 
 static void early_init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 4453d52..f8236cb 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_SMP
 # include 
@@ -84,6 +85,45 @@ static void delay_tsc(unsigned long __loops)
 }
 
 /*
+ * On AMD platforms MWAITX has a configurable 32-bit timer, that
+ * counts with TSC frequency. And the input value is the loop of the
+ * counter, it will exit when the timer expires.
+ */
+static void delay_mwaitx(unsigned long __loops)
+{
+   u32 delay, loops = __loops;
+   u64 end, start;
+
+   start = rdtsc_ordered();
+
+   for (;;) {
+   delay = min(MWAITX_MAX_LOOPS, loops);
+
+   /*
+* Use cpu_tss as a cacheline-aligned, seldomly
+* accessed per-cpu variable as the monitor target.
+*/
+   __monitorx(this_cpu_ptr(_tss), 0, 0);
+   /*
+* AMD, like Intel, supports the EAX hint and EAX=0xf
+* means, do not enter any deep C-state and we use it
+* here in delay() to minimize wakeup latency.
+*/
+   __mwaitx(MWAITX_DISABLE_CSTATES, delay,
+MWAITX_ECX_TIMER_ENABLE);
+
+   end = rdtsc_ordered();
+
+   if (loops <= end - start)
+   break;
+
+   loops -= end - start;
+
+   start = end;
+   }
+}
+
+/*
  * Since we calibrate only once at boot, this
  * function should be set once at boot and not changed
  */
@@ -91,7 +131,13 @@ static void (*delay_fn)(unsigned long) = delay_loop;
 
 void use_tsc_delay(void)
 {
-   delay_fn = delay_tsc;
+   if (delay_fn == delay_loop)
+   delay_fn = delay_tsc;
+}
+
+void use_mwaitx_delay(void)
+{
+   delay_fn = delay_mwaitx;
 }
 
 int read_current_timer(unsigned long *timer_val)
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V3] dmaengine: imx-sdma: Add device to device support

2015-08-04 Thread Vinod Koul
On Fri, Jul 10, 2015 at 05:08:16PM +0800, Shengjiu Wang wrote:
> This patch adds DEV_TO_DEV support for i.MX SDMA driver to support data
> transfer between two peripheral FIFOs.
> The per_2_per script requires two peripheral addresses and two DMA
> requests, and it need to check the src addr and dst addr is in the SPBA
> bus space or in the AIPS bus space.
Applied, thanks

-- 
~Vinod

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 2/2] Documentation: dt: binding: atmel-sama5d4-wdt: for SAMA5D4 watchdog driver

2015-08-04 Thread Wenyou Yang
The compatible "atmel,sama5d4-wdt" supports the SAMA5D4 watchdog driver
and the watchdog's WDT_MR register can be written more than once.

Signed-off-by: Wenyou Yang 
---
 .../bindings/watchdog/atmel-sama5d4-wdt.txt|   35 
 1 file changed, 35 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt

diff --git a/Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt 
b/Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt
new file mode 100644
index 000..f7cc7c0
--- /dev/null
+++ b/Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt
@@ -0,0 +1,35 @@
+* Atmel SAMA5D4 Watchdog Timer (WDT) Controller
+
+Required properties:
+- compatible: "atmel,sama5d4-wdt"
+- reg: base physical address and length of memory mapped region.
+
+Optional properties:
+- timeout-sec: watchdog timeout value (in seconds).
+- interrupts: interrupt number to the CPU.
+- atmel,watchdog-type: should be "hardware" or "software".
+   "hardware": enable watchdog fault reset. A watchdog fault triggers
+   watchdog reset.
+   "software": enable watchdog fault interrupt. A watchdog fault asserts
+   watchdog interrupt.
+- atmel,idle-halt: present if you want to stop the watchdog when the CPU is
+  in idle state.
+   CAUTION: This property should be used with care, it actually makes the
+   watchdog not counting when the CPU is in idle state, therefore the
+   watchdog reset time depends on mean CPU usage and will not reset at all
+   if the CPU stop working while it is in idle state, which is probably
+   not what you want.
+- atmel,dbg-halt: present if you want to stop the watchdog when the CPU is
+ in debug state.
+
+Example:
+   watchdog@fc068640 {
+   compatible = "atmel,sama5d4-wdt";
+   reg = <0xfc068640 0x10>;
+   interrupts = <4 IRQ_TYPE_LEVEL_HIGH 5>;
+   timeout-sec = <10>;
+   atmel,watchdog-type = "hardware";
+   atmel,dbg-halt;
+   atmel,idle-halt;
+   status = "okay";
+   };
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 1/2] drivers: watchdog: add a driver to support SAMA5D4 watchdog timer

2015-08-04 Thread Wenyou Yang
>From SAMA5D4, the watchdog timer is upgrated with a new feature,
which is describled as in the datasheet, "WDT_MR can be written
until a LOCKMR command is issued in WDT_CR".
That is to say, as long as the bootstrap and u-boot don't issue
a LOCKMR command, WDT_MR can be written more than once in the driver.

So the SAMA5D4 watchdog driver's implementation is different from
the previous: the user application open the device file to enable
the watchdog timer hardware, and close to disable it, and set the
watchdog timer timeout by seting WDV and WDD fields of WDT_MR register,
and ping the watchdog by issuing WDRSTT command to WDT_CR register
with hard-coded key.

Signed-off-by: Wenyou Yang 
---
 drivers/watchdog/Kconfig|9 ++
 drivers/watchdog/Makefile   |1 +
 drivers/watchdog/at91sam9_wdt.h |4 +
 drivers/watchdog/atmel_wdt.c|  290 +++
 4 files changed, 304 insertions(+)
 create mode 100644 drivers/watchdog/atmel_wdt.c

diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index e5e7c55..4425813 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -152,6 +152,15 @@ config ARM_SP805_WATCHDOG
  ARM Primecell SP805 Watchdog timer. This will reboot your system when
  the timeout is reached.
 
+config ATMEL_WATCHDOG
+   tristate "Atmel SAMA5D4 Watchdog Timer"
+   depends on ARCH_AT91
+   select WATCHDOG_CORE
+   help
+ Atmel watchdog timer embedded into SAMA5D4 chips. Its Watchdog Timer
+ Mode Register(WDT_MR) can be written more than once.
+ This will reboot your system when the timeout is reached.
+
 config AT91RM9200_WATCHDOG
tristate "AT91RM9200 watchdog"
depends on SOC_AT91RM9200 && MFD_SYSCON
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index 5c19294..c24a8fc 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_USBPCWATCHDOG) += pcwd_usb.o
 
 # ARM Architecture
 obj-$(CONFIG_ARM_SP805_WATCHDOG) += sp805_wdt.o
+obj-$(CONFIG_ATMEL_WATCHDOG) += atmel_wdt.o
 obj-$(CONFIG_AT91RM9200_WATCHDOG) += at91rm9200_wdt.o
 obj-$(CONFIG_AT91SAM9X_WATCHDOG) += at91sam9_wdt.o
 obj-$(CONFIG_CADENCE_WATCHDOG) += cadence_wdt.o
diff --git a/drivers/watchdog/at91sam9_wdt.h b/drivers/watchdog/at91sam9_wdt.h
index c6fbb2e6..79add4f 100644
--- a/drivers/watchdog/at91sam9_wdt.h
+++ b/drivers/watchdog/at91sam9_wdt.h
@@ -22,11 +22,15 @@
 
 #define AT91_WDT_MR0x04/* Watchdog Mode 
Register */
 #defineAT91_WDT_WDV(0xfff << 0)/* 
Counter Value */
+#defineAT91_WDT_WDV_MSK(0xfff)
+#defineAT91_WDT_WDV_(x)(((x) & 
AT91_WDT_WDV_MSK) << 0)
 #defineAT91_WDT_WDFIEN (1 << 12)   /* 
Fault Interrupt Enable */
 #defineAT91_WDT_WDRSTEN(1 << 13)   /* 
Reset Processor */
 #defineAT91_WDT_WDRPROC(1 << 14)   /* 
Timer Restart */
 #defineAT91_WDT_WDDIS  (1 << 15)   /* 
Watchdog Disable */
 #defineAT91_WDT_WDD(0xfff << 16)   /* 
Delta Value */
+#defineAT91_WDT_WDD_MSK(0xfff)
+#defineAT91_WDT_WDD_(x)(((x) & 
AT91_WDT_WDD_MSK) << 16)
 #defineAT91_WDT_WDDBGHLT   (1 << 28)   /* 
Debug Halt */
 #defineAT91_WDT_WDIDLEHLT  (1 << 29)   /* Idle 
Halt */
 
diff --git a/drivers/watchdog/atmel_wdt.c b/drivers/watchdog/atmel_wdt.c
new file mode 100644
index 000..e1cdc84
--- /dev/null
+++ b/drivers/watchdog/atmel_wdt.c
@@ -0,0 +1,290 @@
+/*
+ * Driver for Atmel SAMA5D4 Watchdog Timer
+ *
+ * Copyright (C) 2015 Atmel Corporation
+ *
+ * Licensed under GPLv2.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "at91sam9_wdt.h"
+
+/* minimum and maximum watchdog timeout, in seconds */
+#define MIN_WDT_TIMEOUT1
+#define MAX_WDT_TIMEOUT16
+#define WDT_DEFAULT_TIMEOUTMAX_WDT_TIMEOUT
+
+#define WDT_MAX_WDV0xFFF
+
+#define WDT_SEC2TICKS(s)   ((s) ? (((s) << 8) - 1) : 0)
+
+struct atmel_wdt {
+   struct watchdog_device  wdd;
+   void __iomem*reg_base;
+   u32 config;
+};
+
+static int wdt_timeout = WDT_DEFAULT_TIMEOUT;
+static bool nowayout = WATCHDOG_NOWAYOUT;
+
+module_param(wdt_timeout, int, 0);
+MODULE_PARM_DESC(wdt_timeout,
+   "Watchdog wdt_timeout in seconds. (default = "
+   __MODULE_STRING(WDT_DEFAULT_TIMEOUT) ")");
+
+module_param(nowayout, bool, 0);
+MODULE_PARM_DESC(nowayout,
+   "Watchdog cannot be stopped once started (default="
+   __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
+
+#define wdt_read(wdt, field) \
+   

[PATCH v2 0/2] add a new driver to support SAMA5D4 watchdog timer

2015-08-04 Thread Wenyou Yang
Hello,

Thank for Guenter's advice, add a new driver to support SAMA5D4 watchdog timer.

Because the watchdog WDT_MR register can be written more than once, its work
mechanism is different from the previous one. Open the device file to enable
the watchdog hardware, close to disable it, and ping it from the user space
directly to keep it alive.

Best Regards,
Wenyou Yang

Wenyou Yang (2):
  drivers: watchdog: add a driver to support SAMA5D4 watchdog timer
  Documentation: dt: binding: atmel-sama5d4-wdt: for SAMA5D4 watchdog
driver

 .../bindings/watchdog/atmel-sama5d4-wdt.txt|   35 +++
 drivers/watchdog/Kconfig   |9 +
 drivers/watchdog/Makefile  |1 +
 drivers/watchdog/at91sam9_wdt.h|4 +
 drivers/watchdog/atmel_wdt.c   |  290 
 5 files changed, 339 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/watchdog/atmel-sama5d4-wdt.txt
 create mode 100644 drivers/watchdog/atmel_wdt.c

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] csiostor:Fix error handling in the function csio_hws_ready

2015-08-04 Thread Nicholas Krause
From: Nicholas Krause 

This fixes error handling in the function csio_hws_ready for when
this function calls csio_scim_cleanup_io to cleanup outstanding
commands by checking if it cleaned a error code to signal internal
failure and if so tell the user we are unable to clean up the
outstanding io commands by printing this to the console before

Signed-off-by: Nicholas Krause 
---
 drivers/scsi/csiostor/csio_hw.c | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/csiostor/csio_hw.c b/drivers/scsi/csiostor/csio_hw.c
index 622bdab..49c47d3 100644
--- a/drivers/scsi/csiostor/csio_hw.c
+++ b/drivers/scsi/csiostor/csio_hw.c
@@ -2445,11 +2445,17 @@ csio_hws_ready(struct csio_hw *hw, enum csio_hw_ev evt)
csio_set_state(>sm, csio_hws_quiescing);
/* cleanup all outstanding cmds */
if (evt == CSIO_HWE_HBA_RESET ||
-   evt == CSIO_HWE_PCIERR_DETECTED)
-   csio_scsim_cleanup_io(csio_hw_to_scsim(hw), false);
-   else
-   csio_scsim_cleanup_io(csio_hw_to_scsim(hw), true);
-
+   evt == CSIO_HWE_PCIERR_DETECTED) {
+   if (csio_scsim_cleanup_io(csio_hw_to_scsim(hw), false)) 
{
+   csio_err(hw, "Unable to properly cleanup 
outstanding commands on this device\n");
+   return;
+   }
+   } else {
+   if (csio_scsim_cleanup_io(csio_hw_to_scsim(hw), true)) {
+   csio_err(hw, "Unable to properly cleanup 
outstanding commands on this device\n");
+   return;
+   }
+   }
csio_hw_intr_disable(hw);
csio_hw_mbm_cleanup(hw);
csio_evtq_stop(hw);
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ipv6:Fix concurrent access issue in the function inet6_rtm_deladdr

2015-08-04 Thread Nicholas Krause
From: Nicholas Krause 

This fixes the issue with conncurrent access when calling the function
inte6_addr_del due to this function using non locked wrapper versions
of certain functions by locking the routing mutex before and after this
call with rtnl_lock/unlock. After the unlocking just return the error
code as normal to signal success or failure to the caller of the function
inet_6_rtm_addr.

Signed-off-by: Nicholas Krause 
---
 net/ipv6/addrconf.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 21c2c81..b6103e0 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4006,8 +4006,11 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr 
*nlh)
/* We ignore other flags so far. */
ifa_flags &= IFA_F_MANAGETEMPADDR;
 
-   return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx,
+   rtnl_lock();
+   err =  inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx,
  ifm->ifa_prefixlen);
+   rtnl_unlock();
+   return err;
 }
 
 static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH net V2] virtio-net: drop NETIF_F_FRAGLIST

2015-08-04 Thread Jason Wang
virtio declares support for NETIF_F_FRAGLIST, but assumes
that there are at most MAX_SKB_FRAGS + 2 fragments which isn't
always true with a fraglist.

A longer fraglist in the skb will make the call to skb_to_sgvec overflow
the sg array, leading to memory corruption.

Drop NETIF_F_FRAGLIST so we only get what we can handle.

Cc: Michael S. Tsirkin 
Signed-off-by: Jason Wang 
---
- Change from V1: coding style fixes.
- The patch is needed for stable.
---
 drivers/net/virtio_net.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7fbca37..237f8e5 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1756,9 +1756,9 @@ static int virtnet_probe(struct virtio_device *vdev)
/* Do we support "hardware" checksums? */
if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
/* This opens up the world of extra features. */
-   dev->hw_features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
+   dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
if (csum)
-   dev->features |= 
NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
+   dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
 
if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] perf: Fix multi-segment problem of perf_event_intel_uncore

2015-08-04 Thread Taku Izumi
In multi-segment system, uncore devices may belong to buses whose segment
number is other than 0.

  
  :ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:7f:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:bf:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03)
  ...
  0001:ff:10.5 System peripheral: Intel Corporation Xeon E5 v3/Core i7 
Scratchpad & Semaphore Registers (rev 03
  ...

In that case relation of bus number and physical id may be broken
because "uncore_pcibus_to_physid" doesn't take account of PCI segment.
For example, bus :ff and 0001:ff uses the same entry of
"uncore_pcibus_to_physid" array.

This patch fixes ths problem by introducing segment-aware pci2phy_map instead.

 v1->v2:
  - Extract method named uncore_pcibus_to_physid to avoid repetetion of
retrieving phys_id code

Signed-off-by: Taku Izumi 
---
 arch/x86/kernel/cpu/perf_event_intel_uncore.c  | 25 --
 arch/x86/kernel/cpu/perf_event_intel_uncore.h  | 11 -
 arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c  | 23 +-
 .../x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 53 --
 4 files changed, 94 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 21b5e38..0ed6f2b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -7,7 +7,8 @@ struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
 static bool pcidrv_registered;
 struct pci_driver *uncore_pci_driver;
 /* pci bus to socket mapping */
-int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
+DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
+struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 
 static DEFINE_RAW_SPINLOCK(uncore_box_lock);
@@ -20,6 +21,23 @@ static struct event_constraint uncore_constraint_fixed =
 struct event_constraint uncore_constraint_empty =
EVENT_CONSTRAINT(0, 0, 0);
 
+int uncore_pcibus_to_physid(struct pci_bus *bus)
+{
+   int phys_id = -1;
+   struct pci2phy_map *map;
+
+   raw_spin_lock(_map_lock);
+   list_for_each_entry(map, _map_head, list) {
+   if (map->segment == pci_domain_nr(bus)) {
+   phys_id = map->pbus_to_physid[bus->number];
+   break;
+   }
+   }
+   raw_spin_unlock(_map_lock);
+
+   return phys_id;
+}
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf)
 {
@@ -809,7 +827,7 @@ static int uncore_pci_probe(struct pci_dev *pdev, const 
struct pci_device_id *id
int phys_id;
bool first_box = false;
 
-   phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
if (phys_id < 0)
return -ENODEV;
 
@@ -856,9 +874,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
 {
struct intel_uncore_box *box = pci_get_drvdata(pdev);
struct intel_uncore_pmu *pmu;
-   int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
+   int i, cpu, phys_id;
bool last_box = false;
 
+   phys_id = uncore_pcibus_to_physid(pdev->bus);
box = pci_get_drvdata(pdev);
if (!box) {
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h 
b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
index 0f77f0a..6c96ee9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -117,6 +117,14 @@ struct uncore_event_desc {
const char *config;
 };
 
+struct pci2phy_map {
+   struct list_head list;
+   int segment;
+   int pbus_to_physid[256];
+};
+
+int uncore_pcibus_to_physid(struct pci_bus *bus);
+
 ssize_t uncore_event_show(struct kobject *kobj,
  struct kobj_attribute *attr, char *buf);
 
@@ -317,7 +325,8 @@ u64 uncore_shared_reg_config(struct intel_uncore_box *box, 
int idx);
 extern struct intel_uncore_type **uncore_msr_uncores;
 extern struct intel_uncore_type **uncore_pci_uncores;
 extern struct pci_driver *uncore_pci_driver;
-extern int uncore_pcibus_to_physid[256];
+extern raw_spinlock_t pci2phy_map_lock;
+extern struct list_head pci2phy_map_head;
 extern struct pci_dev 
*uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
 extern struct event_constraint uncore_constraint_empty;
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c 
b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index b005a78..ccbc817 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ 

Re: [PATCH net] virtio-net: drop NETIF_F_FRAGLIST

2015-08-04 Thread Jason Wang


On 08/04/2015 07:11 PM, Sergei Shtylyov wrote:
> Hello.
>
> On 8/4/2015 12:55 PM, Jason Wang wrote:
>
>> virtio declares support for NETIF_F_FRAGLIST, but assumes
>> that there are at most MAX_SKB_FRAGS + 2 fragments which isn't
>> always true with a fraglist.
>
>> A longer fraglist in the skb will make the call to skb_to_sgvec overflow
>> the sg array, leading to memory corruption.
>
>> Drop NETIF_F_FRAGLIST so we only get what we can handle.
>
>> Cc: Michael S. Tsirkin 
>> Signed-off-by: Jason Wang 
>> ---
>> The patch is needed for stable.
>> ---
>>   drivers/net/virtio_net.c | 4 ++--
>>   1 file changed, 2 insertions(+), 2 deletions(-)
>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 7fbca37..2347a73 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -1756,9 +1756,9 @@ static int virtnet_probe(struct virtio_device
>> *vdev)
>>   /* Do we support "hardware" checksums? */
>>   if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
>>   /* This opens up the world of extra features. */
>> -dev->hw_features |=
>> NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
>> +dev->hw_features |= NETIF_F_HW_CSUM|NETIF_F_SG;
>>   if (csum)
>> -dev->features |=
>> NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
>> +dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG;
>
>I'd have added spaces around | to match the style seen below.
>

Ok, will fix this in V2.

>>   if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
>>   dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO
>
> MBR, Sergei
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: powerpc: Add an inline function to update HID0

2015-08-04 Thread Segher Boessenkool
On Tue, Aug 04, 2015 at 08:08:58PM +1000, Michael Ellerman wrote:
> > +static inline void update_hid0(unsigned long hid0)
> > +{
> > +   /*
> > +*  The HID0 update should at the very least be preceded by a
> > +*  a SYNC instruction followed by an ISYNC instruction
> > +*/
> > +   mb();
> > +   mtspr(SPRN_HID0, hid0);
> > +   isync();
> 
> That's going to turn into three separate inline asm blocks, which is maybe a
> bit unfortunate. Have you checked the generated code is what we want, ie. just
> sync, mtspr, isync ?

The "mb()" is not such a great name anyway: you don't want a memory
barrier, you want an actual sync instruction ("sync 0", "hwsync",
whatever the currently preferred spelling is).

The function name should also say this is for POWER8 (the required
sequences are different for some other processors; and some others
might not even _have_ a HID0, or not at 1008).  power8_write_hid0
or such?

For writing it as one asm, why not just

  asm volatile("sync ; mtspr %0,%1 ; isync" : : "i"(SPRN_HID0), "r"(hid0));

instead of the stringify stuff?


Segher
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] KVM: MTRR: Use default type for non-MTRR-covered gfn before WARN_ON

2015-08-04 Thread Xiao Guangrong



On 08/05/2015 12:58 AM, Alex Williamson wrote:

The patch was munged on commit to re-order these tests resulting in
excessive warnings when trying to do device assignment.  Return to
original ordering: https://lkml.org/lkml/2015/7/15/769



Reviewed-by: Xiao Guangrong 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/9] x86/intel_rdt: Add new cgroup and Class of service management

2015-08-04 Thread Vikas Shivappa



On Tue, 4 Aug 2015, Tejun Heo wrote:


Hello, Vikas.

On Tue, Aug 04, 2015 at 11:50:16AM -0700, Vikas Shivappa wrote:

I will make this more clear in the documentation - We intend this cgroup
interface to be used by a root or superuser - more like a system
administrator being able to control the allocation of the threads , the one
who has the knowledge of the usage and being able to decide.


I get that this would be an easier "bolt-on" solution but isn't a good
solution by itself in the long term.  As I wrote multiple times
before, this is a really bad programmable interface.  Unless you're
sure that this doesn't have to be programmable for threads of an
individual applications,


Yes, this doesnt have to be a programmable interface for threads. May not be a 
good idea to let the threads decide the cache allocation by themselves using this direct 
interface. We are transfering the decision maker responsibility to the system 
administrator.


- This interface like you said can easily bolt-on. basically an easy to use 
interface without worrying about the architectural details.
- But still does the job. root user can allocate exclusive or overlapping cache 
lines to threads or group of threads.
- No major roadblocks for usage as we can make the allocations like mentioned 
above and still keep the hierarchy etc and use it when needed.
- An important factor is that it can co-exist with other interfaces like #2 and 
#3 for the same easily. So I donot see a reason why we should not use this.
This is not meant to be a programmable interface, however it does not prevent 
co-existence.
- If root user has to set affinity of threads that he is allocating cache, he 
can do so using other cgroups like cpuset or set the masks seperately using 
taskset. This would let him configure the cache allocation on a socket.


this is a pretty bad interface by itself.



There is already a lot of such usage among different enterprise users at
Intel/google/cisco etc who have been testing the patches posted to lkml and
academically there is plenty of usage as well.


I mean, that's the tool you gave them.  Of course they'd be using it
but I suspect most of them would do fine with a programmable interface
too.  Again, please think of cpu affinity.


All the methodology to support the feature may need an arbitrator/agent to 
decide the allocation.


1. Let the root user or system administrator be the one who decides the
allocation based on the current usage. We assume this to be one with
administrative privileges. He could use the cgroup interface to perform the
task. One way to do the cpu affinity is by mounting cpuset and rdt cgroup 
together.


2. Kernel automatically assigning the cache based on the priority of the apps
etc. This is something which could be designed to co-exist with the #1 above
much like how the cpusets cgroup co-exist with the kernel assigning cpus to 
tasks. (the task could be having a cache capacity mask 
just like the cpu affinity mask)


3. User programmable interface , where say a resource management program
x (and hence apps) could link a library which supports cache alloc/monitoring
etc and then try to control and monitor the resources. The arbitrator could just
be the resource management interface itself or the kernel could decide.

If users use this programmable interface, we need to 
make sure all the apps just cannot allocate resources without some interfacing 
agent (in which case they could interface with #2 ?).


Do you think there are any issues for the user programmable interface to 
co-exist with the cgroup interface ?


Thanks,
Vikas



Thanks.

--
tejun


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] Staging: android: timed_gpio.c: fix coding style errors

2015-08-04 Thread Jandy Gou
remove extra space and replace tab to space after a variable

Signed-off-by: Jandy Gou 
---
 drivers/staging/android/timed_gpio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/android/timed_gpio.c 
b/drivers/staging/android/timed_gpio.c
index 938a35c..ce11726 100644
--- a/drivers/staging/android/timed_gpio.c
+++ b/drivers/staging/android/timed_gpio.c
@@ -61,9 +61,9 @@ static int gpio_get_time(struct timed_output_dev *dev)
 
 static void gpio_enable(struct timed_output_dev *dev, int value)
 {
-   struct timed_gpio_data  *data =
+   struct timed_gpio_data *data =
container_of(dev, struct timed_gpio_data, dev);
-   unsigned long   flags;
+   unsigned long flags;
 
spin_lock_irqsave(>lock, flags);
 
-- 
1.9.1


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] Staging: android: timed_gpio.c: fix coding style errors

2015-08-04 Thread Jandy Gou
remove extra space
replace tab to space after a variable

Jandy Gou (1):
  Staging: android: timed_gpio.c: fix coding style errors

 drivers/staging/android/timed_gpio.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

-- 
1.9.1


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 3/4] bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter

2015-08-04 Thread xiakaixu
于 2015/8/5 1:55, Alexei Starovoitov 写道:
> On 8/4/15 1:58 AM, Kaixu Xia wrote:
>> +static int check_func_limit(struct bpf_map **mapp, int func_id)
> 
> how about 'check_map_func_compatibility' or 'check_map_func_affinity' ?
> 
>> +{
>> +struct bpf_map *map = *mapp;
> 
> why pass pointer to a pointer? single pointer would be be fine.
> 
>> +bool bool_map, bool_func;
>> +int i;
>> +
>> +if (!map)
>> +return 0;
>> +
>> +for (i = 0; i <= ARRAY_SIZE(func_limit); i++) {
>> +bool_map = (map->map_type == func_limit[i].map_type);
>> +bool_func = (func_id == func_limit[i].func_id);
>> +/* only when map & func pair match it can continue.
>> + * don't allow any other map type to be passed into
>> + * the special func;
>> + */
>> +if (bool_map != bool_func)
>> +return -EINVAL;
>> +}
> 
> nice simplification!
> 
> the rest of the changes look good.
> please respin your next set against net-next.

Thanks for your review! I will follow them in the next set.
> 
> 
> .
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Cc llvmdev: Re: llvm bpf debug info. Re: [RFC PATCH v4 3/3] bpf: Introduce function for outputing data to perf event

2015-08-04 Thread Wangnan (F)

Send again since llvmdev is moved to llvm-...@lists.llvm.org

On 2015/8/5 9:58, Wangnan (F) wrote:



On 2015/8/4 17:01, Wangnan (F) wrote:

For people who in llvmdev:

This mail is belong to a thread in linux kernel mailing list, the 
first message

can be retrived from:

 http://lkml.kernel.org/r/55b1535e.8090...@plumgrid.com

Our goal is to fild a way to make BPF program get an unique ID for 
each type
so it can pass the ID to other part of kernel, then we can retrive 
the type and
decode the structure using DWARF information. Currently we have two 
problem

needs to solve:

1. Dwarf information generated by BPF backend lost all DW_AT_name field;

2. How to get typeid from local variable? I tried llvm.eh_typeid_for
   but it support global variable only.

Following is my response to Alexei.

On 2015/8/4 3:44, Alexei Starovoitov wrote:

On 7/31/15 3:18 AM, Wangnan (F) wrote:



[SNIP]


didn't have time to look at it.
from your llvm patches looks like you've got quite experienced
with it already :)


I'll post 2 LLVM patches by replying this mail. Please have a look and
help me
send them to LLVM if you think my code is correct.


patch 1:
I don't quite understand the purpose of builtin_dwarf_cfa
returning R11. It's a special register seen inside llvm codegen
only. It doesn't have kernel meaning.



Kernel side verifier allows us to do arithmetic computation using two 
local variable
address or local variable address and R11. Therefore, we can compute 
the location

of a local variable using:

  mark = _var_a - __builtin_frame_address(0);

If the stack allocation is fixed (if the location is never reused), 
the above 'mark'
can be uniquely identify a local variable. That's why I'm interesting 
in it. However

I'm not sure whether the prerequestion is hold.


patch 2:
do we really need to hack clang?
Can you just define a function that aliases to intrinsic,
like we do for ld_abs/ld_ind ?
void bpf_store_half(void *skb, u64 off, u64 val) 
asm("llvm.bpf.store.half");

then no extra patches necessary.


struct my_str {
 int x;
 int y;
};
struct my_str __str_my_str;

struct my_str2 {
 int x;
 int y;
 int z;
};
struct my_str2 __str_my_str2;

 test_func(__builtin_bpf_typeid(&__str_my_str));
test_func(__builtin_bpf_typeid(&__str_my_str2));
 mov r1, 1
 call4660
 mov r1, 2
 call4660


this part looks good. I think it's usable.

> 1. llvm.eh_typeid_for can be used on global variables only. So for 
each

> output
> structure we have to define a global varable.

why? I think it should work with local pointers as well.



It is defined by LLVM, in lib/CodeGen/Analysis.cpp:

/// ExtractTypeInfo - Returns the type info, possibly bitcast, 
encoded in V.

GlobalValue *llvm::ExtractTypeInfo(Value *V) {
  ...
  assert((GV || isa(V)) &&
 "TypeInfo must be a global variable or NULL");   <-- we can 
use only constant pointers

  return GV;
}

So from llvm::Intrinsic::eh_typeid_for we can get type of global 
variables only.


We may need a new intrinsic for that.


> 2. We still need to find a way to connect the fetchd typeid with 
DWARF

> info.
> Inserting that ID into DWARF may workable?

hmm, that id should be the same id we're seeing in dwarf, right?


There's no 'typeid' field in dwarf originally. I'm still looking for 
a way

to inject this ID into dwarf infromation.


I think it's used in exception handling which is reusing some of
the dwarf stuff for this, so the must be a way to connect that id
to actual type info. Though last time I looked at EH was
during g++ hacking days. No idea how llvm does it exactly, but
I'm assuming the logic for rtti should be similar.



I'm not sure whether RTTI use dwarf to deduce type information. I 
think not,

because dwarf infos can be stripped out.



Hi Alexei,

Just found that llvm::Intrinsic::eh_typeid_for is function specific. 
ID of same type in

different functions may be different. Here is an example:

static int (*bpf_output_event)(unsigned long, void *buf, int size) =
(void *) 0x1234;

struct my_str {
int x;
int y;
};
struct my_str __str_my_str;

struct my_str2 {
int x;
int y;
int z;
};
struct my_str2 __str_my_str2;

int func(int *ctx)
{
struct my_str var_a;
struct my_str2 var_b;
bpf_output_event(__builtin_bpf_typeid(&__str_my_str), _a, 
sizeof(var_a));
bpf_output_event(__builtin_bpf_typeid(&__str_my_str2), _b, 
sizeof(var_b));

return 0;
}

int func2(int *ctx)
{
struct my_str var_a;
struct my_str2 var_b;

/* change order here */
bpf_output_event(__builtin_bpf_typeid(&__str_my_str2), _b, 
sizeof(var_b));
bpf_output_event(__builtin_bpf_typeid(&__str_my_str), _a, 
sizeof(var_a))

return 0;
}

This program uses __builtin_bpf_typeid(llvm::Intrinsic::eh_typeid_for) 
in func and func2
for same two types but in different order. We expect 

Re: [PATCH 04/11] ext4: Add ext4_get_block_dax()

2015-08-04 Thread Dave Chinner
On Tue, Aug 04, 2015 at 03:57:58PM -0400, Matthew Wilcox wrote:
> From: Matthew Wilcox 
> 
> DAX wants different semantics from any currently-existing ext4
> get_block callback.  Unlike ext4_get_block_write(), it needs to honour
> the 'create' flag, and unlike ext4_get_block(), it needs to be able
> to return unwritten extents.  So introduce a new ext4_get_block_dax()
> which has those semantics.  We could also change ext4_get_block_write()
> to honour the 'create' flag, but that might have consequences on other
> users that I do not currently understand.
> 
> Signed-off-by: Matthew Wilcox 

Doesn't this make the first patch in the series redundant?

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] serial: don't announce CIR serial ports

2015-08-04 Thread Peter Hurley
On 08/04/2015 07:25 PM, Maciej S. Szmigiero wrote:
> Hi Peter,
> 
> Thanks for looking into it.
> 
> On 04.08.2015 03:46, Peter Hurley wrote:
>> Hi Maciej,
>>
>> On 08/02/2015 05:09 PM, Maciej S. Szmigiero wrote:
>>> CIR type serial ports aren't real serial ports.
>>> This is just a way to prevent legacy serial driver
>>> from probing and eventually binding some resources
>>> so don't announce them like normal serial ports.
>>
>> I'd like to keep some form of reporting so that we know the
>> port was properly probed; what about extending uart_report_port()
>> to including CIR + disabled status?
> 
> Currently the printed message looks like this:
> "00:01: ttyS2 at I/O 0x3e8 (irq = 7, base_baud = 115200) is a CIR port".
> 
> I think it would be best to skip a device file name in this case,
> since this is how user sees (and uses) a real serial port.
> The message would be then:
> "00:01 at I/O 0x3e8 (irq = 7, base_baud = 115200) is a CIR port".
> 
> The dev name will always be present since the only current
> "source" of CIR ports is PNP 8250 driver which sets 
> dev pointer uncondtionally.
> 
>> Secondly, good catch! Because we should not be trying to
>> register a console on this port, nor driving modem signals.
>>
>> So maybe an early exit after uart_report_port?
> 
> All right, I will resubmit updated patch tomorrow.

In re-reviewing this, I think the proper solution is actually not
to add the uart port for a CIR port at all. It doesn't make
sense because the tty cannot be changed by setserial/ioctl(TIOCSSERIAL),
so the device node serves no purpose.

This problem is really an artifact of the 8250 driver port management,
and shouldn't involve the serial core at all.

An additional benefit of this approach is that a simple one-line
banner noting the port skip could be emitted instead from
serial8250_register_8250_port().

Regards,
Peter Hurley

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: powerpc: Add an inline function to update HID0

2015-08-04 Thread Michael Ellerman
On Tue, 2015-08-04 at 19:36 +0530, Madhavan Srinivasan wrote:
> 
> On Tuesday 04 August 2015 03:38 PM, Michael Ellerman wrote:
> > On Tue, 2015-04-08 at 08:30:58 UTC, "Gautham R. Shenoy" wrote:
> >> Section 3.7 of Version 1.2 of the Power8 Processor User's Manual
> >> prescribes that updates to HID0 be preceded by a SYNC instruction and
> >> followed by an ISYNC instruction (Page 91).
> >>
> >> Create a function name update_hid0() which follows this recipe and
> >> invoke it from the static split core path.
> >>
> >> Signed-off-by: Gautham R. Shenoy 
> >> ---
> >>  arch/powerpc/include/asm/kvm_ppc.h   | 11 +++
> > Why is it in there? It's not KVM related per se.
> >
> > Where should it go? I think reg.h would be best, ideally near the definition
> > for HID0, though that's probably not possible because of ASSEMBLY 
> > requirements.
> > So at the bottom of reg.h ?
> 
> just to understand, Something like this will not do?
> 
> #define update_hid0(x)  __asm__ __volatile__(   "sync\n"\
> "mtspr " 
> __stringify(SPRN_HID0)", %0\n"\
> "isync"::"r"(x));
> 

Yeah we could do that also.

The static inline is less ugly though.

cheers


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/



Re: Cc llvmdev: Re: llvm bpf debug info. Re: [RFC PATCH v4 3/3] bpf: Introduce function for outputing data to perf event

2015-08-04 Thread Wangnan (F)



On 2015/8/4 17:01, Wangnan (F) wrote:

For people who in llvmdev:

This mail is belong to a thread in linux kernel mailing list, the 
first message

can be retrived from:

 http://lkml.kernel.org/r/55b1535e.8090...@plumgrid.com

Our goal is to fild a way to make BPF program get an unique ID for 
each type
so it can pass the ID to other part of kernel, then we can retrive the 
type and
decode the structure using DWARF information. Currently we have two 
problem

needs to solve:

1. Dwarf information generated by BPF backend lost all DW_AT_name field;

2. How to get typeid from local variable? I tried llvm.eh_typeid_for
   but it support global variable only.

Following is my response to Alexei.

On 2015/8/4 3:44, Alexei Starovoitov wrote:

On 7/31/15 3:18 AM, Wangnan (F) wrote:



[SNIP]


didn't have time to look at it.
from your llvm patches looks like you've got quite experienced
with it already :)


I'll post 2 LLVM patches by replying this mail. Please have a look and
help me
send them to LLVM if you think my code is correct.


patch 1:
I don't quite understand the purpose of builtin_dwarf_cfa
returning R11. It's a special register seen inside llvm codegen
only. It doesn't have kernel meaning.



Kernel side verifier allows us to do arithmetic computation using two 
local variable
address or local variable address and R11. Therefore, we can compute 
the location

of a local variable using:

  mark = _var_a - __builtin_frame_address(0);

If the stack allocation is fixed (if the location is never reused), 
the above 'mark'
can be uniquely identify a local variable. That's why I'm interesting 
in it. However

I'm not sure whether the prerequestion is hold.


patch 2:
do we really need to hack clang?
Can you just define a function that aliases to intrinsic,
like we do for ld_abs/ld_ind ?
void bpf_store_half(void *skb, u64 off, u64 val) 
asm("llvm.bpf.store.half");

then no extra patches necessary.


struct my_str {
 int x;
 int y;
};
struct my_str __str_my_str;

struct my_str2 {
 int x;
 int y;
 int z;
};
struct my_str2 __str_my_str2;

 test_func(__builtin_bpf_typeid(&__str_my_str));
 test_func(__builtin_bpf_typeid(&__str_my_str2));
 mov r1, 1
 call4660
 mov r1, 2
 call4660


this part looks good. I think it's usable.

> 1. llvm.eh_typeid_for can be used on global variables only. So for 
each

> output
> structure we have to define a global varable.

why? I think it should work with local pointers as well.



It is defined by LLVM, in lib/CodeGen/Analysis.cpp:

/// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded 
in V.

GlobalValue *llvm::ExtractTypeInfo(Value *V) {
  ...
  assert((GV || isa(V)) &&
 "TypeInfo must be a global variable or NULL");   <-- we can 
use only constant pointers

  return GV;
}

So from llvm::Intrinsic::eh_typeid_for we can get type of global 
variables only.


We may need a new intrinsic for that.



> 2. We still need to find a way to connect the fetchd typeid with DWARF
> info.
> Inserting that ID into DWARF may workable?

hmm, that id should be the same id we're seeing in dwarf, right?


There's no 'typeid' field in dwarf originally. I'm still looking for a 
way

to inject this ID into dwarf infromation.


I think it's used in exception handling which is reusing some of
the dwarf stuff for this, so the must be a way to connect that id
to actual type info. Though last time I looked at EH was
during g++ hacking days. No idea how llvm does it exactly, but
I'm assuming the logic for rtti should be similar.



I'm not sure whether RTTI use dwarf to deduce type information. I 
think not,

because dwarf infos can be stripped out.



Hi Alexei,

Just found that llvm::Intrinsic::eh_typeid_for is function specific. ID 
of same type in

different functions may be different. Here is an example:

static int (*bpf_output_event)(unsigned long, void *buf, int size) =
(void *) 0x1234;

struct my_str {
int x;
int y;
};
struct my_str __str_my_str;

struct my_str2 {
int x;
int y;
int z;
};
struct my_str2 __str_my_str2;

int func(int *ctx)
{
struct my_str var_a;
struct my_str2 var_b;
bpf_output_event(__builtin_bpf_typeid(&__str_my_str), _a, 
sizeof(var_a));
bpf_output_event(__builtin_bpf_typeid(&__str_my_str2), _b, 
sizeof(var_b));

return 0;
}

int func2(int *ctx)
{
struct my_str var_a;
struct my_str2 var_b;

/* change order here */
bpf_output_event(__builtin_bpf_typeid(&__str_my_str2), _b, 
sizeof(var_b));
bpf_output_event(__builtin_bpf_typeid(&__str_my_str), _a, 
sizeof(var_a))

return 0;
}

This program uses __builtin_bpf_typeid(llvm::Intrinsic::eh_typeid_for) 
in func and func2

for same two types but in different order. We expect same type get same ID.

Compiled with:

 $ clang -target bpf -S -O2 -c ./test_bpf_typeid.c


Re: [PATCH V3 3/3] rtc: da9063: Add DA9062 RTC capability to DA9063 RTC driver

2015-08-04 Thread Alexandre Belloni
On 21/07/2015 at 11:29:07 +0100, S Twiss wrote :
> From: S Twiss 
> 
> Add DA9062 RTC support into the existing DA9063 RTC driver component by
> using generic access tables for common register and bit mask definitions.
> 
> The following change will add generic register and bit mask support to the
> DA9063 RTC. The changes are slightly complicated by requiring support for
> three register sets: DA9063-AD, DA9063-BB and DA9062-AA.
> 
> The following alterations have been made to the DA9063 RTC:
> 
> - Addition of a da9063_compatible_rtc_regmap structure to hold all generic
>   registers and bitmasks for this type of RTC component.
> - A re-write of struct da9063 to use pointers for regmap and compatible
>   registers/masks definitions
> - Addition of a of_device_id table for DA9063 and DA9062 defaults
> - Refactoring functions to use struct da9063_compatible_rtc accesses to
>   generic registers/masks instead of using defines from registers.h
> - Re-work of da9063_rtc_probe() to use of_match_node() and dev_get_regmap()
>   to provide initialisation of generic registers and masks and access to
>   regmap
> 
> Signed-off-by: Steve Twiss 
> 
Applied, thanks.

-- 
Alexandre Belloni, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] rtc: mt6397: implement suspend/resume function in rtc-mt6397 driver

2015-08-04 Thread Alexandre Belloni
On 30/07/2015 at 22:53:14 +0800, Henry Chen wrote :
> Implement the suspend/resume function in order to control rtc's irq_wake flag 
> and handle as wakeup source.
> 
> Signed-off-by: Henry Chen 
> ---
>  drivers/rtc/rtc-mt6397.c | 26 ++
>  1 file changed, 26 insertions(+)
> 
Applied, thanks.

-- 
Alexandre Belloni, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ARM: BCM: Enable ARM erratum 798181 for BRCMSTB

2015-08-04 Thread Gregory Fong
Commit 04fcab32d3fa1d3f6afe97e0ab431c5572e07a2c ("ARM: 8111/1: Enable
erratum 798181 for Broadcom Brahma-B15") enables this erratum for
affected Broadcom Brahma-B15 CPUs when CONFIG_ARM_ERRATA_798181=y.
Let's make sure that config option is actually set.

Signed-off-by: Gregory Fong 
---
 arch/arm/mach-bcm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/mach-bcm/Kconfig b/arch/arm/mach-bcm/Kconfig
index 0ac9e4b3..eed8746 100644
--- a/arch/arm/mach-bcm/Kconfig
+++ b/arch/arm/mach-bcm/Kconfig
@@ -140,6 +140,7 @@ config ARCH_BCM_63XX
 config ARCH_BRCMSTB
bool "Broadcom BCM7XXX based boards" if ARCH_MULTI_V7
select ARM_GIC
+   select ARM_ERRATA_798181 if SMP
select HAVE_ARM_ARCH_TIMER
select BRCMSTB_GISB_ARB
select BRCMSTB_L2_IRQ
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 3/3] RTC: switch to using is_visible() to control sysfs attributes

2015-08-04 Thread Alexandre Belloni
On 23/07/2015 at 16:01:08 -0700, Dmitry Torokhov wrote :
> Instead of creating wakealarm attribute manually, after the device has been
> registered, let's rely on facilities provided by the attribute groups to
> control which attributes are visible and which are not. This allows to to
> create all needed attributes at once, at the same time that we register RTC
> class device.
> 
> Signed-off-by: Dmitry Torokhov 
Applied, thanks.

-- 
Alexandre Belloni, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/3] RTC: make rtc_does_wakealarm() return boolean

2015-08-04 Thread Alexandre Belloni
On 23/07/2015 at 16:01:06 -0700, Dmitry Torokhov wrote :
> Users of rtc_does_wakealarm() return value treat it as boolean so let's
> change the signature accordingly.
> 
> Signed-off-by: Dmitry Torokhov 
Applied, thanks.

-- 
Alexandre Belloni, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 2/3] RTC: switch wakealarm attribute to DEVICE_ATTR_RW

2015-08-04 Thread Alexandre Belloni
On 23/07/2015 at 16:01:07 -0700, Dmitry Torokhov wrote :
> Instead of using older style DEVICE_ATTR for wakealarm attribute let's
> switch to using DEVICE_ATTR_RW that ensures consistent across the kernel
> permissions on the attribute.
> 
> Signed-off-by: Dmitry Torokhov 
Applied, thanks.

-- 
Alexandre Belloni, Free Electrons
Embedded Linux, Kernel and Android engineering
http://free-electrons.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH net] netfilter: conntrack: Use flags in nf_ct_tmpl_alloc()

2015-08-04 Thread Joe Stringer
On 4 August 2015 at 18:34, Joe Stringer  wrote:
> The flags were ignored for this function when it was introduced. Also
> fix the style problem in kzalloc.
>
> Fixes: 0838aa7fc (netfilter: fix netns dependencies with conntrack
> templates)
> Signed-off-by: Joe Stringer 

s/net/nf/ in subject line.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/3] Make workingset detection logic memcg aware

2015-08-04 Thread Kamezawa Hiroyuki
On 2015/08/03 21:04, Vladimir Davydov wrote:
> Hi,
> 
> Currently, workingset detection logic is not memcg aware - inactive_age
> is maintained per zone. As a result, if memory cgroups are used,
> refaulted file pages are activated randomly. This patch set makes
> inactive_age per lruvec so that workingset detection will work correctly
> for memory cgroup reclaim.
> 
> Thanks,
> 

Reading discussion, I feel storing more data is difficult, too.

I wonder, rather than collecting more data, rough calculation can help the 
situation.
for example,

   (refault_disatance calculated in zone) * memcg_reclaim_ratio < memcg's 
active list

If one of per-zone calc or per-memcg calc returns true, refault should be true.

memcg_reclaim_ratio is the percentage of scan in a memcg against in a zone.


Thanks,
-Kame



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH net] netfilter: conntrack: Use flags in nf_ct_tmpl_alloc()

2015-08-04 Thread Joe Stringer
The flags were ignored for this function when it was introduced. Also
fix the style problem in kzalloc.

Fixes: 0838aa7fc (netfilter: fix netns dependencies with conntrack
templates)
Signed-off-by: Joe Stringer 
---
 net/netfilter/nf_conntrack_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index f168099..3c20d02 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -292,7 +292,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, 
gfp_t flags)
 {
struct nf_conn *tmpl;
 
-   tmpl = kzalloc(sizeof(struct nf_conn), GFP_KERNEL);
+   tmpl = kzalloc(sizeof(*tmpl), flags);
if (tmpl == NULL)
return NULL;
 
@@ -303,7 +303,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, 
gfp_t flags)
if (zone) {
struct nf_conntrack_zone *nf_ct_zone;
 
-   nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC);
+   nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, flags);
if (!nf_ct_zone)
goto out_free;
nf_ct_zone->id = zone;
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RT 5/7] ipc/mqueue: Implement lockless pipelined wakeups

2015-08-04 Thread Steven Rostedt
3.18.18-rt16-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Davidlohr Bueso 

This patch moves the wakeup_process() invocation so it is not done under
the info->lock by making use of a lockless wake_q. With this change, the
waiter is woken up once it is STATE_READY and it does not need to loop
on SMP if it is still in STATE_PENDING. In the timeout case we still need
to grab the info->lock to verify the state.

This change should also avoid the introduction of preempt_disable() in -rt
which avoids a busy-loop which pools for the STATE_PENDING -> STATE_READY
change if the waiter has a higher priority compared to the waker.

Additionally, this patch micro-optimizes wq_sleep by using the cheaper
cousin of set_current_state(TASK_INTERRUPTABLE) as we will block no
matter what, thus get rid of the implied barrier.

Cc: stable...@vger.kernel.org
Signed-off-by: Davidlohr Bueso 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: George Spelvin 
Acked-by: Thomas Gleixner 
Cc: Andrew Morton 
Cc: Borislav Petkov 
Cc: Chris Mason 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Manfred Spraul 
Cc: Peter Zijlstra 
Cc: Sebastian Andrzej Siewior 
Cc: Steven Rostedt 
Cc: d...@stgolabs.net
Link: http://lkml.kernel.org/r/1430748166.1940.17.ca...@stgolabs.net
Signed-off-by: Ingo Molnar 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Steven Rostedt 
---
 ipc/mqueue.c | 53 -
 1 file changed, 32 insertions(+), 21 deletions(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 516902313dc3..79351b5dd0a1 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -47,8 +47,7 @@
 #define RECV   1
 
 #define STATE_NONE 0
-#define STATE_PENDING  1
-#define STATE_READY2
+#define STATE_READY1
 
 struct posix_msg_tree_node {
struct rb_node  rb_node;
@@ -571,15 +570,12 @@ static int wq_sleep(struct mqueue_inode_info *info, int 
sr,
wq_add(info, sr, ewp);
 
for (;;) {
-   set_current_state(TASK_INTERRUPTIBLE);
+   __set_current_state(TASK_INTERRUPTIBLE);
 
spin_unlock(>lock);
time = schedule_hrtimeout_range_clock(timeout, 0,
HRTIMER_MODE_ABS, CLOCK_REALTIME);
 
-   while (ewp->state == STATE_PENDING)
-   cpu_relax();
-
if (ewp->state == STATE_READY) {
retval = 0;
goto out;
@@ -907,11 +903,15 @@ out_name:
  * list of waiting receivers. A sender checks that list before adding the new
  * message into the message array. If there is a waiting receiver, then it
  * bypasses the message array and directly hands the message over to the
- * receiver.
- * The receiver accepts the message and returns without grabbing the queue
- * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
- * are necessary. The same algorithm is used for sysv semaphores, see
- * ipc/sem.c for more details.
+ * receiver. The receiver accepts the message and returns without grabbing the
+ * queue spinlock:
+ *
+ * - Set pointer to message.
+ * - Queue the receiver task for later wakeup (without the info->lock).
+ * - Update its state to STATE_READY. Now the receiver can continue.
+ * - Wake up the process after the lock is dropped. Should the process wake up
+ *   before this wakeup (due to a timeout or a signal) it will either see
+ *   STATE_READY and continue or acquire the lock to check the state again.
  *
  * The same algorithm is used for senders.
  */
@@ -919,7 +919,8 @@ out_name:
 /* pipelined_send() - send a message directly to the task waiting in
  * sys_mq_timedreceive() (without inserting message into a queue).
  */
-static inline void pipelined_send(struct mqueue_inode_info *info,
+static inline void pipelined_send(struct wake_q_head *wake_q,
+ struct mqueue_inode_info *info,
  struct msg_msg *message,
  struct ext_wait_queue *receiver)
 {
@@ -929,16 +930,23 @@ static inline void pipelined_send(struct 
mqueue_inode_info *info,
preempt_disable_rt();
receiver->msg = message;
list_del(>list);
-   receiver->state = STATE_PENDING;
-   wake_up_process(receiver->task);
-   smp_wmb();
+   wake_q_add(wake_q, receiver->task);
+   /*
+* Rely on the implicit cmpxchg barrier from wake_q_add such
+* that we can ensure that updating receiver->state is the last
+* write operation: As once set, the receiver can continue,
+* and if we don't have the reference count from the wake_q,
+* yet, at that point we can later have a use-after-free
+* condition and bogus wakeup.
+*/
receiver->state = STATE_READY;
preempt_enable_rt();
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
  * gets its message and put 

[PATCH RT 4/7] futex: Implement lockless wakeups

2015-08-04 Thread Steven Rostedt
3.18.18-rt16-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Davidlohr Bueso 

Given the overall futex architecture, any chance of reducing
hb->lock contention is welcome. In this particular case, using
wake-queues to enable lockless wakeups addresses very much real
world performance concerns, even cases of soft-lockups in cases
of large amounts of blocked tasks (which is not hard to find in
large boxes, using but just a handful of futex).

At the lowest level, this patch can reduce latency of a single thread
attempting to acquire hb->lock in highly contended scenarios by a
up to 2x. At lower counts of nr_wake there are no regressions,
confirming, of course, that the wake_q handling overhead is practically
non existent. For instance, while a fair amount of variation,
the extended pef-bench wakeup benchmark shows for a 20 core machine
the following avg per-thread time to wakeup its share of tasks:

nr_thr  ms-before   ms-after
16  0.0590  0.0215
32  0.0396  0.0220
48  0.0417  0.0182
64  0.0536  0.0236
80  0.0414  0.0097
96  0.0672  0.0152

Naturally, this can cause spurious wakeups. However there is no core code
that cannot handle them afaict, and furthermore tglx does have the point
that other events can already trigger them anyway.

Cc: stable...@vger.kernel.org
Signed-off-by: Davidlohr Bueso 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Thomas Gleixner 
Cc: Andrew Morton 
Cc: Borislav Petkov 
Cc: Chris Mason 
Cc: Davidlohr Bueso 
Cc: George Spelvin 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Manfred Spraul 
Cc: Peter Zijlstra 
Cc: Sebastian Andrzej Siewior 
Cc: Steven Rostedt 
Link: 
http://lkml.kernel.org/r/1430494072-30283-3-git-send-email-d...@stgolabs.net
Signed-off-by: Ingo Molnar 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Steven Rostedt 
---
 kernel/futex.c | 33 +
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 647ff4b3a150..f9172a5ee332 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1092,9 +1092,11 @@ static void __unqueue_futex(struct futex_q *q)
 
 /*
  * The hash bucket lock must be held when this is called.
- * Afterwards, the futex_q must not be accessed.
+ * Afterwards, the futex_q must not be accessed. Callers
+ * must ensure to later call wake_up_q() for the actual
+ * wakeups to occur.
  */
-static void wake_futex(struct futex_q *q)
+static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
 {
struct task_struct *p = q->task;
 
@@ -1102,14 +1104,10 @@ static void wake_futex(struct futex_q *q)
return;
 
/*
-* We set q->lock_ptr = NULL _before_ we wake up the task. If
-* a non-futex wake up happens on another CPU then the task
-* might exit and p would dereference a non-existing task
-* struct. Prevent this by holding a reference on p across the
-* wake up.
+* Queue the task for later wakeup for after we've released
+* the hb->lock. wake_q_add() grabs reference to p.
 */
-   get_task_struct(p);
-
+   wake_q_add(wake_q, p);
__unqueue_futex(q);
/*
 * The waiting task can free the futex_q as soon as
@@ -1119,9 +1117,6 @@ static void wake_futex(struct futex_q *q)
 */
smp_wmb();
q->lock_ptr = NULL;
-
-   wake_up_state(p, TASK_NORMAL);
-   put_task_struct(p);
 }
 
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1219,6 +1214,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int 
nr_wake, u32 bitset)
struct futex_q *this, *next;
union futex_key key = FUTEX_KEY_INIT;
int ret;
+   WAKE_Q(wake_q);
 
if (!bitset)
return -EINVAL;
@@ -1246,13 +1242,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int 
nr_wake, u32 bitset)
if (!(this->bitset & bitset))
continue;
 
-   wake_futex(this);
+   mark_wake_futex(_q, this);
if (++ret >= nr_wake)
break;
}
}
 
spin_unlock(>lock);
+   wake_up_q(_q);
 out_put_key:
put_futex_key();
 out:
@@ -1271,6 +1268,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 
__user *uaddr2,
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
int ret, op_ret;
+   WAKE_Q(wake_q);
 
 retry:
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, , VERIFY_READ);
@@ -1322,7 +1320,7 @@ retry_private:
ret = -EINVAL;
goto out_unlock;
}
-   wake_futex(this);
+  

[PATCH RT 2/7] mm/slub: move slab initialization into irq enabled region

2015-08-04 Thread Steven Rostedt
3.18.18-rt16-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Thomas Gleixner 

Initializing a new slab can introduce rather large latencies because most
of the initialization runs always with interrupts disabled.

There is no point in doing so.  The newly allocated slab is not visible
yet, so there is no reason to protect it against concurrent alloc/free.

Move the expensive parts of the initialization into allocate_slab(), so
for all allocations with GFP_WAIT set, interrupts are enabled.

Signed-off-by: Thomas Gleixner 
Acked-by: Christoph Lameter 
Cc: Pekka Enberg 
Cc: David Rientjes 
Cc: Joonsoo Kim 
Cc: Sebastian Andrzej Siewior 
Cc: Steven Rostedt 
Cc: Peter Zijlstra 
Signed-off-by: Andrew Morton 
Signed-off-by: Steven Rostedt 
---
 mm/slub.c | 79 ++-
 1 file changed, 38 insertions(+), 41 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 534609a0326a..e48bca049f21 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1279,6 +1279,13 @@ static inline void slab_free_hook(struct kmem_cache *s, 
void *x)
debug_check_no_obj_freed(x, s->object_size);
 }
 
+static void setup_object(struct kmem_cache *s, struct page *page,
+   void *object)
+{
+   setup_object_debug(s, page, object);
+   if (unlikely(s->ctor))
+   s->ctor(object);
+}
 /*
  * Slab allocation and freeing
  */
@@ -1310,6 +1317,8 @@ static struct page *allocate_slab(struct kmem_cache *s, 
gfp_t flags, int node)
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
bool enableirqs;
+   void *start, *p;
+   int idx, order;
 
flags &= gfp_allowed_mask;
 
@@ -1337,13 +1346,13 @@ static struct page *allocate_slab(struct kmem_cache *s, 
gfp_t flags, int node)
 * Try a lower order alloc if possible
 */
page = alloc_slab_page(s, alloc_gfp, node, oo);
-
-   if (page)
-   stat(s, ORDER_FALLBACK);
+   if (unlikely(!page))
+   goto out;
+   stat(s, ORDER_FALLBACK);
}
 
-   if (kmemcheck_enabled && page
-   && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
+   if (kmemcheck_enabled &&
+   !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
int pages = 1 << oo_order(oo);
 
kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
@@ -1358,45 +1367,9 @@ static struct page *allocate_slab(struct kmem_cache *s, 
gfp_t flags, int node)
kmemcheck_mark_unallocated_pages(page, pages);
}
 
-   if (enableirqs)
-   local_irq_disable();
-   if (!page)
-   return NULL;
-
page->objects = oo_objects(oo);
-   mod_zone_page_state(page_zone(page),
-   (s->flags & SLAB_RECLAIM_ACCOUNT) ?
-   NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-   1 << oo_order(oo));
-
-   return page;
-}
-
-static void setup_object(struct kmem_cache *s, struct page *page,
-   void *object)
-{
-   setup_object_debug(s, page, object);
-   if (unlikely(s->ctor))
-   s->ctor(object);
-}
-
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
-{
-   struct page *page;
-   void *start;
-   void *p;
-   int order;
-   int idx;
-
-   BUG_ON(flags & GFP_SLAB_BUG_MASK);
-
-   page = allocate_slab(s,
-   flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
-   if (!page)
-   goto out;
 
order = compound_order(page);
-   inc_slabs_node(s, page_to_nid(page), page->objects);
page->slab_cache = s;
__SetPageSlab(page);
if (page->pfmemalloc)
@@ -1418,10 +1391,34 @@ static struct page *new_slab(struct kmem_cache *s, 
gfp_t flags, int node)
page->freelist = start;
page->inuse = page->objects;
page->frozen = 1;
+
 out:
+   if (enableirqs)
+   local_irq_disable();
+   if (!page)
+   return NULL;
+
+   mod_zone_page_state(page_zone(page),
+   (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+   NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+   1 << oo_order(oo));
+
+   inc_slabs_node(s, page_to_nid(page), page->objects);
+
return page;
 }
 
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+   if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
+   pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
+   BUG();
+   }
+
+   return allocate_slab(s,
+   flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
+}
+
 static void __free_slab(struct kmem_cache *s, struct page *page)
 {
int order = compound_order(page);
-- 
2.1.4


--
To unsubscribe from this list: send the 

[PATCH RT 6/7] kernel/irq_work: fix non RT case

2015-08-04 Thread Steven Rostedt
3.18.18-rt16-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Sebastian Andrzej Siewior 

After the deadlock fixed, the checked got somehow away and broke the non-RT
case which could invoke IRQ-work from softirq context.

Cc: stable...@vger.kernel.org
Reported-by: Steven Rostedt 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Steven Rostedt 
---
 kernel/time/timer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a29ab1a17023..3a978d000fce 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1452,7 +1452,8 @@ void update_process_times(int user_tick)
rcu_check_callbacks(cpu, user_tick);
 
 #if defined(CONFIG_IRQ_WORK) && !defined(CONFIG_PREEMPT_RT_FULL)
-   irq_work_tick();
+   if (in_irq())
+   irq_work_tick();
 #endif
run_posix_cpu_timers(p);
 }
-- 
2.1.4


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RT 3/7] sched: Implement lockless wake-queues

2015-08-04 Thread Steven Rostedt
3.18.18-rt16-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Peter Zijlstra 

This is useful for locking primitives that can effect multiple
wakeups per operation and want to avoid lock internal lock contention
by delaying the wakeups until we've released the lock internal locks.

Alternatively it can be used to avoid issuing multiple wakeups, and
thus save a few cycles, in packet processing. Queue all target tasks
and wakeup once you've processed all packets. That way you avoid
waking the target task multiple times if there were multiple packets
for the same task.

Properties of a wake_q are:
- Lockless, as queue head must reside on the stack.
- Being a queue, maintains wakeup order passed by the callers. This can
  be important for otherwise, in scenarios where highly contended locks
  could affect any reliance on lock fairness.
- A queued task cannot be added again until it is woken up.

This patch adds the needed infrastructure into the scheduler code
and uses the new wake_list to delay the futex wakeups until
after we've released the hash bucket locks.

Cc: stable...@vger.kernel.org
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Steven Rostedt 
[tweaks, adjustments, comments, etc.]
Signed-off-by: Davidlohr Bueso 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Thomas Gleixner 
Cc: Borislav Petkov 
Cc: Chris Mason 
Cc: Davidlohr Bueso 
Cc: George Spelvin 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Manfred Spraul 
Cc: Sebastian Andrzej Siewior 
Cc: Steven Rostedt 
Link: 
http://lkml.kernel.org/r/1430494072-30283-2-git-send-email-d...@stgolabs.net
Signed-off-by: Ingo Molnar 
Signed-off-by: Sebastian Andrzej Siewior 
---
 include/linux/sched.h | 46 ++
 kernel/sched/core.c   | 46 ++
 2 files changed, 92 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 05353a40a462..97056d557b06 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -855,6 +855,50 @@ enum cpu_idle_type {
 #define SCHED_CAPACITY_SCALE   (1L << SCHED_CAPACITY_SHIFT)
 
 /*
+ * Wake-queues are lists of tasks with a pending wakeup, whose
+ * callers have already marked the task as woken internally,
+ * and can thus carry on. A common use case is being able to
+ * do the wakeups once the corresponding user lock as been
+ * released.
+ *
+ * We hold reference to each task in the list across the wakeup,
+ * thus guaranteeing that the memory is still valid by the time
+ * the actual wakeups are performed in wake_up_q().
+ *
+ * One per task suffices, because there's never a need for a task to be
+ * in two wake queues simultaneously; it is forbidden to abandon a task
+ * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
+ * already in a wake queue, the wakeup will happen soon and the second
+ * waker can just skip it.
+ *
+ * The WAKE_Q macro declares and initializes the list head.
+ * wake_up_q() does NOT reinitialize the list; it's expected to be
+ * called near the end of a function, where the fact that the queue is
+ * not used again will be easy to see by inspection.
+ *
+ * Note that this can cause spurious wakeups. schedule() callers
+ * must ensure the call is done inside a loop, confirming that the
+ * wakeup condition has in fact occurred.
+ */
+struct wake_q_node {
+   struct wake_q_node *next;
+};
+
+struct wake_q_head {
+   struct wake_q_node *first;
+   struct wake_q_node **lastp;
+};
+
+#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
+
+#define WAKE_Q(name)   \
+   struct wake_q_head name = { WAKE_Q_TAIL,  }
+
+extern void wake_q_add(struct wake_q_head *head,
+  struct task_struct *task);
+extern void wake_up_q(struct wake_q_head *head);
+
+/*
  * sched-domains (multiprocessor balancing) declarations:
  */
 #ifdef CONFIG_SMP
@@ -1463,6 +1507,8 @@ struct task_struct {
/* Protection of the PI data structures: */
raw_spinlock_t pi_lock;
 
+   struct wake_q_node wake_q;
+
 #ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct rb_root pi_waiters;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8ad9dcc8270e..cd25ced2208e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -601,6 +601,52 @@ static bool set_nr_if_polling(struct task_struct *p)
 #endif
 #endif
 
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+   struct wake_q_node *node = >wake_q;
+
+   /*
+* Atomically grab the task, if ->wake_q is !nil already it means
+* its already queued (either by us or someone else) and will get the
+* wakeup due to that.
+*
+* This cmpxchg() implies a full barrier, which pairs with the write
+* barrier implied by the wakeup in wake_up_list().
+*/
+   if (cmpxchg(>next, NULL, WAKE_Q_TAIL))

[PATCH RT 1/7] Revert "slub: delay ctor until the object is requested"

2015-08-04 Thread Steven Rostedt
3.18.18-rt16-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: Sebastian Andrzej Siewior 

This approach is broken with SLAB_DESTROY_BY_RCU allocations.
Reported by Steven Rostedt and Koehrer Mathias.

Cc: stable...@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Steven Rostedt 
---
 mm/slub.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 72bb06beaabc..534609a0326a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1376,10 +1376,8 @@ static void setup_object(struct kmem_cache *s, struct 
page *page,
void *object)
 {
setup_object_debug(s, page, object);
-#ifndef CONFIG_PREEMPT_RT_FULL
if (unlikely(s->ctor))
s->ctor(object);
-#endif
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2501,10 +2499,6 @@ redo:
 
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->object_size);
-#ifdef CONFIG_PREEMPT_RT_FULL
-   if (unlikely(s->ctor) && object)
-   s->ctor(object);
-#endif
 
slab_post_alloc_hook(s, gfpflags, object);
 
-- 
2.1.4


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RT 7/7] Linux 3.18.18-rt16-rc1

2015-08-04 Thread Steven Rostedt
3.18.18-rt16-rc1 stable review patch.
If anyone has any objections, please let me know.

--

From: "Steven Rostedt (Red Hat)" 

---
 localversion-rt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/localversion-rt b/localversion-rt
index 18777ec0c27d..5d3eaf36ded1 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt15
+-rt16-rc1
-- 
2.1.4


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RT 0/7] Linux 3.18.18-rt16-rc1

2015-08-04 Thread Steven Rostedt

Dear RT Folks,

This is the RT stable review cycle of patch 3.18.18-rt16-rc1.

Please scream at me if I messed something up. Please test the patches too.

The -rc release will be uploaded to kernel.org and will be deleted when
the final release is out. This is just a review release (or release candidate).

The pre-releases will not be pushed to the git repository, only the
final release is.

If all goes well, this patch will be converted to the next main release
on 8/10/2015.

Enjoy,

-- Steve


To build 3.18.18-rt16-rc1 directly, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v3.x/linux-3.18.tar.xz

  http://www.kernel.org/pub/linux/kernel/v3.x/patch-3.18.18.xz

  
http://www.kernel.org/pub/linux/kernel/projects/rt/3.18/patch-3.18.18-rt16-rc1.patch.xz

You can also build from 3.18.18-rt15 by applying the incremental patch:

http://www.kernel.org/pub/linux/kernel/projects/rt/3.18/incr/patch-3.18.18-rt15-rt16-rc1.patch.xz


Changes from 3.18.18-rt15:

---


Davidlohr Bueso (2):
  futex: Implement lockless wakeups
  ipc/mqueue: Implement lockless pipelined wakeups

Peter Zijlstra (1):
  sched: Implement lockless wake-queues

Sebastian Andrzej Siewior (2):
  Revert "slub: delay ctor until the object is requested"
  kernel/irq_work: fix non RT case

Steven Rostedt (Red Hat) (1):
  Linux 3.18.18-rt16-rc1

Thomas Gleixner (1):
  mm/slub: move slab initialization into irq enabled region


 include/linux/sched.h | 46 
 ipc/mqueue.c  | 53 +++-
 kernel/futex.c| 33 ++--
 kernel/sched/core.c   | 46 
 kernel/time/timer.c   |  3 +-
 localversion-rt   |  2 +-
 mm/slub.c | 85 +++
 7 files changed, 182 insertions(+), 86 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] zram: Replace pr_* with dev_*

2015-08-04 Thread Joe Perches
On Wed, 2015-08-05 at 08:57 +0900, Sergey Senozhatsky wrote:
> On (08/05/15 08:42), Sergey Senozhatsky wrote:
> > what's the benefit?
> 
> and apart from that I don't understand why do you replace some
> pr_info() with dev_warn(). f.e.

And besides that, the formats should not be changed
and still require a '\n' termination to avoid
possible interleaving.

> [..]
> > >   if (!zcomp_set_max_streams(zram->comp, num)) {
> > > - pr_info("Cannot change max compression streams\n");
> > > + dev_warn(dev, "Cannot change max compression streams to 
> > > %d",
> > > + num);

etc...

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v4 00/16] ARM: shmobile: Add CPG/MSTP Clock Domain

2015-08-04 Thread Simon Horman
On Tue, Aug 04, 2015 at 10:48:28PM +0900, Simon Horman wrote:
> Thanks Geert,
> 
> I have tentatively queued this up in its own branch,
> cpg-mstp-clock-domain-for-v4.3.

Where possible I prefer not to apply non-DTS/DTSI patches on top of
DTS/DTSI patches, I believe this is in keeping with how the ARM SoC
maintainers like things handled.  With this in mind I have shuffled things
around a little, the result is:

1. The following are (still) queued up for v4.3 in their own
   cpg-mstp-clock-domain-for-v4.3 branch:

   clk: shmobile: Add CPG/MSTP Clock Domain support
   clk: shmobile: r8a7778: Add CPG/MSTP Clock Domain support
   clk: shmobile: r8a7779: Add CPG/MSTP Clock Domain support
   clk: shmobile: rcar-gen2: Add CPG/MSTP Clock Domain support
   clk: shmobile: rz: Add CPG/MSTP Clock Domain support
   ARM: shmobile: r7s72100 dtsi: Add CPG/MSTP Clock Domain
   ARM: shmobile: r8a7778 dtsi: Add CPG/MSTP Clock Domain
   ARM: shmobile: r8a7779 dtsi: Add CPG/MSTP Clock Domain
   ARM: shmobile: r8a7790 dtsi: Add CPG/MSTP Clock Domain
   ARM: shmobile: r8a7791 dtsi: Add CPG/MSTP Clock Domain
   ARM: shmobile: r8a7793 dtsi: Add CPG/MSTP Clock Domain
   ARM: shmobile: r8a7794 dtsi: Add CPG/MSTP Clock Domain

2. The following are now queued up for v4.3 in a separate
   sh-drivers-for-v4.3 branch. That branch is on top of the
   cpg-mstp-clock-domain-for-v4.3 branch and I intend to send it directly
   to Linus if/when a v4.3-rc has been released with the patches
   listed in 1. present.

   drivers: sh: Disable legacy default PM Domain on emev2
   drivers: sh: Disable PM runtime for multi-platform ARM with genpd

3. I have deferred the remaining two patches, listed below, for v4.4.
   they currently reside in cpg-mstp-clock-domain-for-v4.4. That
   branch is based on the sh-drivers-for-v4.3 branch.

   clk: shmobile: mstp: Consider "zb_clk" suitable for power management
   ARM: shmobile: R-Mobile: Use CPG/MSTP Clock Domain attach/detach helpers


This means that the last two patches will disappear from next until
after v4.3-rc1 is released but they, along with all the other patches,
will be present in the renesas devel branch.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] vmscan: fix increasing nr_isolated incurred by putback unevictable pages

2015-08-04 Thread Jaewon Kim


On 2015년 08월 05일 08:31, Minchan Kim wrote:
> Hello,
> 
> On Tue, Aug 04, 2015 at 03:09:37PM -0700, Andrew Morton wrote:
>> On Tue, 04 Aug 2015 19:40:08 +0900 Jaewon Kim  
>> wrote:
>>
>>> reclaim_clean_pages_from_list() assumes that shrink_page_list() returns
>>> number of pages removed from the candidate list. But shrink_page_list()
>>> puts back mlocked pages without passing it to caller and without
>>> counting as nr_reclaimed. This incurrs increasing nr_isolated.
>>> To fix this, this patch changes shrink_page_list() to pass unevictable
>>> pages back to caller. Caller will take care those pages.
>>>
>>> ..
>>>
>>> --- a/mm/vmscan.c
>>> +++ b/mm/vmscan.c
>>> @@ -1157,7 +1157,7 @@ cull_mlocked:
>>> if (PageSwapCache(page))
>>> try_to_free_swap(page);
>>> unlock_page(page);
>>> -   putback_lru_page(page);
>>> +   list_add(>lru, _pages);
>>> continue;
>>>  
>>>  activate_locked:
>>
>> Is this going to cause a whole bunch of mlocked pages to be migrated
>> whereas in current kernels they stay where they are?
>>
> 
> It fixes two issues.
> 
> 1. With unevictable page, cma_alloc will be successful.
> 
> Exactly speaking, cma_alloc of current kernel will fail due to unevictable 
> pages.
> 
> 2. fix leaking of NR_ISOLATED counter of vmstat
> 
> With it, too_many_isolated works. Otherwise, it could make hang until
> the process get SIGKILL.
> 
> So, I think it's stable material.
> 
> Acked-by: Minchan Kim 
> 
> 
> 
Hello

Traditional shrink_inactive_list will put back the unevictable pages as it does 
through putback_inactive_pages.
However as Minchan Kim said, cma_alloc will be more successful by migrating 
unevictable pages.
In current kernel, I think, cma_alloc is already trying to migrate unevictable 
pages except clean page cache.
This patch will allow clean page cache also to be migrated in cma_alloc.

Thank you
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/3] cgroup: define controller file conventions

2015-08-04 Thread Kamezawa Hiroyuki

On 2015/08/05 4:31, Tejun Heo wrote:

 From 6abc8ca19df0078de17dc38340db3002ed489ce7 Mon Sep 17 00:00:00 2001
From: Tejun Heo 
Date: Tue, 4 Aug 2015 15:20:55 -0400

Traditionally, each cgroup controller implemented whatever interface
it wanted leading to interfaces which are widely inconsistent.
Examining the requirements of the controllers readily yield that there
are only a few control schemes shared among all.

Two major controllers already had to implement new interface for the
unified hierarchy due to significant structural changes.  Let's take
the chance to establish common conventions throughout all controllers.

This patch defines CGROUP_WEIGHT_MIN/DFL/MAX to be used on all weight
based control knobs and documents the conventions that controllers
should follow on the unified hierarchy.  Except for io.weight knob,
all existing unified hierarchy knobs are already compliant.  A
follow-up patch will update io.weight.

v2: Added descriptions of min, low and high knobs.

Signed-off-by: Tejun Heo 
Acked-by: Johannes Weiner 
Cc: Li Zefan 
Cc: Peter Zijlstra 
---
Hello,

Added low/high descriptions and applied to the following git branch.

  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git 
for-4.3-unified-base

The branch currently only contains this patch and will stay stable so
that it can be pulled from.  I kept the base weight as DFL for now.
If we decide to change it, I'll apply the change on top.

Thanks.

  Documentation/cgroups/unified-hierarchy.txt | 80 ++---
  include/linux/cgroup.h  |  9 
  2 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/Documentation/cgroups/unified-hierarchy.txt 
b/Documentation/cgroups/unified-hierarchy.txt
index 86847a7..1ee9caf 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -23,10 +23,13 @@ CONTENTS
  5. Other Changes
5-1. [Un]populated Notification
5-2. Other Core Changes
-  5-3. Per-Controller Changes
-5-3-1. blkio
-5-3-2. cpuset
-5-3-3. memory
+  5-3. Controller File Conventions
+5-3-1. Format
+5-3-2. Control Knobs
+  5-4. Per-Controller Changes
+5-4-1. blkio
+5-4-2. cpuset
+5-4-3. memory
  6. Planned Changes
6-1. CAP for resource control

@@ -372,14 +375,75 @@ supported and the interface files "release_agent" and
  - The "cgroup.clone_children" file is removed.


-5-3. Per-Controller Changes
+5-3. Controller File Conventions

-5-3-1. blkio
+5-3-1. Format
+
+In general, all controller files should be in one of the following
+formats whenever possible.
+
+- Values only files
+
+  VAL0 VAL1...\n
+
+- Flat keyed files
+
+  KEY0 VAL0\n
+  KEY1 VAL1\n
+  ...
+
+- Nested keyed files
+
+  KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
+  KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
+  ...
+
+For a writeable file, the format for writing should generally match
+reading; however, controllers may allow omitting later fields or
+implement restricted shortcuts for most common use cases.
+
+For both flat and nested keyed files, only the values for a single key
+can be written at a time.  For nested keyed files, the sub key pairs
+may be specified in any order and not all pairs have to be specified.
+
+
+5-3-2. Control Knobs
+
+- Settings for a single feature should generally be implemented in a
+  single file.
+
+- In general, the root cgroup should be exempt from resource control
+  and thus shouldn't have resource control knobs.
+
+- If a controller implements ratio based resource distribution, the
+  control knob should be named "weight" and have the range [1, 1]
+  and 100 should be the default value.  The values are chosen to allow
+  enough and symmetric bias in both directions while keeping it
+  intuitive (the default is 100%).
+
+- If a controller implements an absolute resource guarantee and/or
+  limit, the control knobs should be named "min" and "max"
+  respectively.  If a controller implements best effort resource
+  gurantee and/or limit, the control knobs should be named "low" and
+  "high" respectively.
+
+  In the above four control files, the special token "max" should be
+  used to represent upward infinity for both reading and writing.
+

so, for memory controller, we'll have

(in alphabet order)
memory.failcnt
memory.force_empty  (<= should this be removed ?)
memory.kmem.failcnt
memory.kmem.max
memory.kmem.max_usage
memory.kmem.slabinfo
memory.kmem.tcp.failcnt
memory.kmem.tcp.max
memory.kmem.tcp.max_usage
memory.kmem.tcp.usage
memory.kmem.usage
memory.max
memory.max_usage
memory.move_charge_at_immigrate
memory.numa_stat
memory.oom_control
memory.pressure_level
memory.high
memory.swapiness
memory.usage
memory.use_hierarchy (<= removed)

?
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   3   4   5   6   7   8   9   10   >