[PATCH] ARM: KVM: Add missing break;

2013-04-13 Thread Joe Perches
commit 3401d54696f ("KVM: ARM: Introduce KVM_ARM_SET_DEVICE_ADDR ioctl")
added the case, but omitted adding break;

Add it.

Found with grep version 2.54 pattern:

$ grep -rP --include=*.[ch] 
"\b(\w+)\s*=[^;]+;\s*(?:case\s+\w+:|default:)\s*\1\s*="

Signed-off-by: Joe Perches 
---
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 5777e0c..b9f2228 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -198,6 +198,7 @@ int kvm_dev_ioctl_check_extension(long ext)
break;
case KVM_CAP_ARM_SET_DEVICE_ADDR:
r = 1;
+   break;
case KVM_CAP_NR_VCPUS:
r = num_online_cpus();
break;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] clk: vt8500: Missing breaks in vtwm_pll_round_rate/_set_rate.

2013-04-13 Thread Tony Prisk
The case of PLL_TYPE_WM8750 in both these functions is missing a break
statement causing a fall-through to the default: case.

Insert the missing break statements.

Signed-off-by: Tony Prisk 
---
Mike,

Any chance this can still go in as a fix for 3.9
The fault makes it impossible to set the PLL clocks on WM8750 and later SoCs.

Regards
Tony P
 drivers/clk/clk-vt8500.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/clk/clk-vt8500.c b/drivers/clk/clk-vt8500.c
index 09c6331..debf688 100644
--- a/drivers/clk/clk-vt8500.c
+++ b/drivers/clk/clk-vt8500.c
@@ -488,6 +488,7 @@ static int vtwm_pll_set_rate(struct clk_hw *hw, unsigned 
long rate,
case PLL_TYPE_WM8750:
wm8750_find_pll_bits(rate, parent_rate, , , , 
);
pll_val = WM8750_BITS_TO_VAL(filter, mul, div1, div2);
+   break;
default:
pr_err("%s: invalid pll type\n", __func__);
return 0;
@@ -523,6 +524,7 @@ static long vtwm_pll_round_rate(struct clk_hw *hw, unsigned 
long rate,
case PLL_TYPE_WM8750:
wm8750_find_pll_bits(rate, *prate, , , , );
round_rate = WM8750_BITS_TO_FREQ(*prate, mul, div1, div2);
+   break;
default:
round_rate = 0;
}
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch v7 0/21] sched: power aware scheduling

2013-04-13 Thread Alex Shi
On 04/14/2013 09:28 AM, Alex Shi wrote:
 >> > These numbers suggest that this patch series simultaneously
 >> > has a negative impact on performance and energy required
 >> > to retire the workload.  Why do it?
> Even some scenario the total energy cost more, at least the avg watts
> dropped in that scenarios. Len said he has low p-state which can work
> there. but that's is different. I had sent some data in another email
> list to show the difference:
> 
> The following is 2 times kbuild testing result for 3 kinds condiation on
> SNB EP box, the middle column is the lowest p-state testing result, we
> can see, it has the lowest power consumption, also has the lowest
> performance/watts value.
> At least for kbuild benchmark, powersaving policy has the best
> compromise on powersaving and power efficient. Further more, due to cpu
> boost feature, it has better performance in some scenarios.

BTW, another benefit on powersaving is that powersaving policy is very
flexible on system load. when task number in sched domain is beyond LCPU
number, it will take performance oriented balance. That conduct the
similar performance when system is busy.

-- 
Thanks
Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3] gianfar: Use netdev_ when possible

2013-04-13 Thread Joe Perches
Use a more current logging style.

Convert pr_ to netdev_ when a struct net_device is
available.  Add pr_fmt and neaten other formats too.

Signed-off-by: Joe Perches 
---
 drivers/net/ethernet/freescale/gianfar_ethtool.c | 24 +---
 drivers/net/ethernet/freescale/gianfar_ptp.c |  3 +++
 drivers/net/ethernet/freescale/gianfar_sysfs.c   |  2 +-
 3 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c 
b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index 4e7118f..083603f 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -389,14 +389,14 @@ static int gfar_scoalesce(struct net_device *dev,
 
/* Check the bounds of the values */
if (cvals->rx_coalesce_usecs > GFAR_MAX_COAL_USECS) {
-   pr_info("Coalescing is limited to %d microseconds\n",
-   GFAR_MAX_COAL_USECS);
+   netdev_info(dev, "Coalescing is limited to %d microseconds\n",
+   GFAR_MAX_COAL_USECS);
return -EINVAL;
}
 
if (cvals->rx_max_coalesced_frames > GFAR_MAX_COAL_FRAMES) {
-   pr_info("Coalescing is limited to %d frames\n",
-   GFAR_MAX_COAL_FRAMES);
+   netdev_info(dev, "Coalescing is limited to %d frames\n",
+   GFAR_MAX_COAL_FRAMES);
return -EINVAL;
}
 
@@ -418,14 +418,14 @@ static int gfar_scoalesce(struct net_device *dev,
 
/* Check the bounds of the values */
if (cvals->tx_coalesce_usecs > GFAR_MAX_COAL_USECS) {
-   pr_info("Coalescing is limited to %d microseconds\n",
-   GFAR_MAX_COAL_USECS);
+   netdev_info(dev, "Coalescing is limited to %d microseconds\n",
+   GFAR_MAX_COAL_USECS);
return -EINVAL;
}
 
if (cvals->tx_max_coalesced_frames > GFAR_MAX_COAL_FRAMES) {
-   pr_info("Coalescing is limited to %d frames\n",
-   GFAR_MAX_COAL_FRAMES);
+   netdev_info(dev, "Coalescing is limited to %d frames\n",
+   GFAR_MAX_COAL_FRAMES);
return -EINVAL;
}
 
@@ -735,7 +735,8 @@ static int gfar_ethflow_to_filer_table(struct gfar_private 
*priv, u64 ethflow,
cmp_rqfpr = RQFPR_IPV6 |RQFPR_UDP;
break;
default:
-   pr_err("Right now this class is not supported\n");
+   netdev_err(priv->ndev,
+  "Right now this class is not supported\n");
ret = 0;
goto err;
}
@@ -751,7 +752,8 @@ static int gfar_ethflow_to_filer_table(struct gfar_private 
*priv, u64 ethflow,
}
 
if (i == MAX_FILER_IDX + 1) {
-   pr_err("No parse rule found, can't create hash rules\n");
+   netdev_err(priv->ndev,
+  "No parse rule found, can't create hash rules\n");
ret = 0;
goto err;
}
@@ -1568,7 +1570,7 @@ static int gfar_process_filer_changes(struct gfar_private 
*priv)
gfar_cluster_filer(tab);
gfar_optimize_filer_masks(tab);
 
-   pr_debug("\n\tSummary:\n"
+   pr_debug("\tSummary:\n"
 "\tData on hardware: %d\n"
 "\tCompression rate: %d%%\n",
 tab->index, 100 - (100 * tab->index) / i);
diff --git a/drivers/net/ethernet/freescale/gianfar_ptp.c 
b/drivers/net/ethernet/freescale/gianfar_ptp.c
index 2e5daee..fe8e9e5 100644
--- a/drivers/net/ethernet/freescale/gianfar_ptp.c
+++ b/drivers/net/ethernet/freescale/gianfar_ptp.c
@@ -17,6 +17,9 @@
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include 
 #include 
 #include 
diff --git a/drivers/net/ethernet/freescale/gianfar_sysfs.c 
b/drivers/net/ethernet/freescale/gianfar_sysfs.c
index cd14a4d..acb55af 100644
--- a/drivers/net/ethernet/freescale/gianfar_sysfs.c
+++ b/drivers/net/ethernet/freescale/gianfar_sysfs.c
@@ -337,5 +337,5 @@ void gfar_init_sysfs(struct net_device *dev)
rc |= device_create_file(>dev, _attr_fifo_starve);
rc |= device_create_file(>dev, _attr_fifo_starve_off);
if (rc)
-   dev_err(>dev, "Error creating gianfar sysfs files.\n");
+   dev_err(>dev, "Error creating gianfar sysfs files\n");
 }
-- 
1.8.1.2.459.gbcd45b4.dirty

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/3] freescale: Update logging style

2013-04-13 Thread Joe Perches
Convert various printk logging styles to current styles.

Uncompiled, untested.

Joe Perches (3):
  fec: Convert printks to netdev_
  gianfar: Use netdev_ when possible
  ucc_geth: Convert ugeth_ to pr_

 drivers/net/ethernet/freescale/fec_main.c |  26 +-
 drivers/net/ethernet/freescale/fec_mpc52xx.c  |  16 +-
 drivers/net/ethernet/freescale/fec_ptp.c  |   2 +
 drivers/net/ethernet/freescale/gianfar_ethtool.c  |  24 +-
 drivers/net/ethernet/freescale/gianfar_ptp.c  |   3 +
 drivers/net/ethernet/freescale/gianfar_sysfs.c|   2 +-
 drivers/net/ethernet/freescale/ucc_geth.c | 881 ++
 drivers/net/ethernet/freescale/ucc_geth_ethtool.c |  24 +-
 8 files changed, 441 insertions(+), 537 deletions(-)

-- 
1.8.1.2.459.gbcd45b4.dirty

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] fec: Convert printks to netdev_

2013-04-13 Thread Joe Perches
Use a more current logging message style.

Convert the printks where a struct net_device is available to
netdev_.  Convert the other printks to pr_ and
add pr_fmt where appropriate.

Signed-off-by: Joe Perches 
---
 drivers/net/ethernet/freescale/fec_main.c| 26 +++---
 drivers/net/ethernet/freescale/fec_mpc52xx.c | 16 
 drivers/net/ethernet/freescale/fec_ptp.c |  2 ++
 3 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c 
b/drivers/net/ethernet/freescale/fec_main.c
index 153437b..d7657a4 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -266,7 +266,7 @@ fec_enet_start_xmit(struct sk_buff *skb, struct net_device 
*ndev)
/* Ooops.  All transmit buffers are full.  Bail out.
 * This should not happen, since ndev->tbusy should be set.
 */
-   printk("%s: tx queue full!.\n", ndev->name);
+   netdev_err(ndev, "tx queue full!\n");
return NETDEV_TX_BUSY;
}
 
@@ -578,7 +578,7 @@ fec_stop(struct net_device *ndev)
writel(1, fep->hwp + FEC_X_CNTRL); /* Graceful transmit stop */
udelay(10);
if (!(readl(fep->hwp + FEC_IEVENT) & FEC_ENET_GRA))
-   printk("fec_stop : Graceful transmit stop did not 
complete !\n");
+   netdev_err(ndev, "Graceful transmit stop did not 
complete!\n");
}
 
/* Whack a reset.  We should wait for this. */
@@ -676,7 +676,7 @@ fec_enet_tx(struct net_device *ndev)
}
 
if (status & BD_ENET_TX_READY)
-   printk("HEY! Enet xmit interrupt and TX_READY.\n");
+   netdev_err(ndev, "HEY! Enet xmit interrupt and 
TX_READY\n");
 
/* Deferred means some collisions occurred during transmit,
 * but we eventually sent the packet OK.
@@ -744,7 +744,7 @@ fec_enet_rx(struct net_device *ndev, int budget)
 * the last indicator should be set.
 */
if ((status & BD_ENET_RX_LAST) == 0)
-   printk("FEC ENET: rcv is not +last\n");
+   netdev_err(ndev, "rcv is not +last\n");
 
if (!fep->opened)
goto rx_processing_done;
@@ -1031,7 +1031,7 @@ static int fec_enet_mdio_read(struct mii_bus *bus, int 
mii_id, int regnum)
usecs_to_jiffies(FEC_MII_TIMEOUT));
if (time_left == 0) {
fep->mii_timeout = 1;
-   printk(KERN_ERR "FEC: MDIO read timeout\n");
+   netdev_err(fep->netdev, "MDIO read timeout\n");
return -ETIMEDOUT;
}
 
@@ -1059,7 +1059,7 @@ static int fec_enet_mdio_write(struct mii_bus *bus, int 
mii_id, int regnum,
usecs_to_jiffies(FEC_MII_TIMEOUT));
if (time_left == 0) {
fep->mii_timeout = 1;
-   printk(KERN_ERR "FEC: MDIO write timeout\n");
+   netdev_err(fep->netdev, "MDIO write timeout\n");
return -ETIMEDOUT;
}
 
@@ -1099,9 +1099,7 @@ static int fec_enet_mii_probe(struct net_device *ndev)
}
 
if (phy_id >= PHY_MAX_ADDR) {
-   printk(KERN_INFO
-   "%s: no PHY, assuming direct connection to switch\n",
-   ndev->name);
+   netdev_info(ndev, "no PHY, assuming direct connection to 
switch\n");
strncpy(mdio_bus_id, "fixed-0", MII_BUS_ID_SIZE);
phy_id = 0;
}
@@ -1110,7 +1108,7 @@ static int fec_enet_mii_probe(struct net_device *ndev)
phy_dev = phy_connect(ndev, phy_name, _enet_adjust_link,
  fep->phy_interface);
if (IS_ERR(phy_dev)) {
-   printk(KERN_ERR "%s: could not attach to PHY\n", ndev->name);
+   netdev_err(ndev, "could not attach to PHY\n");
return PTR_ERR(phy_dev);
}
 
@@ -1128,11 +1126,9 @@ static int fec_enet_mii_probe(struct net_device *ndev)
fep->link = 0;
fep->full_duplex = 0;
 
-   printk(KERN_INFO
-   "%s: Freescale FEC PHY driver [%s] (mii_bus:phy_addr=%s, 
irq=%d)\n",
-   ndev->name,
-   fep->phy_dev->drv->name, dev_name(>phy_dev->dev),
-   fep->phy_dev->irq);
+   netdev_info(ndev, "Freescale FEC PHY driver [%s] (mii_bus:phy_addr=%s, 
irq=%d)\n",
+   fep->phy_dev->drv->name, dev_name(>phy_dev->dev),
+   fep->phy_dev->irq);
 
return 0;
 }
diff --git a/drivers/net/ethernet/freescale/fec_mpc52xx.c 
b/drivers/net/ethernet/freescale/fec_mpc52xx.c
index 77943a6..9bc15e2 100644
--- a/drivers/net/ethernet/freescale/fec_mpc52xx.c
+++ b/drivers/net/ethernet/freescale/fec_mpc52xx.c
@@ -14,6 +14,8 @@
  *
  */
 
+#define pr_fmt(fmt) 

affordable shuttle service

2013-04-13 Thread satish arora
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] module: Fix race condition between load and unload module

2013-04-13 Thread Al Viro
On Sat, Apr 13, 2013 at 09:42:06PM -0700, Anatol Pomozov wrote:

> > in kobject_cleanup().  Why don't we require kobject_del() before the final
> > kobject_put(), if the sucker had been added?  FWIW, I thought it *was*
> > required all along...
> 
> But kobject_release/kobject_cleanup function is called as a result of
> atomic decrement_compare. Until we perform the atomic operation we
> don't know whether it is final kobject_put() or not.
> 
> kobject_put() {
> if (atomic_sub_and_test(kobj->kref->refcount)) {
> // refcounter is decremented to 0 so cleanup sysfs
> kobject_release(kobj)
> }
> }

Yes, of course, but WTF do we play with kobject_del() on that path at all?
Let the caller do it when it decides that object shouldn't be possible to
see anymore.  Which is not the same thing as "the last reference is gone"...

Sigh...  kobject model sucks, film at 11... ;-/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] module: Fix race condition between load and unload module

2013-04-13 Thread Anatol Pomozov
Hi

On Sat, Apr 13, 2013 at 8:35 PM, Al Viro  wrote:
> On Fri, Apr 12, 2013 at 04:47:50PM -0700, Linus Torvalds wrote:
>> This is a much more generic bug in kobjects, and I would hate to add
>> some random workaround for just one case of this bug like you do. The
>> more fundamental bug needs to be fixed too.
>>
>> I think the more fundamental bugfix is to just fix kobject_get() to
>> return NULL if the refcount was zero, because in that case the kobject
>> no longer really exists.
>>
>> So instead of having
>>
>> kref_get(>kref);
>>
>> it should do
>>
>> if (!atomic_inc_not_zero(>kref.refcount))
>> kobj = NULL;
>>
>> and I think that should fix your race automatically, no? Proper patch
>> attached (but TOTALLY UNTESTED - it seems to compile, though).
>>
>> The problem is that we lose the warning for when the refcount is zero
>> and somebody does a kobject_get(), but that is ok *assuming* that
>> people actually check the return value of kobject_get() rather than
>> just "know" that if they passed in a non-NULL kobj, they'll get it
>> right back.
>>
>> Greg - please take a look... I'm adding Al to the discussion too,
>> because Al just *loooves* these kinds of races ;)
>
> Unless I'm misreading what's going on, we have the following to thank for 
> that:
> /* remove from sysfs if the caller did not do it */
> if (kobj->state_in_sysfs) {
> pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n",
>  kobject_name(kobj), kobj);
> kobject_del(kobj);
> }
> in kobject_cleanup().  Why don't we require kobject_del() before the final
> kobject_put(), if the sucker had been added?  FWIW, I thought it *was*
> required all along...

But kobject_release/kobject_cleanup function is called as a result of
atomic decrement_compare. Until we perform the atomic operation we
don't know whether it is final kobject_put() or not.

kobject_put() {
if (atomic_sub_and_test(kobj->kref->refcount)) {
// refcounter is decremented to 0 so cleanup sysfs
kobject_release(kobj)
}
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Latest randconfig build errors

2013-04-13 Thread Rob Clark
On Sat, Apr 13, 2013 at 5:45 PM, Thierry Reding
 wrote:
> On Sat, Apr 13, 2013 at 08:54:22AM -0400, Rob Clark wrote:
>> On Mon, Mar 4, 2013 at 1:46 PM, Tony Lindgren  wrote:
>> >
>> >> drivers/gpu/drm/tilcdc/tilcdc_slave.o:(.data+0x54): multiple definition 
>> >> of `__mod_of_device_table'
>> >> drivers/gpu/drm/tilcdc/tilcdc_tfp410.o:(.data+0x54): first defined here
>> >> drivers/gpu/drm/tilcdc/tilcdc_panel.o:(.data+0x54): multiple definition 
>> >> of `__mod_of_device_table'
>> >> drivers/gpu/drm/tilcdc/tilcdc_tfp410.o:(.data+0x54): first defined here
>> >> drivers/gpu/drm/tilcdc/tilcdc_drv.o:(.data+0x184): multiple definition of 
>> >> `__mod_of_device_table'
>> >> drivers/gpu/drm/tilcdc/tilcdc_tfp410.o:(.data+0x54): first defined here
>> >
>> > Rob, I assume you'll do a patch for this one?
>>
>>
>> oh, I apologize for the late reply, I didn't see this email...
>>
>> There is a patch that we can merge to make tilcdc bool instead of
>> tristate[1], which I suppose is ok for a temporary fix.  Although I'm
>> all-ears if someone has a better idea about how to fix this.  The
>> problem is that we have multiple sub-devices for different possible
>> panel drivers, so that depending on DT tables, the correct ones get
>> loaded for the hw.  But they are all built into a single module.
>> Splitting them into multiple modules will be problematic, as panel
>> drivers which are present really need to get probed before the
>> toplevel drm device..
>
> You could look at the Tegra driver. I had to solve a similar problem
> there. What I did is basically parse the DT in the host1x driver and add
> all device nodes which are required by DRM to a list. Later when the
> individual devices are probed they are removed from that list, so that
> when the list becomes empty we are sure that all required devices are
> there and only then call the drm_platform_init() function.

thx, ok, I'll have a look at this

> This fits very well with how Tegra hardware is designed because host1x
> is the parent for all DRM subdevices (DC, RGB/LVDS, HDMI, ...). So it is
> probed before any of its children and it can easily parse the DT upfront
> and initialize the list of required devices.
>
>> I suppose in theory it is possible to make drm
>> cope better with dynamically loaded outputs, but I'm not sure that
>> there is any way to do this without breaking userspace which expects
>> that all of the connectors/encoders are present once the drm device is
>> loaded.
>
> I had been thinking about this on and off for a while, but I haven't
> come up with anything concrete. Ideally we could just have some kind of
> event that userspace would listen for, so that new outputs can be
> dynamically added and userspace informed about them. Last time I checked
> most of the helpers assumed that the complete output configuration is
> known when the DRM device is registered, so some major rework will be
> required to efficiently make use of such dynamicity.

I'm less worried about the kernel re-work.. more worried about the
fact that we have no way to know whether userspace knows to listen for
this new event.  So anything down this path could, I think, be
considered as breaking userspace.

I think in the end, we need some way to have sort of "dummy"
connectors for output drivers which might or might not be probed, so
that from userspace perspective, non-present panels appear as displays
that are not plugged in.

BR,
-R


> Thierry
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: helping with tracking commits across repos

2013-04-13 Thread Ben Hutchings
I notice that where a commit is cherry-picked cleanly on a stable
branch, like 6b90466cfec2a2fe027187d675d8d14217c12d82, your script finds
the corresponding commit on the stable branch.  This is useful.

But where some backporting changes are needed, such as for
f01fc1a82c2ee68726b400fadb156bd623b5f2f1, which became
8ebfe28181b02766ac41d9d841801c146e6161c1 on the 3.2.y branch, the
corresponding commit isn't found.

It should be possible to find such backported commits based on a simple
regex search over the commit message:

for (<$body>) {
if (/^commit (.*) upstream\.\n/) {
$upstream = $1;
} elsif (/^\[ Upstream commit (.*) \]\n/) {
$upstream = $1;
} elsif (/^\(cherry picked from commit (.*)\)\n/) {
$upstream = $1;
}
}

This covers all formats in current use to show a direct correspondence
between a single mainline and stable branch commit.  (Really we should
settle on just one format...)

Ben.

-- 
Ben Hutchings
It is impossible to make anything foolproof because fools are so ingenious.


signature.asc
Description: This is a digitally signed message part


[PATCH RESEND v4] fat: editions to support fat_fallocate

2013-04-13 Thread Namjae Jeon
From: Namjae Jeon 

Implement preallocation via the fallocate syscall on VFAT partitions.

Change Log:
v4: Rework based on review comments.
Add check in fat_setattr to release fallocated blocks on a truncate

v3: Release preallocated blocks at file release.

With FALLOC_FL_KEEP_SIZE, there is no way to distinguish if the
mismatch between i_size and no. of clusters allocated is a consequence
of fallocate or just plain corruption. When a non fallocate aware (old)
linux fat driver tries to write to such a file, it throws an error.Also,
fsck detects this as inconsistency and truncates the prealloc'd blocks.

To avoid this, as suggested by OGAWA, remove changes that make fallocate
persistent across mounts and restrict lifetime of blocks from
fallocate(2) to file release.

v2: On an area preallocated with FALLOC_FL_KEEP_SIZE, when a seek was
done to an offset beyond i_size, the old (garbage) data was exposed as
we did not zero out the area at allocation time. Added
fat_zero_falloc_area() to fix this.

v1: Reworked an earlier patch of the same name
(https://lkml.org/lkml/2007/12/22/130) to fix some bugs:
 i) Preallocated space was not persistent and was lost on remount. Fixed
it.
 ii) Did not zero out allocated clusters when FALLOC_FL_KEEP_SIZE was set,
thereby speeding up preallocation time.

Signed-off-by: Namjae Jeon 
Signed-off-by: Ravishankar N 
Signed-off-by: Amit Sahrawat 
---
 fs/fat/file.c  |  108 +++-
 fs/fat/inode.c |   53 +++
 2 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/fs/fat/file.c b/fs/fat/file.c
index b0b632e..7326439 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -17,8 +17,11 @@
 #include 
 #include 
 #include 
+#include 
 #include "fat.h"
 
+static long fat_fallocate(struct file *file, int mode,
+   loff_t offset, loff_t len);
 static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
 {
u32 attr;
@@ -140,6 +143,22 @@ static long fat_generic_compat_ioctl(struct file *filp, 
unsigned int cmd,
 
 static int fat_file_release(struct inode *inode, struct file *filp)
 {
+
+   struct super_block *sb = inode->i_sb;
+   loff_t mmu_private_ideal;
+
+   /*
+* Release unwritten fallocated blocks on file release.
+* Do this only when the last open file descriptor is closed.
+*/
+   mutex_lock(>i_mutex);
+   mmu_private_ideal = round_up(inode->i_size, sb->s_blocksize);
+
+   if (mmu_private_ideal < MSDOS_I(inode)->mmu_private &&
+   filp->f_dentry->d_count == 1)
+   fat_truncate_blocks(inode, inode->i_size);
+   mutex_unlock(>i_mutex);
+
if ((filp->f_mode & FMODE_WRITE) &&
 MSDOS_SB(inode->i_sb)->options.flush) {
fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -174,6 +193,7 @@ const struct file_operations fat_file_operations = {
 #endif
.fsync  = fat_file_fsync,
.splice_read= generic_file_splice_read,
+   .fallocate  = fat_fallocate,
 };
 
 static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -212,6 +232,88 @@ out:
return err;
 }
 
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate. The
+ * allocated clusters are freed in fat_file_release().
+ */
+static long fat_fallocate(struct file *file, int mode,
+   loff_t offset, loff_t len)
+{
+   int cluster, fclus, dclus;
+   int nr_cluster; /* Number of clusters to be allocated */
+   loff_t nr_bytes; /* Number of bytes to be allocated*/
+   loff_t free_bytes; /* Unused bytes in the last cluster of file*/
+   struct inode *inode = file->f_mapping->host;
+   struct super_block *sb = inode->i_sb;
+   struct msdos_sb_info *sbi = MSDOS_SB(sb);
+   int err = 0;
+
+   /* No support for hole punch or other fallocate flags. */
+   if (mode & ~FALLOC_FL_KEEP_SIZE)
+   return -EOPNOTSUPP;
+
+   mutex_lock(>i_mutex);
+   if ((offset + len) <= MSDOS_I(inode)->mmu_private) {
+   fat_msg(sb, KERN_ERR,
+   "fat_fallocate(): Blocks already allocated");
+   err = -EINVAL;
+   goto error;
+   }
+
+   if (mode & FALLOC_FL_KEEP_SIZE) {
+   /* First compute the number of clusters to be allocated */
+   if (inode->i_size > 0) {
+   err = fat_get_cluster(inode, FAT_ENT_EOF,
+ , );
+   if (err < 0) {
+   fat_msg(sb, KERN_ERR,
+   "fat_fallocate(): fat_get_cluster() 

Re: [PATCH] module: Fix race condition between load and unload module

2013-04-13 Thread Al Viro
On Fri, Apr 12, 2013 at 04:47:50PM -0700, Linus Torvalds wrote:
> This is a much more generic bug in kobjects, and I would hate to add
> some random workaround for just one case of this bug like you do. The
> more fundamental bug needs to be fixed too.
> 
> I think the more fundamental bugfix is to just fix kobject_get() to
> return NULL if the refcount was zero, because in that case the kobject
> no longer really exists.
> 
> So instead of having
> 
> kref_get(>kref);
> 
> it should do
> 
> if (!atomic_inc_not_zero(>kref.refcount))
> kobj = NULL;
> 
> and I think that should fix your race automatically, no? Proper patch
> attached (but TOTALLY UNTESTED - it seems to compile, though).
> 
> The problem is that we lose the warning for when the refcount is zero
> and somebody does a kobject_get(), but that is ok *assuming* that
> people actually check the return value of kobject_get() rather than
> just "know" that if they passed in a non-NULL kobj, they'll get it
> right back.
> 
> Greg - please take a look... I'm adding Al to the discussion too,
> because Al just *loooves* these kinds of races ;)

Unless I'm misreading what's going on, we have the following to thank for that:
/* remove from sysfs if the caller did not do it */
if (kobj->state_in_sysfs) {
pr_debug("kobject: '%s' (%p): auto cleanup kobject_del\n",
 kobject_name(kobj), kobj);
kobject_del(kobj);
}
in kobject_cleanup().  Why don't we require kobject_del() before the final
kobject_put(), if the sucker had been added?  FWIW, I thought it *was*
required all along...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/6] x86: kaslr: relocate base offset at boot

2013-04-13 Thread H. Peter Anvin
On 04/13/2013 05:37 PM, Yinghai Lu wrote:
> 
> so decompress code position is changed?
> 
> You may push out bss and other data area of run-time kernel of limit
> that boot loader
> chose according to setup_header.init_size.
> aka that make those area overlap with ram hole or other area like
> boot command line or initrd
> 

Is there a strong reason to randomize the physical address on 64 bits
(and if so, shouldn't we do it right?)

-hpa


-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] move exit_task_namespaces() outside of exit_notify()

2013-04-13 Thread Eric W. Biederman
Oleg Nesterov  writes:

> exit_notify() does exit_task_namespaces() after
> forget_original_parent(). This was needed to ensure that ->nsproxy
> can't be cleared prematurely, an exiting child we are going to
> reparent can do do_notify_parent() and use the parent's (ours) pid_ns.
>
> However, after 32084504 "pidns: use task_active_pid_ns in
> do_notify_parent" ->nsproxy != NULL is no longer needed, we rely
> on task_active_pid_ns().
>
> Move exit_task_namespaces() from exit_notify() to do_exit(), after
> exit_fs() and before exit_task_work().
>
> This solves the problem reported by Andrey, free_ipc_ns()->shm_destroy()
> does fput() which needs task_work_add(). And this allows us do simplify
> exit_notify(), we can avoid unlock/lock(tasklist) and we can change
> ->exit_state instead of PF_EXITING in forget_original_parent().

It feels like this ought to work, certainly the pid namespace should not
need this, and the pid namespace was the motivating case for most of the
movement.  However we haven't called exit_task_namespaces this early
since 2006.

Ugh. I goofed and used that field in scm.c. Sigh.  I will push a patch
to rename that field nsproxy->childrens_pid_ns so it is harder to
make the mistake I just made.

None of the uses of nsproxy->net_ns look like they will be used on the
exit path.

The /proc//ns/{uts,ipc,net,mnt,pid} files are fine as nsproxy
itself is what becomes NULL and they test for that.  Well except the pid
file uses task_active_pid_ns.

nsproxy->ipc_ns is isolated to files under ipc so it is probably fine.

Likewise the nsproxy->uts_ns uses look like they will be fine.

Likewise the nsproxy->mnt_ns uses look like they will be fine.

So in a quick skim through the uses no problem cases stick out, nor
can I think of anything that would cause trouble.  This looks like a
good patch.

Acked-by: "Eric W. Biederman" 

> Reported-by: Andrey Vagin 
> Signed-off-by: Oleg Nesterov 
>
> --- x/kernel/exit.c
> +++ x/kernel/exit.c
> @@ -649,7 +649,6 @@ static void exit_notify(struct task_stru
>*  jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
>*/
>   forget_original_parent(tsk);
> - exit_task_namespaces(tsk);
>  
>   write_lock_irq(_lock);
>   if (group_dead)
> @@ -795,6 +794,7 @@ void do_exit(long code)
>   exit_shm(tsk);
>   exit_files(tsk);
>   exit_fs(tsk);
> + exit_task_namespaces(tsk);
>   exit_task_work(tsk);
>   check_stack_usage();
>   exit_thread();
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch v7 0/21] sched: power aware scheduling

2013-04-13 Thread Alex Shi
On 04/13/2013 01:12 AM, Borislav Petkov wrote:
> On Fri, Apr 12, 2013 at 06:48:31PM +0200, Mike Galbraith wrote:
>> (just saying there are other aspects besides joules in there)
> 
> Yeah, but we don't allow any regressions in sched*, do we? Can we pick
> only the good cherries? :-)
> 

Thanks for all of discussion on this threads. :)
I think we can bear a little power efficient lose when want powersaving.

For second question, the performance increase come from cpu boost
feature, the hardware feature diffined, if there are some cores idle in
cpu socket, other core has more chance to boost on higher frequency. The
task packing try to pack tasks so that left more idle cores.

The difficult to merge this feature into current performance is that
current balance policy is trying to give as much as possible cpu
resources to each of task. that just conflict with the cpu boost condition.

-- 
Thanks
Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch v7 0/21] sched: power aware scheduling

2013-04-13 Thread Alex Shi
On 04/13/2013 12:23 AM, Borislav Petkov wrote:
> On Fri, Apr 12, 2013 at 04:46:50PM +0800, Alex Shi wrote:
>> > Thanks a lot for comments, Len!
> AFAICT, you kinda forgot to answer his most important question:
> 
>> > These numbers suggest that this patch series simultaneously
>> > has a negative impact on performance and energy required
>> > to retire the workload.  Why do it?

Even some scenario the total energy cost more, at least the avg watts
dropped in that scenarios. Len said he has low p-state which can work
there. but that's is different. I had sent some data in another email
list to show the difference:

The following is 2 times kbuild testing result for 3 kinds condiation on
SNB EP box, the middle column is the lowest p-state testing result, we
can see, it has the lowest power consumption, also has the lowest
performance/watts value.
At least for kbuild benchmark, powersaving policy has the best
compromise on powersaving and power efficient. Further more, due to cpu
boost feature, it has better performance in some scenarios.

   powersaving + ondemand  userspace + fixed 1.2GHz performance+ondemand
x = 8231.318 /75 57   165.063 /166 36253.552 /63 62
x = 16   280.357 /49 72   174.408 /106 54296.776 /41 82
x = 32   325.206 /34 90   178.675 /90 62 314.153 /37 86

x = 8233.623 /74 57   164.507 /168 36254.775 /65 60
x = 16   272.54  /38 96   174.364 /106 54297.731 /42 79
x = 32   320.758 /34 91   177.917 /91 61 317.875 /35 89
x = 64   326.837 /33 92   179.037 /90 62 320.615 /36 86

-- 
Thanks
Alex
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Error in linux-3.0.73 build

2013-04-13 Thread Akemi Yagi
Just tried to build from the linux-3.0.73 source tarball and got the following 
error:

arch/x86/mm/numa_32.c:107: error: redefinition of 'alloc_remap'
include/linux/bootmem.h:144: note: previous definition of 'alloc_remap' was here
make[2]: *** [arch/x86/mm/numa_32.o] Error 1
make[1]: *** [arch/x86/mm] Error 2
make: *** [arch/x86] Error 2

This only affects the 32-bit build of the kernel with the RHEL-6 configuration.

Akemi
The ELRepo Team.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv9 7/8] zswap: add swap page writeback support

2013-04-13 Thread Mel Gorman
On Wed, Apr 10, 2013 at 01:18:59PM -0500, Seth Jennings wrote:
> This patch adds support for evicting swap pages that are currently
> compressed in zswap to the swap device.  This functionality is very
> important and make zswap a true cache in that, once the cache is full
> or can't grow due to memory pressure, the oldest pages can be moved
> out of zswap to the swap device so newer pages can be compressed and
> stored in zswap.
> 

Oh great, this may cover one of my larger objections from an earlier patch!
I had not guessed from the leader mail or the subject that this patch
implemented zswap page aging of some sort.

> This introduces a good amount of new code to guarantee coherency.
> Most notably, and LRU list is added to the zswap_tree structure,
> and refcounts are added to each entry to ensure that one code path
> doesn't free then entry while another code path is operating on it.
> 
> Signed-off-by: Seth Jennings 
> ---
>  mm/zswap.c | 530 
> ++---
>  1 file changed, 508 insertions(+), 22 deletions(-)
> 
> diff --git a/mm/zswap.c b/mm/zswap.c
> index db283c4..edb354b 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -36,6 +36,12 @@
>  #include 
>  #include 
>  
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
>  /*
>  * statistics
>  **/
> @@ -43,6 +49,8 @@
>  static atomic_t zswap_pool_pages = ATOMIC_INIT(0);
>  /* The number of compressed pages currently stored in zswap */
>  static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
> +/* The number of outstanding pages awaiting writeback */
> +static atomic_t zswap_outstanding_writebacks = ATOMIC_INIT(0);
>  
>  /*
>   * The statistics below are not protected from concurrent access for
> @@ -51,9 +59,13 @@ static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
>   * certain event is occurring.
>  */
>  static u64 zswap_pool_limit_hit;
> +static u64 zswap_written_back_pages;
>  static u64 zswap_reject_compress_poor;
> +static u64 zswap_writeback_attempted;
> +static u64 zswap_reject_tmppage_fail;
>  static u64 zswap_reject_zsmalloc_fail;
>  static u64 zswap_reject_kmemcache_fail;
> +static u64 zswap_saved_by_writeback;
>  static u64 zswap_duplicate_entry;
>  

At some point it would be nice to document what these mean. I know what
they mean now because I read the code recently but I'll have forgotten in
6 months time.

>  /*
> @@ -82,6 +94,14 @@ static unsigned int zswap_max_compression_ratio = 80;
>  module_param_named(max_compression_ratio,
>   zswap_max_compression_ratio, uint, 0644);
>  
> +/*
> + * Maximum number of outstanding writebacks allowed at any given time.
> + * This is to prevent decompressing an unbounded number of compressed
> + * pages into the swap cache all at once, and to help with writeback
> + * congestion.
> +*/
> +#define ZSWAP_MAX_OUTSTANDING_FLUSHES 64
> +

Why 64?

>  /*
>  * compression functions
>  **/
> @@ -144,18 +164,49 @@ static void zswap_comp_exit(void)
>  /*
>  * data structures
>  **/
> +
> +/*
> + * struct zswap_entry
> + *
> + * This structure contains the metadata for tracking a single compressed
> + * page within zswap.
> + *
> + * rbnode - links the entry into red-black tree for the appropriate swap type
> + * lru - links the entry into the lru list for the appropriate swap type
> + * refcount - the number of outstanding reference to the entry. This is 
> needed
> + *to protect against premature freeing of the entry by code
> + *concurent calls to load, invalidate, and writeback.  The lock

s/concurent/concurrent/

> + *for the zswap_tree structure that contains the entry must
> + *be held while changing the refcount.  Since the lock must
> + *be held, there is no reason to also make refcount atomic.
> + * type - the swap type for the entry.  Used to map back to the zswap_tree
> + *structure that contains the entry.
> + * offset - the swap offset for the entry.  Index into the red-black tree.
> + * handle - zsmalloc allocation handle that stores the compressed page data
> + * length - the length in bytes of the compressed page data.  Needed during
> + *   decompression
> + */

It's good that you document the fields but from a review perspective it
would be easier if the documentation was introduced in an earlier patch
and then update it here. Note for example that you document "type" here
even though this patch removes it.

>  struct zswap_entry {
>   struct rb_node rbnode;
> - unsigned type;
> + struct list_head lru;
> + int refcount;

Any particular reason you did not use struct kref (include/linux/kref.h)
for the refcount? I suppose it's because your refcount is protected by
the lock and the atomics 

Re: [PATCHv9 4/8] zswap: add to mm/

2013-04-13 Thread Mel Gorman
On Wed, Apr 10, 2013 at 01:18:56PM -0500, Seth Jennings wrote:
> zswap is a thin compression backend for frontswap. It receives
> pages from frontswap and attempts to store them in a compressed
> memory pool, resulting in an effective partial memory reclaim and
> dramatically reduced swap device I/O.
> 
> Additionally, in most cases, pages can be retrieved from this
> compressed store much more quickly than reading from tradition
> swap devices resulting in faster performance for many workloads.
> 

Except in the case where the zswap pool is externally fragmented, occupies
its maximum configured size and a workload that would otherwise have fit
in memory gets pushed to swap.

Yes, it's a corner case but the changelog portrays zswap as an unconditional
win and while it certainly is going to help some cases, it won't help
them all.

> This patch adds the zswap driver to mm/
> 
> Signed-off-by: Seth Jennings 
> ---
>  mm/Kconfig  |  15 ++
>  mm/Makefile |   1 +
>  mm/zswap.c  | 665 
> 
>  3 files changed, 681 insertions(+)
>  create mode 100644 mm/zswap.c
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index aa054fc..36d93b0 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -495,3 +495,18 @@ config PGTABLE_MAPPING
>  
> You can check speed with zsmalloc benchmark[1].
> [1] https://github.com/spartacus06/zsmalloc
> +
> +config ZSWAP
> + bool "In-kernel swap page compression"
> + depends on FRONTSWAP && CRYPTO
> + select CRYPTO_LZO
> + select ZSMALLOC
> + default n
> + help
> +   Zswap is a backend for the frontswap mechanism in the VMM.
> +   It receives pages from frontswap and attempts to store them
> +   in a compressed memory pool, resulting in an effective
> +   partial memory reclaim.  In addition, pages and be retrieved
> +   from this compressed store much faster than most tradition
> +   swap devices resulting in reduced I/O and faster performance
> +   for many workloads.
> diff --git a/mm/Makefile b/mm/Makefile
> index 0f6ef0a..1e0198f 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
>  obj-$(CONFIG_BOUNCE) += bounce.o
>  obj-$(CONFIG_SWAP)   += page_io.o swap_state.o swapfile.o
>  obj-$(CONFIG_FRONTSWAP)  += frontswap.o
> +obj-$(CONFIG_ZSWAP)  += zswap.o
>  obj-$(CONFIG_HAS_DMA)+= dmapool.o
>  obj-$(CONFIG_HUGETLBFS)  += hugetlb.o
>  obj-$(CONFIG_NUMA)   += mempolicy.o
> diff --git a/mm/zswap.c b/mm/zswap.c
> new file mode 100644
> index 000..db283c4
> --- /dev/null
> +++ b/mm/zswap.c
> @@ -0,0 +1,665 @@
> +/*
> + * zswap.c - zswap driver file
> + *
> + * zswap is a backend for frontswap that takes pages that are in the
> + * process of being swapped out and attempts to compress them and store
> + * them in a RAM-based memory pool.  This results in a significant I/O
> + * reduction on the real swap device and, in the case of a slow swap
> + * device, can also improve workload performance.
> + *
> + * Copyright (C) 2012  Seth Jennings 
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> +*/
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +/*
> +* statistics
> +**/
> +/* Number of memory pages used by the compressed pool */
> +static atomic_t zswap_pool_pages = ATOMIC_INIT(0);
> +/* The number of compressed pages currently stored in zswap */
> +static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
> +
> +/*
> + * The statistics below are not protected from concurrent access for
> + * performance reasons so they may not be a 100% accurate.  However,
> + * they do provide useful information on roughly how many times a
> + * certain event is occurring.
> +*/
> +static u64 zswap_pool_limit_hit;
> +static u64 zswap_reject_compress_poor;
> +static u64 zswap_reject_zsmalloc_fail;
> +static u64 zswap_reject_kmemcache_fail;
> +static u64 zswap_duplicate_entry;
> +

Ok. Initially I thought "vmstat" but it would be overkill in this case
and the fact zswap can be a module would be a problem.

> +/*
> +* tunables
> +**/
> +/* Enable/disable zswap (disabled by default, fixed at boot for now) */
> +static bool zswap_enabled;
> 

Re: [PATCHv9 3/8] debugfs: add get/set for atomic types

2013-04-13 Thread Mel Gorman
On Wed, Apr 10, 2013 at 01:18:55PM -0500, Seth Jennings wrote:
> debugfs currently lack the ability to create attributes
> that set/get atomic_t values.
> 
> This patch adds support for this through a new
> debugfs_create_atomic_t() function.
> 
> Acked-by: Greg Kroah-Hartman 
> Signed-off-by: Seth Jennings 

Acked-by: Mel Gorman 

-- 
Mel Gorman
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv9 1/8] zsmalloc: add to mm/

2013-04-13 Thread Mel Gorman
I no longer remember any of the previous z* discussions, including my
own review and I was not online as I wrote this. I may repeat myself,
contradict myself or rehash topics that were visited already and have
been concluded. If I do any of that then sorry.

On Wed, Apr 10, 2013 at 01:18:53PM -0500, Seth Jennings wrote:
> 
>
> Also, zsmalloc allows objects to span page boundaries within the
> zspage.  This allows for lower fragmentation than could be had
> with the kernel slab allocator for objects between PAGE_SIZE/2
> and PAGE_SIZE. 

Be aware that this reduces *internal* fragmentation but not necessarily
external fragmentation. If a page portion cannot be freed for some reason
then the entire page cannot be freed. If it is possible for a page fragment
to be pinned then it is potentially a serious problem because the zswap
portion of memory does not necessarily shrink forever. This means that a
large process exiting that had been pushed to swap may not free any
physical memory due to fragmentation within zsmalloc which might be a
big surprise to the OOM killer.

Even assuming though that a page can be forcibly evicted then moving data
from zswap to disk has two strange effects.

1. Reclaiming a single page requires an unpredictable amount of
   page frames to be uncompressed and written to swap. Swapout times may
   vary considerably as a result.

2. It make cause aging inversions. If an old page fragment and new page
   fragment are co-located then a new page can be written to swap before
   there was an opportunity to refault it.

Both yield unpredictable performance characteristics for zswap.
zbud conceptually (I can't remember any of the code details) suffers from
internal fragmentation wastage but it would have more predictable performance
characterisics. The worst of the fragmentaiton problems may be mitigated
if a zero-filled page was special cased (if it hasn't already). If the
compressed page cannot fit into PAGE_SIZE/2 then too bad, dump it to swap.
It still would suffer from an age inversion but at worst it only affects
one other swap page so at least it's bound to a known value.

I think I said it before but I worry that testing has seen the ideal
behaviour for zsmalloc because it is based on kernel compiles which has
data that compresses easily and processes that are relatively short lived.

I recognise that a lot of work has gone into zsmalloc and that it exists
for a reason. I'm not going to make it a blocker for merging because frankly
I'm not familiar enough with zbud to know it actually can be used by zswap
and my performance characterisic objections have not been proven. However,
my gut feeling says that the allocators should have had compatible APIs
or an operations struct with a default to zbud for predictable performance
characterisics (assuming zbud is not completely broken of course).

Furthermore if any of this is accurate then the limitations of the
allocator should be described in the changelog (copy and paste this if
you wish). When/if this gets deployed and a vendor is handed a bug about
unpredictable performance characteristics of zswap then there is a remote
chance they learn why.

> With the kernel slab allocator, if a page compresses
> to 60% of it original size, the memory savings gained through
> compression is lost in fragmentation because another object of
> the same size can't be stored in the leftover space.
> 
> This ability to span pages results in zsmalloc allocations not being
> directly addressable by the user.  The user is given an
> non-dereferencable handle in response to an allocation request.
> That handle must be mapped, using zs_map_object(), which returns
> a pointer to the mapped region that can be used.  The mapping is
> necessary since the object data may reside in two different
> noncontigious pages.
> 
> zsmalloc fulfills the allocation needs for zram and zswap.
> 
> Acked-by: Nitin Gupta 
> Acked-by: Minchan Kim 
> Signed-off-by: Seth Jennings 
> ---
>  include/linux/zsmalloc.h |   56 +++
>  mm/Kconfig   |   24 +
>  mm/Makefile  |1 +
>  mm/zsmalloc.c| 1117 
> ++
>  4 files changed, 1198 insertions(+)
>  create mode 100644 include/linux/zsmalloc.h
>  create mode 100644 mm/zsmalloc.c
> 
> diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
> new file mode 100644
> index 000..398dae3
> --- /dev/null
> +++ b/include/linux/zsmalloc.h
> @@ -0,0 +1,56 @@
> +/*
> + * zsmalloc memory allocator
> + *
> + * Copyright (C) 2011  Nitin Gupta
> + *

git blame indicates there are more people than Nitin involved although
the bulk of the code does appear to be his.

> + * This code is released using a dual license strategy: BSD/GPL
> + * You can choose the license that better fits your requirements.
> + *
> + * Released under the terms of 3-clause BSD License
> + * Released under the terms of GNU General Public License Version 2.0
> + */
> +
> +#ifndef 

Re: [tip:smp/hotplug] idle: Provide a generic entry point for the idle code

2013-04-13 Thread Yinghai Lu
On Mon, Apr 8, 2013 at 1:12 PM, tip-bot for Thomas Gleixner
 wrote:
> Commit-ID:  a1a04ec3c7c27a682473fd9beb2c996316a64649
> Gitweb: http://git.kernel.org/tip/a1a04ec3c7c27a682473fd9beb2c996316a64649
> Author: Thomas Gleixner 
> AuthorDate: Thu, 21 Mar 2013 22:49:34 +0100
> Committer:  Thomas Gleixner 
> CommitDate: Mon, 8 Apr 2013 17:39:23 +0200
>
> idle: Provide a generic entry point for the idle code
>
> For now this calls cpu_idle(), but in the long run we want to move the
> cpu bringup code to the core and therefor we add a state argument.
>
> Signed-off-by: Thomas Gleixner 
...
> Reviewed-by: Cc: Srivatsa S. Bhat 

What is that ?

looks your scripts has problem to convert Cc to Reviewed-by

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/6] x86: kaslr: relocate base offset at boot

2013-04-13 Thread Yinghai Lu
On Fri, Apr 12, 2013 at 1:13 PM, Kees Cook  wrote:
[...]
> diff --git a/arch/x86/boot/compressed/head_64.S 
> b/arch/x86/boot/compressed/head_64.S
> index c1d383d..fc37910 100644
> --- a/arch/x86/boot/compressed/head_64.S
> +++ b/arch/x86/boot/compressed/head_64.S
> @@ -59,7 +59,7 @@ ENTRY(startup_32)
>  1:
>
>  /*
> - * Calculate the delta between where we were compiled to run
> + * Calculate the delta between where we were linked to load
>   * at and where we were actually loaded at.  This can only be done
>   * with a short local call on x86.  Nothing  else will tell us what
>   * address we are running at.  The reserved chunk of the real-mode
> @@ -78,10 +78,10 @@ ENTRY(startup_32)
>
> callverify_cpu
> testl   %eax, %eax
> -   jnz no_longmode
> +   jnz hang
>
>  /*
> - * Compute the delta between where we were compiled to run at
> + * Compute the delta between where we were linked to load at
>   * and where the code will actually run at.
>   *
>   * %ebp contains the address we are loaded at by the boot loader and %ebx
> @@ -90,15 +90,32 @@ ENTRY(startup_32)
>   */
>
>  #ifdef CONFIG_RELOCATABLE
> +#ifdef CONFIG_RANDOMIZE_BASE
> +   callselect_aslr_address /* Select ASLR offset */
> +   movl%eax, %ebx
> +   /* LOAD_PHYSICAL_ADDR is the minimum safe address we can
> +* decompress at */
> +   cmpl$LOAD_PHYSICAL_ADDR, %ebx
> +   jae 1f
> +   movl$LOAD_PHYSICAL_ADDR, %ebx
> +#else /* CONFIG_RANDOMIZE_BASE */
> movl%ebp, %ebx
> movlBP_kernel_alignment(%esi), %eax
> decl%eax
> addl%eax, %ebx
> notl%eax
> andl%eax, %ebx
> -#else
> +#endif /* CONFIG_RANDOMIZE_BASE */
> +
> +#ifdef CONFIG_RANDOMIZE_BASE
> +1: movl%ebx, %eax
> +   subl$LOAD_PHYSICAL_ADDR, %eax
> +movl   %eax, aslr_offset(%ebp)
> +   inclaslr_in_32bit(%ebp) /* say 32 bit code ran */
> +#endif /* CONFIG_RANDOMIZE_BASE */
> +#else /* CONFIG_RELOCATABLE */
> movl$LOAD_PHYSICAL_ADDR, %ebx
> -#endif
> +#endif /* CONFIG_RELOCATABLE */
>
> /* Target address to relocate to for decompression */
> addl$z_extract_offset, %ebx
> @@ -266,14 +283,30 @@ preferred_addr:
> /* Start with the delta to where the kernel will run at. */
>  #ifdef CONFIG_RELOCATABLE
> leaqstartup_32(%rip) /* - $startup_32 */, %rbp
> +#ifdef CONFIG_RANDOMIZE_BASE
> +   leaqboot_stack_end(%rip), %rsp
> +   testl   $1, aslr_in_32bit(%rip)
> +   jne 1f
> +   callselect_aslr_address
> +   movq%rax, %rbp

select_aslr_address only play %ebp, so you assume bzImage is loaded under 4G?

can you just run slect_aslr_address in 64bit only?

> +   jmp 2f
> +1: movlaslr_offset(%rip), %eax
> +   addq%rax, %rbp
> +   /* LOAD_PHYSICAL_ADDR is the minimum safe address we can
> +* decompress at. */
> +   cmpq$LOAD_PHYSICAL_ADDR, %rbp
> +   jae 2f
> +   movq$LOAD_PHYSICAL_ADDR, %rbp

should use old value before select_alsr_addr?

> +2:
> +#endif /* CONFIG_RANDOMIZE_BASE */
> movlBP_kernel_alignment(%rsi), %eax
> decl%eax
> addq%rax, %rbp
> notq%rax
> andq%rax, %rbp
> -#else
> +#else /* CONFIG_RELOCATABLE */
> movq$LOAD_PHYSICAL_ADDR, %rbp
> -#endif
> +#endif /* CONFIG_RELOCATABLE */
>
> /* Target address to relocate to for decompression */
> leaqz_extract_offset(%rbp), %rbx
> @@ -343,13 +376,85 @@ relocated:
> calldecompress_kernel
> popq%rsi
>
> +#ifdef CONFIG_RANDOMIZE_BASE
> +/*
> + * Find the address of the relocations.
> + */
> +   leaqz_output_len(%rbp), %rdi
> +
> +/*
> + * Calculate the delta between where vmlinux was linked to load
> + * and where it was actually loaded.
> + */
> +   movq%rbp, %rbx
> +   subq$LOAD_PHYSICAL_ADDR, %rbx
> +   je  3f  /* Nothing to be done if loaded at linked addr. */
> +/*
> + * The kernel contains a table of relocation addresses. Those addresses
> + * have the final load address of the kernel in virtual memory.
> + * We are currently working in the self map. So we need to create an
> + * adjustment for kernel memory addresses to the self map. This will
> + * involve subtracting out the base address of the kernel.
> + */
> +   movq$-__START_KERNEL_map, %rdx /* Literal is too big for add etc 
> */
> +   addq%rbx, %rdx
> +/*
> + * Process relocations. 32 bit relocations first then 64 bit after.
> + * Two sets of binary relocations are added to the end of the
> + * kernel before compression. Each relocation table entry is the kernel
> + * address of the location which needs to be updated stored as a 32 bit
> + * value which is sign extended to 64 bits.
> + *
> + * Format is:
> + *
> + * kernel bits...
> + * 0 - zero terminator for 64 bit relocations
> + * 64 bit 

Re: Return value of __mm_populate

2013-04-13 Thread KOSAKI Motohiro
(4/13/13 5:14 AM), Marco Stornelli wrote:
> Hi,
> 
> I was seeing the code of __mm_populate (in -next) and I've got a doubt 
> about the return value. The function __mlock_posix_error_return should 
> return a proper error for mlock, converting the return value from 
> __get_user_pages. It checks for EFAULT and ENOMEM. Actually 
> __get_user_pages could return, in addition, ERESTARTSYS and EHWPOISON. 

__get_user_pages doesn't return EHWPOISON if FOLL_HWPOISON is not specified.
I'm not expert ERESTARTSYS. I understand correctly, ERESTARTSYS is only returned
when signal received, and signal handling routine (e.g. do_signal) modify EIP 
and
hidden ERESTARTSYS from userland generically.


> So it seems to me that we could return to user space not expected value. 
> I can't see them on the man page. In addition we shouldn't ever return 
> ERESTARTSYS to the user space but EINTR. According to the man pages 
> maybe we should return EAGAIN in these cases. Am I missing something?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/6] x86: kaslr: routines to choose random base offset

2013-04-13 Thread Yinghai Lu
On Fri, Apr 12, 2013 at 1:13 PM, Kees Cook  wrote:
> This provides routines for selecting a randomized kernel base offset,
> bounded by the e820 entries. It tries to use RDRAND and falls back to
> RDTSC. If "noaslr" is on the kernel command line, no offset will be used.
>
> Heavily based on work by Dan Rosenberg and Neill Clift.
>
> Signed-off-by: Kees Cook 
> Cc: Eric Northup 
> ---
>  arch/x86/boot/compressed/Makefile |2 +-
>  arch/x86/boot/compressed/aslr.S   |  228 
> +
>  2 files changed, 229 insertions(+), 1 deletion(-)
>  create mode 100644 arch/x86/boot/compressed/aslr.S
>
> diff --git a/arch/x86/boot/compressed/Makefile 
> b/arch/x86/boot/compressed/Makefile
> index 0dac175..feaf203 100644
> --- a/arch/x86/boot/compressed/Makefile
> +++ b/arch/x86/boot/compressed/Makefile
> @@ -26,7 +26,7 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include
>
>  VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \
> $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
> -   $(obj)/piggy.o
> +   $(obj)/piggy.o $(obj)/aslr.o
>
>  $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
>
> diff --git a/arch/x86/boot/compressed/aslr.S b/arch/x86/boot/compressed/aslr.S
> new file mode 100644
> index 000..37cdef4
> --- /dev/null
> +++ b/arch/x86/boot/compressed/aslr.S
> @@ -0,0 +1,228 @@
> +/*
> + *  arch/x86/boot/compressed/aslr.S
> + *
> + * Support routine for Kernel Address Space Layout Randomization used by both
> + * the 32 and 64 bit boot code.
> + *
> + */
> +   .text
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#ifdef CONFIG_RANDOMIZE_BASE
> +
> +   .globl  select_aslr_address
> +   .code32
> +
> +/*
> + * Get the physical memory limit for the run from the physical load position 
> of
> + * the kernel. The kernel loads at LOAD_PHYSICAL_ADDR and we need to know how
> + * much physical memory is available for use after that point to make sure 
> the
> + * relocated kernel will fit. Returns the limit in eax.
> + */
> +get_physical_run_end:
> +   pushl   %edi
> +   pushl   %esi
> +   pushl   %ebx
> +   pushl   %edx
> +   pushl   %ecx
> +   movzbl  BP_e820_entries(%esi), %edi
> +   lealBP_e820_map(%esi), %esi
> +   testl   %edi, %edi
> +   jz  5f
> +1: cmpl$E820_RAM, E820_type(%esi)
> +   jnz 4f
> +   movlE820_addr(%esi), %eax
> +   movlE820_addr+4(%esi), %edx
> +   testl   %edx, %edx /* Start address is too big for 32 bit */
> +   jnz 4f
> +   cmpl$LOAD_PHYSICAL_ADDR, %eax
> +   ja  4f
> +   movlE820_size(%esi), %ecx
> +   movlE820_size+4(%esi), %ebx
> +   addl%eax, %ecx
> +   adcl%edx, %ebx
> +   jz  2f /* end address not beyond 32bit*/
> +/* For a large run set the limit as 2^32-1 */
> +   xorl%ecx, %ecx
> +   decl%ecx
> +   jmp 3f
> +2: cmpl$LOAD_PHYSICAL_ADDR, %ecx
> +   jb  4f
> +3:
> +   movl%ecx, %eax
> +   jmp 6f
> +
> +4: addl$E820_entry_size, %esi
> +   decl%edi
> +   jnz 1b
> +5: xorl%eax, %eax /* Fail */
> +6: popl%ecx
> +   popl%edx
> +   popl%ebx
> +   popl%esi
> +   popl%edi
> +   ret
> +
> +/*
> + * Get a random value to be used for the ASLR kernel offset.
> + * Returns the value in eax.
> + */
> +get_aslr_offset:
> +   pushl   %ebx
> +   pushl   %edx
> +   pushl   %ecx
> +   callfind_cmdline_option
> +   testl   %eax, %eax
> +   jne 4f
> +   /* Standard check for cpuid */
> +   pushfl  /* Push original flags */
> +   pushfl
> +   popl%eax
> +   movl%eax, %ebx
> +   xorl$X86_EFLAGS_ID, %eax
> +   pushl   %eax
> +   popfl
> +   pushfl
> +   popl%eax
> +   popfl   /* Pop original flags */
> +   cmpl%eax, %ebx
> +   /* Say zero offset if we can't change the flag */
> +   movl$0, %eax
> +   je  4f
> +
> +   /* Check for cpuid 1 */
> +   cpuid
> +   cmpl$0x1, %eax
> +   jb  4f
> +
> +   movl$0x1, %eax
> +   cpuid
> +   xor %eax, %eax
> +
> +   /* RDRAND is bit 30 */
> +   btl $(X86_FEATURE_RDRAND & 31), %ecx
> +   jc  1f
> +
> +   /* RDTSC is bit 4 */
> +   btl $(X86_FEATURE_TSC & 31), %edx
> +   jc  3f
> +
> +   /* Nothing is supported */
> +   jmp 4f
> +1:
> +   /*
> +* RDRAND sets carry bit on success, otherwise we should try
> +* again up to 16 times.
> +*/
> +   movl$0x10, %ecx
> +2:
> +   /* rdrand %eax */
> +   .byte   0x0f, 0xc7, 0xf0
> +   jc  4f
> +   loop2b
> +
> +   /* Fall through: if RDRAND is supported but fails, use RDTSC,
> +* which is guaranteed to be supported.
> +*/
> +3:
> +   rdtsc
> 

Re: [RFC PATCH 0/2] sched: move content out of core files for load average

2013-04-13 Thread Paul Gortmaker
On Sat, Apr 13, 2013 at 12:30 AM, Rakib Mullick  wrote:
> On Sat, Apr 13, 2013 at 6:04 AM, Paul Gortmaker
>  wrote:
>> Recent activity has had a focus on moving functionally related blocks of 
>> stuff
>> out of sched/core.c into stand-alone files.  The code relating to load 
>> average
>> calculations has grown significantly enough recently to warrant placing it in
>> a separate file.
>>
>> Here we do that, and in doing so, we shed ~20k of code from sched/core.c 
>> (~10%).

[...]

>> Paul Gortmaker (2):
>>   sched: fork load calculation code from sched/core --> sched/load_avg
>>   sched: move update_load_[add/sub/set] from sched.h to fair.c
>>
>>  kernel/sched/Makefile   |   2 +-
>>  kernel/sched/core.c | 569 
>> ---
>>  kernel/sched/fair.c |  18 ++
>>  kernel/sched/load_avg.c | 577 
>> 
>>  kernel/sched/sched.h|  26 +--
>>  5 files changed, 604 insertions(+), 588 deletions(-)
>>  create mode 100644 kernel/sched/load_avg.c
>>
>
> Is there any impact positive over vmlinuz size after these changes?

As per the above description and diffstat, it is just a straight
up code relocation, so aside from trivial differences in what
the optimizer does, I'd expect no real change at all in the
size or anything else

Paul.
--

>
> Thanks,
> Rakib
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: still in 3.9-rc6 - fan speed at 100% after suspend/resume regression

2013-04-13 Thread Ville Syrjälä
On Sun, Apr 14, 2013 at 12:44:00AM +0300, Ville Syrjälä wrote:
> On Sat, Apr 13, 2013 at 10:02:21AM -0600, Jake Edge wrote:
> > Hi Zhang Rui,
> > 
> > The problem reported in https://lkml.org/lkml/2012/12/4/428 (and
> > incorrectly attributed to a suspend patch by me here:
> > https://lkml.org/lkml/2013/4/12/314) still exists in 3.9-rc6, at least
> > for my HP/Compaq 2510p laptop. It appeared that some folks were seeing
> > some improvement with earlier 3.9-rcs (?) but I just tested rc6 and saw
> > substantially the same behavior.  This all works fine in <= 3.6.11.
> > 
> > A brief recap: after resuming, the fan on the laptop spins up to full
> > speed and stays there. "temp6" in "acpitz-virtual-0" (as shown by
> > "sensors") is 100°C and stays there, which is presumably what is making
> > the fan stay on.
> 
> My HP Compaq NC6000 exhibits slightly different behaviour. For me the
> reported temp is always accurate but the trip points get out of sync
> with the actual temperature.
> 
> With 3.7 I saw two different kinds of problems coming out of resume.
> In one case the fan stays on until the temp rises high enough to get the
> trip points back into sync. In the other case the fan goes off, and stays
> off even when the temperature rises above the highest active trip point,
> but it does appear to get back into sync when the temp starts to come back
> down. I think the difference might stem from resuming when the laptop has
> cooled down fully vs. when it's still warm.
> 
> I also just tried 3.9-rc6, and that one appears to behave differently
> to 3.7, but still wrong. There the fan goes off after resume, and comes
> back on as the temperature rises, but it never slows back down after
> that.
> 
> I'll try to collect some more detailed dumps of all three cases.

I filed a new bug and attached all my logs:
https://bugzilla.kernel.org/show_bug.cgi?id=56591

-- 
Ville Syrjälä
syrj...@sci.fi
http://www.sci.fi/~syrjala/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/9] AMD IOMMU cleanups, fixes and IVRS bug workarounds

2013-04-13 Thread Shuah Khan
On Sat, Apr 13, 2013 at 9:26 AM, Joerg Roedel  wrote:
> On Sat, Apr 13, 2013 at 11:06:22PM +0800, Andrew Cooks wrote:
>> On Fri, Apr 12, 2013 at 4:06 PM, Joerg Roedel  wrote:
>
>> > Oh, that's sad. You were the only one having a machine wich actually has
>> > unity-mapped ranges defined in the BIOS table. The code for those
>> > mappings was basically untested before you ran it on that machine.
>> >
>> What is the machine in question? Maybe someone else has access to one,
>> if it's not too exotic.
>
> Shuah had access to a HP server machine (don't know which one) that
> defined unity-map ranges in the BIOS table. Shuah certainly knows the
> details about that machine.
>

Joerg/Andrew,

It is a DL385 Gen8 server. Unfortunately I left HP as of yesterday and no longer
have access to the system. Maybe there others that have access to one.

-- Shuah
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] vfs: Revert spurious fix to spinning prevention in prune_icache_sb

2013-04-13 Thread Hugh Dickins
From: Suleiman Souhlal 

Revert v3.2's 62a3dde ("vfs: fix spinning prevention in prune_icache_sb").

This commit doesn't look right:
Since we are looking at the tail of the list (sb->s_inode_lru.prev)
if we want to skip an inode, we should put it back at the head of
the list instead of the tail, otherwise we will keep spinning on it.

Discovered when investigating why prune_icache_sb came top in perf
reports of a swapping load.
 
Signed-off-by: Suleiman Souhlal 
Signed-off-by: Hugh Dickins 
Cc: sta...@vger.kernel.org
---

 fs/inode.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/fs/inode.c
+++ b/fs/inode.c
@@ -725,7 +725,7 @@ void prune_icache_sb(struct super_block *sb, int 
nr_to_scan, int priority)
 * inode to the back of the list so we don't spin on it.
 */
if (!spin_trylock(>i_lock)) {
-   list_move_tail(>i_lru, >s_inode_lru);
+   list_move(>i_lru, >s_inode_lru);
continue;
}
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH tip/core/rcu 6/7] rcu: Drive quiescent-state-forcing delay from HZ

2013-04-13 Thread Paul E. McKenney
On Sat, Apr 13, 2013 at 12:53:36PM -0700, Josh Triplett wrote:
> On Sat, Apr 13, 2013 at 12:34:25PM -0700, Paul E. McKenney wrote:
> > On Sat, Apr 13, 2013 at 11:18:00AM -0700, Josh Triplett wrote:
> > > On Fri, Apr 12, 2013 at 11:38:04PM -0700, Paul E. McKenney wrote:
> > > > On Fri, Apr 12, 2013 at 04:54:02PM -0700, Josh Triplett wrote:
> > > > > On Fri, Apr 12, 2013 at 04:19:13PM -0700, Paul E. McKenney wrote:
> > > > > > From: "Paul E. McKenney" 
> > > > > > 
> > > > > > Systems with HZ=100 can have slow bootup times due to the default
> > > > > > three-jiffy delays between quiescent-state forcing attempts.  This
> > > > > > commit therefore auto-tunes the RCU_JIFFIES_TILL_FORCE_QS value 
> > > > > > based
> > > > > > on the value of HZ.  However, this would break very large systems 
> > > > > > that
> > > > > > require more time between quiescent-state forcing attempts.  This
> > > > > > commit therefore also ups the default delay by one jiffy for each
> > > > > > 256 CPUs that might be on the system (based off of nr_cpu_ids at
> > > > > > runtime, -not- NR_CPUS at build time).
> > > > > > 
> > > > > > Reported-by: Paul Mackerras 
> > > > > > Signed-off-by: Paul E. McKenney 
> > > > > 
> > > > > Something seems very wrong if RCU regularly hits the fqs code during
> > > > > boot; feels like there's some more straightforward solution we're
> > > > > missing.  What causes these CPUs to fall under RCU's scrutiny during
> > > > > boot yet not actually hit the RCU codepaths naturally?
> > > > 
> > > > The problem is that they are running HZ=100, so that RCU will often
> > > > take 30-60 milliseconds per grace period.  At that point, you only
> > > > need 16-30 grace periods to chew up a full second, so it is not all
> > > > that hard to eat up the additional 8-12 seconds of boot time that
> > > > they were seeing.  IIRC, UP boot was costing them 4 seconds.
> > > > 
> > > > For HZ=1000, this would translate to 800ms to 1.2s, which is nowhere
> > > > near as annoying.
> > > 
> > > That raises two questions, though.  First, who calls synchronize_rcu()
> > > repeatedly during boot, and could they call call_rcu() instead to avoid
> > > blocking for an RCU grace period?  Second, why does RCU need 3-6 jiffies
> > > to resolve a grace period during boot?  That suggests that RCU doesn't
> > > actually resolve a grace period until the force-quiescent-state
> > > machinery kicks in, meaning that the normal quiescent-state mechanism
> > > didn't work.
> > 
> > Indeed, converting synchronize_rcu() to call_rcu() might also be
> > helpful.  The reason that RCU often does not resolve grace periods until
> > force_quiescent_state() is that it is often the case during boot that
> > all but one CPU is idle.  RCU tries hard to avoid waking up idle CPUs,
> > so it must scan them.  Scanning is relatively expensive, so there is
> > reason to wait.
> 
> How are those CPUs going idle without first telling RCU that they're
> quiesced?  Seems like, during boot at least, you want RCU to use its
> idle==quiesced logic to proactively note continuously-quiescent states.
> Ideally, you should not hit the FQS code at all during boot.

FQS is RCU's idle==quiesced logic.  ;-)

In theory, RCU could add logic at idle entry to report a quiescent state,
in fact CONFIG_RCU_FAST_NO_HZ used to do exactly that.  In practice,
this is not good for energy efficiency at runtime for a goodly number
of workloads, which is why CONFIG_RCU_FAST_NO_HZ now relies on callback
numbering and FQS.

I understand that at boot time, energy efficiency is best served by
making boot go faster, but that means that something has to tell RCU
when boot is complete.

> > One thing that could be done would be to scan immediately during boot,
> > and then back off once boot has completed.  Of course, RCU has no idea
> > when boot has completed, but one way to get this effect is to boot
> > with rcutree.jiffies_till_first_fqs=0, and then use sysfs to set it
> > to 3 once boot has completed.
> 
> What do you mean by "boot has completed" here?  The kernel's early
> initialization, the kernel's initialization up to running /sbin/init, or
> userspace initialization up through supporting user login?

That is exactly the question.  After all, if RCU is going to do something
special during boot, it needs to know when boot ends.  People normally
count boot as up to user login, but RCU currently has no way to know
when this is, at least as far as I know.  Which is why I suggested that
something tell RCU via sysfs.

Regardless, for the usual definition of "boot is complete", user space has
to decide when boot is complete.  The kernel is out of the loop early on.

> In any case, I don't think it makes sense to do this with FQS.

OK, let's go through the possibilities I can imagine at the moment:

1.  Force the scheduling-clock interrupt to remain on during
boot.  This way, each CPU could tell RCU of its idle/non-idle
state.  Of course, something then needs to tell the 

[PATCH] [media] cx88: Fix unsafe locking in suspend-resume

2013-04-13 Thread Alexey Khoroshilov
Legacy PCI suspend-resume handlers are called with interrupts enabled.
But cx8800_suspend/cx8800_resume and cx8802_suspend_common/cx8802_resume_common
use spin_lock/spin_unlock functions to acquire dev->slock, while the same lock 
is acquired in
the corresponding irq-handlers: cx8800_irq and cx8802_irq.
That means a deadlock is possible if an interrupt happens while suspend or 
resume owns the lock.

The patch replaces spin_lock/spin_unlock with 
spin_lock_irqsave/spin_unlock_irqrestore.

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov 
---
 drivers/media/pci/cx88/cx88-mpeg.c  |   10 ++
 drivers/media/pci/cx88/cx88-video.c |   10 ++
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/media/pci/cx88/cx88-mpeg.c 
b/drivers/media/pci/cx88/cx88-mpeg.c
index c9d3182..c6cdbdb 100644
--- a/drivers/media/pci/cx88/cx88-mpeg.c
+++ b/drivers/media/pci/cx88/cx88-mpeg.c
@@ -532,16 +532,17 @@ static int cx8802_suspend_common(struct pci_dev *pci_dev, 
pm_message_t state)
 {
struct cx8802_dev *dev = pci_get_drvdata(pci_dev);
struct cx88_core *core = dev->core;
+   unsigned long flags;
 
/* stop mpeg dma */
-   spin_lock(>slock);
+   spin_lock_irqsave(>slock,flags);
if (!list_empty(>mpegq.active)) {
dprintk( 2, "suspend\n" );
printk("%s: suspend mpeg\n", core->name);
cx8802_stop_dma(dev);
del_timer(>mpegq.timeout);
}
-   spin_unlock(>slock);
+   spin_unlock_irqrestore(>slock,flags);
 
/* FIXME -- shutdown device */
cx88_shutdown(dev->core);
@@ -558,6 +559,7 @@ static int cx8802_resume_common(struct pci_dev *pci_dev)
 {
struct cx8802_dev *dev = pci_get_drvdata(pci_dev);
struct cx88_core *core = dev->core;
+   unsigned long flags;
int err;
 
if (dev->state.disabled) {
@@ -584,12 +586,12 @@ static int cx8802_resume_common(struct pci_dev *pci_dev)
cx88_reset(dev->core);
 
/* restart video+vbi capture */
-   spin_lock(>slock);
+   spin_lock_irqsave(>slock,flags);
if (!list_empty(>mpegq.active)) {
printk("%s: resume mpeg\n", core->name);
cx8802_restart_queue(dev,>mpegq);
}
-   spin_unlock(>slock);
+   spin_unlock_irqrestore(>slock,flags);
 
return 0;
 }
diff --git a/drivers/media/pci/cx88/cx88-video.c 
b/drivers/media/pci/cx88/cx88-video.c
index bc78354..d72b403 100644
--- a/drivers/media/pci/cx88/cx88-video.c
+++ b/drivers/media/pci/cx88/cx88-video.c
@@ -1957,9 +1957,10 @@ static int cx8800_suspend(struct pci_dev *pci_dev, 
pm_message_t state)
 {
struct cx8800_dev *dev = pci_get_drvdata(pci_dev);
struct cx88_core *core = dev->core;
+   unsigned long flags;
 
/* stop video+vbi capture */
-   spin_lock(>slock);
+   spin_lock_irqsave(>slock,flags);
if (!list_empty(>vidq.active)) {
printk("%s/0: suspend video\n", core->name);
stop_video_dma(dev);
@@ -1970,7 +1971,7 @@ static int cx8800_suspend(struct pci_dev *pci_dev, 
pm_message_t state)
cx8800_stop_vbi_dma(dev);
del_timer(>vbiq.timeout);
}
-   spin_unlock(>slock);
+   spin_unlock_irqrestore(>slock,flags);
 
if (core->ir)
cx88_ir_stop(core);
@@ -1989,6 +1990,7 @@ static int cx8800_resume(struct pci_dev *pci_dev)
 {
struct cx8800_dev *dev = pci_get_drvdata(pci_dev);
struct cx88_core *core = dev->core;
+   unsigned long flags;
int err;
 
if (dev->state.disabled) {
@@ -2019,7 +2021,7 @@ static int cx8800_resume(struct pci_dev *pci_dev)
cx_set(MO_PCI_INTMSK, core->pci_irqmask);
 
/* restart video+vbi capture */
-   spin_lock(>slock);
+   spin_lock_irqsave(>slock,flags);
if (!list_empty(>vidq.active)) {
printk("%s/0: resume video\n", core->name);
restart_video_queue(dev,>vidq);
@@ -2028,7 +2030,7 @@ static int cx8800_resume(struct pci_dev *pci_dev)
printk("%s/0: resume vbi\n", core->name);
cx8800_restart_vbi_queue(dev,>vbiq);
}
-   spin_unlock(>slock);
+   spin_unlock_irqrestore(>slock,flags);
 
return 0;
 }
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Latest randconfig build errors

2013-04-13 Thread Thierry Reding
On Sat, Apr 13, 2013 at 08:54:22AM -0400, Rob Clark wrote:
> On Mon, Mar 4, 2013 at 1:46 PM, Tony Lindgren  wrote:
> >
> >> drivers/gpu/drm/tilcdc/tilcdc_slave.o:(.data+0x54): multiple definition of 
> >> `__mod_of_device_table'
> >> drivers/gpu/drm/tilcdc/tilcdc_tfp410.o:(.data+0x54): first defined here
> >> drivers/gpu/drm/tilcdc/tilcdc_panel.o:(.data+0x54): multiple definition of 
> >> `__mod_of_device_table'
> >> drivers/gpu/drm/tilcdc/tilcdc_tfp410.o:(.data+0x54): first defined here
> >> drivers/gpu/drm/tilcdc/tilcdc_drv.o:(.data+0x184): multiple definition of 
> >> `__mod_of_device_table'
> >> drivers/gpu/drm/tilcdc/tilcdc_tfp410.o:(.data+0x54): first defined here
> >
> > Rob, I assume you'll do a patch for this one?
> 
> 
> oh, I apologize for the late reply, I didn't see this email...
> 
> There is a patch that we can merge to make tilcdc bool instead of
> tristate[1], which I suppose is ok for a temporary fix.  Although I'm
> all-ears if someone has a better idea about how to fix this.  The
> problem is that we have multiple sub-devices for different possible
> panel drivers, so that depending on DT tables, the correct ones get
> loaded for the hw.  But they are all built into a single module.
> Splitting them into multiple modules will be problematic, as panel
> drivers which are present really need to get probed before the
> toplevel drm device..

You could look at the Tegra driver. I had to solve a similar problem
there. What I did is basically parse the DT in the host1x driver and add
all device nodes which are required by DRM to a list. Later when the
individual devices are probed they are removed from that list, so that
when the list becomes empty we are sure that all required devices are
there and only then call the drm_platform_init() function.

This fits very well with how Tegra hardware is designed because host1x
is the parent for all DRM subdevices (DC, RGB/LVDS, HDMI, ...). So it is
probed before any of its children and it can easily parse the DT upfront
and initialize the list of required devices.

> I suppose in theory it is possible to make drm
> cope better with dynamically loaded outputs, but I'm not sure that
> there is any way to do this without breaking userspace which expects
> that all of the connectors/encoders are present once the drm device is
> loaded.

I had been thinking about this on and off for a while, but I haven't
come up with anything concrete. Ideally we could just have some kind of
event that userspace would listen for, so that new outputs can be
dynamically added and userspace informed about them. Last time I checked
most of the helpers assumed that the complete output configuration is
known when the DRM device is registered, so some major rework will be
required to efficiently make use of such dynamicity.

Thierry


pgpqlba7rroGM.pgp
Description: PGP signature


Re: still in 3.9-rc6 - fan speed at 100% after suspend/resume regression

2013-04-13 Thread Ville Syrjälä
On Sat, Apr 13, 2013 at 10:02:21AM -0600, Jake Edge wrote:
> Hi Zhang Rui,
> 
> The problem reported in https://lkml.org/lkml/2012/12/4/428 (and
> incorrectly attributed to a suspend patch by me here:
> https://lkml.org/lkml/2013/4/12/314) still exists in 3.9-rc6, at least
> for my HP/Compaq 2510p laptop. It appeared that some folks were seeing
> some improvement with earlier 3.9-rcs (?) but I just tested rc6 and saw
> substantially the same behavior.  This all works fine in <= 3.6.11.
> 
> A brief recap: after resuming, the fan on the laptop spins up to full
> speed and stays there. "temp6" in "acpitz-virtual-0" (as shown by
> "sensors") is 100°C and stays there, which is presumably what is making
> the fan stay on.

My HP Compaq NC6000 exhibits slightly different behaviour. For me the
reported temp is always accurate but the trip points get out of sync
with the actual temperature.

With 3.7 I saw two different kinds of problems coming out of resume.
In one case the fan stays on until the temp rises high enough to get the
trip points back into sync. In the other case the fan goes off, and stays
off even when the temperature rises above the highest active trip point,
but it does appear to get back into sync when the temp starts to come back
down. I think the difference might stem from resuming when the laptop has
cooled down fully vs. when it's still warm.

I also just tried 3.9-rc6, and that one appears to behave differently
to 3.7, but still wrong. There the fan goes off after resume, and comes
back on as the temperature rises, but it never slows back down after
that.

I'll try to collect some more detailed dumps of all three cases.

-- 
Ville Syrjälä
syrj...@sci.fi
http://www.sci.fi/~syrjala/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] module: Fix race condition between load and unload module

2013-04-13 Thread Anatol Pomozov
Hi

On Sat, Apr 13, 2013 at 10:53 AM, Linus Torvalds
 wrote:
> On Sat, Apr 13, 2013 at 8:41 AM, Anatol Pomozov
>  wrote:
>>
>> Does it make sense to move it to a separate function in kref.h?
>>
>> /** Useful when kref_get is racing with kref_put and refcounter might be 0 */
>> int kref_get_not_zero(kref* ref) {
>> return atomic_inc_not_zero(>refcount);
>> }
>
> It turns out we have that, except it's called "unless_zero", because
> it uses "atomic_add_unless(x,1,0)", rather than the simplified
> "atomic_inc_not_zero(x)".
>
>> or maybe instead change default behavior of kref_get() to
>> atomic_inc_not_zero and force callers check the return value from
>> kref_get()?
>
> That would be painful, and _most_ users should have a preexisting
> refcount. So it's probably better in the long run to just keep the
> warning (but perhaps fix it to be SMP-safe). So I think the part of
> your patch that made kref_get() use atomic_inc_return() is probably a
> good idea regardless.
>
> Also, I changed my patch to be minimal, and not change other users of
> kobject_get(). So other users (not kset_find_obj()) will continue to
> get the warning, and kset_find_obj() uses the safe version.
Looks good to me.

> So this is
> what I'm planning on committing as the minimal patch and marking for
> stable. The rest (including that atomic_inc_return() in kref_get)
> would be cleanup.
>
> Can you give this a quick test?

I ran the test case for ~60 minutes with XFS tests in parallel - no any issues.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 6/7] backports: add media subsystem drivers

2013-04-13 Thread Johannes Berg
On Sat, 2013-04-13 at 07:13 -0700, Luis R. Rodriguez wrote:
> From: "Luis R. Rodriguez" 
> 
> This adds backport support for all media subsystem
> drivers. This is enabled only for >= 3.2. Some media
> drivers rely on the new probe deferrral mechanism
> (-EPROBE_DEFER see commit d1c3414c), those are only
> enabled for kernels >= 3.4. Some media drivers only
> depend on the regulatory but since we only support
> backporting the regulatory on kernels >= 3.4 we only
> enable those media drivers for >= 3.4.
> 
> This backports 433 media drivers.

Heh. Applied. Good thing I can kill the pr_fmt patches again soon.

johannes

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 5/7] backports: add support for voltage / current regulator drivers

2013-04-13 Thread Johannes Berg
On Sat, 2013-04-13 at 07:13 -0700, Luis R. Rodriguez wrote:
> From: "Luis R. Rodriguez" 
> 
> This backports the latest regulator drivers for kernels >= 3.4.
> We enable the regulator only on kernels >= 3.4 given that
> it relies on the new probe deferral mechanism which would
> otherwise mean having to support drivers that do not probe
> correctly. Note that 3.2 had a base regulator implementation
> but that was just stubs.

Applied.

johannes

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[char-misc-next] mie:wd: fix line over 80 characters

2013-04-13 Thread Tomas Winkler
Fix checkpatch warning:
WARNING: line over 80 characters

Signed-off-by: Tomas Winkler 

---
 drivers/misc/mei/wd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/mei/wd.c b/drivers/misc/mei/wd.c
index 2413247..eb3f05c 100644
--- a/drivers/misc/mei/wd.c
+++ b/drivers/misc/mei/wd.c
@@ -317,7 +317,8 @@ end:
  *
  * returns 0 if success, negative errno code for failure
  */
-static int mei_wd_ops_set_timeout(struct watchdog_device *wd_dev, unsigned int 
timeout)
+static int mei_wd_ops_set_timeout(struct watchdog_device *wd_dev,
+   unsigned int timeout)
 {
struct mei_device *dev;
 
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/4] mfd: Kontron PLD mfd driver

2013-04-13 Thread Thomas Gleixner
On Mon, 8 Apr 2013, Kevin Strasser wrote:
> --- /dev/null
> +++ b/drivers/mfd/kempld-core.c
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 

I seriously doubt, that all these includes are required.

> +#define KEMPLD_MAINTAIN_EFT_COMPATIBILITY1

What's the point of this define ?

> +static int kempld_platform_device_register(const struct dmi_system_id *id);
> +static int kempld_get_mutex_set_index_generic(struct kempld_device_data *pld,
> + u8 index, unsigned int timeout);
> +static void kempld_release_mutex_generic(struct kempld_device_data *pld);
> +
> +static int kempld_get_info(struct kempld_device_data *pld);
> +static int kempld_get_info_NOW1(struct kempld_device_data *pld);

Can you please get rid of the CamelCase ?

> +static int kempld_get_info_generic(struct kempld_device_data *pld);
> +static int kempld_get_features(struct kempld_device_data *pld);
> +static int kempld_register_cells_generic(struct kempld_device_data *pld);
> +static int kempld_register_cells_NOW1(struct kempld_device_data *pld);

Can you please reshuffle the code, so that we can get rid of all these
forward declarations ?

> +#define MAX_IDENT_LEN 4
> +static char force_ident[MAX_IDENT_LEN + 1] = "";
> +module_param_string(force_ident, force_ident, sizeof(force_ident), 0);
> +MODULE_PARM_DESC(force_ident, "Force detection of specific product");

Please change this to something which is ad hoc understandable w/o
reading the code. e.g. "kempld_device_id". 

> +/* this option is only here for debugging and should never be needed in
> + * production environments */

/*
 * Please use standard multiline comment style and proper
 * sentences starting with a capital letter
 */

> +static bool force_unlock;
> +module_param(force_unlock, bool, 0);
> +MODULE_PARM_DESC(force_unlock, "Force breaking the semaphore on driver 
> load");

Is it really necessary to carry this in the kernel? If yes, then please put it 
under

#ifdef DEBUG

We really can do without random debug code. And the comment should be
a little more elaborate about what the heck this is doing. "Force
breaking the semaphore ..." makes me shudder, w/o reading the code
which uses this.

> +/**
> + * kempld_read8 - read 8 bit register
> + * @pld: kempld_device_data structure describing the PLD
> + * @index: register index on the chip
> + *
> + * This function reads an 8 bit register of the PLD and returns its value.
> + *
> + * In order for this function to work correctly, 
> kempld_try_get_mutex_set_index
> + * or kempld_get_mutex_set_index has to be called before calling the function
> + * to acquire the mutex. Afterwards the mutex has to be released with
> + * kempld_release_mutex.
> + */
> +u8 kempld_read8(struct kempld_device_data *pld, u8 index)
> +{
> + kempld_set_index(pld, index);
> +
> + return ioread8(pld->io_data);
> +}
> +EXPORT_SYMBOL(kempld_read8);

EXPORT_SYMBOL_GPL please. All over the place.

> +/**
> + * kempld_read16 - read 16 bit register
> + * @pld: kempld_device_data structure describing the PLD
> + * @index: register index on the chip
> + *
> + * This function reads a 16 bit register of the PLD and returns its value.
> + *
> + * In order for this function to work correctly, 
> kempld_try_get_mutex_set_index
> + * or kempld_get_mutex_set_index has to be called before calling the function
> + * to acquire the mutex. Afterwards the mutex has to be released with
> + * kempld_release_mutex.
> + */
> +u16 kempld_read16(struct kempld_device_data *pld, u8 index)
> +{
> + BUG_ON(index+1 < index);

Yuck. What kind of problem are you catching here? Just the corner case
that someone hands in 0xff as index?

I'd rather assume that you tried to catch the case where someone hand
in an index with BIT0 set. So that would be:
   
BUG_ON(index & 0x01);

Aside of that, do you really want to kill the machine here? A
WARN_ON[_ONCE] would be more appropriate.

WARN_ON_ONCE(index & 0x01);

> + return kempld_read8(pld, index) | kempld_read8(pld, index+1) << 8;

index + 1)
Please

> +void kempld_write16(struct kempld_device_data *pld, u8 index, u16 data)
> +{
> + BUG_ON(index+1 < index);

See above. And all other functions which use that silly BUG_ON as well.

> +/**
> + * kempld_set_index - change the current register index of the PLD
> + * @pld: kempld_device_data structure describing the PLD
> + * @index: register index on the chip
> + *
> + * This function changes the register index of the PLD.

That's really important information after reading the above function
descriptor...

> + *
> + * If the PLD mutex has been acquired the whole time and the desired index is

-ENOPARSE

> + * already set there might be no actual hardware access done in this 
> function.
> + *
> + * In order for this function to work correctly, 
> 

Re: [PATCH V2] cpufreq: ARM big LITTLE: Add generic cpufreq driver and its DT glue

2013-04-13 Thread Francesco Lavra
On 03/26/2013 10:51 AM, Viresh Kumar wrote:
> big LITTLE is ARM's new Architecture focussing power/performance needs of 
> modern
> world. More information about big LITTLE can be found here:
> 
> http://www.arm.com/products/processors/technologies/biglittleprocessing.php
> http://lwn.net/Articles/481055/
> 
> In order to keep cpufreq support for all big LITTLE platforms simple/generic,
> this patch tries to add a generic cpufreq driver layer for all big LITTLE
> platforms.
> 
> The driver is divided into two parts:
> - Core driver: Generic and shared across all big LITTLE SoC's
> - Glue drivers: Per platform drivers providing ops to the core driver
> 
> This patch adds in a generic glue driver which would extract information from
> Device Tree.
> 
> Future SoC's can either reuse the DT glue or write their own depending on the
> need.
> 
> Signed-off-by: Sudeep KarkadaNagesha 
> Signed-off-by: Viresh Kumar 
[...]
> diff --git a/drivers/cpufreq/arm_big_little_dt.c 
> b/drivers/cpufreq/arm_big_little_dt.c
> new file mode 100644
> index 000..452ff46
> --- /dev/null
> +++ b/drivers/cpufreq/arm_big_little_dt.c
> @@ -0,0 +1,92 @@
> +/*
> + * Generic big.LITTLE CPUFreq Interface driver
> + *
> + * It provides necessary ops to arm_big_little cpufreq driver and gets
> + * Frequency information from Device Tree. Freq table in DT must be in KHz.
> + *
> + * Copyright (C) 2013 Linaro.
> + * Viresh Kumar 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed "as is" WITHOUT ANY WARRANTY of any
> + * kind, whether express or implied; without even the implied warranty
> + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "arm_big_little.h"
> +
> +static int dt_init_opp_table(struct device *cpu_dev)
> +{
> + struct device_node *np = NULL;
> + int count = 0, ret;
> +
> + for_each_child_of_node(of_find_node_by_path("/cpus"), np) {

If of_find_node_by_path() returns NULL, there will be a NULL pointer
dereference.

> + if (count++ != cpu_dev->id)
> + continue;
> + if (!of_get_property(np, "operating-points", NULL))
> + return -ENODATA;
> +
> + cpu_dev->of_node = np;
> +
> + ret = of_init_opp_table(cpu_dev);
> + if (ret)
> + return ret;
> +
> + return 0;

of_node_put() should be called on np before returning.
Also, the reference count of the parent node should be decremented as well.

These comments apply to the below function dt_get_transition_latency() too.

> + }
> +
> + return -ENODEV;
> +}
> +
> +static int dt_get_transition_latency(struct device *cpu_dev)
> +{
> + struct device_node *np = NULL;
> + u32 transition_latency = CPUFREQ_ETERNAL;
> + int count = 0;
> +
> + for_each_child_of_node(of_find_node_by_path("/cpus"), np) {
> + if (count++ != cpu_dev->id)
> + continue;
> +
> + of_property_read_u32(np, "clock-latency", _latency);
> + return 0;
> + }
> +
> + return -ENODEV;
> +}

--
Francesco
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: still in 3.9-rc6 - fan speed at 100% after suspend/resume regression

2013-04-13 Thread Jake Edge
[ not sure why, but the lkml address got munged in Rui's reply ...
fixed now -- sorry for the dupe ]

On Sun, 14 Apr 2013 02:20:15 +0800 Zhang Rui wrote:

> > A brief recap: after resuming, the fan on the laptop spins up to
> > full speed and stays there. "temp6" in "acpitz-virtual-0" (as shown
> > by "sensors") is 100°C and stays there, which is presumably what is
> > making the fan stay on.
> > 
> first, this seems like the same problem reported at
> https://bugzilla.kernel.org/show_bug.cgi?id=50041
> please refer to comment #17 and #27.

Yes, it does seem like the same bug that Matthias is talking about in
those comments ...
 
> > Is more information needed?
> > 
> please attach the acpidump output of your laptop.

I have the dump, do you want it on the old closed bug or shall I wait
until Matthias opens a new one?

> can you please test thermal -thermal branch at
> git://git.kernel.org/pub/scm/linux/kernel/git/rzhang/linux.git

I just tried it, no difference in behavior (fan on full, temp6 ==
100°C) ...

I took all the defaults when I did a 'make oldconfig', some of which
looked thermal-related ... was I supposed to set them differently?

thanks,

jake

-- 
Jake Edge - LWN - j...@lwn.net - http://lwn.net
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[tip:smp/hotplug] sparc: Use generic idle loop

2013-04-13 Thread tip-bot for Sam Ravnborg
Commit-ID:  87fa05aeb3a5e8e21b1a5510eef6983650eff092
Gitweb: http://git.kernel.org/tip/87fa05aeb3a5e8e21b1a5510eef6983650eff092
Author: Sam Ravnborg 
AuthorDate: Thu, 11 Apr 2013 21:38:50 +0200
Committer:  Thomas Gleixner 
CommitDate: Sat, 13 Apr 2013 21:36:27 +0200

sparc: Use generic idle loop

Add generic cpu_idle support

sparc32:
- replace call to cpu_idle() with cpu_startup_entry()
- add arch_cpu_idle()

sparc64:
- smp_callin() now include cpu_startup_entry() call so we can
  skip calling cpu_idle from assembler
- add arch_cpu_idle() and arch_cpu_idle_dead()

Signed-off-by: Sam Ravnborg 
Reviewed-by: "Srivatsa S. Bhat" 
Cc: torva...@linux-foundation.org
Cc: ru...@rustcorp.com.au
Cc: paul...@linux.vnet.ibm.com
Cc: pet...@infradead.org
Cc: magnus.d...@gmail.com
Acked-by: David Miller 
Link: http://lkml.kernel.org/r/20130411193850.ga2...@merkur.ravnborg.org
Signed-off-by: Thomas Gleixner 
---
 arch/sparc/Kconfig|  1 +
 arch/sparc/kernel/hvtramp.S   |  3 +--
 arch/sparc/kernel/process_32.c| 21 -
 arch/sparc/kernel/process_64.c| 49 +++
 arch/sparc/kernel/smp_32.c|  2 +-
 arch/sparc/kernel/smp_64.c|  2 ++
 arch/sparc/kernel/trampoline_64.S |  3 +--
 7 files changed, 24 insertions(+), 57 deletions(-)

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 3d361f2..ee5eacc 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -37,6 +37,7 @@ config SPARC
select GENERIC_SMP_IDLE_THREAD
select GENERIC_CMOS_UPDATE
select GENERIC_CLOCKEVENTS
+   select GENERIC_IDLE_LOOP
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select MODULES_USE_ELF_RELA
diff --git a/arch/sparc/kernel/hvtramp.S b/arch/sparc/kernel/hvtramp.S
index 9365432..605c960 100644
--- a/arch/sparc/kernel/hvtramp.S
+++ b/arch/sparc/kernel/hvtramp.S
@@ -128,8 +128,7 @@ hv_cpu_startup:
 
callsmp_callin
 nop
-   callcpu_idle
-mov0, %o0
+
callcpu_panic
 nop
 
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 62eede1..c852410 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -64,23 +64,12 @@ extern void fpsave(unsigned long *, unsigned long *, void 
*, unsigned long *);
 struct task_struct *last_task_used_math = NULL;
 struct thread_info *current_set[NR_CPUS];
 
-/*
- * the idle loop on a Sparc... ;)
- */
-void cpu_idle(void)
+/* Idle loop support. */
+void arch_cpu_idle(void)
 {
-   set_thread_flag(TIF_POLLING_NRFLAG);
-
-   /* endless idle loop with no priority at all */
-   for (;;) {
-   while (!need_resched()) {
-   if (sparc_idle)
-   (*sparc_idle)();
-   else
-   cpu_relax();
-   }
-   schedule_preempt_disabled();
-   }
+   if (sparc_idle)
+   (*sparc_idle)();
+   local_irq_enable();
 }
 
 /* XXX cli/sti -> local_irq_xxx here, check this works once SMP is fixed. */
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index cdb80b2..9fbf0d1 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -52,20 +52,17 @@
 
 #include "kstack.h"
 
-static void sparc64_yield(int cpu)
+/* Idle loop support on sparc64. */
+void arch_cpu_idle(void)
 {
if (tlb_type != hypervisor) {
touch_nmi_watchdog();
-   return;
-   }
-
-   clear_thread_flag(TIF_POLLING_NRFLAG);
-   smp_mb__after_clear_bit();
-
-   while (!need_resched() && !cpu_is_offline(cpu)) {
+   } else {
unsigned long pstate;
 
-   /* Disable interrupts. */
+/* The sun4v sleeping code requires that we have PSTATE.IE 
cleared over
+ * the cpu sleep hypervisor call.
+ */
__asm__ __volatile__(
"rdpr %%pstate, %0\n\t"
"andn %0, %1, %0\n\t"
@@ -73,7 +70,7 @@ static void sparc64_yield(int cpu)
: "=" (pstate)
: "i" (PSTATE_IE));
 
-   if (!need_resched() && !cpu_is_offline(cpu))
+   if (!need_resched() && !cpu_is_offline(smp_processor_id()))
sun4v_cpu_yield();
 
/* Re-enable interrupts. */
@@ -84,36 +81,16 @@ static void sparc64_yield(int cpu)
: "=" (pstate)
: "i" (PSTATE_IE));
}
-
-   set_thread_flag(TIF_POLLING_NRFLAG);
+   local_irq_enable();
 }
 
-/* The idle loop on sparc64. */
-void cpu_idle(void)
-{
-   int cpu = smp_processor_id();
-
-   set_thread_flag(TIF_POLLING_NRFLAG);
-
-   while(1) {
-   tick_nohz_idle_enter();
-   rcu_idle_enter();
-
-   

Re: [PATCH tip/core/rcu 6/7] rcu: Drive quiescent-state-forcing delay from HZ

2013-04-13 Thread Josh Triplett
On Sat, Apr 13, 2013 at 12:34:25PM -0700, Paul E. McKenney wrote:
> On Sat, Apr 13, 2013 at 11:18:00AM -0700, Josh Triplett wrote:
> > On Fri, Apr 12, 2013 at 11:38:04PM -0700, Paul E. McKenney wrote:
> > > On Fri, Apr 12, 2013 at 04:54:02PM -0700, Josh Triplett wrote:
> > > > On Fri, Apr 12, 2013 at 04:19:13PM -0700, Paul E. McKenney wrote:
> > > > > From: "Paul E. McKenney" 
> > > > > 
> > > > > Systems with HZ=100 can have slow bootup times due to the default
> > > > > three-jiffy delays between quiescent-state forcing attempts.  This
> > > > > commit therefore auto-tunes the RCU_JIFFIES_TILL_FORCE_QS value based
> > > > > on the value of HZ.  However, this would break very large systems that
> > > > > require more time between quiescent-state forcing attempts.  This
> > > > > commit therefore also ups the default delay by one jiffy for each
> > > > > 256 CPUs that might be on the system (based off of nr_cpu_ids at
> > > > > runtime, -not- NR_CPUS at build time).
> > > > > 
> > > > > Reported-by: Paul Mackerras 
> > > > > Signed-off-by: Paul E. McKenney 
> > > > 
> > > > Something seems very wrong if RCU regularly hits the fqs code during
> > > > boot; feels like there's some more straightforward solution we're
> > > > missing.  What causes these CPUs to fall under RCU's scrutiny during
> > > > boot yet not actually hit the RCU codepaths naturally?
> > > 
> > > The problem is that they are running HZ=100, so that RCU will often
> > > take 30-60 milliseconds per grace period.  At that point, you only
> > > need 16-30 grace periods to chew up a full second, so it is not all
> > > that hard to eat up the additional 8-12 seconds of boot time that
> > > they were seeing.  IIRC, UP boot was costing them 4 seconds.
> > > 
> > > For HZ=1000, this would translate to 800ms to 1.2s, which is nowhere
> > > near as annoying.
> > 
> > That raises two questions, though.  First, who calls synchronize_rcu()
> > repeatedly during boot, and could they call call_rcu() instead to avoid
> > blocking for an RCU grace period?  Second, why does RCU need 3-6 jiffies
> > to resolve a grace period during boot?  That suggests that RCU doesn't
> > actually resolve a grace period until the force-quiescent-state
> > machinery kicks in, meaning that the normal quiescent-state mechanism
> > didn't work.
> 
> Indeed, converting synchronize_rcu() to call_rcu() might also be
> helpful.  The reason that RCU often does not resolve grace periods until
> force_quiescent_state() is that it is often the case during boot that
> all but one CPU is idle.  RCU tries hard to avoid waking up idle CPUs,
> so it must scan them.  Scanning is relatively expensive, so there is
> reason to wait.

How are those CPUs going idle without first telling RCU that they're
quiesced?  Seems like, during boot at least, you want RCU to use its
idle==quiesced logic to proactively note continuously-quiescent states.
Ideally, you should not hit the FQS code at all during boot.

> One thing that could be done would be to scan immediately during boot,
> and then back off once boot has completed.  Of course, RCU has no idea
> when boot has completed, but one way to get this effect is to boot
> with rcutree.jiffies_till_first_fqs=0, and then use sysfs to set it
> to 3 once boot has completed.

What do you mean by "boot has completed" here?  The kernel's early
initialization, the kernel's initialization up to running /sbin/init, or
userspace initialization up through supporting user login?

In any case, I don't think it makes sense to do this with FQS.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] perf: treat attr.config as u64 in perf_swevent_init()

2013-04-13 Thread Tommi Rantala
Trinity discovered that we fail to check all 64 bits of attr.config
passed by user space, resulting to out-of-bounds access of the
perf_swevent_enabled array in sw_perf_event_destroy().

Introduced in commit b0a873ebb ("perf: Register PMU implementations").

Signed-off-by: Tommi Rantala 
Cc: Peter Zijlstra 
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 59412d0..fff6420 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5330,7 +5330,7 @@ static void sw_perf_event_destroy(struct perf_event 
*event)
 
 static int perf_swevent_init(struct perf_event *event)
 {
-   int event_id = event->attr.config;
+   u64 event_id = event->attr.config;
 
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH tip/core/rcu 0/12] TINY_RCU changes for 3.11

2013-04-13 Thread Paul E. McKenney
On Sat, Apr 13, 2013 at 11:30:49AM -0700, Josh Triplett wrote:
> On Fri, Apr 12, 2013 at 04:48:10PM -0700, Paul E. McKenney wrote:
> > This series removes TINY_PREEMPT_RCU, as promised/threatened at
> > http://lwn.net/Articles/541037/ and https://lkml.org/lkml/2012/11/12/545.
> > 
> > 1.  Remove TINY_PREEMPT_RCU.  This is a straight syntactic removal,
> > with no attempt at cleanup.  The remaining patches do the cleanup.
> > 
> > 2.  Inline the now-empty show_tiny_preempt_stats() function.
> > 
> > 3.  Inline the now-empty rcu_preempt_check_callbacks() function.
> > 
> > 4.  Inline the now-empty rcu_preempt_remove_callbacks() function.
> > 
> > 5.  Inline the now-empty rcu_preempt_process_callbacks() function.
> > 
> > 6.  Because TINY_RCU no longer has kthreads, remove the code that
> > used to abstract away kthread vs. softirq invocation.
> > 
> > 7.  Inline the now-empty check_cpu_stall_preempt() function.
> > 
> > 8.  Remove CONFIG_TINY_RCU ifdefs from include/linux/rcutiny.h
> > 
> > 9.  Inline the now-empty rcu_preempt_note_context_switch() function.
> > 
> > 10. Move code to allow consolidating ifdefs in kernel/rcutiny_plugin.h.
> > 
> > 11. Remove TINY_PREEMPT_RCU's tracing formats from documentation.
> > 
> > 12. Shrink TINY_RCU a bit by moving exit_rcu() to TREE_RCU, leaving
> > TINY_RCU with a static inline empty function.
> 
> For 2-7 and 9-12:
> Reviewed-by: Josh Triplett 
> 
> I responded to patch 8 with a note about moving part of it to patch 1;
> with that changed,
> Reviewed-by: Josh Triplett 
> for those two as well.

Thank you for the review, and good point on merging patch 8 into patch 1,
will do!

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH tip/core/rcu 6/7] rcu: Drive quiescent-state-forcing delay from HZ

2013-04-13 Thread Paul E. McKenney
On Sat, Apr 13, 2013 at 11:18:00AM -0700, Josh Triplett wrote:
> On Fri, Apr 12, 2013 at 11:38:04PM -0700, Paul E. McKenney wrote:
> > On Fri, Apr 12, 2013 at 04:54:02PM -0700, Josh Triplett wrote:
> > > On Fri, Apr 12, 2013 at 04:19:13PM -0700, Paul E. McKenney wrote:
> > > > From: "Paul E. McKenney" 
> > > > 
> > > > Systems with HZ=100 can have slow bootup times due to the default
> > > > three-jiffy delays between quiescent-state forcing attempts.  This
> > > > commit therefore auto-tunes the RCU_JIFFIES_TILL_FORCE_QS value based
> > > > on the value of HZ.  However, this would break very large systems that
> > > > require more time between quiescent-state forcing attempts.  This
> > > > commit therefore also ups the default delay by one jiffy for each
> > > > 256 CPUs that might be on the system (based off of nr_cpu_ids at
> > > > runtime, -not- NR_CPUS at build time).
> > > > 
> > > > Reported-by: Paul Mackerras 
> > > > Signed-off-by: Paul E. McKenney 
> > > 
> > > Something seems very wrong if RCU regularly hits the fqs code during
> > > boot; feels like there's some more straightforward solution we're
> > > missing.  What causes these CPUs to fall under RCU's scrutiny during
> > > boot yet not actually hit the RCU codepaths naturally?
> > 
> > The problem is that they are running HZ=100, so that RCU will often
> > take 30-60 milliseconds per grace period.  At that point, you only
> > need 16-30 grace periods to chew up a full second, so it is not all
> > that hard to eat up the additional 8-12 seconds of boot time that
> > they were seeing.  IIRC, UP boot was costing them 4 seconds.
> > 
> > For HZ=1000, this would translate to 800ms to 1.2s, which is nowhere
> > near as annoying.
> 
> That raises two questions, though.  First, who calls synchronize_rcu()
> repeatedly during boot, and could they call call_rcu() instead to avoid
> blocking for an RCU grace period?  Second, why does RCU need 3-6 jiffies
> to resolve a grace period during boot?  That suggests that RCU doesn't
> actually resolve a grace period until the force-quiescent-state
> machinery kicks in, meaning that the normal quiescent-state mechanism
> didn't work.

Indeed, converting synchronize_rcu() to call_rcu() might also be
helpful.  The reason that RCU often does not resolve grace periods until
force_quiescent_state() is that it is often the case during boot that
all but one CPU is idle.  RCU tries hard to avoid waking up idle CPUs,
so it must scan them.  Scanning is relatively expensive, so there is
reason to wait.

One thing that could be done would be to scan immediately during boot,
and then back off once boot has completed.  Of course, RCU has no idea
when boot has completed, but one way to get this effect is to boot
with rcutree.jiffies_till_first_fqs=0, and then use sysfs to set it
to 3 once boot has completed.

> > > Also, a comment below.
> > > 
> > > > --- a/kernel/rcutree.h
> > > > +++ b/kernel/rcutree.h
> > > > @@ -342,7 +342,17 @@ struct rcu_data {
> > > >  #define RCU_FORCE_QS   3   /* Need to force quiescent 
> > > > state. */
> > > >  #define RCU_SIGNAL_INITRCU_SAVE_DYNTICK
> > > >  
> > > > -#define RCU_JIFFIES_TILL_FORCE_QS   3  /* for 
> > > > rsp->jiffies_force_qs */
> > > > +#if HZ > 500
> > > > +#define RCU_JIFFIES_TILL_FORCE_QS   3  /* for 
> > > > jiffies_till_first_fqs */
> > > > +#elif HZ > 250
> > > > +#define RCU_JIFFIES_TILL_FORCE_QS   2
> > > > +#else
> > > > +#define RCU_JIFFIES_TILL_FORCE_QS   1
> > > > +#endif
> > > 
> > > This seems like it really wants to use a duration calculated directly
> > > from HZ; perhaps (HZ/100)?
> > 
> > Very possibly to the direct calculation, but HZ/100 would get 10 ticks
> > delay at HZ=1000, which is too high -- the value of 3 ticks for HZ=1000
> > works well.  But I could do something like this:
> > 
> > #define RCU_JIFFIES_TILL_FORCE_QS (((HZ + 199) / 300) + ((HZ + 199) / 300 ? 
> > 0 : 1))
> > 
> > Or maybe a bit better:
> > 
> > #define RCU_JTFQS_SE ((HZ + 199) / 300)
> > #define RCU_JIFFIES_TILL_FORCE_QS (RCU_JTFQS_SE + (RCU_JTFQS_SE ? 0 : 1))
> > 
> > This would come reasonably close to the values shown above.  Would
> > this work for you?
> 
> I'd argue that if you need something that complex, you should just
> explicitly write it as a step function:
> 
> #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))

Yeah, I couldn't resist handling HZ>1000, but that doesn't sound all
that likely.  I will use your suggested approach.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: bcache/dmcache/enhanceio bake-off

2013-04-13 Thread Mike Snitzer
On Sat, Apr 13 2013 at 12:09pm -0400,
Joe Thornber  wrote:

> Hi Darrick,
> 
> On Thu, Apr 11, 2013 at 12:22:39AM -0700, Darrick J. Wong wrote:
> > Hi all,
> > 
> > Lately I've been having some fun playing with bcache, dmcache, and 
> > enhanceio.
> 
> I pushed some tweaks to the mq policy today to my thin-dev tree.  They
> show some improvements to these fio based tests.
> 
> In addition I've written a blog post trying to explain what's going on in 
> dm-cache:
> http://device-mapper.org/blog/2013/04/13/benchmarking-dm-cache-with-fio/

Darrick,

Joe has a few other dm-cache-target.c changes in his thin-dev branch
that are required in order to realize the gains from his mq changes.  I
haven't yet isolated which changes are important but if I just use the
3.9-rc6's dm-cache-tagret.c with thin-dev's mq changes I cannot
reproduce the improved performance Joe mentions in his blog post.

Also, even before these changes I wasn't able to reproduce your dm-cache
results (either the spike in performance or the inconsistencies you
saw across runs).

BTW, I have added 'test_fio_database_funtime' to both the cache and
bcache testsuites in my thinp-test-suite repo (master branch):
git://github.com/snitm/thinp-test-suite.git

You'd run it with somwthing like:
./run_tests --profile mix_fio --suite cache -n /test_fio_database_funtime/
or
./run_tests --profile mix_fio --suite bcache -n /test_fio_database_funtime/

I've been testing against the v3.9-rc6 kernel with Jens' for-next bcache
code merged in, see 'thin-dev-bcache' branch of my linux repo:
git://github.com/snitm/linux.git
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Heads up on a device tree change

2013-04-13 Thread Grant Likely
On Thu, 7 Feb 2013 10:32:13 +, James Hogan  wrote:
> On 06/02/13 14:28, Grant Likely wrote:
> > On Wed, Feb 6, 2013 at 1:32 PM, James Hogan  wrote:
> >> On 06/02/13 13:11, Grant Likely wrote:
> >>> - Resources on platform_devices get registered so they appear in
> >>> /proc/iomem and /proc/ioports and so that device drivers get the added
> >>> protection of request_region. This will cause breakage on device trees
> >>> nodes with partially overlapping memory regions. (ie. 0x100..0x1ff and
> >>> 0x180..0x27f). I also have a workaround for this, but I doubt that it
> >>> will be necessary.
> >>
> >> Hi Grant,
> >>
> >> If I understand you correctly, the non-overlapping memory regions thing
> >> could be a problem for me. We have a Meta based SoC that has various SoC
> >> registers grouped together for doing GPIOs and Pin control things. I'm
> >> still in the process of converting it to device tree, but the way I've
> >> been handling it is to provide overlapping registers to both the gpio
> >> and pinctl DT nodes. Each GPIO bank's registers are also interleaved
> >> with the others, so I've been providing overlapping register ranges
> >> (offset by 4 for each bank) to the DT node for each gpio bank too, so
> >> each bank can function independently and the driver doesn't have to
> >> worry about multiple banks. Does that sound like a reasonable use case?
> >>
> >> I guess I could cheat with the length, or specify each register in it's
> >> own memory resource, but it seems like overkill.
> > 
> > Note that overlapping regions are fine /provided/ that they are the
> > same size or one fits nicely inside another. It's partial overlap that
> > is a problem
> 
> It still feels a bit artificial to impose that limitation on something
> that is supposed to be implementation independent. Having said that it
> doesn't particularly bother me having to work around it.

I've backed out on this. It broke too much.

g.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] checkpatch: whitelist SUPPORTED_*/ADVERTISED_* defines from ethtool.h

2013-04-13 Thread Joe Perches
On Sat, 2013-04-13 at 16:10 +0200, Jonas Gorski wrote:
> Don't complain about camelcase when using SUPPORTED_*/ADVERTISED_*
> defines, they are part of the user api so can't be (easily) fixed.

CamelCase was downgraded recently to a --strict test.

https://lkml.org/lkml/2013/4/11/273

Still, there's a large quantity of CamelCase names in
include/... that would still exist in any case.

A complete whitelist would be very long.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: helping with tracking commits across repos

2013-04-13 Thread D M German
 D M German twisted the bytes to say:

 dmg> One thing that will help me is that if any of you feel I am not tracking
 dmg> your repository, please send me an email with its address.

 dmg> thank you!

 dmg> --daniel

I have now listed all the repositories I am tracking:

http://o.cs.uvic.ca:20810/perl/repos.pl

if your repo is not in the list, please let me know. I spend a fair
amount of time tracking them down, and that will really help.

Thanks!



--
Daniel M. German  "Mathematics belong to God."
   Donald Knuth
http://turingmachine.org/
http://silvernegative.com/
dmg (at) uvic (dot) ca
replace (at) with @ and (dot) with .

 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:sched/core] sched: Lower chances of cputime scaling overflow

2013-04-13 Thread Linus Torvalds
On Sat, Apr 13, 2013 at 7:49 AM, Stanislaw Gruszka  wrote:
>
> It works fine - gives relative error less than 0.1% for very big
> numbers.

So I was assuming that the values were all roughly in the same range.

When that is not true (say, very big rtime and very big total, very
small stime), it's possible that we end up first normalizing
rtime-vs-stime, and then dropping precision due to big 'total'. And
that might cause excessive precision loss (and I think "0.1%" is
excessive, even if it's probably perfectly fine).

I suspect I should have done the "drop precision due to 'total' being
out of range *first*. And then only balance rtime/stime after we've
scaled down total. That might avoid any unnecessary precision loss,
because bringing 'total' into the 32-bit range will continually shrink
the much larger (unbalanced) number, rather than shrink something that
was already balanced.

But I doubt it ever matters in practice.

The *big* loss (well, relatively - the worst case I've seen with your
test program is with 'err' being 0.021%) comes because if we have to
drop precision, and both rtime and stime are big, we have to scale
down 'total' by one bit every time. And we scale down the bigger of
rtime/stime too,  but we basically have twice as many bits to shave
off rtime/stime, since there are two values (even if we pick the
biggest one, eventually we'll start alternating because shaving bits
will make the other one bigger).

So we may end up scaling 'total' down to much less than 32 bits, and
that's how you get the "big" errors in the 0.02% range.

The good news is that
 (a) this requires that rtime/stime really both are big numbers
 (b) this only happens with really really big numbers (ie ver much
your "10 years of 4096 threads" at 1000 Hz kind of numbers)
 (c) even then the error isn't catastrophic.

So I think my algorithm could be improved a bit (to do the total
scaling *before* doing the scaling of rtime-vs-stime), but I think
it's quite usable.

 Linus

PS. This is the "Make sure 'total' fits in 32 bits first" version. Not
really tested, but it's just changing the order of operations a bit.

/* We know one of the values has a bit set in the high 32 bits */
for (;;) {
/* Make sure "rtime" is the bigger of stime/rtime */
if (stime > rtime) {
u64 tmp = rtime; rtime = stime; stime = tmp;
}

/* Make sure 'total' fits in 32 bits */
if (total >> 32)
goto drop_precision;

/* Does rtime (and thus stime) fit in 32 bits? */
if (!(rtime >> 32))
break;

/* Can we just balance rtime/stime rather than dropping bits? */
if (stime >> 31)
goto drop_precision;

/* We can grow stime and shrink rtime and try to make them both fit */
stime <<= 1;
rtime >>= 1;
continue;

drop_precision:
/* We drop from rtime, it has more bits than stime */
rtime >>= 1;
total >>= 1;
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: sw_perf_event_destroy() oops while fuzzing

2013-04-13 Thread Tommi Rantala
2013/4/12 Tommi Rantala :
> 2013/4/12 Peter Zijlstra :
>> perf_swevent_init() only sets event->destroy() (to
>> sw_perf_event_destroy) _after_ it increments the static key thing and
>> enqueues (and allocates) the hash list stuff.
>>
>> Obviously something is funny, but I'm not seeing it.
>
> Might this help... ? (untested)

I can reproduce the bug on my machine with:

#include 
#include 
#include 

int main(void)
{
struct perf_event_attr attr = {
.type = PERF_TYPE_SOFTWARE,
.size = sizeof(struct perf_event_attr),
.config = 0x,
};

syscall(__NR_perf_event_open, , getpid(), -1, -1, 0);
return 0;
}

The patch below fixes the oops. I'll send it properly.

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 59412d0..fff6420 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -5330,7 +5330,7 @@ static void sw_perf_event_destroy(struct
> perf_event *event)
>
>  static int perf_swevent_init(struct perf_event *event)
>  {
> -   int event_id = event->attr.config;
> +   u64 event_id = event->attr.config;
>
> if (event->attr.type != PERF_TYPE_SOFTWARE)
> return -ENOENT;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH tip/core/rcu 0/8] RCU callback-numbering simplifications for 3.11

2013-04-13 Thread Josh Triplett
On Fri, Apr 12, 2013 at 04:32:11PM -0700, Paul E. McKenney wrote:
> This series takes advantage of callback numbering to simplify RCU's
> grace-period machinery, in some cases also reducing the number of
> lock acquisitions (though the resulting change in performance is not
> perceptible).  The individual patches are as follows:
> 
> 1.Move code to make way for the code-combining in later patches.
>   This commit makes no changes, just moves code.
> 
> 2.Make __note_new_gpnum() also check for the ends of prior grace
>   periods, thus eliminating the earlier possibility of a given
>   CPU becoming aware of the start of the next grace period before
>   becoming aware of the end of the previous grace period.  Yes,
>   the code did handle this correctly, but now it doesn't need to.
>   More important, now I don't need to think about how it handles
>   this correctly.
> 
> 3.Rename note_new_gpnum() to note_gp_changes() in preparation for
>   later merge of rcu_process_gp_end() into this function.
> 
> 4.Change calls to rcu_process_gp_end() to instead call
>   note_gp_changes(), and also remove the now-used rcu_process_gp_end().
> 
> 5.Remove duplicate code by merging __rcu_process_gp_end() into
>   __note_gp_changes().
> 
> 6.Eliminate now-redundant call to check_for_new_grace_period().  This
>   leaves only a single caller, so inline check_for_new_grace_period().
> 
> 7.Given that rcu_start_gp_per_cpu() is a trivial wrapper function
>   with only one caller, inline it into its sole remaining call site.
> 
> 8.Eliminate now-redundant call to note_gp_changes().

For all 8:
Reviewed-by: Josh Triplett 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH tip/core/rcu 0/12] TINY_RCU changes for 3.11

2013-04-13 Thread Josh Triplett
On Fri, Apr 12, 2013 at 04:48:10PM -0700, Paul E. McKenney wrote:
> This series removes TINY_PREEMPT_RCU, as promised/threatened at
> http://lwn.net/Articles/541037/ and https://lkml.org/lkml/2012/11/12/545.
> 
> 1.Remove TINY_PREEMPT_RCU.  This is a straight syntactic removal,
>   with no attempt at cleanup.  The remaining patches do the cleanup.
> 
> 2.Inline the now-empty show_tiny_preempt_stats() function.
> 
> 3.Inline the now-empty rcu_preempt_check_callbacks() function.
> 
> 4.Inline the now-empty rcu_preempt_remove_callbacks() function.
> 
> 5.Inline the now-empty rcu_preempt_process_callbacks() function.
> 
> 6.Because TINY_RCU no longer has kthreads, remove the code that
>   used to abstract away kthread vs. softirq invocation.
> 
> 7.Inline the now-empty check_cpu_stall_preempt() function.
> 
> 8.Remove CONFIG_TINY_RCU ifdefs from include/linux/rcutiny.h
> 
> 9.Inline the now-empty rcu_preempt_note_context_switch() function.
> 
> 10.   Move code to allow consolidating ifdefs in kernel/rcutiny_plugin.h.
> 
> 11.   Remove TINY_PREEMPT_RCU's tracing formats from documentation.
> 
> 12.   Shrink TINY_RCU a bit by moving exit_rcu() to TREE_RCU, leaving
>   TINY_RCU with a static inline empty function.

For 2-7 and 9-12:
Reviewed-by: Josh Triplett 

I responded to patch 8 with a note about moving part of it to patch 1;
with that changed,
Reviewed-by: Josh Triplett 
for those two as well.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH tip/core/rcu 08/12] rcu: Remove the CONFIG_TINY_RCU ifdefs in rcutiny.h

2013-04-13 Thread Josh Triplett
On Fri, Apr 12, 2013 at 04:48:27PM -0700, Paul E. McKenney wrote:
> From: "Paul E. McKenney" 
> 
> Now that CONFIG_TINY_PREEMPT_RCU is no more, this commit removes
> the CONFIG_TINY_RCU ifdefs from include/linux/rcutiny.h in favor of
> unconditionally compiling the CONFIG_TINY_RCU legs of those ifdefs.
> 
> Signed-off-by: Paul E. McKenney 

The #else branches of these ifdefs ought to disappear in the first patch
of the series, since they cover the CONFIG_TINY_PREEMPT_RCU case.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/1] uprobes/perf: Avoid perf_trace_buf_prepare/submit if ->perf_events is empty

2013-04-13 Thread Oleg Nesterov
On 04/13, Oleg Nesterov wrote:
>
> On 04/12, Steven Rostedt wrote:
> >
> > Can you make the necessary changes elsewhere? I talked with Frederic on
> > IRC and he's a bit busy with other work. But he did say he would review
> > changes that you make.
>
> Sure, will be happy to do.

Everything looks trivial except DECLARE_EVENT_CLASS()->perf_trace_*() which
should also check __task != NULL.

In fact this _looks_ simple too, we could move TP_perf_assign() logic into
TP_ARGS(), but probably this is too ugly. I'll try think more.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH tip/core/rcu 6/7] rcu: Drive quiescent-state-forcing delay from HZ

2013-04-13 Thread Josh Triplett
On Fri, Apr 12, 2013 at 11:38:04PM -0700, Paul E. McKenney wrote:
> On Fri, Apr 12, 2013 at 04:54:02PM -0700, Josh Triplett wrote:
> > On Fri, Apr 12, 2013 at 04:19:13PM -0700, Paul E. McKenney wrote:
> > > From: "Paul E. McKenney" 
> > > 
> > > Systems with HZ=100 can have slow bootup times due to the default
> > > three-jiffy delays between quiescent-state forcing attempts.  This
> > > commit therefore auto-tunes the RCU_JIFFIES_TILL_FORCE_QS value based
> > > on the value of HZ.  However, this would break very large systems that
> > > require more time between quiescent-state forcing attempts.  This
> > > commit therefore also ups the default delay by one jiffy for each
> > > 256 CPUs that might be on the system (based off of nr_cpu_ids at
> > > runtime, -not- NR_CPUS at build time).
> > > 
> > > Reported-by: Paul Mackerras 
> > > Signed-off-by: Paul E. McKenney 
> > 
> > Something seems very wrong if RCU regularly hits the fqs code during
> > boot; feels like there's some more straightforward solution we're
> > missing.  What causes these CPUs to fall under RCU's scrutiny during
> > boot yet not actually hit the RCU codepaths naturally?
> 
> The problem is that they are running HZ=100, so that RCU will often
> take 30-60 milliseconds per grace period.  At that point, you only
> need 16-30 grace periods to chew up a full second, so it is not all
> that hard to eat up the additional 8-12 seconds of boot time that
> they were seeing.  IIRC, UP boot was costing them 4 seconds.
> 
> For HZ=1000, this would translate to 800ms to 1.2s, which is nowhere
> near as annoying.

That raises two questions, though.  First, who calls synchronize_rcu()
repeatedly during boot, and could they call call_rcu() instead to avoid
blocking for an RCU grace period?  Second, why does RCU need 3-6 jiffies
to resolve a grace period during boot?  That suggests that RCU doesn't
actually resolve a grace period until the force-quiescent-state
machinery kicks in, meaning that the normal quiescent-state mechanism
didn't work.

> > Also, a comment below.
> > 
> > > --- a/kernel/rcutree.h
> > > +++ b/kernel/rcutree.h
> > > @@ -342,7 +342,17 @@ struct rcu_data {
> > >  #define RCU_FORCE_QS 3   /* Need to force quiescent 
> > > state. */
> > >  #define RCU_SIGNAL_INIT  RCU_SAVE_DYNTICK
> > >  
> > > -#define RCU_JIFFIES_TILL_FORCE_QS 3  /* for 
> > > rsp->jiffies_force_qs */
> > > +#if HZ > 500
> > > +#define RCU_JIFFIES_TILL_FORCE_QS 3  /* for 
> > > jiffies_till_first_fqs */
> > > +#elif HZ > 250
> > > +#define RCU_JIFFIES_TILL_FORCE_QS 2
> > > +#else
> > > +#define RCU_JIFFIES_TILL_FORCE_QS 1
> > > +#endif
> > 
> > This seems like it really wants to use a duration calculated directly
> > from HZ; perhaps (HZ/100)?
> 
> Very possibly to the direct calculation, but HZ/100 would get 10 ticks
> delay at HZ=1000, which is too high -- the value of 3 ticks for HZ=1000
> works well.  But I could do something like this:
> 
> #define RCU_JIFFIES_TILL_FORCE_QS (((HZ + 199) / 300) + ((HZ + 199) / 300 ? 0 
> : 1))
> 
> Or maybe a bit better:
> 
> #define RCU_JTFQS_SE ((HZ + 199) / 300)
> #define RCU_JIFFIES_TILL_FORCE_QS (RCU_JTFQS_SE + (RCU_JTFQS_SE ? 0 : 1))
> 
> This would come reasonably close to the values shown above.  Would
> this work for you?

I'd argue that if you need something that complex, you should just
explicitly write it as a step function:

#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: helping with tracking commits across repos

2013-04-13 Thread D M German
 vinod> 
 vinod> 
 vinod> 
 vinod> On Fri, 2013-04-12 at 13:22 -0700, D M German wrote:
 vinod> > Hi Everybody,
 vinod> > 
 vinod> > I am professor of computer science at the University of Victoria
 vinod> > (Canada).
 vinod> > 
 vinod> > During the last year and a half, we have been trying to track the
 vinod> > commits as they move in the entire linux git repos ecosystem. We have
 vinod> > amassed a good amount of data that tell us for every commit (and in 
fact
 vinod> > for every unique patch inside a commit) where it has been and whether 
it
 vinod> > has reached linus or not ---or any other repository, as a matter of
 vinod> > fact.
 vinod> i see some of the commits shown not in linus tree, although they are...
 vinod> perhaps a bug?
 vinod> 
http://o.cs.uvic.ca:20810/perl/cid.pl?cid=765024697807ad1e1cac332aa891253ca4a339da
 vinod> 
 vinod> It shows the same for linus's merge!
 vinod> 
http://o.cs.uvic.ca:20810/perl/cid.pl?cid=cfb63bafdb87bbcdc5d6dbbca623d3f69475f118
 
Hi Vinod,

the tracking of the path-to-linus is something that is not done
automatically yet (I have to start the process manually, as there are
some issues I need to verify--it is a heuristic), but I plan to run it
automatically.

Nonetheless, it might be run once a day, so the commits of the day will
always be slightly behind.

One thing that will help me is that if any of you feel I am not tracking
your repository, please send me an email with its address.

thank you!

--daniel


--
Daniel M. German  "Don't try to be like Jackie.
   There is only one Jackie...
   Jackie Chan ->  Study computers instead"
http://turingmachine.org/
http://silvernegative.com/
dmg (at) uvic (dot) ca
replace (at) with @ and (dot) with .

 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] module: Fix race condition between load and unload module

2013-04-13 Thread Linus Torvalds
On Sat, Apr 13, 2013 at 8:41 AM, Anatol Pomozov
 wrote:
>
> Does it make sense to move it to a separate function in kref.h?
>
> /** Useful when kref_get is racing with kref_put and refcounter might be 0 */
> int kref_get_not_zero(kref* ref) {
> return atomic_inc_not_zero(>refcount);
> }

It turns out we have that, except it's called "unless_zero", because
it uses "atomic_add_unless(x,1,0)", rather than the simplified
"atomic_inc_not_zero(x)".

> or maybe instead change default behavior of kref_get() to
> atomic_inc_not_zero and force callers check the return value from
> kref_get()?

That would be painful, and _most_ users should have a preexisting
refcount. So it's probably better in the long run to just keep the
warning (but perhaps fix it to be SMP-safe). So I think the part of
your patch that made kref_get() use atomic_inc_return() is probably a
good idea regardless.

Also, I changed my patch to be minimal, and not change other users of
kobject_get(). So other users (not kset_find_obj()) will continue to
get the warning, and kset_find_obj() uses the safe version. So this is
what I'm planning on committing as the minimal patch and marking for
stable. The rest (including that atomic_inc_return() in kref_get)
would be cleanup.

Can you give this a quick test?

   Linus


patch.diff
Description: Binary data


Re: BUG: Fn keys not working on EliteBook 8460p after fabf85e3ca15d5b94058f391dac8df870cdd427a

2013-04-13 Thread Matthew Garrett
On Sat, 2013-04-13 at 13:39 -0400, Kyle Evans wrote:
> On 04/13/2013 12:21 PM, Matthew Garrett wrote:
> > On Sat, 2013-04-13 at 08:36 -0400, Kyle Evans wrote:
> >> Sure, sorry about that. I was hoping the GUID would be enough. I'll see
> >> what I can come up with.
> > Sure there's no WMI method that makes the EC write? It's a little weird
> > for WMI drivers to have to hit the EC directly.
> >
> I have no idea, I didn't know what a DSDT was before trying to get these 
> buttons working.

Got a copy of your ACPI tables?

-- 
Matthew Garrett | mj...@srcf.ucam.org


Re: BUG: Fn keys not working on EliteBook 8460p after fabf85e3ca15d5b94058f391dac8df870cdd427a

2013-04-13 Thread Kyle Evans

On 04/13/2013 12:21 PM, Matthew Garrett wrote:

On Sat, 2013-04-13 at 08:36 -0400, Kyle Evans wrote:

Sure, sorry about that. I was hoping the GUID would be enough. I'll see
what I can come up with.

Sure there's no WMI method that makes the EC write? It's a little weird
for WMI drivers to have to hit the EC directly.

I have no idea, I didn't know what a DSDT was before trying to get these 
buttons working.


...A quick grep reveals acpi_wmi_ec_space_handler, is that what I should 
use? It calls ec_write itself, but has more function parameters and of 
course error checking to make sure you don't screw up those extra 
parameters. Seems inefficient to me. Or, maybe like it was designed for 
an automated code routine.


Looking further, I don't see any other drivers that use it, ec_write 
seems to be the standard.


Your call though, you are the master in this domain and you wrote the 
driver.




static acpi_status
acpi_wmi_ec_space_handler(u32 function, acpi_physical_address address,
  u32 bits, u64 *value,
  void *handler_context, void *region_context)
{
int result = 0, i = 0;
u8 temp = 0;

if ((address > 0xFF) || !value)
return AE_BAD_PARAMETER;

if (function != ACPI_READ && function != ACPI_WRITE)
return AE_BAD_PARAMETER;

if (bits != 8)
return AE_BAD_PARAMETER;

if (function == ACPI_READ) {
result = ec_read(address, );
(*value) |= ((u64)temp) << i;
} else {
temp = 0xff & ((*value) >> i);
result = ec_write(address, temp);
}

switch (result) {
case -EINVAL:
return AE_BAD_PARAMETER;
break;
case -ENODEV:
return AE_NOT_FOUND;
break;
case -ETIME:
return AE_TIME;
break;
default:
return AE_OK;
}
}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V4 1/6] clk: OMAP: introduce device tree binding to kernel clock data

2013-04-13 Thread Tony Lindgren
* Nishanth Menon  [130412 16:43]:
> Thanks for checking up. Fixed all of them below, will post part of
> series again, only if I need to address further comments in other
> patches..

Thanks it seems that the other ones are ready to go, just one
more comment below.

> --- /dev/null
> +++ b/drivers/clk/omap/clk.c
> @@ -0,0 +1,95 @@
> +/*
> + * Texas Instruments OMAP Clock driver
> + *
> + * Copyright (C) 2013 Texas Instruments Incorporated - http://www.ti.com/
> + *   Nishanth Menon 
> + *   Tony Lindgren 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed "as is" WITHOUT ANY WARRANTY of any
> + * kind, whether express or implied; without even the implied warranty
> + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +static const struct of_device_id omap_clk_of_match[] = {
> + {.compatible = "ti,omap-clock",},
> + {},
> +};
> +
> +/**
> + * omap_clk_src_get() - Get OMAP clock from node name when needed
> + * @clkspec: clkspec argument
> + * @data:unused
> + *
> + * REVISIT: We assume the following:
> + * 1. omap clock names end with _ck
> + * 2. omap clock names are under 32 characters in length
> + */
> +static struct clk *omap_clk_src_get(struct of_phandle_args *clkspec, void 
> *data)
> +{
> + struct clk *clk;
> + char clk_name[32];
> + struct device_node *np = clkspec->np;
> +
> + snprintf(clk_name, 32, "%s_ck", np->name);
> + clk = clk_get(NULL, clk_name);
> + if (IS_ERR(clk)) {
> + pr_err("%s: could not get clock %s(%ld)\n", __func__,
> +clk_name, PTR_ERR(clk));
> + goto out;
> + }
> + clk_put(clk);

It seems that clk_put() is actually wrong here. That's because
of_clk_get() should boild down to just the look up of the clock 
and then clk_get() on it, so no double clk_get() is done in this
case. Once the consumer driver is done, it will just call clk_put()
on it.

> +out:
> + return clk;
> +}
> +
> +/**
> + * omap_clk_probe() - create link from DT definition to clock data
> + * @pdev:device node
> + *
> + * NOTE: we look up the clock lazily when the consumer driver does
> + * of_clk_get() and initialize a NULL clock here.
> + */
> +static int omap_clk_probe(struct platform_device *pdev)
> +{
> + int res;
> + struct device_node *np = pdev->dev.of_node;
> +
> + /* This allows the driver to of_clk_get() */
> + res = of_clk_add_provider(np, omap_clk_src_get, NULL);
> + if (res)
> + dev_err(>dev, "could not add provider(%d)\n", res);
> +
> + return res;
> +}
> +
> +static struct platform_driver omap_clk_driver = {
> + .probe = omap_clk_probe,
> + .driver = {
> +.name = "omap_clk",
> +.of_match_table = of_match_ptr(omap_clk_of_match),
> +},
> +};
> +
> +static int __init omap_clk_init(void)
> +{
> + return platform_driver_register(_clk_driver);
> +}
> +arch_initcall(omap_clk_init);
> +
> +MODULE_DESCRIPTION("OMAP Clock driver");
> +MODULE_AUTHOR("Texas Instruments Inc.");
> +MODULE_LICENSE("GPL v2");

Other than that looks OK to me.

Tony
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Summary of security bugs (now fixed) in user namespaces

2013-04-13 Thread Andy Lutomirski
I previously reported these bugs privatley.  I'm summarizing them for
the historical record.  These bugs were never exploitable on a
default-configured released kernel, but some 3.8 versions are
vulnerable depending on configuration.

=== Bug 1: chroot bypass ===

It was possible for a chrooted program to create a new user namespace
and a new mount namespace.  It could keep an fd to the old root, which
is outside the new root, and therefore use it to escape, like this:

--- begin ---
/* break_chroot.c by */
/* Copyright (c) 2013 Andrew Lutomirski.  All rights reserved. */

#define _GNU_SOURCE
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#ifndef CLONE_NEWUSER
#define CLONE_NEWUSER 0x1000
#endif

static void printcwd(void)
{
  /* This is fugly. */
  static int lastlen = -1;
  char buf[8192];
  if (getcwd(buf, sizeof(buf))) {
if (strlen(buf) != lastlen)
  printf("%s\n", buf);
lastlen = strlen(buf);
  } else {
warn("getcwd");
  }
}

int fn(void *unused)
{
  int i;
  int fd;

  fd = open("/", O_RDONLY | O_DIRECTORY);
  if (fd == -1)
err(1, "open(\".\")");
  if (unshare(CLONE_NEWUSER) != 0)
err(1, "unshare(CLONE_NEWUSER)");
  if (unshare(CLONE_NEWNS) != 0)
err(1, "unshare(CLONE_NEWNS)");
  if (fchdir(fd) != 0)
err(1, "fchdir");
  close(fd);

  for (i = 0; i < 100; i++) {
printcwd();
if (chdir("..") != 0) {
  warn("chdir");
  break;
}
  }

  fd = open(".", O_PATH | O_DIRECTORY);
  if (fd == -1)
err(1, "open(\".\")");

  if (fd != 3) {
if (dup2(fd, 3) == -1)
  err(1, "dup2");
close(fd);
  }
  _exit(0);
}

int main(int argc, char **argv)
{
  int dummy;

  if (argc < 2) {
printf("usage: break_chroot COMMAND ARGS...\n\n"
   "You won't be entirely out of jail.  / is still the jail root.\n");
return 1;
  }

  close(3);

  if (signal(SIGCHLD, SIG_DFL) != 0)
err(1, "signal");

  if (clone(fn, , CLONE_FILES | SIGCHLD, 0) == -1)
err(1, "clone");

  int status;
  if (wait() == -1)
err(1, "wait");
  if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
errx(1, "child failed");
  if (fchdir(3) != 0)
err(1, "fchdir");
  close(3);

  execv(argv[1], argv+1);
  err(1, argv[1]);

  return 0;
}
--- end ---

$ ls /
bin   dev  home  lib64   media  opt   root  sbin  sys  usr
boot  etc  lib   lost+found  mntproc  run   srv   tmp  var
$ /path/to/break_chroot /bin/sh
(unreachable)/hostfs
(unreachable)/
sh-4.2$ pwd
(unreachable)/
sh-4.2$ ls
bin  dev  etc  hostfs  init  lib  lib64  proc  root  run  sbin  sys  usr  var

=== Bug 2: read-only bind mount bypass ===

This one was straightforward: create a new userns and mount namespace,
then remount a previously read-only bind mount as read-write.  It
worked.

=== Bug 3: SCM_CREDENTIALS pid spoofing ===

This one was also straightforward: create a new userns and then spoof
the pid.  The capability check was on the wrong namespace.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: drm: i915+fb: crtc->lock recursive locking deadlock on VT switch [>= 3.9-rc1 regresion]

2013-04-13 Thread Chris Wilson
On Sat, Apr 13, 2013 at 05:41:46PM +0200, Krzysztof Mazur wrote:
> Hi,
> 
> the drm_fb_helper_hotplug_event() locks all crtc->mutex locks by calling
> drm_modeset_lock_all() and later calls drm_fb_helper_probe_connector_modes(),
> which in case of i915 DRM driver effectively calls
> intel_get_load_detect_pipe() that tries to lock crtc->mutex again.
> This causes a deadlock, and can be in some cases triggered by VT
> switch to framebuffer console on i915.
> 
> This bug is introduced in Linux 3.9-rc1 and still exists
> in v3.9-rc6-183-gbf81710. Linux 3.8 is ok.

In Dave's drm-fixes branch:

commit 89ced125472b8551c65526934b7f6c733a6864fa
Author: Daniel Vetter 
Date:   Thu Apr 11 14:26:55 2013 +

drm/fb-helper: Fix locking in drm_fb_helper_hotplug_event

Driver's and ->fill_modes functions are allowed to grab crtc mutexes
(for e.g. load detect). Hence we need to first only grab the general
kms mutex, and only in a second step grab all locks to do the
modesets.

This prevents a deadlock on my gm45 in the tv load detect code called
by drm_helper_probe_single_connector_modes.

Signed-off-by: Daniel Vetter 
Signed-off-by: Dave Airlie 

-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Bulk CPU Hotplug (Was Re: [PATCH] Do not force shutdown/reboot to boot cpu.)

2013-04-13 Thread Srivatsa S. Bhat
On 04/12/2013 03:01 PM, Robin Holt wrote:
>  kernel/sys.c | 17 +++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 0da73cf..4d1047d 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -357,6 +357,19 @@ int unregister_reboot_notifier(struct notifier_block *nb)
>  }
>  EXPORT_SYMBOL(unregister_reboot_notifier);
> 
> +void migrate_to_boot_cpu(void)
> +{
> + /* The boot cpu is always logical cpu 0 */
> + int reboot_cpu_id = 0;
> +
> + /* Make certain the cpu I'm about to reboot on is online */
> + if (!cpu_online(reboot_cpu_id))
> + reboot_cpu_id = smp_processor_id();
> +

If CPU 0 is offline, there is no point in binding, right?

[Fenghua (in CC) added the support to offline CPU0 on x86 Intel platforms.
So its possible that CPU0 is offline when you try a reboot.]

> + /* Make certain I only run on the appropriate processor */
> + set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
> +}
> +
>  /**
>   *   kernel_restart - reboot the system
>   *   @cmd: pointer to buffer containing command to execute for restart
> @@ -368,7 +381,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
>  void kernel_restart(char *cmd)
>  {
>   kernel_restart_prepare(cmd);
> - disable_nonboot_cpus();
> + migrate_to_boot_cpu();
>   syscore_shutdown();
>   if (!cmd)
>   printk(KERN_EMERG "Restarting system.\n");
> @@ -414,7 +427,7 @@ void kernel_power_off(void)
>   kernel_shutdown_prepare(SYSTEM_POWER_OFF);
>   if (pm_power_off_prepare)
>   pm_power_off_prepare();
> - disable_nonboot_cpus();
> + migrate_to_boot_cpu();

Okay, so you are touching poweroff also. Restart was only recently altered
by Shawn, so we can assume that his fix was necessary only to his platform.
However, for poweroff, I see the commit below in the git log, which added
the disable_nonboot_cpus() call.

commit 4047727e5ae33f9b8d2b7766d1994ea6e5ec2991
Author: Mark Lord 
Date:   Mon Oct 1 01:20:10 2007 -0700

Fix SMP poweroff hangs

Its an old commit, so perhaps the issue no longer holds good, but I thought
I should bring this to notice, just in case.

>   syscore_shutdown();
>   printk(KERN_EMERG "Power down.\n");
>   kmsg_dump(KMSG_DUMP_POWEROFF);
> 

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: helping with tracking commits across repos

2013-04-13 Thread Vinod Koul
On Fri, 2013-04-12 at 13:22 -0700, D M German wrote:
> Hi Everybody,
> 
> I am professor of computer science at the University of Victoria
> (Canada).
> 
> During the last year and a half, we have been trying to track the
> commits as they move in the entire linux git repos ecosystem. We have
> amassed a good amount of data that tell us for every commit (and in fact
> for every unique patch inside a commit) where it has been and whether it
> has reached linus or not ---or any other repository, as a matter of
> fact.
i see some of the commits shown not in linus tree, although they are...
perhaps a bug?
http://o.cs.uvic.ca:20810/perl/cid.pl?cid=765024697807ad1e1cac332aa891253ca4a339da

It shows the same for linus's merge!
http://o.cs.uvic.ca:20810/perl/cid.pl?cid=cfb63bafdb87bbcdc5d6dbbca623d3f69475f118

-- 
Vinod Koul
Intel Corp.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Bulk CPU Hotplug (Was Re: [PATCH] Do not force shutdown/reboot to boot cpu.)

2013-04-13 Thread Oleg Nesterov
On 04/12, Robin Holt wrote:
>
> +void migrate_to_boot_cpu(void)
> +{
> + /* The boot cpu is always logical cpu 0 */
> + int reboot_cpu_id = 0;
> +
> + /* Make certain the cpu I'm about to reboot on is online */
> + if (!cpu_online(reboot_cpu_id))
> + reboot_cpu_id = smp_processor_id();
> +
> + /* Make certain I only run on the appropriate processor */
> + set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));

This is only theoretical, but perhaps it makes sense to set
PF_THREAD_BOUND before set_cpus_allowed_ptr() ? To prevent the
race with another thread doing sched_setaffinity().

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG: Fn keys not working on EliteBook 8460p after fabf85e3ca15d5b94058f391dac8df870cdd427a

2013-04-13 Thread Matthew Garrett
On Sat, 2013-04-13 at 08:36 -0400, Kyle Evans wrote:
> Sure, sorry about that. I was hoping the GUID would be enough. I'll see 
> what I can come up with.

Sure there's no WMI method that makes the EC write? It's a little weird
for WMI drivers to have to hit the EC directly.

-- 
Matthew Garrett | mj...@srcf.ucam.org


Re: [PATCH v1 6/9] uretprobes: Return probe exit, invoke handlers

2013-04-13 Thread Oleg Nesterov
On 04/13, Srikar Dronamraju wrote:
>
> > > Oh yes, this should be documented more explicitly in the changelog of
> > > this patch or 7/9 (which tries to document the limitations but should
> > > be more clear).
> > >
> > > Currently we do not support longjmp() and we assume that the probed
> > > function should do the regular return. We should certainly try to improve
> > > this, but I really think that this should go into the next series.
> > >
> > > Because this is nontrivial, needs more discussion, and I'm afraid should
> > > be per-arch. Even on x86 (which can check the stack) this is not simple,
> > > in general we can't know how to check that (to simplify) the first frame
> > > is already invalid. Just for example, we could check regs->sp and detect
> > > that longjmp() was called but sigaltstack() can easily fool this logic.
> > >
>
> Yes, its perfectly fine to keep this logic for the next patchset.

OK, great.

> Can you tell me why sigaltstack() can fool us if we rely on regs->sp.

Because we can't simply compare resg->sp and ret_instance->sp and decide
if we should ignore this ri or not, the task can hit retprobe, then take
a signal, switch to altstack and hit another rp. I'll write another email
(hopefully patches) later.

> Acked-by: Srikar Dronamraju 

Thanks Srikar.

OK. Everything is acked, I'll send git-pull-request.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [dm-devel] bcache/dmcache/enhanceio bake-off

2013-04-13 Thread Joe Thornber
Hi Darrick,

On Thu, Apr 11, 2013 at 12:22:39AM -0700, Darrick J. Wong wrote:
> Hi all,
> 
> Lately I've been having some fun playing with bcache, dmcache, and enhanceio.

I pushed some tweaks to the mq policy today to my thin-dev tree.  They
show some improvements to these fio based tests.

In addition I've written a blog post trying to explain what's going on in 
dm-cache:
http://device-mapper.org/blog/2013/04/13/benchmarking-dm-cache-with-fio/

- Joe
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/1] (Was: kernel: move exit_task_work() past exit_notify())

2013-04-13 Thread Oleg Nesterov
On 04/13, Oleg Nesterov wrote:
>
> > --- a/kernel/exit.c
> > +++ b/kernel/exit.c
> > @@ -795,7 +795,6 @@ void do_exit(long code)
> > exit_shm(tsk);
> > exit_files(tsk);
> > exit_fs(tsk);
> > -   exit_task_work(tsk);
> > check_stack_usage();
> > exit_thread();
> >
> > @@ -822,6 +821,7 @@ void do_exit(long code)
> > ptrace_put_breakpoints(tsk);
> >
> > exit_notify(tsk, group_dead);
> > +   exit_task_work(tsk);
>
> I am not comfortable with this change...
>
> The task is "really dead" after exit_notify(), even release_task(current)
> can be called.
>
> Let me think a bit... It seems that we have the alternative.

Andrey, Eric, how about this patch?

COMPLETELY UNTESTED and I need to recheck, but perhaps you can review?

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/1] move exit_task_namespaces() outside of exit_notify()

2013-04-13 Thread Oleg Nesterov
exit_notify() does exit_task_namespaces() after
forget_original_parent(). This was needed to ensure that ->nsproxy
can't be cleared prematurely, an exiting child we are going to
reparent can do do_notify_parent() and use the parent's (ours) pid_ns.

However, after 32084504 "pidns: use task_active_pid_ns in
do_notify_parent" ->nsproxy != NULL is no longer needed, we rely
on task_active_pid_ns().

Move exit_task_namespaces() from exit_notify() to do_exit(), after
exit_fs() and before exit_task_work().

This solves the problem reported by Andrey, free_ipc_ns()->shm_destroy()
does fput() which needs task_work_add(). And this allows us do simplify
exit_notify(), we can avoid unlock/lock(tasklist) and we can change
->exit_state instead of PF_EXITING in forget_original_parent().

Reported-by: Andrey Vagin 
Signed-off-by: Oleg Nesterov 

--- x/kernel/exit.c
+++ x/kernel/exit.c
@@ -649,7 +649,6 @@ static void exit_notify(struct task_stru
 *  jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 */
forget_original_parent(tsk);
-   exit_task_namespaces(tsk);
 
write_lock_irq(_lock);
if (group_dead)
@@ -795,6 +794,7 @@ void do_exit(long code)
exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
+   exit_task_namespaces(tsk);
exit_task_work(tsk);
check_stack_usage();
exit_thread();

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 18/19] mm/alpha: unify mem_init() for both UMA and NUMA architectures

2013-04-13 Thread Jiang Liu
Now mem_init() for both Alpha UMA and Alpha NUMA are the same,
so unify it to reduce duplicated code.

Signed-off-by: Jiang Liu 
Cc: Richard Henderson 
Cc: Ivan Kokshaysky 
Cc: Matt Turner 
Cc: linux-al...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/alpha/mm/init.c |7 ++-
 arch/alpha/mm/numa.c |   10 --
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 04c933c..39de408 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -276,17 +276,14 @@ srm_paging_stop (void)
 }
 #endif
 
-#ifndef CONFIG_DISCONTIGMEM
 void __init
 mem_init(void)
 {
-   max_mapnr = max_low_pfn;
-   free_all_bootmem();
+   set_max_mapnr(max_low_pfn);
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
-
+   free_all_bootmem();
mem_init_print_info(NULL);
 }
-#endif /* CONFIG_DISCONTIGMEM */
 
 void
 free_initmem(void)
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index 0894b3a8..d543d71 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -319,13 +319,3 @@ void __init paging_init(void)
/* Initialize the kernel's ZERO_PGE. */
memset((void *)ZERO_PGE, 0, PAGE_SIZE);
 }
-
-void __init mem_init(void)
-{
-   high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
-   free_all_bootmem();
-   mem_init_print_info(NULL);
-#if 0
-   mem_stress();
-#endif
-}
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 19/19] mm: call register_page_bootmem_info_node() from mm core

2013-04-13 Thread Jiang Liu
Function register_page_bootmem_info_node() is suitably defined for
both HOTPLUG and non-HOTPLUG configurations, so we could call it
from mm core instead of arch specific code. This could simplify
arch implementations.

Signed-off-by: Jiang Liu 
Cc: "David S. Miller" 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: Yasuaki Ishimatsu 
Cc: Michal Hocko 
Cc: Yinghai Lu 
Cc: Wen Congyang 
Cc: Johannes Weiner 
Cc: Tejun Heo 
Cc: sparcli...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux...@kvack.org
---
 arch/sparc/mm/init_64.c |   12 
 arch/x86/mm/init_64.c   |   12 
 mm/bootmem.c|6 ++
 mm/nobootmem.c  |6 ++
 4 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index b1e35b7..5530c09 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2027,16 +2027,6 @@ static void __init patch_tlb_miss_handler_bitmap(void)
flushi(_addr_bitmap_insn[0]);
 }
 
-static void __init register_page_bootmem_info(void)
-{
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-   int i;
-
-   for_each_online_node(i)
-   if (NODE_DATA(i)->node_spanned_pages)
-   register_page_bootmem_info_node(NODE_DATA(i));
-#endif
-}
 void __init mem_init(void)
 {
unsigned long addr, last;
@@ -2052,8 +2042,6 @@ void __init mem_init(void)
patch_tlb_miss_handler_bitmap();
 
high_memory = __va(last_valid_pfn << PAGE_SHIFT);
-
-   register_page_bootmem_info();
free_all_bootmem();
 
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 650264b..72b5141 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1031,24 +1031,12 @@ int __ref arch_remove_memory(u64 start, u64 size)
 
 static struct kcore_list kcore_vsyscall;
 
-static void __init register_page_bootmem_info(void)
-{
-#ifdef CONFIG_NUMA
-   int i;
-
-   for_each_online_node(i)
-   register_page_bootmem_info_node(NODE_DATA(i));
-#endif
-}
-
 void __init mem_init(void)
 {
pci_iommu_alloc();
 
/* clear_bss() already clear the empty_zero_page */
 
-   register_page_bootmem_info();
-
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index fab8f63..3cf36ac 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -273,6 +273,12 @@ unsigned long __init free_all_bootmem(void)
 {
unsigned long total_pages = 0;
bootmem_data_t *bdata;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+   pg_data_t *pgdat;
+
+   for_each_online_pgdat(pgdat)
+   register_page_bootmem_info_node(pgdat);
+#endif
 
reset_all_zones_managed_pages();
 
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6b63cd6..ccc6630 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -166,6 +166,12 @@ void __init reset_all_zones_managed_pages(void)
 unsigned long __init free_all_bootmem(void)
 {
unsigned long pages;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+   pg_data_t *pgdat;
+
+   for_each_online_pgdat(pgdat)
+   register_page_bootmem_info_node(pgdat);
+#endif
 
reset_all_zones_managed_pages();
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 17/19] mm/m68k: fix build warning of unused variable

2013-04-13 Thread Jiang Liu
Fix build warning of unused variable:
arch/m68k/mm/init.c: In function 'mem_init':
arch/m68k/mm/init.c:151:6: warning: unused variable 'i' [-Wunused-variable]

Signed-off-by: Jiang Liu 
Cc: Geert Uytterhoeven 
Cc: Greg Ungerer 
Cc: Thadeu Lima de Souza Cascardo 
Cc: linux-m...@lists.linux-m68k.org
Cc: linux-kernel@vger.kernel.org
---
 arch/m68k/mm/init.c |   13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index 20d0ae2..e4cb0af 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -146,14 +146,11 @@ void __init print_memmap(void)
MLK_ROUNDUP(__bss_start, __bss_stop));
 }
 
-void __init mem_init(void)
+static inline void init_pointer_tables(void)
 {
+#if defined(CONFIG_MMU) && !defined(CONFIG_SUN3) && !defined(CONFIG_COLDFIRE)
int i;
 
-   /* this will put all memory onto the freelists */
-   free_all_bootmem();
-
-#if defined(CONFIG_MMU) && !defined(CONFIG_SUN3) && !defined(CONFIG_COLDFIRE)
/* insert pointer tables allocated so far into the tablelist */
init_pointer_table((unsigned long)kernel_pg_dir);
for (i = 0; i < PTRS_PER_PGD; i++) {
@@ -165,7 +162,13 @@ void __init mem_init(void)
if (zero_pgtable)
init_pointer_table((unsigned long)zero_pgtable);
 #endif
+}
 
+void __init mem_init(void)
+{
+   /* this will put all memory onto the freelists */
+   free_all_bootmem();
+   init_pointer_tables();
mem_init_print_info(NULL);
print_memmap();
 }
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 16/19] mm/unicore32: fix stale comment about VALID_PAGE()

2013-04-13 Thread Jiang Liu
VALID_PAGE() has been removed from kernel long time ago,
so fix the comment.

Signed-off-by: Jiang Liu 
Cc: Guan Xuetao 
Cc: linux-kernel@vger.kernel.org
---
 arch/unicore32/include/asm/memory.h |6 --
 1 file changed, 6 deletions(-)

diff --git a/arch/unicore32/include/asm/memory.h 
b/arch/unicore32/include/asm/memory.h
index 5eddb99..debafc4 100644
--- a/arch/unicore32/include/asm/memory.h
+++ b/arch/unicore32/include/asm/memory.h
@@ -98,12 +98,6 @@
 /*
  * Conversion between a struct page and a physical address.
  *
- * Note: when converting an unknown physical address to a
- * struct page, the resulting pointer must be validated
- * using VALID_PAGE().  It must return an invalid struct page
- * for any physical address not corresponding to a system
- * RAM address.
- *
  *  page_to_pfn(page)  convert a struct page * to a PFN number
  *  pfn_to_page(pfn)   convert a _valid_ PFN number to struct page *
  *
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 01/19] mm: introduce accessor function set_max_mapnr()

2013-04-13 Thread Jiang Liu
Introduce accessor function set_max_mapnr() to set global variable
max_mapnr.

Also unify condition compilation for max_mapnr with
CONFIG_NEED_MULTIPLE_NODES instead of CONFIG_DISCONTIGMEM.

Signed-off-by: Jiang Liu 
---
 include/linux/mm.h |9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f9f9f3c..497ebaf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -25,8 +25,15 @@ struct file_ra_state;
 struct user_struct;
 struct writeback_control;
 
-#ifndef CONFIG_DISCONTIGMEM  /* Don't use mapnrs, do it properly */
+#ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
+
+static inline void set_max_mapnr(unsigned long limit)
+{
+   max_mapnr = limit;
+}
+#else
+static inline void set_max_mapnr(unsigned long limit) { }
 #endif
 
 extern unsigned long totalram_pages;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 15/19] mm/ARM: fix stale comment about VALID_PAGE()

2013-04-13 Thread Jiang Liu
VALID_PAGE() has been removed from kernel long time ago,
so fix the comment.

Signed-off-by: Jiang Liu 
Cc: Russell King 
Cc: Will Deacon 
Cc: Nicolas Pitre 
Cc: Stephen Boyd 
Cc: Giancarlo Asnaghi 
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 arch/arm/include/asm/memory.h |6 --
 1 file changed, 6 deletions(-)

diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index 57870ab..0cd2a3d 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -260,12 +260,6 @@ static inline __deprecated void *bus_to_virt(unsigned long 
x)
 /*
  * Conversion between a struct page and a physical address.
  *
- * Note: when converting an unknown physical address to a
- * struct page, the resulting pointer must be validated
- * using VALID_PAGE().  It must return an invalid struct page
- * for any physical address not corresponding to a system
- * RAM address.
- *
  *  page_to_pfn(page)  convert a struct page * to a PFN number
  *  pfn_to_page(pfn)   convert a _valid_ PFN number to struct page *
  *
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 13/19] mm/CRIS: clean up unused VALID_PAGE()

2013-04-13 Thread Jiang Liu
VALID_PAGE() has been removed from kernel long time ago, so clean up it.

Signed-off-by: Jiang Liu 
Cc: Mikael Starvik 
Cc: Jesper Nilsson 
Cc: Jiang Liu 
Cc: linux-cris-ker...@axis.com
Cc: linux-kernel@vger.kernel.org
---
 arch/cris/include/asm/page.h |1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/cris/include/asm/page.h b/arch/cris/include/asm/page.h
index be45ee3..dfc53f9 100644
--- a/arch/cris/include/asm/page.h
+++ b/arch/cris/include/asm/page.h
@@ -51,7 +51,6 @@ typedef struct page *pgtable_t;
  */ 
 
 #define virt_to_page(kaddr)(mem_map + (((unsigned long)(kaddr) - 
PAGE_OFFSET) >> PAGE_SHIFT))
-#define VALID_PAGE(page)   (((page) - mem_map) < max_mapnr)
 #define virt_addr_valid(kaddr) pfn_valid((unsigned)(kaddr) >> PAGE_SHIFT)
 
 /* convert a page (based on mem_map and forward) to a physical address
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 14/19] mm/microblaze: clean up unused VALID_PAGE()

2013-04-13 Thread Jiang Liu
VALID_PAGE() has been removed from kernel long time ago, so clean up it.

Signed-off-by: Jiang Liu 
Cc: Michal Simek 
Cc: microblaze-ucli...@itee.uq.edu.au
Cc: linux-kernel@vger.kernel.org
---
 arch/microblaze/include/asm/page.h |1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/microblaze/include/asm/page.h 
b/arch/microblaze/include/asm/page.h
index 85a5ae8..fd85087 100644
--- a/arch/microblaze/include/asm/page.h
+++ b/arch/microblaze/include/asm/page.h
@@ -168,7 +168,6 @@ extern int page_is_ram(unsigned long pfn);
 #  else /* CONFIG_MMU */
 #  define ARCH_PFN_OFFSET  (memory_start >> PAGE_SHIFT)
 #  define pfn_valid(pfn)   ((pfn) < (max_mapnr + ARCH_PFN_OFFSET))
-#  define VALID_PAGE(page) ((page - mem_map) < max_mapnr)
 #  endif /* CONFIG_MMU */
 
 # endif /* __ASSEMBLY__ */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 11/19] mm: kill free_all_bootmem_node()

2013-04-13 Thread Jiang Liu
Now nobody makes use of free_all_bootmem_node(), kill it.

Signed-off-by: Jiang Liu 
Cc: Andrew Morton 
Cc: Johannes Weiner 
Cc: "David S. Miller" 
Cc: Yinghai Lu 
Cc: Tejun Heo 
Cc: linux-kernel@vger.kernel.org
Cc: linux...@kvack.org
---
 include/linux/bootmem.h |1 -
 mm/bootmem.c|   18 --
 2 files changed, 19 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index b585f57..a8866c5 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -45,7 +45,6 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat,
 extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
 
 extern unsigned long free_low_memory_core_early(int nodeid);
-extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
 extern unsigned long free_all_bootmem(void);
 extern void reset_all_zones_managed_pages(void);
 
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a19404b..fab8f63 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -265,24 +265,6 @@ void __init reset_all_zones_managed_pages(void)
 }
 
 /**
- * free_all_bootmem_node - release a node's free pages to the buddy allocator
- * @pgdat: node to be released
- *
- * Returns the number of pages actually released.
- */
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
-{
-   unsigned long pages;
-
-   register_page_bootmem_info_node(pgdat);
-   reset_node_managed_pages(pgdat);
-   pages = free_all_bootmem_core(pgdat->bdata);
-   totalram_pages += pages;
-
-   return pages;
-}
-
-/**
  * free_all_bootmem - release free pages to the buddy allocator
  *
  * Returns the number of pages actually released.
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 12/19] mm/ALPHA: clean up unused VALID_PAGE()

2013-04-13 Thread Jiang Liu
VALID_PAGE() has been removed from kernel long time ago, so clean up it.

Signed-off-by: Jiang Liu 
---
 arch/alpha/include/asm/mmzone.h |2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/alpha/include/asm/mmzone.h b/arch/alpha/include/asm/mmzone.h
index c5b5d6b..14ce27b 100644
--- a/arch/alpha/include/asm/mmzone.h
+++ b/arch/alpha/include/asm/mmzone.h
@@ -71,8 +71,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
 
 #define virt_to_page(kaddr)pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
 
-#define VALID_PAGE(page)   (((page) - mem_map) < max_mapnr)
-
 #define pmd_page(pmd)  (pfn_to_page(pmd_val(pmd) >> 32))
 #define pgd_page(pgd)  (pfn_to_page(pgd_val(pgd) >> 32))
 #define pte_pfn(pte)   (pte_val(pte) >> 32)
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 10/19] mm/SH: prepare for killing free_all_bootmem_node()

2013-04-13 Thread Jiang Liu
Prepare for killing free_all_bootmem_node() by using
free_all_bootmem().

Signed-off-by: Jiang Liu 
Cc: Paul Mundt 
Cc: Wen Congyang 
Cc: Tang Chen 
Cc: linux...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/sh/mm/init.c |   16 
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 3826596..485c858 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -412,19 +412,11 @@ void __init mem_init(void)
iommu_init();
 
high_memory = NULL;
+   for_each_online_pgdat(pgdat)
+   high_memory = max_t(void *, high_memory,
+   (void *)__va(pgdat_end_pfn(pgdat) << PAGE_SHIFT));
 
-   for_each_online_pgdat(pgdat) {
-   void *node_high_memory;
-
-   if (pgdat->node_spanned_pages)
-   free_all_bootmem_node(pgdat);
-
-   node_high_memory = (void *)__va((pgdat->node_start_pfn +
-pgdat->node_spanned_pages) <<
-PAGE_SHIFT);
-   if (node_high_memory > high_memory)
-   high_memory = node_high_memory;
-   }
+   free_all_bootmem();
 
/* Set this up early, so we can take care of the zero page */
cpu_cache_init();
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


drm: i915+fb: crtc->lock recursive locking deadlock on VT switch [>= 3.9-rc1 regresion]

2013-04-13 Thread Krzysztof Mazur
Hi,

the drm_fb_helper_hotplug_event() locks all crtc->mutex locks by calling
drm_modeset_lock_all() and later calls drm_fb_helper_probe_connector_modes(),
which in case of i915 DRM driver effectively calls
intel_get_load_detect_pipe() that tries to lock crtc->mutex again.
This causes a deadlock, and can be in some cases triggered by VT
switch to framebuffer console on i915.

This bug is introduced in Linux 3.9-rc1 and still exists
in v3.9-rc6-183-gbf81710. Linux 3.8 is ok.

This deadlock is probably introduced by
commit 7b24056be6db7ce907baffdd4cf142ab774ea60c
(drm: don't hold crtc mutexes for connector ->detect callbacks).


Steps to reproduce (the deadlock occurs in almost all cases):
LCD panel connected to LVDS1, 915GM, HP nc6120
1. power on
2. $ startx
3. connect monitor to VGA1
4. $ xrandr --output VGA1 --mode 1600x1200 --right-of LVDS1
5. change VT, ALT + CTRL + F2

Krzysiek

...
[   62.995861] [drm:drm_helper_probe_single_connector_modes], 
[CONNECTOR:13:SVIDEO-1]
[   62.995870] [drm:intel_get_load_detect_pipe], [CONNECTOR:13:SVIDEO-1], 
[ENCODER:14:TV-14]
[   62.995873] 
[   62.995875] =
[   62.995877] [ INFO: possible recursive locking detected ]
[   62.995881] 3.9.0-rc6-00184-gc875d9b #14 Not tainted
# v3.9-rc6-183-gbf81710 + one unrelated local patch.
[   62.995883] -
[   62.995885] X/1759 is trying to acquire lock:
[   62.995903]  (>mutex){+.+.+.}, at: [<803649e4>] 
intel_get_load_detect_pipe+0x164/0x3f0
[   62.995905] 
[   62.995905] but task is already holding lock:
[   62.995916]  (>mutex){+.+.+.}, at: [<8032d1da>] 
drm_modeset_lock_all+0x3a/0x50
[   62.995918] 
[   62.995918] other info that might help us debug this:
[   62.995920]  Possible unsafe locking scenario:
[   62.995920] 
[   62.995921]CPU0
[   62.995923]
[   62.995926]   lock(>mutex);
[   62.995930]   lock(>mutex);
[   62.995931] 
[   62.995931]  *** DEADLOCK ***
[   62.995931] 
[   62.995933]  May be due to missing lock nesting notation
[   62.995933] 
[   62.995936] 3 locks held by X/1759:
[   62.995948]  #0:  (console_lock){+.+.+.}, at: [<802fe726>] 
vt_ioctl+0xe26/0x1220
[   62.995958]  #1:  (>mode_config.mutex){+.+.+.}, at: [<8032d1b5>] 
drm_modeset_lock_all+0x15/0x50
[   62.995968]  #2:  (>mutex){+.+.+.}, at: [<8032d1da>] 
drm_modeset_lock_all+0x3a/0x50
[   62.995970] 
[   62.995970] stack backtrace:
[   62.995974] Pid: 1759, comm: X Not tainted 3.9.0-rc6-00184-gc875d9b #14
[   62.995976] Call Trace:
[   62.995985]  [<8016d9f8>] __lock_acquire+0x748/0x19e0
[   62.995991]  [<80160008>] ? ktime_get+0xb8/0xf0
[   62.995998]  [<80155f6e>] ? local_clock+0x4e/0x60
[   62.996005]  [<8012c2c5>] ? log_store+0x2d5/0x3b0
[   62.996011]  [<8014fc83>] ? down_trylock+0x13/0x40
[   62.996017]  [<8016b655>] ? mark_held_locks+0x75/0xe0
[   62.996020]  [<8012d9b9>] ? vprintk_emit+0x159/0x4e0
[   62.996020]  [<8016f189>] lock_acquire+0x79/0x90
[   62.996020]  [<803649e4>] ? intel_get_load_detect_pipe+0x164/0x3f0
[   62.996020]  [<8050acf4>] mutex_lock_nested+0x54/0x310
[   62.996020]  [<803649e4>] ? intel_get_load_detect_pipe+0x164/0x3f0
[   62.996020]  [<803649e4>] ? intel_get_load_detect_pipe+0x164/0x3f0
[   62.996020]  [<80327c7a>] ? drm_ut_debug_printk+0x2a/0x50
[   62.996020]  [<803649e4>] intel_get_load_detect_pipe+0x164/0x3f0
[   62.996020]  [<80108707>] ? native_sched_clock+0x27/0xb0
[   62.996020]  [<80155bcc>] ? sched_clock_local.constprop.2+0x3c/0x170
[   62.996020]  [<8038061d>] intel_tv_detect+0x15d/0x570
[   62.996020]  [<8016b98b>] ? trace_hardirqs_off+0xb/0x10
[   62.996020]  [<80155f6e>] ? local_clock+0x4e/0x60
[   62.996020]  [<8012c2c5>] ? log_store+0x2d5/0x3b0
[   62.996020]  [<8014fc83>] ? down_trylock+0x13/0x40
[   62.996020]  [<8031dbd8>] drm_helper_probe_single_connector_modes+0x278/0x330
[   62.996020]  [<80319fc7>] 
drm_fb_helper_probe_connector_modes.isra.3+0x37/0x60
[   62.996020]  [<8031bfec>] drm_fb_helper_hotplug_event+0x6c/0xc0
[   62.996020]  [<8031c0d3>] drm_fb_helper_set_par+0x93/0xc0
[   62.996020]  [<802afdaa>] fb_set_var+0x1ea/0x4b0
[   62.996020]  [<8016d6bb>] ? __lock_acquire+0x40b/0x19e0
[   62.996020]  [<80108707>] ? native_sched_clock+0x27/0xb0
[   62.996020]  [<80155bcc>] ? sched_clock_local.constprop.2+0x3c/0x170
[   62.996020]  [<802b964a>] fbcon_blank+0x2aa/0x300
[   62.996020]  [<80307371>] do_unblank_screen+0x91/0x1a0
[   62.996020]  [<802fd876>] complete_change_console+0x56/0xe0
[   62.996020]  [<802fea3f>] vt_ioctl+0x113f/0x1220
[   62.996020]  [<8050d8c7>] ? _raw_spin_unlock+0x27/0x50
[   62.996020]  [<802fd900>] ? complete_change_console+0xe0/0xe0
[   62.996020]  [<802f438a>] tty_ioctl+0x26a/0xb70
[   62.996020]  [<80155bcc>] ? sched_clock_local.constprop.2+0x3c/0x170
[   62.996020]  [<802fd900>] ? complete_change_console+0xe0/0xe0
[   62.996020]  [<80155db5>] ? sched_clock_cpu+0x75/0xd0
[   62.996020]  [<8016b98b>] ? trace_hardirqs_off+0xb/0x10
[   62.996020]  [<80155f6e>] ? 

[RFC PATCH v1 08/19] mm/PARISC: prepare for killing free_all_bootmem_node()

2013-04-13 Thread Jiang Liu
Prepare for killing free_all_bootmem_node() by using
free_all_bootmem().

Signed-off-by: Jiang Liu 
Cc: "James E.J. Bottomley" 
Cc: Helge Deller 
Cc: Michal Hocko 
Cc: David Rientjes 
Cc: linux-par...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/parisc/mm/init.c |   12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index f80c175..ab76b84 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -585,18 +585,8 @@ void __init mem_init(void)
> BITS_PER_LONG);
 
high_memory = __va((max_pfn << PAGE_SHIFT));
-
-#ifndef CONFIG_DISCONTIGMEM
-   max_mapnr = page_to_pfn(virt_to_page(high_memory - 1)) + 1;
+   set_max_mapnr(page_to_pfn(virt_to_page(high_memory - 1)) + 1);
free_all_bootmem();
-#else
-   {
-   int i;
-
-   for (i = 0; i < npmem_ranges; i++)
-   free_all_bootmem_node(NODE_DATA(i));
-   }
-#endif
 
 #ifdef CONFIG_PA11
if (hppa_dma_ops == _dma_ops) {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 09/19] mm/PPC: prepare for killing free_all_bootmem_node()

2013-04-13 Thread Jiang Liu
Prepare for killing free_all_bootmem_node() by using
free_all_bootmem().

Signed-off-by: Jiang Liu 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Alexander Graf 
Cc: "Suzuki K. Poulose" 
Cc: linuxppc-...@lists.ozlabs.org
Cc: linux-kernel@vger.kernel.org
---
 arch/powerpc/mm/mem.c |   16 +---
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 07663de..22e46db 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -305,22 +305,8 @@ void __init mem_init(void)
 #endif
 
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
-
-#ifdef CONFIG_NEED_MULTIPLE_NODES
-   {
-   pg_data_t *pgdat;
-
-   for_each_online_pgdat(pgdat)
-   if (pgdat->node_spanned_pages != 0) {
-   printk("freeing bootmem node %d\n",
-   pgdat->node_id);
-   free_all_bootmem_node(pgdat);
-   }
-   }
-#else
-   max_mapnr = max_pfn;
+   set_max_mapnr(max_pfn);
free_all_bootmem();
-#endif
 
 #ifdef CONFIG_HIGHMEM
{
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 07/19] mm/MIPS: prepare for killing free_all_bootmem_node()

2013-04-13 Thread Jiang Liu
Prepare for killing free_all_bootmem_node() by using
free_all_bootmem().

Signed-off-by: Jiang Liu 
Cc: Ralf Baechle 
Cc: Minchan Kim 
Cc: linux-m...@linux-mips.org
Cc: linux-kernel@vger.kernel.org
---
 arch/mips/sgi-ip27/ip27-memory.c |   12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index d074680..0ebea6f 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -477,18 +477,8 @@ void __init paging_init(void)
 
 void __init mem_init(void)
 {
-   unsigned node;
-
high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT);
-
-   for_each_online_node(node) {
-   /*
-* This will free up the bootmem, ie, slot 0 memory.
-*/
-   free_all_bootmem_node(NODE_DATA(node));
-   }
-
+   free_all_bootmem();
setup_zero_pages(); /* This comes from node 0 */
-
mem_init_print_info(NULL);
 }
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 06/19] mm/metag: prepare for killing free_all_bootmem_node()

2013-04-13 Thread Jiang Liu
Prepare for killing free_all_bootmem_node() by using
free_all_bootmem().

Signed-off-by: Jiang Liu 
Cc: James Hogan 
Cc: linux-kernel@vger.kernel.org
---
 arch/metag/mm/init.c |   14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c
index e00586f..096d022 100644
--- a/arch/metag/mm/init.c
+++ b/arch/metag/mm/init.c
@@ -376,31 +376,21 @@ void __init paging_init(unsigned long mem_end)
 
 void __init mem_init(void)
 {
-   int nid;
-
 #ifdef CONFIG_HIGHMEM
unsigned long tmp;
 
/*
 * Explicitly reset zone->managed_pages because highmem pages are
-* freed before calling free_all_bootmem_node();
+* freed before calling free_all_bootmem();
 */
reset_all_zones_managed_pages();
for (tmp = highstart_pfn; tmp < highend_pfn; tmp++)
free_highmem_page(pfn_to_page(tmp));
 #endif /* CONFIG_HIGHMEM */
 
-   for_each_online_node(nid) {
-   pg_data_t *pgdat = NODE_DATA(nid);
-
-   if (pgdat->node_spanned_pages)
-   free_all_bootmem_node(pgdat);
-   }
-
+   free_all_bootmem();
mem_init_print_info(NULL);
show_mem(0);
-
-   return;
 }
 
 void free_initmem(void)
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 05/19] mm/m68k: prepare for killing free_all_bootmem_node()

2013-04-13 Thread Jiang Liu
Prepare for killing free_all_bootmem_node() by using
free_all_bootmem().

Signed-off-by: Jiang Liu 
Cc: Geert Uytterhoeven 
Cc: Greg Ungerer 
Cc: linux-m...@lists.linux-m68k.org
Cc: linux-kernel@vger.kernel.org
---
 arch/m68k/mm/init.c |4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index 0723141..20d0ae2 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -148,12 +148,10 @@ void __init print_memmap(void)
 
 void __init mem_init(void)
 {
-   pg_data_t *pgdat;
int i;
 
/* this will put all memory onto the freelists */
-   for_each_online_pgdat(pgdat)
-   free_all_bootmem_node(pgdat);
+   free_all_bootmem();
 
 #if defined(CONFIG_MMU) && !defined(CONFIG_SUN3) && !defined(CONFIG_COLDFIRE)
/* insert pointer tables allocated so far into the tablelist */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] module: Fix race condition between load and unload module

2013-04-13 Thread Anatol Pomozov
Hi

On Fri, Apr 12, 2013 at 4:47 PM, Linus Torvalds
 wrote:
> On Fri, Apr 12, 2013 at 3:32 PM, Anatol Pomozov
>  wrote:
>>
>> Here is timeline for the crash in case if kset_find_obj() searches for
>> an object tht nobody holds and other thread is doing kobject_put()
>> on the same kobject:
>>
>> THREAD A (calls kset_find_obj()) THREAD B (calls kobject_put())
>> splin_lock()
>>  atomic_dec_return(kobj->kref), counter 
>> gets zero here
>>  ... starts kobject cleanup 
>>  spin_lock() // WAIT thread A in 
>> kobj_kset_leave()
>> iterate over kset->list
>> atomic_inc(kobj->kref) (counter becomes 1)
>> spin_unlock()
>>  spin_lock() // taken
>>  // it does not know that thread A 
>> increased counter so it
>>  remove obj from list
>>  spin_unlock()
>>  vfree(module) // frees module object 
>> with containing kobj
>>
>> // kobj points to freed memory area!!
>> koubject_put(kobj) // OOPS
>
> This is a much more generic bug in kobjects, and I would hate to add
> some random workaround for just one case of this bug like you do. The
> more fundamental bug needs to be fixed too.
>
> I think the more fundamental bugfix is to just fix kobject_get() to
> return NULL if the refcount was zero, because in that case the kobject
> no longer really exists.
>
> So instead of having
>
> kref_get(>kref);
>
> it should do
>
> if (!atomic_inc_not_zero(>kref.refcount))
> kobj = NULL;

Does it make sense to move it to a separate function in kref.h?

/** Useful when kref_get is racing with kref_put and refcounter might be 0 */
int kref_get_not_zero(kref* ref) {
return atomic_inc_not_zero(>refcount);
}

or maybe instead change default behavior of kref_get() to
atomic_inc_not_zero and force callers check the return value from
kref_get()?

>
> and I think that should fix your race automatically, no? Proper patch
> attached (but TOTALLY UNTESTED - it seems to compile, though).
>
> The problem is that we lose the warning for when the refcount is zero
> and somebody does a kobject_get(), but that is ok *assuming* that
> people actually check the return value of kobject_get() rather than
> just "know" that if they passed in a non-NULL kobj, they'll get it
> right back.
>
> Greg - please take a look... I'm adding Al to the discussion too,
> because Al just *loooves* these kinds of races ;)
>
>   Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 03/19] mm/IA64: prepare for killing free_all_bootmem_node()

2013-04-13 Thread Jiang Liu
Prepare for killing free_all_bootmem_node() by using
free_all_bootmem().

Signed-off-by: Jiang Liu 
Cc: Tony Luck 
Cc: Fenghua Yu 
Cc: Tang Chen 
Cc: David Rientjes 
Cc: linux-i...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/ia64/mm/init.c |9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index d4382dc..26eeb74 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -584,7 +584,6 @@ __setup("nolwsys", nolwsys_setup);
 void __init
 mem_init (void)
 {
-   pg_data_t *pgdat;
int i;
 
BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
@@ -602,15 +601,11 @@ mem_init (void)
 
 #ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
-   max_mapnr = max_low_pfn;
 #endif
 
+   set_max_mapnr(max_low_pfn);
high_memory = __va(max_low_pfn * PAGE_SIZE);
-
-   for_each_online_pgdat(pgdat)
-   if (pgdat->bdata->node_bootmem_map)
-   free_all_bootmem_node(pgdat);
-
+   free_all_bootmem();
mem_init_print_info(NULL);
 
/*
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 04/19] mm/m32r: prepare for killing free_all_bootmem_node()

2013-04-13 Thread Jiang Liu
Prepare for killing free_all_bootmem_node() by using
free_all_bootmem().

Signed-off-by: Jiang Liu 
Cc: Hirokazu Takata 
Cc: linux-m...@ml.linux-m32r.org
Cc: linux-m32r...@ml.linux-m32r.org
Cc: linux-kernel@vger.kernel.org
---
 arch/m32r/mm/init.c |   17 -
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 9c94839..3113c85 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -111,28 +111,19 @@ void __init paging_init(void)
  *==*/
 void __init mem_init(void)
 {
-   int nid;
 #ifndef CONFIG_MMU
extern unsigned long memory_end;
-#endif
 
-#ifndef CONFIG_DISCONTIGMEM
-   max_mapnr = get_num_physpages();
-#endif /* CONFIG_DISCONTIGMEM */
-
-#ifdef CONFIG_MMU
-   high_memory = (void *)__va(PFN_PHYS(MAX_LOW_PFN(0)));
-#else
high_memory = (void *)(memory_end & PAGE_MASK);
+#else
+   high_memory = (void *)__va(PFN_PHYS(MAX_LOW_PFN(0)));
 #endif /* CONFIG_MMU */
 
/* clear the zero-page */
memset(empty_zero_page, 0, PAGE_SIZE);
 
-   /* this will put all low memory onto the freelists */
-   for_each_online_node(nid)
-   free_all_bootmem_node(NODE_DATA(nid));
-
+   set_max_mapnr(get_num_physpages());
+   free_all_bootmem();
mem_init_print_info(NULL);
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 00/19] kill free_all_bootmem() and clean up VALID_PAGE()

2013-04-13 Thread Jiang Liu
Commit 600cc5b7f6 "mm: Kill NO_BOOTMEM version free_all_bootmem_node()"
has kill free_all_bootmem_node() for NO_BOOTMEM.

Currently the usage pattern for free_all_bootmem_node() is like:
for_each_online_pgdat(pgdat)
free_all_bootmem_node(pgdat);

It's equivalent to free_all_bootmem(), so this patchset goes one
step further to kill free_all_bootmem_node() for BOOTMEM too.

This patchset also tries to clean up code and comments related to
VALID_PAGE() because it has been removed from kernel long time ago.

Patch 1-11:
Kill free_all_bootmem_node()
Patch 12-16:
Clean up code and comments related to VALID_PAGE()
Patch 17:
Fix a minor build warning for m68k
Patch 18:
merge Alpha's mem_init() for UMA and NUMA.
Patch 19:
call register_page_bootmem_info_node() from mm core

This patch is based on patchset at
http://marc.info/?l=linux-mm=136525931917910=2

Jiang Liu (19):
  mm: introduce accessor function set_max_mapnr()
  mm/AVR32: prepare for killing free_all_bootmem_node()
  mm/IA64: prepare for killing free_all_bootmem_node()
  mm/m32r: prepare for killing free_all_bootmem_node()
  mm/m68k: prepare for killing free_all_bootmem_node()
  mm/metag: prepare for killing free_all_bootmem_node()
  mm/MIPS: prepare for killing free_all_bootmem_node()
  mm/PARISC: prepare for killing free_all_bootmem_node()
  mm/PPC: prepare for killing free_all_bootmem_node()
  mm/SH: prepare for killing free_all_bootmem_node()
  mm: kill free_all_bootmem_node()
  mm/ALPHA: clean up unused VALID_PAGE()
  mm/CRIS: clean up unused VALID_PAGE()
  mm/microblaze: clean up unused VALID_PAGE()
  mm/ARM: fix stale comment about VALID_PAGE()
  mm/unicore32: fix stale comment about VALID_PAGE()
  mm/m68k: fix build warning of unused variable
  mm/alpha: unify mem_init() for both UMA and NUMA architectures
  mm: call register_page_bootmem_info_node() from mm core

 arch/alpha/include/asm/mmzone.h |2 --
 arch/alpha/mm/init.c|7 ++-
 arch/alpha/mm/numa.c|   10 --
 arch/arm/include/asm/memory.h   |6 --
 arch/avr32/mm/init.c|   21 +
 arch/cris/include/asm/page.h|1 -
 arch/ia64/mm/init.c |9 ++---
 arch/m32r/mm/init.c |   17 -
 arch/m68k/mm/init.c |   15 ---
 arch/metag/mm/init.c|   14 ++
 arch/microblaze/include/asm/page.h  |1 -
 arch/mips/sgi-ip27/ip27-memory.c|   12 +---
 arch/parisc/mm/init.c   |   12 +---
 arch/powerpc/mm/mem.c   |   16 +---
 arch/sh/mm/init.c   |   16 
 arch/sparc/mm/init_64.c |   12 
 arch/unicore32/include/asm/memory.h |6 --
 arch/x86/mm/init_64.c   |   12 
 include/linux/bootmem.h |1 -
 include/linux/mm.h  |9 -
 mm/bootmem.c|   24 ++--
 mm/nobootmem.c  |6 ++
 22 files changed, 50 insertions(+), 179 deletions(-)

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH v1 02/19] mm/AVR32: prepare for killing free_all_bootmem_node()

2013-04-13 Thread Jiang Liu
Prepare for killing free_all_bootmem_node() by using
free_all_bootmem() instead.

Signed-off-by: Jiang Liu 
Cc: Haavard Skinnemoen 
Cc: Hans-Christian Egtvedt 
Cc: linux-kernel@vger.kernel.org
---
 arch/avr32/mm/init.c |   21 +
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c
index c1706a0..b25aba3 100644
--- a/arch/avr32/mm/init.c
+++ b/arch/avr32/mm/init.c
@@ -103,23 +103,12 @@ void __init mem_init(void)
pg_data_t *pgdat;
 
high_memory = NULL;
+   for_each_online_pgdat(pgdat)
+   high_memory = max_t(void *, high_memory,
+   (void *)__va(pgdat_end_pfn(pgdat) << PAGE_SHIFT));
 
-   /* this will put all low memory onto the freelists */
-   for_each_online_pgdat(pgdat) {
-   void *node_high_memory;
-
-   if (pgdat->node_spanned_pages != 0)
-   free_all_bootmem_node(pgdat);
-
-   node_high_memory = (void *)((pgdat->node_start_pfn
-+ pgdat->node_spanned_pages)
-   << PAGE_SHIFT);
-   if (node_high_memory > high_memory)
-   high_memory = node_high_memory;
-   }
-
-   max_mapnr = MAP_NR(high_memory);
-
+   set_max_mapnr(MAP_NR(high_memory));
+   free_all_bootmem();
mem_init_print_info(NULL);
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3.8-stable] gpio: fix wrong checking condition for gpio range

2013-04-13 Thread Haojian Zhuang
On 13 April 2013 22:46, Jonghwan Choi  wrote:
> From: Haojian Zhuang 
>
> This patch looks like it should be in the 3.8-stable tree, should we apply
> it?
>

It could be merged into 3.8-stable tree.

Regards
Haojian
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/9] AMD IOMMU cleanups, fixes and IVRS bug workarounds

2013-04-13 Thread Joerg Roedel
On Sat, Apr 13, 2013 at 11:06:22PM +0800, Andrew Cooks wrote:
> On Fri, Apr 12, 2013 at 4:06 PM, Joerg Roedel  wrote:

> > Oh, that's sad. You were the only one having a machine wich actually has
> > unity-mapped ranges defined in the BIOS table. The code for those
> > mappings was basically untested before you ran it on that machine.
> >
> What is the machine in question? Maybe someone else has access to one,
> if it's not too exotic.

Shuah had access to a HP server machine (don't know which one) that
defined unity-map ranges in the BIOS table. Shuah certainly knows the
details about that machine.


Joerg


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH tip/core/rcu 3/7] rcu: Kick adaptive-ticks CPUs that are holding up RCU grace periods

2013-04-13 Thread Paul E. McKenney
On Sat, Apr 13, 2013 at 04:06:58PM +0200, Frederic Weisbecker wrote:
> 2013/4/13 Paul E. McKenney :
> > From: "Paul E. McKenney" 
> >
> > Adaptive-ticks CPUs inform RCU when they enter kernel mode, but they do
> > not necessarily turn the scheduler-clock tick back on.  This state of
> > affairs could result in RCU waiting on an adaptive-ticks CPU running
> > for an extended period in kernel mode.  Such a CPU will never run the
> > RCU state machine, and could therefore indefinitely extend the RCU state
> > machine, sooner or later resulting in an OOM condition.
> >
> > This patch, inspired by an earlier patch by Frederic Weisbecker, therefore
> > causes RCU's force-quiescent-state processing to check for this condition
> > and to send an IPI to CPUs that remain in that state for too long.
> > "Too long" currently means about three jiffies by default, which is
> > quite some time for a CPU to remain in the kernel without blocking.
> > The rcu_tree.jiffies_till_first_fqs and rcutree.jiffies_till_next_fqs
> > sysfs variables may be used to tune "too long" if needed.
> >
> > Reported-by: Frederic Weisbecker 
> > Signed-off-by: Paul E. McKenney 
> 
> It might be better if I take this patch to get it through
> tip:timers/nohz so that I can keep it in sync with the rest. What do
> you think?

Works for me!  Let me know when you have picked it up and I will drop
it from my tree.

Thanx, Paul

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/9] AMD IOMMU cleanups, fixes and IVRS bug workarounds

2013-04-13 Thread Andrew Cooks
On Fri, Apr 12, 2013 at 4:06 PM, Joerg Roedel  wrote:
> Hi Shuah,
>
> On Wed, Apr 10, 2013 at 10:06:02AM -0600, Shuah Khan wrote:
>> On Tue, Apr 9, 2013 at 2:12 PM, Joerg Roedel  wrote:
>> >  Documentation/kernel-parameters.txt |   14 
>> >  drivers/iommu/amd_iommu.c   |   79 +++---
>> >  drivers/iommu/amd_iommu_init.c  |  151 
>> > +++
>> >  drivers/iommu/amd_iommu_types.h |1 +
>> >  4 files changed, 182 insertions(+), 63 deletions(-)
>
>> Reviewed all the patches in this set. No longer have access to test machine. 
>> :(
>
> Oh, that's sad. You were the only one having a machine wich actually has
> unity-mapped ranges defined in the BIOS table. The code for those
> mappings was basically untested before you ran it on that machine.
>
What is the machine in question? Maybe someone else has access to one,
if it's not too exotic.

--
a.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:sched/core] sched: Lower chances of cputime scaling overflow

2013-04-13 Thread Stanislaw Gruszka
On Thu, Apr 11, 2013 at 08:38:37AM -0700, Linus Torvalds wrote:
> /* We know one of the values has a bit set in the high 32 bits */
> for (;;) {
> /* Make sure "stime" is the bigger of stime/rtime */
> if (rtime > stime) {
> u64 tmp = stime; stime = rtime; rtime = tmp;
> }

For most workloads rtime is bigger than stime, so swapping those would
save some cycles on common cases. Otherwise this algorithm looks great.

Thanks
Stanislaw

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [tip:sched/core] sched: Lower chances of cputime scaling overflow

2013-04-13 Thread Stanislaw Gruszka
On Fri, Apr 12, 2013 at 09:55:56AM +0200, Peter Zijlstra wrote:
> > The above is totally untested, but each step is pretty damn simple and
> > fairly cheap. Sure, it's a loop, but it's bounded to 32 (cheap)
> > iterations, and the normal case is that it's not done at all, or done
> > only a few times.
> 
> Right it gets gradually heavier the bigger the numbers get; which is
> more and more unlikely.
> 
> > And the advantage is that the end result is always that simple
> > 32x32/32 case that we started out with as the common case.
> > 
> > I dunno. Maybe I'm overlooking something, and the above is horrible,
> > but the above seems reasonably efficient if not optimal, and
> > *understandable*.
> 
> I suppose that entirely matters on what one is used to ;-) I had to
> stare rather hard at it for a little while.
> 
> But yes, you take it one step further and are willing to ditch rtime
> bits too and I suppose that's fine.
> 
> Should work,.. Stanislaw could you stick this into your userspace
> thingy and verify the numbers are sane enough? 

It works fine - gives relative error less than 0.1% for very big
numbers.

For the record I'm attaching test program and script.

Thanks
Stanislaw

#include 
#include 
#include 
#include 
#include 

typedef uint64_t u64;
typedef uint32_t u32;

static u64 div_u64_u32(u64 a, u32 b)
{
return a / b;
}

static u64 scale_stime(u64 stime, u64 rtime, u64 total)
{

/* We know one of the values has a bit set in the high 32 bits */
for (;;) {
/* Make sure "rtime" is the bigger of stime/rtime */
if (stime > rtime) {
u64 tmp = rtime; rtime = stime; stime = tmp;
}

/* Do we need to balance stime/rtime bits? */
if (rtime >> 32) {
if (stime >> 31)
goto drop_precision;

/* We can grow rtime and shrink stime and try to make them both fit 
*/
stime <<= 1;
rtime >>= 1;
continue;
}

/* stime/rtime fits in 32 bits, how about total? */
if (!(total >> 32))
break;

drop_precision:
/* We drop from rtime, it has more bits than stime */
rtime >>= 1;
total >>= 1;
}

if (!total)
return stime;

/* Make sure gcc understands that this is a 32x32->64 multiply,
 * followed by a 64/32->64 divide */
return div_u64_u32((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
}
 
int main(int argc, char *argv[])
{
u64 rtime, total, stime, scaled;

if (argc != 4)
return;

rtime = strtoll(argv[1], NULL, 10);
total = strtoll(argv[2], NULL, 10);
stime = strtoll(argv[3], NULL, 10);

assert (total >= stime);

scaled = scale_stime(stime, rtime, total);
printf("%llu\n", scaled);

return 0;
}
#!/usr/bin/python

import subprocess
import random
import math

def kernel_scale (rtime, total, stime):
p = subprocess.Popen("./scale_stime5 " + str(rtime) + " " + str(total) 
+ " " + str(stime) , shell=True, stdout=subprocess.PIPE)
return int(p.stdout.read())

def python_scale (rtime, total, stime):
return (stime * rtime) / total

max_rtime = 10*4096*364*24*60*60*1000;  # 10 years for 4096 threads

fail=False
K=1
for i in range(0, K):
rtime = random.randrange(max_rtime)
total = int(random.uniform(0.1, 1.9) * rtime)

for n in range(1, 100):
stime = (n * total / 100)
r1 = kernel_scale(rtime, total, stime)
r2 = python_scale(rtime, total, stime)
if (float(abs(r1 - r2)) / float(r2)) > 0.001:
print "FAIL!"
print "rtime: " + str(rtime)
print "total: " + str(total)
print "stime: " + str(stime)
print "kernel: " + str(r1)
print "python: " + str(r2)

fail=True
break
if fail:
break;
if (i % 100) == 99:
print str(i/100) + "/" + str(K/100) + " OK"



  1   2   3   4   >