date:20070302

Re: [RFC] Heads up on sys_fallocate()

2007-03-02 Thread Eric Sandeen


Badari Pulavarty wrote:


Amit K. Arora wrote:


This is to give a heads up on few patches that we will be soon coming up
with. These patches implement a new system call sys_fallocate() and a
new inode operation fallocate, for persistent preallocation. The new
system call, as Andrew suggested, will look like:

 asmlinkage long sys_fallocate(int fd, loff_t offset, loff_t len);

I am wondering about return values from this syscall ? Is it supposed to 
return the
number of bytes allocated ? What about partial allocations ? 


If you don't have enough blocks to cover the request, you should 
probably just return -ENOSPC, not a partial allocation.


What about 
if the

blocks already exists ? What would be return values in those cases ?


0 on success, other normal errors oetherwise..

If asked for a range that includes already-allocated blocks, you just 
allocate any non-allocated blocks in the range, I think.


-Eric

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 5/9] mtd: Allow dynamic major/minor number for mtd block devices

2007-03-02 Thread Richard Purdie

Allow mtd block devices to have a dynamically allocated major/minor 
numbers if tr-major == 0.

Signed-off-by: Richard Purdie [EMAIL PROTECTED]

---
 drivers/mtd/mtd_blkdevs.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: linux/drivers/mtd/mtd_blkdevs.c
===
--- linux.orig/drivers/mtd/mtd_blkdevs.c2007-02-28 18:16:52.0 
+
+++ linux/drivers/mtd/mtd_blkdevs.c 2007-03-02 14:46:29.0 +
@@ -380,13 +380,16 @@ int register_mtd_blktrans(struct mtd_blk
mutex_lock(mtd_table_mutex);
 
ret = register_blkdev(tr-major, tr-name);
-   if (ret) {
+   if (ret  0) {
printk(KERN_WARNING Unable to register %s block device on 
major %d: %d\n,
   tr-name, tr-major, ret);
kfree(tr-blkcore_priv);
mutex_unlock(mtd_table_mutex);
return ret;
}
+   if (!tr-major)
+   tr-major = ret;
+
spin_lock_init(tr-blkcore_priv-queue_lock);
init_completion(tr-blkcore_priv-thread_dead);
init_waitqueue_head(tr-blkcore_priv-thread_wq);


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/9] swap: Add try_to_unuse_page_entry()

2007-03-02 Thread Richard Purdie

Add try_to_unuse_page_entry() which can be used to unuse page entries.

This needs try_to_unuse_anon() which is also added, similar to
try_to_unmap_anon().

Originally based on a patch by Nick Piggin from LKML with changes of my
own after hints from Hugh Dickins.

Signed-off-by: Richard Purdie [EMAIL PROTECTED]

---
 include/linux/rmap.h |6 +
 include/linux/swap.h |1 
 mm/rmap.c|2 -
 mm/swapfile.c|   55 +++
 4 files changed, 63 insertions(+), 1 deletion(-)

Index: linux/mm/swapfile.c
===
--- linux.orig/mm/swapfile.c2007-02-28 18:12:34.0 +
+++ linux/mm/swapfile.c 2007-02-28 18:13:07.0 +
@@ -646,6 +646,61 @@ static int unuse_mm(struct mm_struct *mm
return 0;
 }
 
+static int try_to_unuse_anon(swp_entry_t entry, struct page *page)
+{
+   struct anon_vma *anon_vma;
+   struct vm_area_struct *vma;
+
+   anon_vma = page_lock_anon_vma(page);
+   if (!anon_vma)
+   return 0;
+
+   list_for_each_entry(vma, anon_vma-head, anon_vma_node) {
+   if (unuse_vma(vma, entry, page))
+   break;
+   }
+   spin_unlock(anon_vma-lock);
+   return 0;
+}
+
+
+void try_to_unuse_page_entry(struct page *page)
+{
+   struct swap_info_struct *si;
+   unsigned short *swap_map;
+   swp_entry_t entry;
+
+   BUG_ON(!PageLocked(page));
+   BUG_ON(!PageSwapCache(page));
+   BUG_ON(PageWriteback(page));
+   BUG_ON(PagePrivate(page));
+
+   entry.val = page_private(page);
+   si = swap_info_get(entry);
+   if (!si) {
+   WARN_ON(1);
+   return;
+   }
+   swap_map = si-swap_map[swp_offset(entry)];
+   spin_unlock(swap_lock);
+
+   BUG_ON(*swap_map == SWAP_MAP_BAD);
+
+   if (!shmem_unuse(entry, page)) {
+   try_to_unuse_anon(entry, page);
+   delete_from_swap_cache(page);
+   } else if (PageSwapCache(page)) {
+   /*
+* shmem_unuse deleted a swappage from the swap cache, but the
+* move to filepage failed so it left swappage in cache and
+* lowered its swap count to pass quickly through the loops in
+* try_to_unuse(). We must reincrement the count to try again
+* later (ick).
+*/
+   swap_duplicate(entry);
+   }
+}
+
 /*
  * Scan swap_map from current position to next entry still in use.
  * Recycle to start on reaching the end, returning 0 when empty.
Index: linux/include/linux/rmap.h
===
--- linux.orig/include/linux/rmap.h 2007-02-28 18:12:16.0 +
+++ linux/include/linux/rmap.h  2007-02-28 18:12:41.0 +
@@ -104,6 +104,11 @@ pte_t *page_check_address(struct page *,
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 
 /*
+ * Used by try_to_unuse_anon() and try_to_unmap_anon()
+ */
+struct anon_vma *page_lock_anon_vma(struct page *page);
+
+/*
  * Cleans the PTEs of shared mappings.
  * (and since clean PTEs should also be readonly, write protects them too)
  *
@@ -125,6 +130,7 @@ static inline int page_mkclean(struct pa
return 0;
 }
 
+#define page_lock_anon_vma(page)   (0)
 
 #endif /* CONFIG_MMU */
 
Index: linux/mm/rmap.c
===
--- linux.orig/mm/rmap.c2007-02-28 18:12:16.0 +
+++ linux/mm/rmap.c 2007-02-28 18:12:41.0 +
@@ -181,7 +181,7 @@ void __init anon_vma_init(void)
  * Getting a lock on a stable anon_vma from a page off the LRU is
  * tricky: page_lock_anon_vma rely on RCU to guard against the races.
  */
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
struct anon_vma *anon_vma = NULL;
unsigned long anon_mapping;
Index: linux/include/linux/swap.h
===
--- linux.orig/include/linux/swap.h 2007-02-28 18:12:34.0 +
+++ linux/include/linux/swap.h  2007-02-28 18:13:04.0 +
@@ -254,6 +254,7 @@ extern sector_t swapdev_block(int, pgoff
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+extern void try_to_unuse_page_entry(struct page *page);
 struct backing_dev_info;
 
 extern spinlock_t swap_lock;


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/9] swap: Simplify shmem_unuse() usage [optional]

2007-03-02 Thread Richard Purdie

Simplify shmem_unuse_inode() removing a confusing optimisation which
requires the caller to call swap_duplicate if the shmem_unuse() call
doesn't succeed.

Based on a patch by Nick Piggin and some of my own changes as discussed
on LKML. 

Signed-off-by: Richard Purdie [EMAIL PROTECTED]

---
 mm/shmem.c|   12 +---
 mm/swapfile.c |   23 ++-
 2 files changed, 7 insertions(+), 28 deletions(-)

Index: linux/mm/shmem.c
===
--- linux.orig/mm/shmem.c   2007-02-28 18:12:34.0 +
+++ linux/mm/shmem.c2007-02-28 18:12:46.0 +
@@ -734,7 +734,7 @@ static int shmem_unuse_inode(struct shme
struct page **dir;
struct page *subdir;
swp_entry_t *ptr;
-   int offset;
+   int offset, moved;
 
idx = 0;
ptr = info-i_direct;
@@ -792,17 +792,15 @@ lost2:
 found:
idx += offset;
inode = info-vfs_inode;
-   if (move_from_swap_cache(page, idx, inode-i_mapping) == 0) {
+   moved = (move_from_swap_cache(page, idx, inode-i_mapping) == 0);
+   if (moved) {
info-flags |= SHMEM_PAGEIN;
shmem_swp_set(info, ptr + offset, 0);
}
shmem_swp_unmap(ptr);
spin_unlock(info-lock);
-   /*
-* Decrement swap count even when the entry is left behind:
-* try_to_unuse will skip over mms, then reincrement count.
-*/
-   swap_free(entry, page);
+   if (moved)
+   swap_free(entry, page);
return 1;
 }
 
Index: linux/mm/swapfile.c
===
--- linux.orig/mm/swapfile.c2007-02-28 18:12:41.0 +
+++ linux/mm/swapfile.c 2007-02-28 18:13:04.0 +
@@ -689,15 +689,6 @@ void try_to_unuse_page_entry(struct page
if (!shmem_unuse(entry, page)) {
try_to_unuse_anon(entry, page);
delete_from_swap_cache(page);
-   } else if (PageSwapCache(page)) {
-   /*
-* shmem_unuse deleted a swappage from the swap cache, but the
-* move to filepage failed so it left swappage in cache and
-* lowered its swap count to pass quickly through the loops in
-* try_to_unuse(). We must reincrement the count to try again
-* later (ick).
-*/
-   swap_duplicate(entry);
}
 }
 
@@ -922,12 +913,6 @@ static int try_to_unuse(unsigned int typ
 * read from disk into another page.  Splitting into two
 * pages would be incorrect if swap supported shared
 * private pages, but they are handled by tmpfs files.
-*
-* Note shmem_unuse already deleted a swappage from
-* the swap cache, unless the move to filepage failed:
-* in which case it left swappage in cache, lowered its
-* swap count to pass quickly through the loops above,
-* and now we must reincrement count to try again later.
 */
if ((*swap_map  1)  PageDirty(page)  PageSwapCache(page)) {
struct writeback_control wbc = {
@@ -938,12 +923,8 @@ static int try_to_unuse(unsigned int typ
lock_page(page);
wait_on_page_writeback(page);
}
-   if (PageSwapCache(page)) {
-   if (shmem)
-   swap_duplicate(entry);
-   else
-   delete_from_swap_cache(page);
-   }
+   if (PageSwapCache(page)  !shmem)
+   delete_from_swap_cache(page);
 
/*
 * So we could skip searching mms once swap count went


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 7/9] mtd: Fix number of free oob spaces in onenand driver

2007-03-02 Thread Richard Purdie

Correct the number of free OOB data positions in the onenand
driver.

Signed-off-by: Richard Purdie [EMAIL PROTECTED]

---
 drivers/mtd/onenand/onenand_base.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

Index: linux/drivers/mtd/onenand/onenand_base.c
===
--- linux.orig/drivers/mtd/onenand/onenand_base.c   2007-03-02 
15:02:31.0 +
+++ linux/drivers/mtd/onenand/onenand_base.c2007-03-02 15:37:05.0 
+
@@ -33,8 +33,8 @@ static struct nand_ecclayout onenand_oob
56, 57, 58, 59, 60,
},
.oobfree= {
-   {2, 3}, {14, 2}, {18, 3}, {30, 2},
-   {34, 3}, {46, 2}, {50, 3}, {62, 2}
+   {2, 6}, {14, 4}, {18, 6}, {30, 4},
+   {34, 6}, {46, 4}, {50, 6}, {62, 2}
}
 };
 
@@ -47,7 +47,7 @@ static struct nand_ecclayout onenand_oob
8, 9, 10, 11, 12,
24, 25, 26, 27, 28,
},
-   .oobfree= { {2, 3}, {14, 2}, {18, 3}, {30, 2} }
+   .oobfree= { {2, 6}, {14, 4}, {18, 6}, {30, 2} }
 };
 
 static const unsigned char ffchars[] = {


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 8/9] mtd: Allow mtd block device drivers to have a custom ioctl function

2007-03-02 Thread Richard Purdie

Allow mtd block drivers to customise their ioctl functions. Also
allow the drivers to obtain the gendisk struct since ioctl 
functions can need this.

This also moves the mtd ioctl functions from locked to unlocked.
As far as I can see, nothing in the mtd code has locking problems.

Signed-off-by: Richard Purdie [EMAIL PROTECTED]

---
 drivers/mtd/mtd_blkdevs.c|   14 +++---
 include/linux/mtd/blktrans.h |5 +
 2 files changed, 16 insertions(+), 3 deletions(-)

Index: linux/drivers/mtd/mtd_blkdevs.c
===
--- linux.orig/drivers/mtd/mtd_blkdevs.c2007-03-02 14:46:29.0 
+
+++ linux/drivers/mtd/mtd_blkdevs.c 2007-03-02 14:46:47.0 +
@@ -204,9 +204,10 @@ static int blktrans_getgeo(struct block_
return -ENOTTY;
 }
 
-static int blktrans_ioctl(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg)
+static long blktrans_unlocked_ioctl(struct file *file, unsigned cmd,
+   unsigned long arg)
 {
+   struct inode *inode = file-f_dentry-d_inode;
struct mtd_blktrans_dev *dev = inode-i_bdev-bd_disk-private_data;
struct mtd_blktrans_ops *tr = dev-tr;
 
@@ -217,6 +218,8 @@ static int blktrans_ioctl(struct inode *
/* The core code did the work, we had nothing to do. */
return 0;
default:
+   if (tr-ioctl)
+   return tr-ioctl(dev, cmd, arg);
return -ENOTTY;
}
 }
@@ -225,7 +228,7 @@ struct block_device_operations mtd_blktr
.owner  = THIS_MODULE,
.open   = blktrans_open,
.release= blktrans_release,
-   .ioctl  = blktrans_ioctl,
+   .unlocked_ioctl = blktrans_unlocked_ioctl,
.getgeo = blktrans_getgeo,
 };
 
@@ -312,6 +315,11 @@ int add_mtd_blktrans_dev(struct mtd_blkt
return 0;
 }
 
+struct gendisk *get_mtd_blktrans_gendisk(struct mtd_blktrans_dev *dev)
+{
+   return dev-blkcore_priv;
+}
+
 int del_mtd_blktrans_dev(struct mtd_blktrans_dev *old)
 {
if (!!mutex_trylock(mtd_table_mutex)) {
Index: linux/include/linux/mtd/blktrans.h
===
--- linux.orig/include/linux/mtd/blktrans.h 2007-02-28 18:16:52.0 
+
+++ linux/include/linux/mtd/blktrans.h  2007-03-02 14:46:47.0 +
@@ -11,6 +11,7 @@
 #define __MTD_TRANS_H__
 
 #include linux/mutex.h
+#include linux/genhd.h
 
 struct hd_geometry;
 struct mtd_info;
@@ -48,6 +49,9 @@ struct mtd_blktrans_ops {
int (*getgeo)(struct mtd_blktrans_dev *dev, struct hd_geometry *geo);
int (*flush)(struct mtd_blktrans_dev *dev);
 
+   /* Optional ioctl passthrough */
+   int (*ioctl)(struct mtd_blktrans_dev *dev, unsigned int cmd, unsigned 
long arg);
+
/* Called with mtd_table_mutex held; no race with add/remove */
int (*open)(struct mtd_blktrans_dev *dev);
int (*release)(struct mtd_blktrans_dev *dev);
@@ -68,6 +72,7 @@ extern int register_mtd_blktrans(struct 
 extern int deregister_mtd_blktrans(struct mtd_blktrans_ops *tr);
 extern int add_mtd_blktrans_dev(struct mtd_blktrans_dev *dev);
 extern int del_mtd_blktrans_dev(struct mtd_blktrans_dev *dev);
+extern struct gendisk *get_mtd_blktrans_gendisk(struct mtd_blktrans_dev *dev);
 
 
 #endif /* __MTD_TRANS_H__ */


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 6/9] mtd: Add a 'block unused' ioctl call to provide hints to block drivers

2007-03-02 Thread Richard Purdie

Knowing that the data in a given block is now unused is a useful
feature that some block drivers can take advantage of, especially 
when dealing with devices like flash.

This adds an ioctl which allows such hints to be passed to the
block driver. Its shouldn't provide false positives but doesn't
have to provide the signal in all cases - its intended as a hint.

Support for the ioctl is added to the swap subsystem when the swap
header indicates such notification would be useful though an added
flags bitfield. The swaponflash driver takes advantage of this
functionality.

Signed-off-by: Richard Purdie [EMAIL PROTECTED]

---
 block/ioctl.c|4 
 include/linux/fs.h   |1 +
 include/linux/swap.h |5 -
 mm/swapfile.c|7 ++-
 4 files changed, 15 insertions(+), 2 deletions(-)

Index: linux/include/linux/fs.h
===
--- linux.orig/include/linux/fs.h   2007-02-28 18:16:52.0 +
+++ linux/include/linux/fs.h2007-03-02 14:46:34.0 +
@@ -214,6 +214,7 @@ extern int dir_notify_enable;
 #define BLKTRACESTART _IO(0x12,116)
 #define BLKTRACESTOP _IO(0x12,117)
 #define BLKTRACETEARDOWN _IO(0x12,118)
+#define BLKSWAPMARKUNUSED _IOW(0x12, 1197, int)
 
 #define BMAP_IOCTL 1   /* obsolete - kept for compatibility */
 #define FIBMAP_IO(0x00,1)  /* bmap access */
Index: linux/include/linux/swap.h
===
--- linux.orig/include/linux/swap.h 2007-03-02 14:35:02.0 +
+++ linux/include/linux/swap.h  2007-03-02 14:46:34.0 +
@@ -65,7 +65,9 @@ union swap_header {
__u32   nr_badpages;
unsigned char   sws_uuid[16];
unsigned char   sws_volume[16];
-   __u32   padding[117];
+   __u32   flags;
+#define SWAPFLAG_UNUSED_IOCTL (1  0)
+   __u32   padding[113];
__u32   badpages[1];
} info;
 };
@@ -119,6 +121,7 @@ enum {
SWP_USED= (1  0), /* is slot in swap_info[] used? */
SWP_WRITEOK = (1  1), /* ok to write to this swap?*/
SWP_ACTIVE  = (SWP_USED | SWP_WRITEOK),
+   SWP_UNUSED_IOCTL = (1  2),
/* add others here before... */
SWP_SCANNING= (1  8), /* refcount in scan_swap_map */
 };
Index: linux/mm/swapfile.c
===
--- linux.orig/mm/swapfile.c2007-03-02 14:35:04.0 +
+++ linux/mm/swapfile.c 2007-03-02 14:46:34.0 +
@@ -284,6 +284,8 @@ static int swap_entry_free(struct swap_i
swap_list.next = p - swap_info;
nr_swap_pages++;
p-inuse_pages--;
+   if (p-flags  SWP_UNUSED_IOCTL)
+   blkdev_ioctl(p-swap_file-f_mapping-host, 
p-swap_file, BLKSWAPMARKUNUSED, offset);
}
}
return count;
@@ -1649,6 +1651,9 @@ asmlinkage long sys_swapon(const char __
goto bad_swap;
}
 
+   if (swap_header-info.flags  SWAPFLAG_UNUSED_IOCTL)
+   p-flags |= SWP_UNUSED_IOCTL;
+
error = 0;
memset(p-swap_map, 0, maxpages * sizeof(short));
for (i = 0; i  swap_header-info.nr_badpages; i++) {
@@ -1684,7 +1689,7 @@ asmlinkage long sys_swapon(const char __
 
mutex_lock(swapon_mutex);
spin_lock(swap_lock);
-   p-flags = SWP_ACTIVE;
+   p-flags |= SWP_ACTIVE;
nr_swap_pages += nr_good_pages;
total_swap_pages += nr_good_pages;
 
Index: linux/block/ioctl.c
===
--- linux.orig/block/ioctl.c2007-02-28 18:16:52.0 +
+++ linux/block/ioctl.c 2007-03-02 14:46:34.0 +
@@ -274,6 +274,10 @@ int blkdev_ioctl(struct inode *inode, st
return -EFAULT;
return 0;
}
+   case BLKSWAPMARKUNUSED:
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+   return blkdev_driver_ioctl(inode, file, disk, cmd, arg);
}
 
lock_kernel();


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 9/9] mtd: Add swaponflash block driver

2007-03-02 Thread Richard Purdie

Add a driver for allowing an mtd device to be used as a swap block device.

Signed-off-by: Richard Purdie [EMAIL PROTECTED]

 drivers/mtd/Kconfig   |7 
 drivers/mtd/Makefile  |1 
 drivers/mtd/mtdswap.c | 1187 ++
 3 files changed, 1195 insertions(+)

Index: linux/drivers/mtd/mtdswap.c
===
--- /dev/null   1970-01-01 00:00:00.0 +
+++ linux/drivers/mtd/mtdswap.c 2007-02-28 18:12:48.0 +
@@ -0,0 +1,1187 @@
+/*
+ * Swap block device support for MTDs
+ * Turns an MTD device into a swap device with block wear leveling
+ *
+ * Copyright (C) 2007 Nokia Corporation. All rights reserved.
+ *
+ * Author: Richard Purdie [EMAIL PROTECTED]
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ *
+ * Features:
+ *  - The mtd partitions to turn into swap devices are listed in the module
+ *parameters in the form 'partitions=1,3,5'.
+ *  - A dummy swap header is added to the start of the device so mkswap isn't
+ *needed.
+ *  - Since block erase counts are kept approximately equal, there is no need
+ *to keep track of erase counts over driver restarts.
+ *  - Two threads are used, one for the bio queue, the other to handle garbage
+ *collection such as erase block leveling, block erasing and
+ *defragmentation. The bio thread can need to wait on the gc thread for
+ *free blocks.
+ *  - For wear leveling there are two possible approaches, an in memory buffer
+ *or keeping at least one erase block reserved. The latter approach is
+ *taken. The driver becomes faster but less space efficent if given more
+ *reserved blocks.
+ *  - The driver tracks erase blocks by placing them in one of five trees,
+ *clean, used, low_frag, high_frag and dirty. A block will move between the
+ *trees roughly in that order as its used. For convience, each tree is
+ *sorted by erase count.
+ */
+
+#include linux/fs.h
+#include linux/init.h
+#include linux/kernel.h
+#include linux/module.h
+#include linux/sched.h
+#include linux/slab.h
+#include linux/types.h
+#include linux/vmalloc.h
+#include linux/swap.h
+#include linux/rbtree.h
+#include linux/mm.h
+#include linux/genhd.h
+#include linux/mutex.h
+#include linux/kthread.h
+#include linux/crc32.h
+#include linux/mtd/mtd.h
+#include linux/mtd/blktrans.h
+
+
+#define MAX_PAGES_PER_EB 64
+/*
+ * The maximum difference in erase counts before we move blocks
+ */
+#define MAX_ERASECOUNT_DIFFERENCE 15
+/*
+ * How many blocks to reserve for garbage collection
+ * Must be = 1
+ */
+#define NUMBER_SPARE_BLOCKS 1
+/*
+ * When changes are made to the number of active blocks in erase blocks,
+ * how long to wait before triggering garbage collection (in msec).
+ */
+#define GC_THREAD_DELAY 5000
+/*
+ * How many clean blocks should be available aabove which
+ * the garbage collection stops processing blocks.
+ */
+#define CLEAN_BLOCK_THRESHOLD 20
+/*
+ * Double check data from mtd with an extra crc?
+ */
+#define USE_CRC 1
+
+#undef DEBUG
+#ifdef DEBUG
+#define TRACE(fmt,a...) printk(KERN_DEBUG fmt, ##a)
+#else
+#define TRACE(fmt,a...)
+#endif
+
+struct swp_blk {
+   int mapno;
+#ifdef USE_CRC
+   unsigned long crc;
+#endif
+};
+
+struct swp_eblk {
+   struct rb_node rb;
+   struct rb_root *root;
+
+   int erase_count;
+   int bad_count;
+
+   DECLARE_BITMAP(active, MAX_PAGES_PER_EB);
+};
+
+#define EBLKADDR_TO_NUM(swpdev, addr) (addr - swpdev-eblk_data[0])
+#define ERASE_COUNT_MIN(rbroot) (rb_entry(rb_first(rbroot), struct swp_eblk, 
rb)-erase_count)
+#define ERASE_COUNT_MAX(rbroot) (rb_entry(rb_last(rbroot), struct swp_eblk, 
rb)-erase_count)
+
+struct mtdswp_dev {
+   struct mtd_blktrans_dev mbd_dev;
+   struct mtd_info *mtd;
+   int fsdata_pos;
+
+   int pages;
+   struct swp_blk *blk_data;
+   int eblks;
+   struct swp_eblk *eblk_data;
+   int pages_per_blk;
+   int max_erase_count;
+
+/* spinlock protects the fields in this block */
+   spinlock_t trees_lock;
+   struct rb_root clean;
+   struct rb_root used;
+   struct rb_root low_frag;
+   struct rb_root high_frag;
+   struct rb_root dirty;
+   int clean_count;
+   int used_count;
+   int low_frag_count;
+   int high_frag_count;
+   int dirty_count;

Re: [PATCH 2/5] lumpy: isolate_lru_pages wants to specifically take active or inactive pages

2007-03-02 Thread Andy Whitcroft

Christoph Lameter wrote:
 On Tue, 27 Feb 2007, Andy Whitcroft wrote:
 
 The caller of isolate_lru_pages specifically knows whether it wants
 to take either inactive or active pages.  Currently we take the
 state of the LRU page at hand and use that to scan for matching
 pages in the order sized block.  If that page is transiting we
 can scan for the wrong type.  The caller knows what they want and
 should be telling us.  Pass in the required active/inactive state
 and match against that.
 
 The page cannot be transiting since we hold the lru lock?

As you say it should be gated by lru_lock and we should not expect to
see pages with the wrong type on the list.  I would swear that I was
seeing pages on the wrong list there for a bit in testing and mistakenly
thought they were in transition.  A quick review at least says thats
false.  So I'll reinstate the BUG() and retest to see if I am smoking
crack or there is a bigger bug out there.

-apw
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 2.6.21-rc2-mm1

2007-03-02 Thread Michal Piotrowski

Hi,

Andrew Morton napisał(a):
 Temporarily at
 
   http://userweb.kernel.org/~akpm/2.6.21-rc2-mm1/
 

Possible fix for

nvidiafb-bring-back-generic-ddc-reading.patch

drivers/built-in.o: In function `nvidia_probe_i2c_connector':
/mnt/md0/devel/linux-mm/drivers/video/nvidia/nv_i2c.c:166: undefined reference 
to `fb_ddc_read'
make[1]: *** [.tmp_vmlinux1] Error 1
make: *** [_all] Error 2

Regards,
Michal

-- 
Michal K. K. Piotrowski
LTG - Linux Testers Group (PL)
(http://www.stardust.webpages.pl/ltg/)
LTG - Linux Testers Group (EN)
(http://www.stardust.webpages.pl/linux_testers_group_en/)

Signed-off-by: Michal Piotrowski [EMAIL PROTECTED]

--- linux-work/drivers/video/Makefile   2007-03-02 16:38:17.0 +0100
+++ linux-mm/drivers/video/Makefile 2007-03-02 16:49:23.0 +0100
@@ -33,6 +33,7 @@ obj-$(CONFIG_FB_PM3)+= pm3fb.o
 obj-$(CONFIG_FB_MATROX)  += matrox/
 obj-$(CONFIG_FB_RIVA)+= riva/ vgastate.o
 obj-$(CONFIG_FB_NVIDIA)  += nvidia/
+obj-$(CONFIG_FB_NVIDIA_I2C)  += fb_ddc.o
 obj-$(CONFIG_FB_ATY) += aty/ macmodes.o
 obj-$(CONFIG_FB_ATY128)  += aty/ macmodes.o
 obj-$(CONFIG_FB_RADEON)  += aty/
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Is the clockevent resolution fine-grained enough?

2007-03-02 Thread Thomas Gleixner

On Thu, 2007-03-01 at 18:34 -0800, Marko Rauhamaa wrote:
 It would appear the new clockevent API has a one-nanosecond resolution.
 It certainly looks sufficiently fine-grained, but I'm afraid it's too
 coarse for some applications.

That's an academic exercise, or are you talking about some real world
hardware which runs Linux ?

 In our application, we need periodic clock interrupts at about 100 kHz.

With a stock kernel ?

 If the (programmable) frequency must be rounded to the nearest
 nanosecond, we have a cumulative error of
 
100,000 * 0.5 ns/s = 50 µs/s
 
 We need to maintain the cumulative error within, say, 1 ms/day, or
 11 ns/s. (The error is not measured against real time, but between
 different parts of our hardware that are run off of the same clock.)

clockevents is based on the monotonic system clock and depends on the
accuracy of that and the device which deliveres the interrupts.

 For our needs, we have built our own clockevent system that has a
 nominal one-femtosecond precision. The nanosecond resolution would be
 sufficient if there was a way to nudge the next interrupt by a
 nanosecond from the interrupt handler.

There is nothing to nugde. The clockevent subsystem operates on absolute
time, so there is no cummulative error, except you setup your timers
relative per event.

tglx


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [-mm patch] cpu_idle: fix build break

2007-03-02 Thread Venkatesh Pallipadi

On Fri, Mar 02, 2007 at 02:40:07PM +, Frederik Deweerdt wrote:
 On Fri, Mar 02, 2007 at 03:00:26AM -0800, Andrew Morton wrote:
  +git-acpi-fix-cpuidle-borkage.patch
 This attached patch might be needed too, the build breaks if
 !CONFIG_HOTPLUG_CPU and CONFIG_CPU_IDLE
  
   CC  drivers/cpuidle/cpuidle.o
 drivers/cpuidle/cpuidle.c: In function 'cpuidle_init':
 drivers/cpuidle/cpuidle.c:272: erreur: 'cpuidle_cpu_notifier' undeclared 
 (first use in this function)
 drivers/cpuidle/cpuidle.c:272: erreur: (Each undeclared identifier is 
 reported only once
 drivers/cpuidle/cpuidle.c:272: erreur: for each function it appears in.)
 make[2]: *** [drivers/cpuidle/cpuidle.o] Erreur 1
 make[1]: *** [drivers/cpuidle] Erreur 2
 make: *** [drivers] Erreur 2


Thanks for catching this breakage. Patch below should be the proper fix.

Thanks,
Venki

Signed-off-by: Venkatesh Pallipadi [EMAIL PROTECTED]
 
Index: linux-2.6.21-rc-mm/drivers/cpuidle/cpuidle.c
===
--- linux-2.6.21-rc-mm.orig/drivers/cpuidle/cpuidle.c
+++ linux-2.6.21-rc-mm/drivers/cpuidle/cpuidle.c
@@ -189,10 +189,6 @@ static struct sysdev_driver cpuidle_sysd
.remove = cpuidle_remove_device,
 };
 
-#ifdef CONFIG_SMP
-
-#ifdef CONFIG_HOTPLUG_CPU
-
 static int cpuidle_cpu_callback(struct notifier_block *nfb,
unsigned long action, void *hcpu)
 {
@@ -224,7 +220,7 @@ static struct notifier_block __cpuinitda
 .notifier_call = cpuidle_cpu_callback,
 };
 
-#endif /* CONFIG_HOTPLUG_CPU */
+#ifdef CONFIG_SMP
 
 static void smp_callback(void *v)
 {
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 2.6.21-rc2-mm1 - fb_ddc_read() not defined

2007-03-02 Thread Valdis . Kletnieks

On Fri, 02 Mar 2007 03:00:26 PST, Andrew Morton said:

   
 ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.21-rc2/2.6.21-rc2-mm1/

 nvidiafb-bring-back-generic-ddc-reading.patch

Building with FB_DDC=N results in:

Kernel: arch/x86_64/boot/bzImage is ready  (#1)
  Building modules, stage 2.
  MODPOST 229 modules
WARNING: fb_ddc_read [drivers/video/nvidia/nvidiafb.ko] undefined!
make[1]: *** [__modpost] Error 1

makes-it-build patch (not sure if it should be select FB_DDC if MUMBLE):

--- linux-2.6.21-rc2-mm1/drivers/video/Kconfig.nvidia   2007-03-02 
09:27:48.0 -0500
+++ linux-2.6.21-rc2-mm1/drivers/video/Kconfig  2007-03-02 10:56:54.0 
-0500
@@ -710,6 +710,7 @@ config FB_NVIDIA
select I2C_ALGOBIT if FB_NVIDIA_I2C
select I2C if FB_NVIDIA_I2C
select FB_BACKLIGHT if FB_NVIDIA_BACKLIGHT
+   select FB_DDC
select FB_MODE_HELPERS
select FB_CFB_FILLRECT
select FB_CFB_COPYAREA




pgp8bqQUCBhQR.pgp
Description: PGP signature

Re: [RFC][PATCH 0/3] VM throttling: avoid blocking occasional writers

2007-03-02 Thread Brice Figureau

Hi,

On Fri, 2007-03-02 at 13:06 +, Leroy van Logchem wrote:
  I'm sorry to piggy-back this thread.
  
  Could it be what I'm experiencing in the following bugzilla report:
  http://bugzilla.kernel.org/show_bug.cgi?id=7372
  
  As I explained in the report, I see this issue only since 2.6.18.
  So if your concern is related to mine, what could have changed between
  2.6.17 and 2.6.18 related to this?
 
 I don't think it's 2.6.x related, it's been under the sheets from start.

Maybe. Still the issue has been aggravated between 2.6.17 and 2.6.18.
Right now (running 2.6.17.13) I can backup and do whatever I want, even
with high memory pressure because of mysql (which is consumming right
now about 3.8GB of 4GB).

Under 2.6.18 and later it is simply impossible to perform that (except
with the dd directIO trick).
This makes me think that something else has joined the party on
2.6.18...

 Related to your problem in the 7372 bug:
 
 Pages are kept in memory for re-use, which is fast and fine except for:
 1) data without re-use value or even single use
 2) applications _do not_ advise the kernel how to cache pages related
to there self generated i/o. POSIX does provide mechanisms to
properly do so. But the kernel should help these poor apps.

 To minimize your MySQL backup cp(1) problem, try this workaround:
 
 cat ./cp_direct.sh 
 #!/bin/sh
 dd if=$1 of=$2 bs=1M iflag=direct oflag=direct

Yes, I already did this, and it helped, see:
http://bugzilla.kernel.org/show_bug.cgi?id=7372#c16

 Combine this with [/etc/sysctl.conf]:
 
 vm.vfs_cache_pressure = 1
 vm.dirty_ratio = 2
 vm.dirty_background_ratio = 1
 
 This should reduce both the stress on the vm and response latency during
 interactive work.

I'll test the vfs_cache_pressure (I was already using those dirty_*
settings) whenever I'll reboot the server to 2.6.20.

Please CC: me on list replies as I'm not subscribed to the list.

Thanks for the tips.
Regards
-- 
Brice Figureau [EMAIL PROTECTED]

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] Fixes and cleanups for earlyprintk aka boot console.

2007-03-02 Thread Jeremy Fitzhardinge

Gerd Hoffmann wrote:
 Hmm, I think this is just a chunk being lost due to the clash with the
 older version of the patch submitted as part of the xen series.
   

But I thought the old version I had posted was well and truly dropped. 
Isn't this problematic patch the one you posted?

 Last patch queue mailed by Jeremy doesn't contain this one any more, so
 the conflict should be gone now.
   
Yeah, this isn't absolutely required to make Xen work, so I didn't want
to confuse things by reposting it.

J
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2.6.21-rc1] Extend print_symbol capability

2007-03-02 Thread Paulo Marques


Robert Peterson wrote:

[...]
#define KSYM_NAME_LEN 127
+#define KSYM_SYMBOL_LEN (sizeof(%s+%#lx/%#lx [%s]) + KSYM_NAME_LEN + \
+2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1)

#ifdef CONFIG_KALLSYMS
/* Lookup the address for a symbol. Returns 0 if not found. */
@@ -22,6 +24,9 @@ const char *kallsyms_lookup(unsigned long addr,
   unsigned long *offset,
   char **modname, char *namebuf);

+/* Look up a kernel symbol and return it in a text buffer. */
+extern void lookup_symbol(unsigned long addr, char *buffer);


I don't like this name much :(

We already have kallsyms_lookup and kallsyms_lookup_name. The name of 
this function should imply that it will print the formatted result into 
the buffer, not just lookup a symbol.


Maybe __sprint_symbol, and change the interface to 
__sprint_symbol(char *buffer, unsigned long addr)?



+
/* Replace %s in format with address, if found */
extern void __print_symbol(const char *fmt, unsigned long address);

@@ -47,6 +52,11 @@ static inline const char *kallsyms_lookup(unsigned 
long addr,

   return NULL;
}

+static inline void lookup_symbol(unsigned long addr, char *buffer)
+{
+   return NULL;
+}


Returning NULL in a function returning void doesn't seem right :P

Maybe it should be something like this instead:
{
*buffer = '\0';
}


[...]


Anyway, the change looks useful, so thanks for the patch :)

--
Paulo Marques - www.grupopie.com

Very funny Scotty. Now beam up my clothes.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: PATCH 2.6.21-rc1 aoe: handle zero _count pages in bios

2007-03-02 Thread Sam Hopkins

 Well, given that bi_end_io() is called after the io has completed, I'm
 assuming that networking has completely finished with the memory by the
 time bi_end_io() gets called.
 
 I guess one can envisage situations where that might not happen, but they'd
 be terribly buggy ones, surely.

This is actually quite common when using broadcom chipsets that take a
long time to clean out the tx ring.  We send a command skb out to
write some data, get the response some tens of ms later and the
command skb (with the pages) still sits in the tx ring.  I've gone to
some lengths to limit the skb memory used in aoe to help with the
OOM/swap issue and this has given me headaches.

Sam

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] longhaul pci_find_device - pci_get_device conversion (was: Re: 2.6.21-rc2-mm1)

2007-03-02 Thread Michal Piotrowski

Hi Dave,

Andrew Morton napisał(a):
 Temporarily at
 
   http://userweb.kernel.org/~akpm/2.6.21-rc2-mm1/
 

  CC [M]  arch/i386/kernel/cpu/cpufreq/longhaul.o
arch/i386/kernel/cpu/cpufreq/longhaul.c: In function 'enable_arbiter_disable':
arch/i386/kernel/cpu/cpufreq/longhaul.c:598: warning: 'pci_find_device' is 
deprecated (declared at include/linux/pci.h:485)
arch/i386/kernel/cpu/cpufreq/longhaul.c:602: warning: 'pci_find_device' is 
deprecated (declared at include/linux/pci.h:485)
arch/i386/kernel/cpu/cpufreq/longhaul.c:605: warning: 'pci_find_device' is 
deprecated (declared at include/linux/pci.h:485)
arch/i386/kernel/cpu/cpufreq/longhaul.c: In function 'longhaul_setup_vt8235':
arch/i386/kernel/cpu/cpufreq/longhaul.c:632: warning: 'pci_find_device' is 
deprecated (declared at include/linux/pci.h:485)

Regards,
Michal

-- 
Michal K. K. Piotrowski
LTG - Linux Testers Group (PL)
(http://www.stardust.webpages.pl/ltg/)
LTG - Linux Testers Group (EN)
(http://www.stardust.webpages.pl/linux_testers_group_en/)

Signed-off-by: Michal Piotrowski [EMAIL PROTECTED]

--- linux-mm/arch/i386/kernel/cpu/cpufreq/longhaul.c2007-03-02 
15:15:51.0 +0100
+++ linux-work/arch/i386/kernel/cpu/cpufreq/longhaul.c  2007-03-02 
17:12:46.0 +0100
@@ -595,14 +595,14 @@ static int enable_arbiter_disable(void)

/* Find PLE133 host bridge */
reg = 0x78;
-   dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0, 
NULL);
+   dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0, NULL);
/* Find CLE266 host bridge */
if (dev == NULL) {
reg = 0x76;
-   dev = pci_find_device(PCI_VENDOR_ID_VIA, 
PCI_DEVICE_ID_VIA_862X_0, NULL);
+   dev = pci_get_device(PCI_VENDOR_ID_VIA, 
PCI_DEVICE_ID_VIA_862X_0, NULL);
/* Find CN400 V-Link host bridge */
if (dev == NULL)
-   dev = pci_find_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
+   dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);

}
if (dev != NULL) {
@@ -629,7 +629,7 @@ static int longhaul_setup_vt8235(void)
u8 pci_cmd;

/* Find VT8235 southbridge */
-   dev = pci_find_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
+   dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
if (dev != NULL) {
/* Set transition time to max */
pci_read_config_byte(dev, 0xec, pci_cmd);

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Mark Gross

On Thu, Mar 01, 2007 at 09:11:58PM -0800, Linus Torvalds wrote:
 
 On Thu, 1 Mar 2007, Andrew Morton wrote:
 
  On Thu, 1 Mar 2007 19:44:27 -0800 (PST) Linus Torvalds [EMAIL PROTECTED] 
  wrote:
  
   In other words, I really don't see a huge upside. I see *lots* of 
   downsides, but upsides? Not so much. Almost everybody who wants unplug 
   wants virtualization, and right now none of the big virtualization 
   people would want to have kernel-level anti-fragmentation anyway sicne 
   they'd need to do it on their own.
  
  Agree with all that, but you're missing the other application: power
  saving.  FBDIMMs take eight watts a pop.
 
 This is a hardware problem. Let's see how long it takes for Intel to 
 realize that FBDIMM's were a hugely bad idea from a power perspective.
 
 Yes, the same issues exist for other DRAM forms too, but to a *much* 
 smaller degree.

DDR3-1333 may be better than FBDIMM's but don't count on it being much
better.

 
 Also, IN PRACTICE you're never ever going to see this anyway. Almost 
 everybody wants bank interleaving, because it's a huge performance win on 
 many loads. That, in turn, means that your memory will be spread out over 
 multiple DIMM's even for a single page, much less any bigger area.

4-way interleave across banks on systems may not be as common as you may
think for future chip sets.  2-way interleave across DIMMs within a bank
will stay.

Also the performance gains between 2 and 4 way interleave have been
shown to be hard to measure.  It may be counter intuitive but its not
the huge performance win you may expect.  At least in some of the test
cases I've seen reported showed it to be under the noise floor of the
lmbench test cases.  


 
 In other words - forget about DRAM power savings. It's not realistic. And 
 if you want low-power, don't use FBDIMM's. It really *is* that simple.


DDR3-1333 won't be much better.  

 (And yes, maybe FBDIMM controllers in a few years won't use 8 W per 
 buffer. I kind of doubt that, since FBDIMM fairly fundamentally is highish 
 voltage swings at high frequencies.)
 
 Also, on a *truly* idle system, we'll see the power savings whatever we 
 do, because the working set will fit in D$, and to get those DRAM power 
 savings in reality you need to have the DRAM controller shut down on its 
 own anyway (ie sw would only help a bit).
 
 The whole DRAM power story is a bedtime story for gullible children. Don't 
 fall for it. It's not realistic. The hardware support for it DOES NOT 
 EXIST today, and probably won't for several years. And the real fix is 
 elsewhere anyway (ie people will have to do a FBDIMM-2 interface, which 
 is against the whole point of FBDIMM in the first place, but that's what 
 you get when you ignore power in the first version!).


Hardware support for some of this is coming this year in the ATCA space
on the MPCBL0050.  The feature is a bit experimental, and
power/performance benefits will be workload and configuration
dependent.  Its not a bed time story.

--mgross
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [-mm patch] cpu_idle: fix build break

2007-03-02 Thread Frederik Deweerdt

On Fri, Mar 02, 2007 at 07:24:28AM -0800, Venkatesh Pallipadi wrote:
 On Fri, Mar 02, 2007 at 02:40:07PM +, Frederik Deweerdt wrote:
  On Fri, Mar 02, 2007 at 03:00:26AM -0800, Andrew Morton wrote:
   +git-acpi-fix-cpuidle-borkage.patch
  This attached patch might be needed too, the build breaks if
  !CONFIG_HOTPLUG_CPU and CONFIG_CPU_IDLE
   
CC  drivers/cpuidle/cpuidle.o
  drivers/cpuidle/cpuidle.c: In function 'cpuidle_init':
  drivers/cpuidle/cpuidle.c:272: erreur: 'cpuidle_cpu_notifier' undeclared 
  (first use in this function)
  drivers/cpuidle/cpuidle.c:272: erreur: (Each undeclared identifier is 
  reported only once
  drivers/cpuidle/cpuidle.c:272: erreur: for each function it appears in.)
  make[2]: *** [drivers/cpuidle/cpuidle.o] Erreur 1
  make[1]: *** [drivers/cpuidle] Erreur 2
  make: *** [drivers] Erreur 2
 
 
 Thanks for catching this breakage. Patch below should be the proper fix.
Yep, works for me.

Regards,
Frederik
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: PREEMPT_RCU breaks anon_vma locking ?

2007-03-02 Thread Hugh Dickins

On Sat, 24 Feb 2007, Paul E. McKenney wrote:
 On Sat, Feb 24, 2007 at 10:04:04PM +, Hugh Dickins wrote:
 
  Have you checked through the SLAB_DESTROY_BY_RCU end in slab.c?
  Is what that's doing still valid?
 
 The only thing I see needed due to PREEMPT_RCU is the following comment
 change.
 
 For a terrified few minutes, I thought that the code assumed that struct
 rcu_head was the same size as struct list_head, but it turns out to only
 assume that struct slab is at least as large as struct slab_rcu.
 
   Thanx, Paul

Thanks for enduring the terror, checking it out, and arriving at
such a reassuring conclusion.  Andrew, please add this to your -mm
collection after (or folded into) Paul's rcu-preemptible-rcu.patch.


PREEMPT_RCU has stricter needs: updated comment on SLAB_DESTROY_BY_RCU.

Signed-off-by: Paul E. McKenney [EMAIL PROTECTED]
Acked-by: Hugh Dickins [EMAIL PROTECTED]
---

diff -urpNa -X dontdiff linux-2.6.20/mm/slab.c linux-2.6.20-slabrcufix/mm/slab.c
--- linux-2.6.20/mm/slab.c  2007-02-04 10:44:54.0 -0800
+++ linux-2.6.20-slabrcufix/mm/slab.c   2007-02-24 14:50:39.0 -0800
@@ -238,7 +238,7 @@ struct slab {
  * other kind of object (which our subsystem's lock might corrupt).
  *
  * rcu_read_lock before reading the address, then rcu_read_unlock after
- * taking the spinlock within the structure expected at that address.
+ * releasing the spinlock within the structure expected at that address.
  *
  * We assume struct slab_rcu can overlay struct slab when destroying.
  */
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Kernel Oops with shm namespace cleanups

2007-03-02 Thread Adam Litke

On Thu, 2007-03-01 at 16:08 -0800, Bill Irwin wrote:
 On Wed, Feb 28, 2007 at 02:13:29PM -0600, Adam Litke wrote:
  Hey.  While testing 2.6.21-rc2 with libhugetlbfs, the shm-fork test case
  causes the kernel to oops.  To reproduce:  Execute 'make check' in the
  latest libhugetlbfs source on a 2.6.21-rc2 kernel with 100 huge pages
  allocated.  Using fewer huge pages will likely also trigger the oops.
  Libhugetlbfs can be downloaded from:
  http://libhugetlbfs.ozlabs.org/snapshots/libhugetlbfs-dev-20070228.tar.gz
 
 Looks like I should grab these testcases for the sake of due diligence
 (not to say I intend to alter maintenance style from primarily review,
 approval, and bugfixing, not that I've been doing as much of any of those
 as I should). To which architectures and/or distributions have the
 userspace bits been ported, or otherwise run/tested on? A quick sniff
 test on an Altix suggests SLES and/or ia64 may trip up the scripts:

Right now we support x86, powerpc, and x86_64.  Segment remapping and
hugetlb malloc won't work on ia64 until long format vhpt is supported (I
suspect).  But the test framework should be adaptable to other
architectures.

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 2.6.21-rc2-mm1 - fb_ddc_read() not defined

2007-03-02 Thread James Simmons


 On Fri, 02 Mar 2007 03:00:26 PST, Andrew Morton said:
 

  ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.21-rc2/2.6.21-rc2-mm1/
 
  nvidiafb-bring-back-generic-ddc-reading.patch
 
 Building with FB_DDC=N results in:
 
 Kernel: arch/x86_64/boot/bzImage is ready  (#1)
   Building modules, stage 2.
   MODPOST 229 modules
 WARNING: fb_ddc_read [drivers/video/nvidia/nvidiafb.ko] undefined!
 make[1]: *** [__modpost] Error 1
 
 makes-it-build patch (not sure if it should be select FB_DDC if MUMBLE):
 
 --- linux-2.6.21-rc2-mm1/drivers/video/Kconfig.nvidia 2007-03-02 
 09:27:48.0 -0500
 +++ linux-2.6.21-rc2-mm1/drivers/video/Kconfig2007-03-02 
 10:56:54.0 -0500
 @@ -710,6 +710,7 @@ config FB_NVIDIA
   select I2C_ALGOBIT if FB_NVIDIA_I2C
   select I2C if FB_NVIDIA_I2C
   select FB_BACKLIGHT if FB_NVIDIA_BACKLIGHT
 + select FB_DDC
   select FB_MODE_HELPERS
   select FB_CFB_FILLRECT
   select FB_CFB_COPYAREA

To have a patch to cleans things up. Give it a try

diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index b8f0a11..855a09e 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -677,8 +678,6 @@ config FB_S1D13XXX
 config FB_NVIDIA
tristate nVidia Framebuffer Support
depends on FB  PCI
-   select I2C_ALGOBIT if FB_NVIDIA_I2C
-   select I2C if FB_NVIDIA_I2C
select FB_BACKLIGHT if FB_NVIDIA_BACKLIGHT
select FB_MODE_HELPERS
select FB_CFB_FILLRECT
@@ -697,6 +696,7 @@ config FB_NVIDIA
 config FB_NVIDIA_I2C
bool Enable DDC Support
depends on FB_NVIDIA
+   select FB_DDC
help
  This enables I2C support for nVidia Chipsets.  This is used
  only for getting EDID information from the attached display
@@ -716,7 +716,6 @@ config FB_NVIDIA_BACKLIGHT
 config FB_RIVA
tristate nVidia Riva support
depends on FB  PCI
-   select FB_DDC if FB_RIVA_I2C
select FB_BACKLIGHT if FB_RIVA_BACKLIGHT
select FB_MODE_HELPERS
select FB_CFB_FILLRECT
@@ -734,6 +733,7 @@ config FB_RIVA
 config FB_RIVA_I2C
bool Enable DDC Support
depends on FB_RIVA
+   select FB_DDC
help
  This enables I2C support for nVidia Chipsets.  This is used
  only for getting EDID information from the attached display
@@ -812,8 +812,6 @@ config FB_INTEL
depends on FB  EXPERIMENTAL  PCI  X86
select AGP
select AGP_INTEL
-   select I2C_ALGOBIT if FB_INTEL_I2C
-   select I2C if FB_INTEL_I2C
select FB_MODE_HELPERS
select FB_CFB_FILLRECT
select FB_CFB_COPYAREA
@@ -846,6 +844,7 @@ config FB_INTEL_DEBUG
 config FB_INTEL_I2C
bool DDC/I2C for Intel framebuffer support
depends on FB_INTEL
+   select FB_DDC
default y
help
  Say Y here if you want DDC/I2C support for your on-board Intel 
graphics.
@@ -924,8 +923,8 @@ config FB_MATROX_G
 
 config FB_MATROX_I2C
tristate Matrox I2C support
-   depends on FB_MATROX  I2C
-   select I2C_ALGOBIT
+   depends on FB_MATROX
+   select FB_DDC
---help---
  This drivers creates I2C buses which are needed for accessing the
  DDC (I2C) bus present on all Matroxes, an I2C bus which
@@ -993,7 +992,6 @@ config FB_MATROX_MULTIHEAD
 config FB_RADEON
tristate ATI Radeon display support
depends on FB  PCI
-   select FB_DDC if FB_RADEON_I2C
select FB_BACKLIGHT if FB_RADEON_BACKLIGHT
select FB_MODE_HELPERS
select FB_CFB_FILLRECT
@@ -1018,6 +1016,7 @@ config FB_RADEON
 config FB_RADEON_I2C
bool DDC/I2C for ATI Radeon support
depends on FB_RADEON
+   select FB_DDC
default y
help
  Say Y here if you want DDC/I2C support for your Radeon board. 
@@ -1125,7 +1124,6 @@ config FB_S3
 config FB_SAVAGE
tristate S3 Savage support
depends on FB  PCI  EXPERIMENTAL
-   select FB_DDC if FB_SAVAGE_I2C
select FB_MODE_HELPERS
select FB_CFB_FILLRECT
select FB_CFB_COPYAREA
@@ -1142,6 +1140,7 @@ config FB_SAVAGE
 config FB_SAVAGE_I2C
bool Enable DDC2 Support
depends on FB_SAVAGE
+   select FB_DDC
help
  This enables I2C support for S3 Savage Chipsets.  This is used
  only for getting EDID information from the attached display
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Joel Schopp


Exhibiting a workload where the list patch breaks down and the zone
patch rescues it might help if it's felt that the combination isn't as
good as lists in isolation. I'm sure one can be dredged up somewhere.


I can't think of a workload that totally makes a mess out of list-based. 
However, list-based makes no guarantees on availability. If a system 
administrator knows they need between 10,000 and 100,000 huge pages and 
doesn't want to waste memory pinning too many huge pages at boot-time, 
the zone-based mechanism would be what he wanted.


From our testing with earlier versions of list based for memory hot-unplug on 
pSeries machines we were able to hot-unplug huge amounts of memory after running the 
nastiest workloads we could find for over a week.  Without the patches we were unable 
to hot-unplug anything within minutes of running the same workloads.


If something works for 99.999% of people (list based) and there is an easy way to 
configure it for the other 0.001% of the people (zone based) I call that a great 
solution.  I really don't understand what the resistance is to these patches.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Mel Gorman


On (01/03/07 16:09), Andrew Morton didst pronounce:
 On Thu, 1 Mar 2007 10:12:50 +
 [EMAIL PROTECTED] (Mel Gorman) wrote:
 
  Any opinion on merging these patches into -mm
  for wider testing?
 
 I'm a little reluctant to make changes to -mm's core mm unless those
 changes are reasonably certain to be on track for mainline, so let's talk
 about that.
 

Sounds reasonable.

 What worries me is memory hot-unplug and per-container RSS limits.  We
 don't know how we're going to do either of these yet, and it could well be
 that the anti-frag work significantly complexicates whatever we end up
 doing there.
 

Ok. I am going to assume as well that all these issues are not mutually
exclusive. To start with, anti-fragmentation in now really two things -
anti-fragmentation and memory partitioning

o Memory partitioning creates an additional zone with hard limits on usage
o Anti-fragmentation groups free pages based on mobility in MAX_ORDER blocks

They both help different things in different ways so it is important not
to conflate them as being the same thing. I would like them both in because
they complement each other nicely from a hugepages perspective.

 For prioritisation purposes I'd judge that memory hot-unplug is of similar
 value to the antifrag work (because memory hot-unplug permits DIMM
 poweroff).
 
 And I'd judge that per-container RSS limits are of considerably more value
 than antifrag (in fact per-container RSS might be a superset of antifrag,
 in the sense that per-container RSS and containers could be abused to fix
 the i-cant-get-any-hugepages problem, dunno).
 

It would be serious abuse and would be too easy to trigger OOM-like conditions
because of the constraints containers must work under to be useful. I'll
come back to this briefly later.

 So some urgent questions are: how are we going to do mem hotunplug and
 per-container RSS?
 

The zone-based patches for memory partitioning should be providing what is
required for memory hot-remove of an entire DIMM or bank of memory (PPC64
also cares about removing smaller blocks of memory but zones are overkill
there and anti-fragmentation on its own is good enough).  Pages hot-added
to ZONE_MOVABLE will always be reclaimable or migratable in the case of
mlock(). Kamezawa Hiroyuki has indicated that his hot-remove patches also
do something like ZONE_MOVABLE. I would hope that his patches could be
easily based on top of my memory partitioning set of patches. The markup
of pages has been tested and the zone definitely works. I've added the
[EMAIL PROTECTED] to the cc list so he can comment :)

What I do not do in my patchset is hot-add to ZONE_MOVABLE because I couldn't
be certain it's what the hotplug people wanted. They will of course need to
hot-add to that zone if they want to be able to remove it later.

For node-based memory hot-add and hot-remove, the node would consist of just
one populated zone - ZONE_MOVABLE.

For the removal of DIMMs, anti-fragmentation has something additional
to offer. The later patches in the anti-fragmentation patchset bias the
placement of unmovable pages towards the lower PFNs. It's not very strict
about this because being strict would cost. A mechanism could be put in place
that enforced the placement of unmovables pages at low PFNS. Due to the cost,
it would need to be disabled by default and enabled on request. On the plus
side, the cost would only be incurred when splitting a MAX_ORDER block of
pages which is a rare event.

One of the reasons why anti-frag doesn't negatively impact kernbench figures
in the majority of cases is because it's actually rare it kicks in to do
anything. Once pages are on the appropriate free lists, everything progresses
as normal.

 Our basic unit of memory management is the zone.  Right now, a zone maps
 onto some hardware-imposed thing.  But the zone-based MM works *well*.  I
 suspect that a good way to solve both per-container RSS and mem hotunplug
 is to split the zone concept away from its hardware limitations: create a
 software zone and a hardware zone.

Ok, lets explore this notion a bit. I am thinking about this both in
terms of making RSS limits work properly and seeing if it collides with
anti-fragmentation or not.

Lets assume that a hardware zone is a management structure of pages that
have some addressing limitation. It might be 16MB for ISA devices, 4GB for
32 bit devices etc.

A software zone is a collection of pages belonging to a subsystem or
a process.  Containers are an example of a software zone.

That gives us a set of structures like

Node
 |
  
 /\
hardwarehardware
zone  zone
  |   \--
  |\ \
   main container container
   software zone   software zone  software zone

i.e. Each node has one hardware zone per zone today like ZONE_DMA, NORMAL
etc Each hardware zone consists of at least one software zone and an additional
software zone per

Re: 2.6.21-rc2-mm1

2007-03-02 Thread Badari Pulavarty

On Fri, 2007-03-02 at 03:00 -0800, Andrew Morton wrote:
 Temporarily at
 
   http://userweb.kernel.org/~akpm/2.6.21-rc2-mm1/
 
 Will appear later at
 
   
 ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.21-rc2/2.6.21-rc2-mm1/
 
 
 - Quite a lot of less-popular architectures still aren't compiling due
   to utrace.  x86, x86_64, powerpc, ia64 and s390 should be OK.


arch/x86_64/mm/numa.c: In function ‘numa_initmem_init’:
arch/x86_64/mm/numa.c:530: error: ‘cmdline’ undeclared (first use in
this function)
arch/x86_64/mm/numa.c:530: error: (Each undeclared identifier is
reported only once
arch/x86_64/mm/numa.c:530: error: for each function it appears in.)
make[1]: *** [arch/x86_64/mm/numa.o] Error 1
make[1]: *** Waiting for unfinished jobs

Thanks,
Badari

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

panic on 2.6.20

2007-03-02 Thread Marco Berizzi

Hi. Sorry for posting to this list,
but I got this panic with linux
2.6.20
I have also changed the motherboard
of this server and memtest has not
found any error (ram tested for 10
hours)
May anyone tell me if this could
be an hardware problem?

TIA

Linux version 2.6.20 ([EMAIL PROTECTED]) (gcc version 3.4.6) #1 SMP Mon Feb 5
09:12:13 CET 2007
BIOS-provided physical RAM map:
sanitize start
sanitize end
copy_e820_map() start:  size: 0009ac00 end:
0009ac00 type: 1
copy_e820_map() type is E820_RAM
copy_e820_map() start: 0009ac00 size: 5400 end:
000a type: 2
copy_e820_map() start: 000ce000 size: 2000 end:
000d type: 2
copy_e820_map() start: 000e size: 0002 end:
0010 type: 2
copy_e820_map() start: 0010 size: 3fdf end:
3fef type: 1
copy_e820_map() type is E820_RAM
copy_e820_map() start: 3fef size: b000 end:
3fefb000 type: 3
copy_e820_map() start: 3fefb000 size: 5000 end:
3ff0 type: 4
copy_e820_map() start: 3ff0 size: 0008 end:
3ff8 type: 1
copy_e820_map() type is E820_RAM
copy_e820_map() start: 3ff8 size: 0008 end:
4000 type: 2
copy_e820_map() start: e000 size: 1000 end:
f000 type: 2
copy_e820_map() start: fec0 size: 00100400 end:
fed00400 type: 2
copy_e820_map() start: fee0 size: 0010 end:
fef0 type: 2
copy_e820_map() start: ffb0 size: 0010 end:
ffc0 type: 2
copy_e820_map() start: fff0 size: 0010 end:
0001 type: 2
 BIOS-e820:  - 0009ac00 (usable)
 BIOS-e820: 0009ac00 - 000a (reserved)
 BIOS-e820: 000ce000 - 000d (reserved)
 BIOS-e820: 000e - 0010 (reserved)
 BIOS-e820: 0010 - 3fef (usable)
 BIOS-e820: 3fef - 3fefb000 (ACPI data)
 BIOS-e820: 3fefb000 - 3ff0 (ACPI NVS)
 BIOS-e820: 3ff0 - 3ff8 (usable)
 BIOS-e820: 3ff8 - 4000 (reserved)
 BIOS-e820: e000 - f000 (reserved)
 BIOS-e820: fec0 - fed00400 (reserved)
 BIOS-e820: fee0 - fef0 (reserved)
 BIOS-e820: ffb0 - ffc0 (reserved)
 BIOS-e820: fff0 - 0001 (reserved)
127MB HIGHMEM available.
896MB LOWMEM available.
found SMP MP-table at 000f6c10
Entering add_active_range(0, 0, 262016) 0 entries of 256 used
Zone PFN ranges:
  DMA 0 - 4096
  Normal   4096 -   229376
  HighMem229376 -   262016
early_node_map[1] active PFN ranges
0:0 -   262016
On node 0 totalpages: 262016
  DMA zone: 32 pages used for memmap
  DMA zone: 0 pages reserved
  DMA zone: 4064 pages, LIFO batch:0
  Normal zone: 1760 pages used for memmap
  Normal zone: 223520 pages, LIFO batch:31
  HighMem zone: 255 pages used for memmap
  HighMem zone: 32385 pages, LIFO batch:7
DMI present.
ACPI: RSDP (v002 PTLTD ) @ 0x000f6ba0
ACPI: XSDT (v001 PTLTDXSDT   0x06040001  LTP 0x) @
0x3fef5381
ACPI: FADT (v003 FSC 0x06040001  0x000f4240) @
0x3fef5441
ACPI: SPCR (v001 PTLTD  $UCRTBL$ 0x06040001 PTL  0x0001) @
0x3fefaeb0
ACPI: MCFG (v001 PTLTDMCFG   0x06040001  LTP 0x) @
0x3fefaf00
ACPI: MADT (v001 PTLTDAPIC   0x06040001  LTP 0x) @
0x3fefaf40
ACPI: BOOT (v001 PTLTD  $SBFTBL$ 0x06040001  LTP 0x0001) @
0x3fefafd8
ACPI: DSDT (v001 FSCD16490x06040001 MSFT 0x0202) @
0x
ACPI: PM-Timer IO Port: 0xf008
ACPI: Local APIC address 0xfee0
ACPI: LAPIC (acpi_id[0x00] lapic_id[0x00] enabled)
Processor #0 15:4 APIC version 20
ACPI: LAPIC (acpi_id[0x01] lapic_id[0x01] enabled)
Processor #1 15:4 APIC version 20
ACPI: LAPIC_NMI (acpi_id[0x00] high edge lint[0x1])
ACPI: LAPIC_NMI (acpi_id[0x01] high edge lint[0x1])
ACPI: IOAPIC (id[0x02] address[0xfec0] gsi_base[0])
IOAPIC[0]: apic_id 2, version 32, address 0xfec0, GSI 0-23
ACPI: IOAPIC (id[0x03] address[0xfec8] gsi_base[24])
IOAPIC[1]: apic_id 3, version 32, address 0xfec8, GSI 24-47
ACPI: IOAPIC (id[0x04] address[0xfec80800] gsi_base[48])
IOAPIC[2]: apic_id 4, version 32, address 0xfec80800, GSI 48-71
ACPI: IOAPIC (id[0x05] address[0xfec84000] gsi_base[72])
IOAPIC[3]: apic_id 5, version 32, address 0xfec84000, GSI 72-95
ACPI: IOAPIC (id[0x06] address[0xfec84800] gsi_base[96])
IOAPIC[4]: apic_id 6, version 32, address 0xfec84800, GSI 96-119
ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 2 high edge)
ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 high level)
ACPI: IRQ0 used by override.
ACPI: IRQ2 used by override.
ACPI: IRQ9 used by override.

Re: [PATCH (update 3)] timer: Run calc_load halfway through each round_jiffies second

2007-03-02 Thread Eric Dumazet

On Friday 02 March 2007 16:15, Simon Arlott wrote:
 Whenever jiffies is started at a multiple of 5*HZ or wraps, calc_load is
 run exactly on the second which is when tasks using round_jiffies will
 be scheduled to run. This has a bad effect on the load average, making
 it tend towards 1.00 if a task happens to run every time the load is
 being calculated.

 This changes calc_load so that it updates load half a second after any
 tasks scheduled using round_jiffies.


Simon

I believe this patch is too complex/hazardous and may break exp decay 
computation.
(Even if nobody care about avenrun[] those days :), do you ? )

You could just change LOAD_FREQ from (5*HZ) to (5*HZ+1)


#define LOAD_FREQ   (5*HZ+1)


Mathematical proof (well... sort of)

$ cat prog.c
#define FSHIFT  11  /* nr of bits of precision */
#define FIXED_1 ((double)(1FSHIFT))   /* 1.0 as fixed-point */

#include math.h
#include stdio.h
int main()
{
printf(Old values :\n);
printf(#define EXP_1  %g\n, FIXED_1/exp(5.0/60.0));
printf(#define EXP_5  %g\n, FIXED_1/exp(5.0/(5*60.0)));
printf(#define EXP_15 %g\n, FIXED_1/exp(5.0/(15*60.0)));
printf(New values :\n);
printf(%g\n, FIXED_1/exp(5.01/60.0));
printf(%g\n, FIXED_1/exp(5.01/(5*60.0)));
printf(%g\n, FIXED_1/exp(5.01/(15*60.0)));
return 0;
}

# gcc -o prog prog.c -lm
# ./prog
Old values :
#define EXP_1  1884.25
#define EXP_5  2014.15
#define EXP_15 2036.65
New values :
1883.94
2014.08
2036.63

You can see that 5.01 instead of 5.00 second gives the same EXP_xx values.

So (5*HZ + 1) is safe. (because HZ = 100)

Eric
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] Fixes and cleanups for earlyprintk aka boot console.

2007-03-02 Thread Gerd Hoffmann

Jeremy Fitzhardinge wrote:
 Gerd Hoffmann wrote:
 Hmm, I think this is just a chunk being lost due to the clash with the
 older version of the patch submitted as part of the xen series.
 
 But I thought the old version I had posted was well and truly dropped. 
 Isn't this problematic patch the one you posted?

I think what happened is (the patch attached to the removed from -mm
mail looks like this):

  - patch-v2 merged into -mm
  - patch-v1 merged into -mm as part of xen series
  - merge conflict, the bits which are in patch-v1 got removed
from patch-v2 to solve that, both patches in -mm
  - patch-v1 got removed with other xen bits due to the conflict
with zachs patches.
  - mm kernel has an incomplete patch-v2 now = boom.

So it isn't a problem with the original patch, it just got corrupted
during conflict resolving.

cheers,
  Gerd

-- 
Gerd Hoffmann [EMAIL PROTECTED]
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/9] swap: Simplify shmem_unuse() usage [optional]

2007-03-02 Thread Hugh Dickins

On Fri, 2 Mar 2007, Richard Purdie wrote:

 Simplify shmem_unuse_inode() removing a confusing optimisation which
 requires the caller to call swap_duplicate if the shmem_unuse() call
 doesn't succeed.
 
 Based on a patch by Nick Piggin and some of my own changes as discussed
 on LKML. 
 
 Signed-off-by: Richard Purdie [EMAIL PROTECTED]

Definite NAK to this one from me: I'm sorry the optimization confuses
you, but it's well commented at both ends, and speeds up shmem swapoff
very significantly e.g. minutes down to seconds.  There may well be a
less confusing way of achieving the same effect, with another return
code from shmem_unuse, and some gotos, but I'm not all that keen.

Your other patches, well, as ever I hope I'll get to look at them,
but there are so many people, all much quicker than me, playing in
mm these days...

Hugh

 
 ---
  mm/shmem.c|   12 +---
  mm/swapfile.c |   23 ++-
  2 files changed, 7 insertions(+), 28 deletions(-)
 
 Index: linux/mm/shmem.c
 ===
 --- linux.orig/mm/shmem.c 2007-02-28 18:12:34.0 +
 +++ linux/mm/shmem.c  2007-02-28 18:12:46.0 +
 @@ -734,7 +734,7 @@ static int shmem_unuse_inode(struct shme
   struct page **dir;
   struct page *subdir;
   swp_entry_t *ptr;
 - int offset;
 + int offset, moved;
  
   idx = 0;
   ptr = info-i_direct;
 @@ -792,17 +792,15 @@ lost2:
  found:
   idx += offset;
   inode = info-vfs_inode;
 - if (move_from_swap_cache(page, idx, inode-i_mapping) == 0) {
 + moved = (move_from_swap_cache(page, idx, inode-i_mapping) == 0);
 + if (moved) {
   info-flags |= SHMEM_PAGEIN;
   shmem_swp_set(info, ptr + offset, 0);
   }
   shmem_swp_unmap(ptr);
   spin_unlock(info-lock);
 - /*
 -  * Decrement swap count even when the entry is left behind:
 -  * try_to_unuse will skip over mms, then reincrement count.
 -  */
 - swap_free(entry, page);
 + if (moved)
 + swap_free(entry, page);
   return 1;
  }
  
 Index: linux/mm/swapfile.c
 ===
 --- linux.orig/mm/swapfile.c  2007-02-28 18:12:41.0 +
 +++ linux/mm/swapfile.c   2007-02-28 18:13:04.0 +
 @@ -689,15 +689,6 @@ void try_to_unuse_page_entry(struct page
   if (!shmem_unuse(entry, page)) {
   try_to_unuse_anon(entry, page);
   delete_from_swap_cache(page);
 - } else if (PageSwapCache(page)) {
 - /*
 -  * shmem_unuse deleted a swappage from the swap cache, but the
 -  * move to filepage failed so it left swappage in cache and
 -  * lowered its swap count to pass quickly through the loops in
 -  * try_to_unuse(). We must reincrement the count to try again
 -  * later (ick).
 -  */
 - swap_duplicate(entry);
   }
  }
  
 @@ -922,12 +913,6 @@ static int try_to_unuse(unsigned int typ
* read from disk into another page.  Splitting into two
* pages would be incorrect if swap supported shared
* private pages, but they are handled by tmpfs files.
 -  *
 -  * Note shmem_unuse already deleted a swappage from
 -  * the swap cache, unless the move to filepage failed:
 -  * in which case it left swappage in cache, lowered its
 -  * swap count to pass quickly through the loops above,
 -  * and now we must reincrement count to try again later.
*/
   if ((*swap_map  1)  PageDirty(page)  PageSwapCache(page)) {
   struct writeback_control wbc = {
 @@ -938,12 +923,8 @@ static int try_to_unuse(unsigned int typ
   lock_page(page);
   wait_on_page_writeback(page);
   }
 - if (PageSwapCache(page)) {
 - if (shmem)
 - swap_duplicate(entry);
 - else
 - delete_from_swap_cache(page);
 - }
 + if (PageSwapCache(page)  !shmem)
 + delete_from_swap_cache(page);
  
   /*
* So we could skip searching mms once swap count went
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 2.6.21-rc2-mm1 - fb_ddc_read() not defined

2007-03-02 Thread Valdis . Kletnieks

On Fri, 02 Mar 2007 16:31:02 GMT, James Simmons said:

   nvidiafb-bring-back-generic-ddc-reading.patch

 To have a patch to cleans things up. Give it a try
 
 diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
 index b8f0a11..855a09e 100644
 --- a/drivers/video/Kconfig
 +++ b/drivers/video/Kconfig

Queued up for the next time I build a -mm kernel.  This build of 21-rc2-mm1
has been up for 33 minutes already, so it may be later tonight (when I try
to get my Intel 3945 wireless card working with the new 80211 stack).


pgpotHLlyUCfe.pgp
Description: PGP signature

Re: [PATCH 4/9] swap: Simplify shmem_unuse() usage [optional]

2007-03-02 Thread Richard Purdie

On Fri, 2007-03-02 at 16:44 +, Hugh Dickins wrote:
 Definite NAK to this one from me: I'm sorry the optimization confuses
 you, but it's well commented at both ends, and speeds up shmem swapoff
 very significantly e.g. minutes down to seconds.  There may well be a
 less confusing way of achieving the same effect, with another return
 code from shmem_unuse, and some gotos, but I'm not all that keen.

Currently there is only one site its used in but with the changes, you
end up with two. My concern is that the behaviour of that function is
not obvious to anyone new to the code and I suspect something will get
broken at some point due to that, even if comments are there.

I'd have no problem with a different return code and some gotos and/or
improved logic. The changes these patches make might even make that
easier to implement. I'll take another look at it and see if I can find
a nicer patch.

 Your other patches, well, as ever I hope I'll get to look at them,
 but there are so many people, all much quicker than me, playing in
 mm these days...

I'm open to offers... :)

Cheers,

Richard


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Loop device - Tracking page writes made to a loop device through mmap

2007-03-02 Thread Randy Dunlap

On Fri, 2 Mar 2007 22:13:46 +1100 (EST) Kandan Venkataraman wrote:

 I am resending the message. The first few lines  in the diff of the
 original message seemed to have an extra space added by the time it got to
 the mailing list. Hopefully this does not happen the second time around.
 
 Also, I missed out on putting a tab space on one line.
 
 
 I have incorporated all the changes you mentioned, except for one. Thank you 
 very much for taking the time to 
 review the code. I still retreive def_blk_fops as I did before, but I have 
 put this in a separate function for now.

OK, that's a good compromise, at least for now.


 I have included a test program. Before you run the test program, please 
 create the backing storage file
 for the loop device as follows
 
 dd if=/dev/zero of=/root/file bs=4K count=10
 
 Set bs to be whatever pagesize is in your machine. In my machine it was 4K.

Thanks.

[snipped]

 Now I will explain what kind of software would find the new ioctls useful.

That's a good explanation, thanks.  And your original patch description
was also thorough (IMO).

 diff -uprN linux-2.6.19.2/drivers/block/loop.c 
 linux-2.6.19.2-new/drivers/block/loop.c
 --- linux-2.6.19.2/drivers/block/loop.c   2007-03-02 22:05:06.0 
 +1100
 +++ linux-2.6.19.2-new/drivers/block/loop.c   2007-03-02 22:03:49.0 
 +1100
 @@ -74,12 +74,16 @@
   #include linux/highmem.h
   #include linux/gfp.h
   #include linux/kthread.h
 +#include linux/mm.h

That looks fishy.  Have you tried to apply the patch from this email?
I'm not finding patch (program) happy with the patch file.
(too much leading whitespace on non-patched lines)

 
   #include asm/uaccess.h
 
   static int max_loop = 8;
   static struct loop_device *loop_dev;
   static struct gendisk **disks;
 +static kmem_cache_t *pgoff_elem_cache;
 +static char* cache_name = loop_pgoff_elem_cache;

stataic char *cache_name

 +static struct file_operations loop_fops;
 
   /*
* Transfer functions
 +static int loop_get_pgwrites(struct loop_device *lo, struct loop_pgoff_array 
 __user *arg)
 +{
 + struct file *filp = lo-lo_backing_file;
 + struct loop_pgoff_array array;
 + loff_t i = 0;
 + struct rb_node *rb_node  = rb_first(lo-pgoff_tree);
 +
 + if (lo-lo_state != Lo_bound)
 + return -ENXIO;
 +
 + if (filp == NULL || !lo-lo_track_pgwrite)
 + return -EINVAL;
 +
 + if (copy_from_user(array, arg, sizeof (struct loop_pgoff_array)))
 + return -EFAULT;
 +
 + while (i  array.max  rb_node != NULL) {
 +
 + if (put_user(rb_entry(rb_node, struct pgoff_elem, 
 node)-offset, array.pgoff + i))

Still need to break (split) several long lines.

 + return -EFAULT;
 +
 + ++i;
 + rb_node = rb_next(rb_node);
 + }
 + array.num = i;
 +
 + if (copy_to_user(arg, array, sizeof(array)))
 + return -EFAULT;
 +
 + return 0;
 +}
 
   /*
* loop_change_fd switched the backing store of a loopback device to
 @@ -1322,10 +1414,67 @@ static long lo_compat_ioctl(struct file
   }
   #endif
 
 +static int loop_file_mmap(struct file * file, struct vm_area_struct * vma)
 +{
 + /* This is used for a general mmap of a disk file */
 + int err = generic_file_mmap(file, vma);
 +
 + if (err)
 + return err;

indentation

 +
 + vma-vm_ops = loop_file_vm_ops;
 + return 0;
 +}
 +


---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/4] coredump: ELF-FDPIC: enable to omit anonymous shared memory

2007-03-02 Thread Hugh Dickins

On Fri, 16 Feb 2007, David Howells wrote:
 Robin Holt [EMAIL PROTECTED] wrote:
 
  How about:
  if (vma-vm_mm-coredump_omit_anon_shared) {
  
  Then the calls to maydump() would be unchanged:
 
 VMAs are a shared resource under NOMMU conditions.

That's a disturbing remark.  Under precisely what NOMMU conditions?

I had thought Robin's suggestion very sensible; and throughout mm/
it has seemed pretty random whether we pass an mm argument down
in addition to vma, or just take vma-vm_mm at whatever level needs.

You seem to be suggesting vma-vm_mm is dangerous when CONFIG_NOMMU,
but we MMU people are scarily unaware of that.  Perhaps you need to
put #ifndef CONFIG_NOMMU around vm_mm in struct vm_area_struct?

Or am I totally misunderstanding?

Hugh
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] timer/hrtimer: take per cpu locks in sane order

2007-03-02 Thread Andrew Morton

On Fri, 2 Mar 2007 15:23:08 +0100 Heiko Carstens [EMAIL PROTECTED] wrote:

 On Fri, Mar 02, 2007 at 02:04:33PM +0100, Ingo Molnar wrote:
  
  * Heiko Carstens [EMAIL PROTECTED] wrote:
  
   - spin_lock(new_base-lock);
   - spin_lock(old_base-lock);
   + /*
   +  * If we take a lock from a different cpu, make sure we have always
   +  * the same locking order. That is the lock that belongs to the cpu
   +  * with the lowest number is taken first.
   +  */
   + lock1 = smp_processor_id()  cpu ? new_base-lock : old_base-lock;
   + lock2 = smp_processor_id()  cpu ? old_base-lock : new_base-lock;
   + spin_lock(lock1);
   + spin_lock(lock2);
  
  looks good to me. Wouldnt this be cleaner via double_lock_timer() - 
  similar to how double_rq_lock() works in kernel/sched.c - instead of 
  open-coding it?
 
 Something like the stuff below? Exploits the knowledge that the two
 tvec_base_t's are in a per_cpu array. Otherwise I would end up passing
 a lot of redundant stuff. But still I think that isn't a good solution
 but rather a hack...?
 I'd go for the patch above.

Yeah, it'd be nicer to pass in the CPU number(s), use that to make the
ordering decision.  Perhaps (smp_processor_id() - cpu).

 ---
 Index: linux-2.6/kernel/timer.c
 ===
 --- linux-2.6.orig/kernel/timer.c
 +++ linux-2.6/kernel/timer.c
 @@ -1640,6 +1640,28 @@ static void migrate_timer_list(tvec_base
   }
  }
  
 +static void __devinit double_tvec_lock(tvec_base_t *base1, tvec_base_t 
 *base2)
 +{
 + if (base1  base2) {
 + spin_lock(base1-lock);
 + spin_lock(base2-lock);
 + } else {
 + spin_lock(base2-lock);
 + spin_lock(base1-lock);
 + }
 +}
 +
 +static void __devinit double_tvec_unlock(tvec_base_t *base1, tvec_base_t 
 *base2)
 +{
 + if (base1  base2) {
 + spin_unlock(base1-lock);
 + spin_unlock(base2-lock);
 + } else {
 + spin_unlock(base2-lock);
 + spin_unlock(base1-lock);
 + }
 +}

And to undo the locks in the reverse order from that in which they were
taken.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Mel Gorman

On (01/03/07 16:44), Linus Torvalds didst pronounce:
 
 
 On Thu, 1 Mar 2007, Andrew Morton wrote:
  
  So some urgent questions are: how are we going to do mem hotunplug and
  per-container RSS?
 
 Also: how are we going to do this in virtualized environments? Usually the 
 people who care abotu memory hotunplug are exactly the same people who 
 also care (or claim to care, or _will_ care) about virtualization.
 

I sent a mail out with a fairly detailed treatment of how RSS could be done.
Essentially, I feel that containers should simply limit the number of
pages used by the container, and not try and do anything magic with a
poorly defined concept like RSS. It would do this by creating a
software zone and taking pages from a hardware zone at creation
time. It has a similar affect to RSS limits except it's better defined.

In that setup, a virtualized environment would create it's own software
zone. It would hand that over to the guest OS and the guest OS could do
whatever it liked. It would be responsible for it's own reclaim and so on
and not have to worry about other containers (or virtualized environments
for that matter) or kswapd interfering with it.

 My personal opinion is that while I'm not a huge fan of virtualization, 
 these kinds of things really _can_ be handled more cleanly at that layer, 
 and not in the kernel at all. Afaik, it's what IBM already does, and has 
 been doing for a while. There's no shame in looking at what already works, 
 especially if it's simpler.
 
   Linus

-- 
-- 
Mel Gorman
Part-time Phd Student  Linux Technology Center
University of Limerick IBM Dublin Software Lab
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Andrew Morton

On Fri, 02 Mar 2007 10:29:58 -0500 Rik van Riel [EMAIL PROTECTED] wrote:

 Andrew Morton wrote:
 
  And I'd judge that per-container RSS limits are of considerably more value
  than antifrag (in fact per-container RSS might be a superset of antifrag,
  in the sense that per-container RSS and containers could be abused to fix
  the i-cant-get-any-hugepages problem, dunno).
 
 The RSS bits really worry me, since it looks like they could
 exacerbate the scalability problems that we are already running
 into on very large memory systems.

Using a zone-per-container or N-64MB-zones-per-container should actually
move us in the direction of *fixing* any such problems.  Because, to a
first-order, the scanning of such a zone has the same behaviour as a 64MB
machine.

(We'd run into a few other problems, some related to the globalness of the
dirty-memory management, but that's fixable).

 Linux is *not* happy on 256GB systems.  Even on some 32GB systems
 the swappiness setting *needs* to be tweaked before Linux will even
 run in a reasonable way.

Please send testcases.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Mel Gorman

On (02/03/07 15:15), Paul Mundt didst pronounce:
 On Fri, Mar 02, 2007 at 02:50:29PM +0900, KAMEZAWA Hiroyuki wrote:
  On Thu, 1 Mar 2007 21:11:58 -0800 (PST)
  Linus Torvalds [EMAIL PROTECTED] wrote:
  
   The whole DRAM power story is a bedtime story for gullible children. 
   Don't 
   fall for it. It's not realistic. The hardware support for it DOES NOT 
   EXIST today, and probably won't for several years. And the real fix is 
   elsewhere anyway (ie people will have to do a FBDIMM-2 interface, which 
   is against the whole point of FBDIMM in the first place, but that's what 
   you get when you ignore power in the first version!).
   
  
  Note:
  I heard embeded people often designs their own memory-power-off control on
  embeded Linux. (but it never seems to be posted to the list.) But I don't 
  know
  they are interested in generic memory hotremove or not.
  
 Yes, this is not that uncommon of a thing. People tend to do this in a
 couple of different ways, in some cases the system is too loaded to ever
 make doing such a thing at run-time worthwhile, and in those cases these
 sorts of things tend to be munged in with the suspend code. Unfortunately
 it tends to be quite difficult in practice to keep pages in one place,
 so people rely on lame chip-select hacks and limiting the amount of
 memory that the kernel treats as RAM instead so it never ends up being an
 issue. Having some sort of a balance would certainly be nice, though.

If the range of memory you want to offline is MAX_ORDER_NR_PAGES,
anti-fragmentation should group pages you can reclaim into those size of
chunks. It might simplify the number of hacks you have to perform to
limit where the kernel uses memory.

-- 
-- 
Mel Gorman
Part-time Phd Student  Linux Technology Center
University of Limerick IBM Dublin Software Lab
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch -mm] x86_64: fake numa cmdline flag fix

2007-03-02 Thread David Rientjes

Make sure we only reference 'cmdline' on CONFIG_NUMA_EMU.

Signed-off-by: David Rientjes [EMAIL PROTECTED]
---
 arch/x86_64/mm/numa.c |   16 +++-
 1 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -524,22 +524,28 @@ void __init numa_initmem_init(unsigned long start_pfn, 
unsigned long end_pfn)
 { 
unsigned long start_addr = start_pfn  PAGE_SHIFT;
unsigned long end_addr = end_pfn  PAGE_SHIFT;
+   int numa_fake = 0;
int i;
 
+#ifdef CONFIG_NUMA_EMU
+   /* Determine if we have a numa=fake command line */
+   if (cmdline != 0)
+   numa_fake = 1;
+#endif
+
 #ifdef CONFIG_ACPI_NUMA
-   if (!numa_off  !cmdline  !acpi_scan_nodes(start_addr, end_addr))
+   if (!numa_off  !numa_fake  !acpi_scan_nodes(start_addr, end_addr))
return;
 #endif
 
 #ifdef CONFIG_K8_NUMA
-   if (!numa_off  !k8_scan_nodes(start_addr, end_addr, cmdline != 0))
-   if (cmdline == 0)
+   if (!numa_off  !k8_scan_nodes(start_addr, end_addr, numa_fake))
+   if (!numa_fake)
return;
 #endif
 
 #ifdef CONFIG_NUMA_EMU
-   if (cmdline)
-   {
+   if (numa_fake) {
numa_emu = !numa_emulation(start_pfn, end_pfn);
if (numa_emu)
return;
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Joel Schopp


Linus Torvalds wrote:


On Thu, 1 Mar 2007, Andrew Morton wrote:

So some urgent questions are: how are we going to do mem hotunplug and
per-container RSS?


The people who were trying to do memory hot-unplug basically all stopped waiting for 
these patches, or something similar, to solve the fragmentation problem.  Our last 
working set of patches built on top of an earlier version of Mel's list based solution.




Also: how are we going to do this in virtualized environments? Usually the 
people who care abotu memory hotunplug are exactly the same people who 
also care (or claim to care, or _will_ care) about virtualization.


Yes, we are.  And we are very much in favor of these patches.  At last year's OLS 
developers from IBM, HP, Xen coauthored a paper titled Resizing Memory with Balloons 
and Hotplug.  http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf  Our 
conclusion was that ballooning is simply not good enough and we need memory 
hot-unplug.  Here is a quote from the article I find relevant to today's discussion:


Memory Hotplug remove is not in mainline.
Patches exist, released under the GPL, but are
only occasionally rebased. To be worthwhile
the existing patches would need either a remappable
kernel, which remains highly doubtful, or
a fragmentation avoidance strategy to keep migrateable
and non-migrateable pages clumped
together nicely.

At IBM all of our Power4, Power5, and future hardware supports a lot of 
virtualization features.  This hardware took Best Virtualization Solution at 
LinuxWorld Expo, so we aren't talking research projects here. 
http://www-03.ibm.com/press/us/en/pressrelease/20138.wss


My personal opinion is that while I'm not a huge fan of virtualization, 
these kinds of things really _can_ be handled more cleanly at that layer, 
and not in the kernel at all. Afaik, it's what IBM already does, and has 
been doing for a while. There's no shame in looking at what already works, 
especially if it's simpler.


I believe you are talking about the zSeries (aka mainframe) because the rest of IBM 
needs these patches.  zSeries built their whole processor instruction set, memory 
model, etc around their form of virtualization, and I doubt the rest of us are going 
to change our processor instruction set that drastically.  I've had a lot of talks 
with Martin Schwidefsky (the maintainer of Linux on zSeries) about how we could do 
more of what they do and the basic answer is we can't because what they do is so 
fundamentally incompatible.


While I appreciate that we should all dump our current hardware and buy mainframes it 
seems to me that an easier solution is to take a few patches from Mel and work with 
the hardware we already have.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Christoph Lameter

On Fri, 2 Mar 2007, Nick Piggin wrote:

  Oh just run a 32GB SMP system with sparsely freeable pages and lots of 
  allocs and frees and you will see it too. F.e try Linus tree and mlock 
  a large portion of the memory and then see the fun starting. See also 
  Rik's list of pathological cases on this.
 
 Ah, so your problem is lots of unreclaimable pages. There are heaps
 of things we can try to reduce the rate at which we scan those.

Well this is one possible sympton of the basic issue of having too many 
page structs. I wonder how long we can patch things up.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Mel Gorman

On (02/03/07 08:58), Andrew Morton didst pronounce:
 On Fri, 02 Mar 2007 10:29:58 -0500 Rik van Riel [EMAIL PROTECTED] wrote:
 
  Andrew Morton wrote:
  
   And I'd judge that per-container RSS limits are of considerably more value
   than antifrag (in fact per-container RSS might be a superset of antifrag,
   in the sense that per-container RSS and containers could be abused to fix
   the i-cant-get-any-hugepages problem, dunno).
  
  The RSS bits really worry me, since it looks like they could
  exacerbate the scalability problems that we are already running
  into on very large memory systems.
 
 Using a zone-per-container or N-64MB-zones-per-container should actually
 move us in the direction of *fixing* any such problems.  Because, to a
 first-order, the scanning of such a zone has the same behaviour as a 64MB
 machine.
 

Quite possibly. Taking software zones from the other large mail I sent,
one could get the 64MB effect by increasing MAX_ORDER_NR_PAGES to be 64MB
in pages. To avoid external fragmentation issues, I'd prefer of course
if these container zones consisted of mainly contiguous memory but with
anti-fragmentation, that would be possible.

 (We'd run into a few other problems, some related to the globalness of the
 dirty-memory management, but that's fixable).
 

It would be fixable, especially if containers do their own reclaim on their
container zones and not kswapd. Writing dirty data back periodically would
still need to be global in nature but that's no different to today.

  Linux is *not* happy on 256GB systems.  Even on some 32GB systems
  the swappiness setting *needs* to be tweaked before Linux will even
  run in a reasonable way.
 
 Please send testcases.

-- 
-- 
Mel Gorman
Part-time Phd Student  Linux Technology Center
University of Limerick IBM Dublin Software Lab
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 2.6.21-rc2-mm1

2007-03-02 Thread Andrew Morton

On Fri, 02 Mar 2007 08:32:28 -0800 Badari Pulavarty [EMAIL PROTECTED] wrote:

 On Fri, 2007-03-02 at 03:00 -0800, Andrew Morton wrote:
  Temporarily at
  
http://userweb.kernel.org/~akpm/2.6.21-rc2-mm1/
  
  Will appear later at
  

  ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.21-rc2/2.6.21-rc2-mm1/
  
  
  - Quite a lot of less-popular architectures still aren't compiling due
to utrace.  x86, x86_64, powerpc, ia64 and s390 should be OK.
 
 
 arch/x86_64/mm/numa.c: In function ‘numa_initmem_init’:
 arch/x86_64/mm/numa.c:530: error: ‘cmdline’ undeclared (first use in
 this function)
 arch/x86_64/mm/numa.c:530: error: (Each undeclared identifier is
 reported only once
 arch/x86_64/mm/numa.c:530: error: for each function it appears in.)
 make[1]: *** [arch/x86_64/mm/numa.o] Error 1
 make[1]: *** Waiting for unfinished jobs
 

oop.  Can we have the .config please?
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Andrew Morton

On Fri, 2 Mar 2007 08:20:23 -0800 Mark Gross [EMAIL PROTECTED] wrote:

  The whole DRAM power story is a bedtime story for gullible children. Don't 
  fall for it. It's not realistic. The hardware support for it DOES NOT 
  EXIST today, and probably won't for several years. And the real fix is 
  elsewhere anyway (ie people will have to do a FBDIMM-2 interface, which 
  is against the whole point of FBDIMM in the first place, but that's what 
  you get when you ignore power in the first version!).
 
 
 Hardware support for some of this is coming this year in the ATCA space
 on the MPCBL0050.  The feature is a bit experimental, and
 power/performance benefits will be workload and configuration
 dependent.  Its not a bed time story.

What is the plan for software support?

Will it be possible to just power the DIMMs off?  I don't see much point in
some half-power non-destructive mode.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 00/13] Syslets, Threadlets, generic AIO support, v3

2007-03-02 Thread Davide Libenzi

On Fri, 2 Mar 2007, Evgeniy Polyakov wrote:

 On Thu, Mar 01, 2007 at 11:31:14AM -0800, Davide Libenzi 
 (davidel@xmailserver.org) wrote:
  On Thu, 1 Mar 2007, Evgeniy Polyakov wrote:
  
   Ingo, do you really think I will send mails with faked benchmarks? :))
  
  I don't think he ever implied that. He was only suggesting that when you 
  post benchmarks, and even more when you make claims based on benchmarks, 
  you need to be extra carefull about what you measure. Otherwise the 
  external view that you give to others does not look good.
  Kevent can be really faster than epoll, but if you post broken benchmarks 
  (that can be, unrealiable HTTP loaders, broken server implemenations, 
  etc..) and make claims based on that, the only effect that you have is to 
  lose your point.
  
 So, I only talked that kevent is superior compared to epoll because (and
 it is _main_ issue) of its ability to handle essentially any kind of
 events with very small overhead (the same as epoll has in struct file -
 list and spinlock) and without significant price of struct file binding
 to event.

You've to excuse me if my memory is bad, but IIRC the whole discussion 
and loong benchmark feast born with you throwing a benchmark at Ingo 
(with kevent showing a 1.9x performance boost WRT epoll), not with you 
making any other point.
As far as epoll not being able to handle other events. Said who? Of 
course, with zero modifications, you can handle zero additional events. 
With modifications, you can handle other events. But lets talk about those 
other events. The *only* kind of event that ppl (and being the epoll 
maintainer I tend to receive those requests) missed in epoll, was AIO 
events, That's the *only* thing that was missed by real life application 
developers. And if something like threadlets/syslets will prove effective, 
the gap is closed WRT that requirement.
Epoll handle already the whole class of pollable devices inside the 
kernel, and if you exclude block AIO, that's a pretty wide class already. 
The *existing* f_op-poll subsystem can be used to deliver events at the 
poll-head wakeup time (by using the key member of the poll callback), so 
that you don't even need the extra f_op-poll call to fetch events.
And if you really feel raw about the single O(nready) loop that epoll 
currently does, a new epoll_wait2 (or whatever) API could be used to 
deliver the event directly into a userspace buffer [1], directly from the 
poll callback, w/out extra delivery loops 
(IRQ/event-epoll_callback-event_buffer).


[1] From the epoll callback, we cannot sleep, so it's gonna be either an 
mlocked userspace buffer, or some kernel pages mapped to userspace.


- Davide


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] [PATCH] Kconfig: enlarge printk buffer size limit

2007-03-02 Thread Randy Dunlap

On Fri, 02 Mar 2007 15:27:48 +0200 Artem Bityutskiy wrote:

 From: Artem Bityutskiy [EMAIL PROTECTED]
 Subject: [RFC] [PATCH] Kconfig: enlarge printk buffer size limit

 This patch makes the upper kernel ring buffer size limit larger. It
 is often very handy to have huge ring-buffer for debugging purposes,
 when the subsystem which is being debugged produces large amount of
 useful output.

 Index: ubi-2.6.git/lib/Kconfig.debug
 ===
 --- ubi-2.6.git.orig/lib/Kconfig.debug
 +++ ubi-2.6.git/lib/Kconfig.debug
 @@ -79,7 +79,7 @@ config DEBUG_KERNEL

  config LOG_BUF_SHIFT
   int Kernel log buffer size (16 = 64KB, 17 = 128KB) if DEBUG_KERNEL
 - range 12 21
 + range 12 26
   default 17 if S390 || LOCKDEP
   default 16 if X86_NUMAQ || IA64
   default 15 if SMP

That's simple enough, but you could also just add
log_buf_len=huge_number
or
log_buf_len=500M
or
log_buf_len=1G
(for megabytes or gigabytes) to the kernel boot command line.

---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

module builds need improvement / top Makefile not good enough

2007-03-02 Thread FN

Hello list,

I am unhappy with the direction the 2.6 kernel builds have taken.
Very much like Micro$loth DDKs we (linux users) are being forced to
build
modules by plugging into a framework that doesn't respect the fine
aspects
of dependency generation and analysis.

Two problems I've identified
1. module builds are forcing me to use a particular make program (gnu
make)
   Well, what if someone uses a different tool to express the DAG (dep.
   graph)?

2. gnu make is a somewhat dated program and can't do profound dependency
   generation and analysis like some newer tools. All it can do is
   produce
   .d from .c with the -MM option using an idiom like this
 -include f1.d f2.d
  %.d: %.c
 $(CC) -MM whatever

   But that's not good enough for 2 reasons.
   a) version rollback that causes timestamp rollback in time does NOT
  trigger regeneration of dependencies (e.g. clearcase based
  builds).
   b) dependencies on order of things can't be expressed in gnu make,
   for
  example -Iinc1 -Iinc2 causes different results from -Iinc2 -Iinc1
  if you have 2 different header files that have the same name in
  both
  directories. Same goes for ld -r -o mod.o f1.o f2.o vs 
  ld -r -o mod.o f2.o f1.o if order mattered (which it doesn't in
  this case).

   Bottom line - there exist free tools that are vastly superior to gnu
   make,
   one such example is omake, and I don't want you to force me to switch
   to
   inferior dependency analysis with gnu make.

My suggestion how to solve this problem is the following.
Instead of
gnumake -C /lib/modules/`uname -r`/build M=`pwd` modules
it's better to be able to do 
gnumake -C /lib/modules/`uname -r`/build M=`pwd` MYMAKE=mymake modules 
and then inside your gnu Makefile you'd call mymake like so

chdir $(M)
mymake MODFLAGS=whatever modflags INCFLAGS=whatever incflags modules
and pass on whatever flags are necessary.

You can set MYMAKE to gmake if unspecified thus MYMAKE ?= make

That would make the callback into the user's build environment clean and
unbind it from gnu make.

Any replies, critique -- cc me, as I am not on this list.
-- 
  FN
  [EMAIL PROTECTED]

-- 
http://www.fastmail.fm - And now for something completely different

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Linus Torvalds



On Fri, 2 Mar 2007, Mark Gross wrote:
  
  Yes, the same issues exist for other DRAM forms too, but to a *much* 
  smaller degree.
 
 DDR3-1333 may be better than FBDIMM's but don't count on it being much
 better.

Hey, fair enough. But it's not a problem (and it doesn't have a solution) 
today. I'm not sure it's going to have a solution tomorrow either.

  Also, IN PRACTICE you're never ever going to see this anyway. Almost 
  everybody wants bank interleaving, because it's a huge performance win on 
  many loads. That, in turn, means that your memory will be spread out over 
  multiple DIMM's even for a single page, much less any bigger area.
 
 4-way interleave across banks on systems may not be as common as you may
 think for future chip sets.  2-way interleave across DIMMs within a bank
 will stay.

.. and think about a realistic future.

EVERYBODY will do on-die memory controllers. Yes, Intel doesn't do it 
today, but in the one- to two-year timeframe even Intel will.

What does that mean? It means that in bigger systems, you will no longer 
even *have* 8 or 16 banks where turning off a few banks makes sense. 
You'll quite often have just a few DIMM's per die, because that's what you 
want for latency. Then you'll have CSI or HT or another interconnect.

And with a few DIMM's per die, you're back where even just 2-way 
interleaving basically means that in order to turn off your DIMM, you 
probably need to remove HALF the memory for that CPU.

In other words: TURNING OFF DIMM's IS A BEDTIME STORY FOR DIMWITTED 
CHILDREN.

There are maybe a couple machines IN EXISTENCE TODAY that can do it. But 
nobody actually does it in practice, and nobody even knows if it's going 
to be viable (yes, DRAM takes energy, but trying to keep memory free will 
likely waste power *too*, and I doubt anybody has any real idea of how 
much any of this would actually help in practice).

And I don't think that will change. See above. The future is *not* moving 
towards more and more DIMMS. Quite the reverse. On workstations, we are 
currently in the one or two DIMM's per die. Do you really think that 
will change? Hell no. And in big servers, pretty much everybody agrees 
that we will move towards that, rather than away from it.

So:
 - forget about turning DIMM's off. There is *no* actual data supporting 
   the notion that it's a good idea today, and I seriously doubt you can 
   really argue that it will be a good idea in five or ten years. It's a 
   hardware hack for a hardware problem, and the problems are way too 
   complex for us to solve in time for the solution to be relevant.

 - aim for NUMA memory allocation and turning off whole *nodes*. That's 
   much more likely to be productive in the longer timeframe. And yes, we 
   may well want to do memory compaction for that too, but I suspect that 
   the issues are going to be different (ie the way to do it is to simply 
   prefer certain nodes for certain allocations, and then try to keep the 
   jobs that you know can be idle on other nodes)

Do you actually have real data supporting the notion that turning DIMM's 
off will be reasonable and worthwhile? 

Linus
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Christoph Lameter

On Fri, 2 Mar 2007, Mel Gorman wrote:

 However, if that is objectionable, I'd at least like to see zone-based patches
 go into -mm on the expectation that the memory hot-remove patches will be
 able to use the infrastructure. It's not ideal for hugepages and it is not my
 first preference, but it's a step in the right direction. Is this reasonable?

I still think that the list based approach is sufficient for memory 
hotplug if one restricts  the location of the unmovable MAX_ORDER chunks 
to not overlap the memory area where we would like to be able to remove 
memory. In very pressing memory situations where we have too much 
unmovable memory we could dynamically disable  memory hotplug. There 
would be no need for this partitioning and additional zones.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC] hwbkpt: Hardware breakpoints (was Kwatch)

2007-03-02 Thread Alan Stern

Roland and Prasanna:

Here's my first attempt, lightly tested, at an hwbkpt implementation.  It
includes copious comments, so it shouldn't be too hard to figure out (if
you read the files in the right order).  The patch below is meant for
2.6.21-rc2; porting it to -mm shouldn't be very hard.

There are still several loose ends and unanswered questions.

I pretty much copied the existing code for handling vm86 mode
and single-step exceptions, without fully understanding it.

The code doesn't virtualize the BS (single-step) flag in DR6
for userspace.  It could be added, but I wonder whether it is
really needed.

Unlike the existing code, DR7 is re-enabled upon returning from
a debug interrupt.  That means it doesn't have to be enabled
when delivering a SIGTRAP.

Setting user breakpoints on I/O ports should require permissions
checking.  I haven't tried to figure out how that works or
how to implement it yet.

It seems likely that some of the new routines should be marked
__kprobes, but I don't know which, or even what that annotation
is supposed to mean.

When CPUs go on- or off-line, their debug registers need to be
initialized or cleared.  I did a little bit of that, but more is
needed.  In particular, CPU hotplugging and kexec have to take
this into account.

The parts relating to kernel breakpoints could be made conditional
on a Kconfig option.  The amount of code space saved would be
relatively small; I'm not sure that it would be worthwhile.

Probably there are some more issues I haven't thought of.  Anyway, let me 
know what you think.

Alan Stern



Index: 2.6.21-rc2/include/asm-i386/hwbkpt.h
===
--- /dev/null
+++ 2.6.21-rc2/include/asm-i386/hwbkpt.h
@@ -0,0 +1,185 @@
+#ifndef_I386_HWBKPT_H
+#define_I386_HWBKPT_H
+
+#include linux/list.h
+#include linux/types.h
+
+/**
+ * struct hwbkpt - unified kernel/user-space hardware breakpoint
+ * @node: internal linked-list management
+ * @triggered: callback invoked when the breakpoint is hit
+ * @installed: callback invoked when the breakpoint is installed
+ * @uninstalled: callback invoked when the breakpoint is uninstalled
+ * @data: private data for use by the breakpoint owner
+ * @address: location (virtual address) of the breakpoint
+ * @len: extent of the breakpoint address (1, 2, or 4 bytes)
+ * @type: breakpoint type (write-only, read/write, execute, or I/O)
+ * @priority: requested priority level
+ * @status: current registration/installation status
+ *
+ * %hwbkpt structures are the kernel's way of representing hardware
+ * breakpoints.  These can be either execution breakpoints (triggered
+ * on instruction execution) or data breakpoints (also known as
+ * watchpoints, triggered on data access), and the breakpoint's
+ * target address can be located in either kernel space or user space.
+ *
+ * The @address, @len, and @type fields are standard, indicating the
+ * location of the breakpoint, its extent in bytes, and the type of
+ * access that will trigger the breakpoint.  Possible values for @len
+ * are 1, 2, and 4.  Possible values for @type are %HWBKPT_WRITE
+ * (triggered on write access), %HWBKPT_RW (triggered on read or
+ * write access), %HWBKPT_IO (triggered on I/O-space access), and
+ * %HWBKPT_EXECUTE (triggered on instruction execution).  Certain
+ * restrictions apply: %HWBKPT_EXECUTE requires that @len be 1, and
+ * %HWBKPT_IO is available only on processors with Debugging Extensions.
+ *
+ * In register_user_hwbkpt() and modify_user_hwbkpt(), @address must
+ * refer to a location in user space (unless @type is %HWBKPT_IO).
+ * The breakpoint will be active only while the requested task is
+ * running.  Conversely, in register_kernel_hwbkpt() @address must
+ * refer to a location in kernel space, and the breakpoint will be
+ * active on all CPUs regardless of the task being run.
+ *
+ * When a breakpoint gets hit, the @triggered callback is invoked
+ * in_interrupt with a pointer to the %hwbkpt structure and the
+ * processor registers.  %HWBKPT_EXECUTE traps occur before the
+ * breakpointed instruction executes; all other types of trap occur
+ * after the memory or I/O access has taken place.  All breakpoints
+ * are disabled while @triggered runs, to avoid recursive traps and
+ * allow unhindered access to breakpointed memory.
+ *
+ * Hardware breakpoints are implemented using the CPU's debug registers,
+ * which are a limited hardware resource.  Requests to register a
+ * breakpoint will always succeed (provided the member entries are
+ * valid), but the breakpoint may not be installed in a debug register
+ * right away.  Physical debug registers are allocated based on the
+ * priority level stored in @priority (higher values indicate higher
+ * priority).  User-space breakpoints within

Re: [patch -mm] x86_64: fake numa cmdline flag fix

2007-03-02 Thread Badari Pulavarty

On Fri, 2007-03-02 at 09:03 -0800, David Rientjes wrote:
 Make sure we only reference 'cmdline' on CONFIG_NUMA_EMU.
 
 Signed-off-by: David Rientjes [EMAIL PROTECTED]

Fixes the compile problem.

So does the moving 

char *cmdline __initdata;

out of CONFIG_NUMA_EMU. But I guess your is a cleaner fix.

Thanks,
Badari

 ---
  arch/x86_64/mm/numa.c |   16 +++-
  1 files changed, 11 insertions(+), 5 deletions(-)
 
 diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
 --- a/arch/x86_64/mm/numa.c
 +++ b/arch/x86_64/mm/numa.c
 @@ -524,22 +524,28 @@ void __init numa_initmem_init(unsigned long start_pfn, 
 unsigned long end_pfn)
  { 
   unsigned long start_addr = start_pfn  PAGE_SHIFT;
   unsigned long end_addr = end_pfn  PAGE_SHIFT;
 + int numa_fake = 0;
   int i;
  
 +#ifdef CONFIG_NUMA_EMU
 + /* Determine if we have a numa=fake command line */
 + if (cmdline != 0)
 + numa_fake = 1;
 +#endif
 +
  #ifdef CONFIG_ACPI_NUMA
 - if (!numa_off  !cmdline  !acpi_scan_nodes(start_addr, end_addr))
 + if (!numa_off  !numa_fake  !acpi_scan_nodes(start_addr, end_addr))
   return;
  #endif
  
  #ifdef CONFIG_K8_NUMA
 - if (!numa_off  !k8_scan_nodes(start_addr, end_addr, cmdline != 0))
 - if (cmdline == 0)
 + if (!numa_off  !k8_scan_nodes(start_addr, end_addr, numa_fake))
 + if (!numa_fake)
   return;
  #endif
  
  #ifdef CONFIG_NUMA_EMU
 - if (cmdline)
 - {
 + if (numa_fake) {
   numa_emu = !numa_emulation(start_pfn, end_pfn);
   if (numa_emu)
   return;

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Christoph Lameter

On Fri, 2 Mar 2007, Andrew Morton wrote:

  Linux is *not* happy on 256GB systems.  Even on some 32GB systems
  the swappiness setting *needs* to be tweaked before Linux will even
  run in a reasonable way.
 
 Please send testcases.

It is not happy if you put 256GB into one zone. We are fine with 1k nodes 
with 8GB each and a 16k page size (which reduces the number of 
page_structs to manage by a fourth). So the total memory is 8TB which is 
significantly larger than 256GB.

If we do this node/zone merging and reassign MAX_ORDER blocks to virtual 
node/zones for containers (with their own LRU etc) then this would also 
reduce the number of page_structs on the list and may make things a bit 
easier.

We would then produce the same effect as the partitioning via NUMA nodes 
on our 8TB boxes. However, then you still have a bandwidth issue since 
your 256 likely only has a single bus and all memory traffic for the 
node/zones has to go through this single bottleneck. That bottleneck does 
not exist on NUMA machines.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Is the clockevent resolution fine-grained enough?

2007-03-02 Thread Marko Rauhamaa

Thomas Gleixner [EMAIL PROTECTED]:

 On Thu, 2007-03-01 at 18:34 -0800, Marko Rauhamaa wrote:
  It would appear the new clockevent API has a one-nanosecond
  resolution. It certainly looks sufficiently fine-grained, but I'm
  afraid it's too coarse for some applications.
 
 That's an academic exercise, or are you talking about some real world
 hardware which runs Linux ?

Real hardware running linux.

  In our application, we need periodic clock interrupts at about 100
  kHz.
 
 With a stock kernel ?

Well, with a clockevent patch of our own. We'd like to use a stock
kernel, though.

  If the (programmable) frequency must be rounded to the nearest
  nanosecond, we have a cumulative error of
  
 100,000 * 0.5 ns/s = 50 µs/s
 
 clockevents is based on the monotonic system clock and depends on the
 accuracy of that and the device which deliveres the interrupts.
 [...]
 There is nothing to nugde. The clockevent subsystem operates on
 absolute time, so there is no cummulative error, except you setup your
 timers relative per event.

I'm afraid you didn't quite understand what I was getting at. Say the
user programs the frequency to be 109,000 Hz. That means a nominal clock
interval of ~9174.3119 ns. Now the clockevent interface forces me to
round it down to 9174 ns. That means the clock interrupts fall behind
with respect to the other parts in the system that implement 109,000 Hz
much more to the letter. The error grows by 34 µs every second so that
after 8 hours, we are lagging by a whole second.


Marko

-- 
Marko Rauhamaa  mailto:[EMAIL PROTECTED] http://pacujo.net/marko/
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2.6.21-rc1] Extend print_symbol capability

2007-03-02 Thread Robert Peterson


Paulo Marques wrote:

I don't like this name much :(

We already have kallsyms_lookup and kallsyms_lookup_name. The name of 
this function should imply that it will print the formatted result 
into the buffer, not just lookup a symbol.


Maybe __sprint_symbol, and change the interface to 
__sprint_symbol(char *buffer, unsigned long addr)?
I'm not sure I like the leading __.  In the print_symbol case, I think 
the function

was given a leading __ so that code referencing print_symbol would use the
macro which formulates the call into __print_symbol.  I don't mind 
sprint_symbol

though.  Since Andrew Morton included the patch, I'll defer to his judgment.

+static inline void lookup_symbol(unsigned long addr, char *buffer)
+{
+   return NULL;
+}

Returning NULL in a function returning void doesn't seem right :P

You're right.  This should just be a simple return;.  My bad.  Good catch.
Since Andrew Morton has already included this patch, I'll let him
make this change if he sees fit.

Regards,

Bob Peterson
Red Hat Cluster Suite

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 2.6.21-rc2 radeon backlight

2007-03-02 Thread Alex Romosan

Andrew Morton [EMAIL PROTECTED] writes:

 On Wed, 28 Feb 2007 08:32:43 -0800
 Alex Romosan [EMAIL PROTECTED] wrote:

 the backlight on my thinkpad still (2.6.20 worked fine) doesn't come
 on if i have the radeon backlight enabled. without it, i guess it's
 the ibm acpi modules that controls the backlight and it seems to work
 fine.
 

 Unclear. Are you saying that the backlight comes on OK if you use
 the IBM acpi module?

yes, if i disable the radeon backlight and use the ibm acpi module,
than the backlight works. if i enable the radeon backlight, the screen
stays dark and i can't turn it on (i tried using radeontool to control
it but nothing happened).

--alex--

-- 
| I believe the moment is at hand when, by a paranoiac and active |
|  advance of the mind, it will be possible (simultaneously with  |
|  automatism and other passive states) to systematize confusion  |
|  and thus to help to discredit completely the world of reality. |
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: module builds need improvement / top Makefile not good enough

2007-03-02 Thread Jeremy Fitzhardinge

FN wrote:
a) version rollback that causes timestamp rollback
Ugh. Broken.

 it's better to be able to do 
 gnumake -C /lib/modules/`uname -r`/build M=`pwd` MYMAKE=mymake modules 
   

Patches accepted.

J
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Fix ACPI documentation in kernel-parameters.txt

2007-03-02 Thread Bernhard Walle

Hello,

* Roland Dreier [EMAIL PROTECTED] [2007-03-02 00:23]:
   Add hint that acpi=off doesn't work on IA64.
 
 Would it make sense to add code to detect this and print a kernel
 message like
 
 warning: ACPI is always enabled on IA64; ignoring acpi=off
 
 No one reads documentation :)

The idea is not bad, but it should not make sense to have it only for
ACPI and not for other arch-specific options ...


Regards
Bernhard


pgpD4ETEgVs47.pgp
Description: PGP signature

Re: [patch 00/13] Syslets, Threadlets, generic AIO support, v3

2007-03-02 Thread Davide Libenzi

On Fri, 2 Mar 2007, Evgeniy Polyakov wrote:

 do we really want to have per process signalfs, timerfs and so on - each 
 simple structure must be bound to a file, which becomes too cost.

I may be old school, but if you ask me, and if you *really* want those 
events, yes. Reason? Unix's everything-is-a-file rule, and being able to 
use them with *existing* POSIX poll/select. Remember, not every app 
requires huge scalability efforts, so working with simpler and familiar 
APIs is always welcome.
The *only* thing that was not practical to have as fd, was block requests. 
But maybe threadlets/syslets will handle those just fine, and close the gap.



- Davide


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Mel Gorman

On (02/03/07 09:19), Christoph Lameter didst pronounce:
 On Fri, 2 Mar 2007, Mel Gorman wrote:
 
  However, if that is objectionable, I'd at least like to see zone-based 
  patches
  go into -mm on the expectation that the memory hot-remove patches will be
  able to use the infrastructure. It's not ideal for hugepages and it is not 
  my
  first preference, but it's a step in the right direction. Is this 
  reasonable?
 
 I still think that the list based approach is sufficient for memory 
 hotplug if one restricts  the location of the unmovable MAX_ORDER chunks 
 to not overlap the memory area where we would like to be able to remove 
 memory.

Yes, true. In the part where I bias placements of unmovable pages at
lower PFNs, additional steps would need to be taken. Specifically, the
lowest block MAX_ORDER_NR_PAGES used for movable pages would need to be
reclaimed for unmovable allocations.

 In very pressing memory situations where we have too much 
 unmovable memory we could dynamically disable  memory hotplug. There 
 would be no need for this partitioning and additional zones.
 

It's simply more complex. I believe it's doable. The main plus going for
the zone is that it is a clearly understood concept and it gives hard
guarantees.

-- 
Mel Gorman
Part-time Phd Student  Linux Technology Center
University of Limerick IBM Dublin Software Lab
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] Fix ACPI documentation in kernel-parameters.txt

2007-03-02 Thread Bernhard Walle

Add hint that acpi=off doesn't work on IA64.

Signed-off-by: Bernhard Walle [EMAIL PROTECTED]

---
 Documentation/kernel-parameters.txt |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Index: linux-2.6.21-rc2-mm1/Documentation/kernel-parameters.txt
===
--- linux-2.6.21-rc2-mm1.orig/Documentation/kernel-parameters.txt
+++ linux-2.6.21-rc2-mm1/Documentation/kernel-parameters.txt
@@ -126,7 +126,8 @@ and is between 256 and 4096 characters. 
See header of drivers/scsi/53c7xx.c.
See also Documentation/scsi/ncr53c7xx.txt.
 
-   acpi=   [HW,ACPI] Advanced Configuration and Power Interface
+   acpi=   [HW,ACPI,X86-64,i386]
+   Advanced Configuration and Power Interface
Format: { force | off | ht | strict | noirq }
force -- enable ACPI if default was off
off -- disable ACPI if default was on
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH (update 3)] timer: Run calc_load halfway through each round_jiffies second

2007-03-02 Thread Simon Arlott


On 02/03/07 16:35, Eric Dumazet wrote:

On Friday 02 March 2007 16:15, Simon Arlott wrote:

Whenever jiffies is started at a multiple of 5*HZ or wraps, calc_load is
run exactly on the second which is when tasks using round_jiffies will
be scheduled to run. This has a bad effect on the load average, making
it tend towards 1.00 if a task happens to run every time the load is
being calculated.

This changes calc_load so that it updates load half a second after any
tasks scheduled using round_jiffies.


I believe this patch is too complex/hazardous and may break exp decay 
computation.


Only for a single calculation whenever it has to adjust, which should only 
happen every 49.7 days (on 32-bit archs). (Or 5 minutes after booting... 
I always wondered why that happened and now I see it's initialised so it 
always wraps early). Whilst it is in sync with jiffies it will not affect 
the process - count is just set to the current value every time. Even with 
NO_HZ because jiffies will be correct when calc_load is called.



(Even if nobody care about avenrun[] those days :), do you ? )

You could just change LOAD_FREQ from (5*HZ) to (5*HZ+1)
You can see that 5.01 instead of 5.00 second gives the same EXP_xx values.

So (5*HZ + 1) is safe. (because HZ = 100)


On HZ=1000, this would cause the load average to be pushed towards +1.00 
for up to 2 minutes every ~83 minutes with no obvious cause. (If a task 
takes ~10-20ms to run, so 20 runs are needed at HZ=1000 before it passes 
it again).


On HZ=100 it would happen every ~8 minutes for up to 10 seconds and never 
be noticed.


Using 5*HZ+2 would move this to ~167 and ~17 minutes which would mitigate 
the effect further still without changing the exp values.


1884.25 - 1883.62
2014.15 - 2014.02
2036.65 - 2036.61

Will anyone notice if the load is adjusted slightly less frequently?


If this is considered preferable to adjusting calc_load to avoid almost all 
round_jiffies scheduled tasks (some of which may take longer than ~15ms to 
run), then I have no problems with it - I just needed something to stop my 
driver changes doing odd things to the load average for other people. I'll 
continue to run with this version, is it possible to add a Kconfig option 
for it somewhere?


--
Simon Arlott
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 2.6.21-rc2-mm1

2007-03-02 Thread Randy Dunlap

On Fri, 2 Mar 2007 03:04:05 -0800 Andrew Morton wrote:

 On Fri, 2 Mar 2007 03:00:26 -0800 Andrew Morton [EMAIL PROTECTED] wrote:
 
  Temporarily at
  
http://userweb.kernel.org/~akpm/2.6.21-rc2-mm1/
  
  Will appear later at
  

  ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.21-rc2/2.6.21-rc2-mm1/
 
 Forget to mention:  there's a huge wireless update in there:
 
Changes include the new wireless stack, the associated drivers, and
the new wireless configuration mechanisms, as well as some b44 changes
related to hardware in-common with bcm43xx.  There are currently over
700 individual patches in the entire series.  I have the individual
patches extracted and available here:
 
   
 http://www.kernel.org/pub/linux/kernel/people/linville/wireless-dev/mm-master
 
 please be sure to cc netdev@vger.kernel.org if anything goes wrong with it.

s/netdev/linux-wireless/ please:

NETWORKING [WIRELESS]
P:  John W. Linville
M:  [EMAIL PROTECTED]
L:  [EMAIL PROTECTED]
T:  git kernel.org:/pub/scm/linux/kernel/git/linville/wireless-2.6.git
S:  Maintained


---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Andrew Morton

On Fri, 2 Mar 2007 09:23:49 -0800 (PST) Christoph Lameter [EMAIL PROTECTED] 
wrote:

 On Fri, 2 Mar 2007, Andrew Morton wrote:
 
   Linux is *not* happy on 256GB systems.  Even on some 32GB systems
   the swappiness setting *needs* to be tweaked before Linux will even
   run in a reasonable way.
  
  Please send testcases.
 
 It is not happy if you put 256GB into one zone.

Oh come on.  What's the workload?  What happens?  system time?  user time?
kernel profiles?
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 00/13] Syslets, Threadlets, generic AIO support, v3

2007-03-02 Thread Davide Libenzi

On Fri, 2 Mar 2007, Ingo Molnar wrote:

  After your changes epoll increased to 5k.
 
 Can we please stop this pointless episode of benchmarketing, where every 
 mail of yours shows different results and you even deny having said 
 something which you clearly said just a few days ago? At this point i 
 simply cannot trust the numbers you are posting, nor is the discussion 
 style you are following productive in any way in my opinion.

Agreed. Can we focus on the topic here? We're still missing proper FPU 
context switch in the move_user_context(). In v6?


- Davide


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Mark Gross

On Fri, Mar 02, 2007 at 09:07:53AM -0800, Andrew Morton wrote:
 On Fri, 2 Mar 2007 08:20:23 -0800 Mark Gross [EMAIL PROTECTED] wrote:
 
   The whole DRAM power story is a bedtime story for gullible children. 
   Don't 
   fall for it. It's not realistic. The hardware support for it DOES NOT 
   EXIST today, and probably won't for several years. And the real fix is 
   elsewhere anyway (ie people will have to do a FBDIMM-2 interface, which 
   is against the whole point of FBDIMM in the first place, but that's what 
   you get when you ignore power in the first version!).
  
  
  Hardware support for some of this is coming this year in the ATCA space
  on the MPCBL0050.  The feature is a bit experimental, and
  power/performance benefits will be workload and configuration
  dependent.  Its not a bed time story.
 
 What is the plan for software support?

The plan is the typical layered approach to enabling.  Post the basic
enabling patch, followed by a patch or software to actually exercise the
feature.

The code to exercise the feature is complicated by the fact that the
memory will need re-training as it comes out of low power state.  The
code doing this is still a bit confidential.

I have the base enabling patch ready for RFC review.
I'm working on the RFC now.

 
 Will it be possible to just power the DIMMs off?  I don't see much point in
 some half-power non-destructive mode.

I think so, but need to double check with the HW folks.

Technically, the dims could be powered off, and put into 2 different low
power non-destructive states.  (standby and suspend), but putting them
in a low power non-destructive mode has much less latency and provides
good bang for the buck or LOC change needed to make work.

Which lower power mode an application chooses will depend on latency
tolerances of the app.  For the POC activities we are looking at we are
targeting the lower latency option, but that doesn't lock out folks from
trying to do something with the other options.

--mgross

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Rik van Riel


Andrew Morton wrote:

On Fri, 2 Mar 2007 09:23:49 -0800 (PST) Christoph Lameter [EMAIL PROTECTED] 
wrote:


On Fri, 2 Mar 2007, Andrew Morton wrote:


Linux is *not* happy on 256GB systems.  Even on some 32GB systems
the swappiness setting *needs* to be tweaked before Linux will even
run in a reasonable way.

Please send testcases.

It is not happy if you put 256GB into one zone.


Oh come on.  What's the workload?  What happens?  system time?  user time?
kernel profiles?


I can't share all the details, since a lot of the problems are customer
workloads.

One particular case is a 32GB system with a database that takes most
of memory.  The amount of actually freeable page cache memory is in
the hundreds of MB.   With swappiness at the default level of 60, kswapd
ends up eating most of a CPU, and other tasks also dive into the pageout
code.  Even with swappiness as high as 98, that system still has
problems with the CPU use in the pageout code!

Another typical problem is that people want to back up their database
servers.  During the backup, parts of the working set get evicted from
the VM and performance is horrible.

A third scenario is where a system has way more RAM than swap, and not
a whole lot of freeable page cache.  In this case, the VM ends up
spending WAY too much CPU time scanning and shuffling around essentially
unswappable anonymous memory and tmpfs files.

I have briefly characterized some of these working sets on:

http://linux-mm.org/ProblemWorkloads

One thing I do not yet have are easily runnable test cases.  I know
the problems that happen because customers run into them, but it is
not as easy to reproduce on test systems...

--
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Christoph Lameter

On Fri, 2 Mar 2007, Mel Gorman wrote:

  I still think that the list based approach is sufficient for memory 
  hotplug if one restricts  the location of the unmovable MAX_ORDER chunks 
  to not overlap the memory area where we would like to be able to remove 
  memory.
 
 Yes, true. In the part where I bias placements of unmovable pages at
 lower PFNs, additional steps would need to be taken. Specifically, the
 lowest block MAX_ORDER_NR_PAGES used for movable pages would need to be
 reclaimed for unmovable allocations.

I think sparsemem can provide some memory maps that show where there are 
section of memory that are hot pluggable. So the MAX_ORDER blocks need
to be categorized as to whether they are in such a section or not. If you 
need another MAX_ORDER block for an unmovable type of allocation then make 
sure that it is not marked as hotpluggable by sparsemem. If we are in an 
emergency situation were we must use a MAX_ORDER block that is currently 
hotpluggable for unmovable allocations then we need to trigger something 
in sparsmem that disabled hotplug for that memory section.

 It's simply more complex. I believe it's doable. The main plus going for
 the zone is that it is a clearly understood concept and it gives hard
 guarantees.

And it gives the sysadmin headaches and increases management VM management 
overhead because we now have more bits in the page struct that tell us 
about the zone that the page belongs to. Another distinction to worry 
about in the VM. If the limit is set too high then we have memory that is 
actually movable but since its on the wrong side of the limit we cannot 
use it. If the limit is set too low then the systewm will crash.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Mel Gorman


On Fri, 2 Mar 2007, Christoph Lameter wrote:


On Fri, 2 Mar 2007, Mel Gorman wrote:


I still think that the list based approach is sufficient for memory
hotplug if one restricts  the location of the unmovable MAX_ORDER chunks
to not overlap the memory area where we would like to be able to remove
memory.


Yes, true. In the part where I bias placements of unmovable pages at
lower PFNs, additional steps would need to be taken. Specifically, the
lowest block MAX_ORDER_NR_PAGES used for movable pages would need to be
reclaimed for unmovable allocations.


I think sparsemem can provide some memory maps that show where there are
section of memory that are hot pluggable. So the MAX_ORDER blocks need
to be categorized as to whether they are in such a section or not.


That makes the problem slightly easier. If sparsemem sections are aware of 
whether they are hotpluggable or not, __rmqueue_fallback() (from the 
list-based anti-frag patches) can be taught to never use those sections 
for unmovable allocations.



If you
need another MAX_ORDER block for an unmovable type of allocation then make
sure that it is not marked as hotpluggable by sparsemem. If we are in an
emergency situation were we must use a MAX_ORDER block that is currently
hotpluggable for unmovable allocations then we need to trigger something
in sparsmem that disabled hotplug for that memory section.



Which should be doable.


It's simply more complex. I believe it's doable. The main plus going for
the zone is that it is a clearly understood concept and it gives hard
guarantees.


And it gives the sysadmin headaches and increases management VM management
overhead because we now have more bits in the page struct that tell us
about the zone that the page belongs to. Another distinction to worry
about in the VM. If the limit is set too high then we have memory that is
actually movable but since its on the wrong side of the limit we cannot
use it. If the limit is set too low then the systewm will crash.



I'm aware of this. It believe it could all be done in the context of 
list-based - just that it requires more code. Zones are easier to 
understand for most people and their behavior is better understood. If a 
workload is discovered that list-based doesn't handle, the zone can be 
used until the problem is solved.


This is why the anti-fragmentation and zone-based approaches are no longer 
mutually exclusive as they were in earlier versions.


--
Mel Gorman
Part-time Phd Student  Linux Technology Center
University of Limerick IBM Dublin Software Lab
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH (update 3)] timer: Run calc_load halfway through each round_jiffies second

2007-03-02 Thread Eric Dumazet

On Friday 02 March 2007 18:32, Simon Arlott wrote:
 On 02/03/07 16:35, Eric Dumazet wrote:

  You could just change LOAD_FREQ from (5*HZ) to (5*HZ+1)
  You can see that 5.01 instead of 5.00 second gives the same EXP_xx
  values.
 
  So (5*HZ + 1) is safe. (because HZ = 100)

 On HZ=1000, this would cause the load average to be pushed towards +1.00
 for up to 2 minutes every ~83 minutes with no obvious cause. (If a task
 takes ~10-20ms to run, so 20 runs are needed at HZ=1000 before it passes
 it again).

Nope, you dont quite understand how load (avenrun[]) is computed.

Every 5 seconds, three values are adjusted, based on their previous value and 
the actual value. Lets focus on the first value (mean load average on one 
minute)

exp = 1.0 / exp(5.0/60.0);
avenrun[0] = (avenrun[0] * exp) + (active * (1.0 - exp));

If previous value is 0.0, and current active count 1, then next value for 
avenrun[0] will be : 0.0799556

Not exactly 1.0 as you think !

Then in the next intervals (if active count is 0), it will decrease 'slowly' : 
0.0735627
0.0676809
0.0622695
0.0572907

In average, your load factor close to reality.

Just try my suggestion, it should work. I even proved it in my previous 
mail :)

Eric
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Andrew Morton

On Fri, 02 Mar 2007 12:43:42 -0500
Rik van Riel [EMAIL PROTECTED] wrote:

 Andrew Morton wrote:
  On Fri, 2 Mar 2007 09:23:49 -0800 (PST) Christoph Lameter [EMAIL 
  PROTECTED] wrote:
  
  On Fri, 2 Mar 2007, Andrew Morton wrote:
 
  Linux is *not* happy on 256GB systems.  Even on some 32GB systems
  the swappiness setting *needs* to be tweaked before Linux will even
  run in a reasonable way.
  Please send testcases.
  It is not happy if you put 256GB into one zone.
  
  Oh come on.  What's the workload?  What happens?  system time?  user time?
  kernel profiles?
 
 I can't share all the details, since a lot of the problems are customer
 workloads.
 
 One particular case is a 32GB system with a database that takes most
 of memory.  The amount of actually freeable page cache memory is in
 the hundreds of MB.

Where's the rest of the memory? tmpfs?  mlocked?  hugetlb?

   With swappiness at the default level of 60, kswapd
 ends up eating most of a CPU, and other tasks also dive into the pageout
 code.  Even with swappiness as high as 98, that system still has
 problems with the CPU use in the pageout code!
 
 Another typical problem is that people want to back up their database
 servers.  During the backup, parts of the working set get evicted from
 the VM and performance is horrible.

userspace fixes for this are far, far better than any magic goo the kernel
can implement.  We really need to get off our butts and start educating
people.

 A third scenario is where a system has way more RAM than swap, and not
 a whole lot of freeable page cache.  In this case, the VM ends up
 spending WAY too much CPU time scanning and shuffling around essentially
 unswappable anonymous memory and tmpfs files.

Well we've allegedly fixed that, but it isn't going anywhere without
testing.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] longhaul pci_find_device - pci_get_device conversion (was: Re: 2.6.21-rc2-mm1)

2007-03-02 Thread Michal Piotrowski


On 02/03/07, Jiri Slaby [EMAIL PROTECTED] wrote:

missing pci_dev_put()s


Yes.

Please ignore this patch.

Regards,
Michal

--
Michal K. K. Piotrowski
LTG - Linux Testers Group (PL)
(http://www.stardust.webpages.pl/ltg/)
LTG - Linux Testers Group (EN)
(http://www.stardust.webpages.pl/linux_testers_group_en/)
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Andrew Morton

On Fri, 2 Mar 2007 09:35:27 -0800
Mark Gross [EMAIL PROTECTED] wrote:

  
  Will it be possible to just power the DIMMs off?  I don't see much point in
  some half-power non-destructive mode.
 
 I think so, but need to double check with the HW folks.
 
 Technically, the dims could be powered off, and put into 2 different low
 power non-destructive states.  (standby and suspend), but putting them
 in a low power non-destructive mode has much less latency and provides
 good bang for the buck or LOC change needed to make work.
 
 Which lower power mode an application chooses will depend on latency
 tolerances of the app.  For the POC activities we are looking at we are
 targeting the lower latency option, but that doesn't lock out folks from
 trying to do something with the other options.
 

If we don't evacuate all live data from all of the DIMM, we'll never be
able to power the thing down in many situations.

Given that we _have_ emptied the DIMM, we can just turn it off.  And
refilling it will be slow - often just disk speed.

So I don't see a useful use-case for non-destructive states.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Christoph Lameter

On Fri, 2 Mar 2007, Andrew Morton wrote:

  One particular case is a 32GB system with a database that takes most
  of memory.  The amount of actually freeable page cache memory is in
  the hundreds of MB.
 
 Where's the rest of the memory? tmpfs?  mlocked?  hugetlb?

The memory is likely in use but there is enough memory free in unmapped 
clean pagecache pages so that we occasionally are able to free pages. Then 
the app is reading more from disk replenishing that ...
Thus we are forever cycling through the LRU lists moving pages between 
the lists aging etc etc. Can lead to a livelock.

  A third scenario is where a system has way more RAM than swap, and not
  a whole lot of freeable page cache.  In this case, the VM ends up
  spending WAY too much CPU time scanning and shuffling around essentially
  unswappable anonymous memory and tmpfs files.
 
 Well we've allegedly fixed that, but it isn't going anywhere without
 testing.

We have fixed the case in which we compile the kernel without swap. Then 
anonymous pages behave like mlocked pages. Did we do more than that?

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Andrew Morton

On Fri, 2 Mar 2007 10:15:36 -0800 (PST)
Christoph Lameter [EMAIL PROTECTED] wrote:

 On Fri, 2 Mar 2007, Andrew Morton wrote:
 
   One particular case is a 32GB system with a database that takes most
   of memory.  The amount of actually freeable page cache memory is in
   the hundreds of MB.
  
  Where's the rest of the memory? tmpfs?  mlocked?  hugetlb?
 
 The memory is likely in use but there is enough memory free in unmapped 
 clean pagecache pages so that we occasionally are able to free pages. Then 
 the app is reading more from disk replenishing that ...
 Thus we are forever cycling through the LRU lists moving pages between 
 the lists aging etc etc. Can lead to a livelock.

Guys, with this level of detail thses problems will never be fixed.

   A third scenario is where a system has way more RAM than swap, and not
   a whole lot of freeable page cache.  In this case, the VM ends up
   spending WAY too much CPU time scanning and shuffling around essentially
   unswappable anonymous memory and tmpfs files.
  
  Well we've allegedly fixed that, but it isn't going anywhere without
  testing.
 
 We have fixed the case in which we compile the kernel without swap. Then 
 anonymous pages behave like mlocked pages. Did we do more than that?

oh yeah, we took the ran-out-of-swapcache code out.  But if we're going to
do this thing, we should find some way to bring it back.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Rik van Riel


Christoph Lameter wrote:

On Fri, 2 Mar 2007, Andrew Morton wrote:


One particular case is a 32GB system with a database that takes most
of memory.  The amount of actually freeable page cache memory is in
the hundreds of MB.

Where's the rest of the memory? tmpfs?  mlocked?  hugetlb?


The memory is likely in use but there is enough memory free in unmapped 
clean pagecache pages so that we occasionally are able to free pages. Then 
the app is reading more from disk replenishing that ...
Thus we are forever cycling through the LRU lists moving pages between 
the lists aging etc etc. Can lead to a livelock.


In this particular case, the system even has swap free.

The kernel just chooses not to use it until it has scanned
some memory, due to the way the swappiness algorithm works.

With 32 CPUs diving into the page reclaim simultaneously,
each trying to scan a fraction of memory, this is disastrous
for performance.  A 256GB system should be even worse.


A third scenario is where a system has way more RAM than swap, and not
a whole lot of freeable page cache.  In this case, the VM ends up
spending WAY too much CPU time scanning and shuffling around essentially
unswappable anonymous memory and tmpfs files.

Well we've allegedly fixed that, but it isn't going anywhere without
testing.


We have fixed the case in which we compile the kernel without swap. Then 
anonymous pages behave like mlocked pages. Did we do more than that?


Not AFAIK.

I would like to see separate pageout selection queues
for anonymous/tmpfs and page cache backed pages.  That
way we can simply scan only that what we want to scan.

There are several ways available to balance pressure
between both sets of lists.

Splitting them out will also make it possible to do
proper use-once replacement for the page cache pages.
Ie. leaving the really active page cache pages on the
page cache active list, instead of deactivating them
because they're lower priority than anonymous pages.

That way we can do a backup without losting the page
cache working set.

--
Politics is the struggle between those who want to make their country
the best in the world, and those who believe it already is.  Each group
calls the other unpatriotic.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

kernel BUG at arch/x86_64/mm/../../i386/mm/hugetlbpage.c:140!

2007-03-02 Thread Alexander Y. Fomichev

G'day

I'm hit a bug on 2.6.21-rc1 at startup of mysql with 'large-pages' flag set.
(at this point mysql trying to allocate pages from hugetlb pool by sysv 
shm syscalls). Seems like it could be triggered by previous badness
and probably hugetlb itself is not related. Anyway i couldn't reproduce
it by now with 2.6.21-rc2 git commit 562aa1d4c6a874373f9a48ac184f662fbbb06a04.
Very likely it has been fixed somwhere between 2.6.21-rc1 and -rc2,
but i couldn't find something related by git log so any comments are welcome.

[422123.288864] Bad page state in process 'mysqld'
[422123.288867] page:810004d09800 flags:0x00f00810 
mapping: mapcount:1 count:0
[422123.288869] Trying to fix it up, but a reboot is needed
[422123.288871] Backtrace:
[422123.311745]
[422123.311746] Call Trace:
[422123.311756]  [8025d84b] _spin_unlock_irq+0x9/0xc
[422123.311762]  [8029a33a] bad_page+0x5b/0x89
[422123.311768]  [8020b94d] free_hot_cold_page+0x8c/0x11e
[422123.311773]  [8024c0a0] free_hot_page+0xb/0xd
[422123.311777]  [8020cba7] put_page+0xc8/0xd0
[422123.311784]  [802c4af8] dio_bio_complete+0x7c/0xa4
[422123.311789]  [802c59f9] __blockdev_direct_IO+0x928/0xa4d
[422123.311795]  [80322f58] xfs_vm_direct_IO+0xf0/0x125
[422123.311799]  [80323246] xfs_get_blocks_direct+0x0/0x17
[422123.311803]  [803229b3] xfs_end_io_direct+0x0/0x49
[422123.311808]  [80298f45] generic_file_direct_IO+0xa5/0xeb
[422123.311814]  [802163da] generic_file_aio_read+0xc7/0x194
[422123.311819]  [8032a38c] xfs_read+0x270/0x2e6
[422123.311824]  [80326e96] xfs_file_aio_read+0x5c/0x5e
[422123.311828]  [8020cf2e] do_sync_read+0xe2/0x126
[422123.311834]  [80289833] autoremove_wake_function+0x0/0x38
[422123.311839]  [8020b7af] vfs_read+0xcc/0x155
[422123.311843]  [80212929] sys_pread64+0x55/0x76
[422123.311848]  [802583de] system_call+0x7e/0x83
[422123.311851]
[422123.312248] Bad page state in process 'mysqld'
[422123.312249] page:810004d09840 flags:0x00f00810 
mapping: mapcount:1 count:0
[422123.312251] Trying to fix it up, but a reboot is needed
[422123.312253] Backtrace:
[422123.335123]
[422123.335124] Call Trace:
[422123.335128]  [8029a33a] bad_page+0x5b/0x89
[422123.335132]  [8020b94d] free_hot_cold_page+0x8c/0x11e
[422123.335137]  [8024c0a0] free_hot_page+0xb/0xd
[422123.335140]  [8020cba7] put_page+0xc8/0xd0
[422123.335145]  [802c4af8] dio_bio_complete+0x7c/0xa4
[422123.335150]  [802c59f9] __blockdev_direct_IO+0x928/0xa4d
[422123.335155]  [80322f58] xfs_vm_direct_IO+0xf0/0x125
[422123.335159]  [80323246] xfs_get_blocks_direct+0x0/0x17
[422123.335163]  [803229b3] xfs_end_io_direct+0x0/0x49
[422123.335167]  [80298f45] generic_file_direct_IO+0xa5/0xeb
[422123.335172]  [802163da] generic_file_aio_read+0xc7/0x194
[422123.335177]  [8032a38c] xfs_read+0x270/0x2e6
[422123.335182]  [80326e96] xfs_file_aio_read+0x5c/0x5e
[422123.335186]  [8020cf2e] do_sync_read+0xe2/0x126
[422123.335191]  [80289833] autoremove_wake_function+0x0/0x38
[422123.335195]  [8020b7af] vfs_read+0xcc/0x155
[422123.335200]  [80212929] sys_pread64+0x55/0x76
[422123.335204]  [802583de] system_call+0x7e/0x83
[422123.335206]
[422123.335277] Bad page state in process 'mysqld'
[422123.335278] page:810004d09880 flags:0x00f00810 
mapping: mapcount:1 count:0
[422123.335280] Trying to fix it up, but a reboot is needed
[422123.335282] Backtrace:
[422123.358152]
[422123.358153] Call Trace:
[422123.358157]  [8029a33a] bad_page+0x5b/0x89
[422123.358161]  [8020b94d] free_hot_cold_page+0x8c/0x11e
[422123.358166]  [8024c0a0] free_hot_page+0xb/0xd
[422123.358170]  [8020cba7] put_page+0xc8/0xd0
[422123.358174]  [802c4af8] dio_bio_complete+0x7c/0xa4
[422123.358179]  [802c59f9] __blockdev_direct_IO+0x928/0xa4d
[422123.358184]  [80322f58] xfs_vm_direct_IO+0xf0/0x125
[422123.358188]  [80323246] xfs_get_blocks_direct+0x0/0x17
[422123.358192]  [803229b3] xfs_end_io_direct+0x0/0x49
[422123.358196]  [80298f45] generic_file_direct_IO+0xa5/0xeb
[422123.358201]  [802163da] generic_file_aio_read+0xc7/0x194
[422123.358206]  [8032a38c] xfs_read+0x270/0x2e6
[422123.358211]  [80326e96] xfs_file_aio_read+0x5c/0x5e
[422123.358215]  [8020cf2e] do_sync_read+0xe2/0x126
[422123.358220]  [80289833] autoremove_wake_function+0x0/0x38
[422123.358224]  [8020b7af] vfs_read+0xcc/0x155
[422123.358228]  [80212929] sys_pread64+0x55/0x76
[422123.358233]  [802583de] system_call+0x7e/0x83
[422123.358235]
[422123.358305] Bad page state in process 'mysqld'
[422123.358307] page:810004d098c0 flags:0x00f00810

Re: [patch 2.6.20-rc2] gpio_direction_output() needs an initial value

2007-03-02 Thread David Brownell

On Thursday 01 March 2007 9:48 pm, Andrew Victor wrote:
 hi David,
 
  It's been pointed out that output GPIOs should have an initial value, to
  avoid signal glitching ... among other things, it can be some time before
  a driver is ready.  This patch corrects that oversight, fixing
 
 For the AT91 changes:
   Acked-by: Andrew Victor [EMAIL PROTECTED]
 
 
  --- g26.orig/drivers/spi/atmel_spi.c2007-02-28 12:47:43.0 
  -0800
  +++ g26/drivers/spi/atmel_spi.c 2007-03-01 15:29:30.0 -0800
 
  -   gpio_direction_output(npcs_pin);
  +   gpio_direction_output(npcs_pin, !(spi-mode  SPI_CS_HIGH));
  }
 
 As mentioned previously (by Walter Tuppa), wouldn't it be better to just
 change this to:
  cs_deactivate(spi);

That *does* deactivate the chipselect, as the initial value.
Given this function signature change, some initial value must
be specified, and that expression uses the inactive value...
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Mark Gross

On Fri, Mar 02, 2007 at 09:16:17AM -0800, Linus Torvalds wrote:
 
 
 On Fri, 2 Mar 2007, Mark Gross wrote:
   
   Yes, the same issues exist for other DRAM forms too, but to a *much* 
   smaller degree.
  
  DDR3-1333 may be better than FBDIMM's but don't count on it being much
  better.
 
 Hey, fair enough. But it's not a problem (and it doesn't have a solution) 
 today. I'm not sure it's going to have a solution tomorrow either.
 
   Also, IN PRACTICE you're never ever going to see this anyway. Almost 
   everybody wants bank interleaving, because it's a huge performance win on 
   many loads. That, in turn, means that your memory will be spread out over 
   multiple DIMM's even for a single page, much less any bigger area.
  
  4-way interleave across banks on systems may not be as common as you may
  think for future chip sets.  2-way interleave across DIMMs within a bank
  will stay.
 
 .. and think about a realistic future.
 
 EVERYBODY will do on-die memory controllers. Yes, Intel doesn't do it 
 today, but in the one- to two-year timeframe even Intel will.

True.

 
 What does that mean? It means that in bigger systems, you will no longer 
 even *have* 8 or 16 banks where turning off a few banks makes sense. 
 You'll quite often have just a few DIMM's per die, because that's what you 
 want for latency. Then you'll have CSI or HT or another interconnect.
 
 And with a few DIMM's per die, you're back where even just 2-way 
 interleaving basically means that in order to turn off your DIMM, you 
 probably need to remove HALF the memory for that CPU.

I think there will be more than just 2 dims per cpu socket on systems
that care about this type of capability.

 
 In other words: TURNING OFF DIMM's IS A BEDTIME STORY FOR DIMWITTED 
 CHILDREN.


Its very true that taking advantage of the first incarnations of this
type of thing will be limited to specific workloads you personally don't
care about, but its got applications and customers.

BTW I hope we aren't talking past each other, there are low power states
where the ram contents are persevered.

 
 There are maybe a couple machines IN EXISTENCE TODAY that can do it. But 
 nobody actually does it in practice, and nobody even knows if it's going 
 to be viable (yes, DRAM takes energy, but trying to keep memory free will 
 likely waste power *too*, and I doubt anybody has any real idea of how 
 much any of this would actually help in practice).
 
 And I don't think that will change. See above. The future is *not* moving 
 towards more and more DIMMS. Quite the reverse. On workstations, we are 
 currently in the one or two DIMM's per die. Do you really think that 
 will change? Hell no. And in big servers, pretty much everybody agrees 
 that we will move towards that, rather than away from it.
 
 So:
  - forget about turning DIMM's off. There is *no* actual data supporting 
the notion that it's a good idea today, and I seriously doubt you can 
really argue that it will be a good idea in five or ten years. It's a 
hardware hack for a hardware problem, and the problems are way too 
complex for us to solve in time for the solution to be relevant.
 
  - aim for NUMA memory allocation and turning off whole *nodes*. That's 
much more likely to be productive in the longer timeframe. And yes, we 
may well want to do memory compaction for that too, but I suspect that 
the issues are going to be different (ie the way to do it is to simply 
prefer certain nodes for certain allocations, and then try to keep the 
jobs that you know can be idle on other nodes)

We doing the NUMA approach.  

 
 Do you actually have real data supporting the notion that turning DIMM's 
 off will be reasonable and worthwhile? 
 

Yes we have data from our internal and external customers showing that
this stuff is worthwhile for specific workload that some people care
about.  However; you need to understand that its by definition marketing data.

--mgross
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: module builds need improvement / top Makefile not good enough

2007-03-02 Thread Sam Ravnborg

On Fri, Mar 02, 2007 at 09:14:22AM -0800, FN wrote:
 Hello list,
 
 I am unhappy with the direction the 2.6 kernel builds have taken.
 Very much like Micro$loth DDKs we (linux users) are being forced to
 build
 modules by plugging into a framework that doesn't respect the fine
 aspects
 of dependency generation and analysis.

The build system for the kernel (kbuild) have a number of goals to fulfill.
The most important ones was to be reliable and easy to use.
The easy to use part has mandated a very simple syntax in the
more than 1000 Makefiles in the kernel - and kbuild has been extended
support external modules too.

With the 2.6 kernel it is mandatory to use kbuild for external
modules for a number of reasons:
- compiler and to some extent linker options now much more rely on
   the actual configuration. So a gcc commandline for a module may look
   different depending on the actual configuration.

- The module support code is well integrated with kbuildand when
   it changes (and it does so now and then) then all external modules
   do not need to apply the same changes.

- Same syntax for in-kernel and external modules makes it simpler.


The infrastructure used to achieve the goals of simplicity and reliability
rely and a great deal of the features supported by GNU make and that
clearmake and others does not support.
So faced with the two possibilities which is 
1) to rely on less GNU make features and support other make tools
2) full feature set but rely on GNU make
the choice was easy.

The point here is that even if kbuild was changed slightly to allow a user
to specify an alternative make program when building external modules
that would not work because kbuild rely too much on the GNU make features.

You can try this today.
MAKE=/usr/bin/my_make_utility make M=`pwd`

And see how it fails due to use of GNU make features.

Sam
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Linus Torvalds



On Fri, 2 Mar 2007, Mark Gross wrote:
 
 I think there will be more than just 2 dims per cpu socket on systems
 that care about this type of capability.

I agree. I think you'll have a nice mix of 2 and 4, although not likely a 
lot more. You want to have independent channels, and then within a channel 
you want to have as close to point-to-point as possible. 

But the reason that I think you're better off looking at a node level is 
that 

 (a) describing the DIMM setup is a total disaster. The interleaving is 
 part of it, but even in the absense of interleaving, we have so far 
 seen that describing DIMM mapping simply isn't a realistic thing to 
 be widely deplyed, judging by the fact that we cannot even get a 
 first-order approximate mapping for the ECC error events.

 Going node-level means that we just piggy-back on the existing node 
 mapping, which is a lot more likely to actually be correct and 
 available (ie you may not know which bank is bank0 and how the 
 interleaving works, but you usually *do* know which bank is connected 
 to which CPU package)

 (Btw, I shouldn't have used the word die, since it's really about 
 package - Intel obviously has a penchant for putting two dies per 
 package)

 (b) especially if you can actually shut down the memory, going node-wide 
 may mean that you can shut down the CPU's too (ie per-package sleep). 
 I bet the people who care enough to care about DIMM's would want to 
 have that *anyway*, so tying them together simplifies the problem.

 BTW I hope we aren't talking past each other, there are low power states
 where the ram contents are persevered.

Yes. They are almost as hard to handle, but the advantage is that if we 
get things wrong, it can still work most of the time (ie we don't have to 
migrate everything off, we just need to try to migrate the stuff that gets 
*used* off a DIMM, and hardware will hopefully end up quiescing the right 
memory controller channel totally automatically, without us having to know 
the exact mapping or even having to 100% always get it 100% right).

With FBDIMM in particular, I guess the biggest power cost isn't actually 
the DRAM content, but just the controllers.

Of course, I wonder how much actual point there is to FBDIMM's once you 
have on-die memory controllers and thus the reason for deep queueing is 
basically gone (since you'd spread out the memory rather than having it 
behind a few central controllers).

Linus
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Mark Gross

On Fri, Mar 02, 2007 at 10:02:57AM -0800, Andrew Morton wrote:
 On Fri, 2 Mar 2007 09:35:27 -0800
 Mark Gross [EMAIL PROTECTED] wrote:
 
   
   Will it be possible to just power the DIMMs off?  I don't see much point 
   in
   some half-power non-destructive mode.
  
  I think so, but need to double check with the HW folks.
  
  Technically, the dims could be powered off, and put into 2 different low
  power non-destructive states.  (standby and suspend), but putting them
  in a low power non-destructive mode has much less latency and provides
  good bang for the buck or LOC change needed to make work.
  
  Which lower power mode an application chooses will depend on latency
  tolerances of the app.  For the POC activities we are looking at we are
  targeting the lower latency option, but that doesn't lock out folks from
  trying to do something with the other options.
  
 
 If we don't evacuate all live data from all of the DIMM, we'll never be
 able to power the thing down in many situations.
 
 Given that we _have_ emptied the DIMM, we can just turn it off.  And
 refilling it will be slow - often just disk speed.
 
 So I don't see a useful use-case for non-destructive states.

I'll post the RFC very soon to provide a better thread context for this
line of discussion, but to answer your question:

There are 2 power management policies we are looking at.  The first one
is allocation based PM, and the other is access base PM.  The access
based PM needs chip set support which is coming at a TBD date.

--mgross
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch] timer/hrtimer: take per cpu locks in sane order

2007-03-02 Thread Heiko Carstens

From: Heiko Carstens [EMAIL PROTECTED]

Doing something like this on a two cpu system

# echo 0  /sys/devices/system/cpu/cpu0/online 
# echo 1  /sys/devices/system/cpu/cpu0/online 
# echo 0  /sys/devices/system/cpu/cpu1/online 

will give me this:

===
[ INFO: possible circular locking dependency detected ]
2.6.21-rc2-g562aa1d4-dirty #7
---
bash/1282 is trying to acquire lock:
 (cpu_base-lock_key){.+..}, at: [0005f17e] 
hrtimer_cpu_notify+0xc6/0x240

but task is already holding lock:
 (cpu_base-lock_key#2){.+..}, at: [0005f174] 
hrtimer_cpu_notify+0xbc/0x240

which lock already depends on the new lock.

This happens because we have the following code in kernel/hrtimer.c:

migrate_hrtimers(int cpu)
[...]
old_base = per_cpu(hrtimer_bases, cpu);
new_base = get_cpu_var(hrtimer_bases);
[...]
spin_lock(new_base-lock);
spin_lock(old_base-lock);

Which means the spinlocks are taken in an order which depends on which cpu
gets shut down from which other cpu. Therefore lockdep complains that there
might be an ABBA deadlock. Since migrate_hrtimers() gets only called on
cpu hotplug it's safe to assume that it isn't executed concurrently on a
different cpu and therefore the locking should be ok.

The same problem exists in kernel/timer.c: migrate_timers().

As pointed out by Christian Borntraeger one possible solution to avoid
the locking order complaints would be to make sure that the locks are
always taken in the same order. E.g. by taking the lock of the cpu with
the lower number first. AFIACS this should be safe and that is what this
patch does.

Cc: Ingo Molnar [EMAIL PROTECTED]
Cc: Thomas Gleixner [EMAIL PROTECTED]
Cc: Roman Zippel [EMAIL PROTECTED]
Cc: John Stultz [EMAIL PROTECTED]
Cc: Christian Borntraeger [EMAIL PROTECTED]
Cc: Martin Schwidefsky [EMAIL PROTECTED]
Signed-off-by: Heiko Carstens [EMAIL PROTECTED]
---
 kernel/hrtimer.c |   39 ++-
 kernel/timer.c   |   38 ++
 2 files changed, 68 insertions(+), 9 deletions(-)

Index: linux-2.6/kernel/hrtimer.c
===
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1343,6 +1343,38 @@ static void migrate_hrtimer_list(struct 
}
 }
 
+/*
+ * double_hrtimer_lock/unlock are used to ensure that on cpu hotplug the
+ * per cpu timer locks are always taken in the same order.
+ */
+static void double_hrtimer_lock(struct hrtimer_cpu_base *base1,
+   struct hrtimer_cpu_base *base2, int ind)
+   __acquires(base1-lock)
+   __acquires(base2-lock)
+{
+   if (ind  0) {
+   spin_lock(base1-lock);
+   spin_lock(base2-lock);
+   } else {
+   spin_lock(base2-lock);
+   spin_lock(base1-lock);
+   }
+}
+
+static void double_hrtimer_unlock(struct hrtimer_cpu_base *base1,
+ struct hrtimer_cpu_base *base2, int ind)
+   __releases(base1-lock)
+   __releases(base2-lock)
+{
+   if (ind  0) {
+   spin_unlock(base2-lock);
+   spin_unlock(base1-lock);
+   } else {
+   spin_unlock(base1-lock);
+   spin_unlock(base2-lock);
+   }
+}
+
 static void migrate_hrtimers(int cpu)
 {
struct hrtimer_cpu_base *old_base, *new_base;
@@ -1355,17 +1387,14 @@ static void migrate_hrtimers(int cpu)
tick_cancel_sched_timer(cpu);
 
local_irq_disable();
-
-   spin_lock(new_base-lock);
-   spin_lock(old_base-lock);
+   double_hrtimer_lock(new_base, old_base, smp_processor_id() - cpu);
 
for (i = 0; i  HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(old_base-clock_base[i],
 new_base-clock_base[i]);
}
-   spin_unlock(old_base-lock);
-   spin_unlock(new_base-lock);
 
+   double_hrtimer_unlock(new_base, old_base, smp_processor_id() - cpu);
local_irq_enable();
put_cpu_var(hrtimer_bases);
 }
Index: linux-2.6/kernel/timer.c
===
--- linux-2.6.orig/kernel/timer.c
+++ linux-2.6/kernel/timer.c
@@ -1640,6 +1640,38 @@ static void migrate_timer_list(tvec_base
}
 }
 
+/*
+ * double_timer_lock/unlock are used to ensure that on cpu hotplug the
+ * per cpu timer locks are always taken in the same order.
+ */
+static void __devinit double_timer_lock(tvec_base_t *base1,
+   tvec_base_t *base2, int ind)
+   __acquires(base1-lock)
+   __acquires(base2-lock)
+{
+   if (ind  0) {
+   spin_lock(base1-lock);
+   spin_lock(base2-lock);
+   } else {
+   spin_lock(base2-lock);
+   spin_lock(base1-lock);
+   }
+}
+
+static void __devinit double_timer_unlock(tvec_base_t

Re: [patch 00/13] Syslets, Threadlets, generic AIO support, v3

2007-03-02 Thread Davide Libenzi

On Fri, 2 Mar 2007, Davide Libenzi wrote:

 And if you really feel raw about the single O(nready) loop that epoll 
 currently does, a new epoll_wait2 (or whatever) API could be used to 
 deliver the event directly into a userspace buffer [1], directly from the 
 poll callback, w/out extra delivery loops 
 (IRQ/event-epoll_callback-event_buffer).

And if you ever wonder from where the epoll name came, it came from the 
old /dev/epoll. The epoll predecessor /dev/epoll, was adding plugs 
everywhere events where needed and was delivering those events in O(1) 
*directly* on a user visible (mmap'd) buffer, in a zero-copy fashion.
The old /dev/epoll was faster the the current epoll, but the latter was 
chosen because despite being sloghtly slower, it had support for every 
pollable device, *without* adding more plugs into the existing code.
Performance and code maintainance are not to be taken disjointly whenever 
you evaluate a solution. That's the reason I got excited about this new 
generic AIO slution.



- Davide


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Very slow routing table modification if RTA_FLOW is set

2007-03-02 Thread David Miller

From: NetArt - Grzegorz Nosek [EMAIL PROTECTED]
Date: Thu, 1 Mar 2007 15:29:11 +0100

 I have noticed that using realm patch for quagga
 http://vcalinus.gemenii.ro/quaggarealms.html causes the kernel to
 spend a lot more time processing rtnetlink messages.

For the second time, I am going to ask you very nicely to
please post this instead to the Linux networking development
mailing list, located at netdev@vger.kernel.org

That is where you will reach the most people knowledgable in
the area of your problem report, networking developers mostly
do not read linux-kernel

Thank you.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: The performance and behaviour of the anti-fragmentation related patches

2007-03-02 Thread Christoph Lameter

On Fri, 2 Mar 2007, Rik van Riel wrote:

 I would like to see separate pageout selection queues
 for anonymous/tmpfs and page cache backed pages.  That
 way we can simply scan only that what we want to scan.
 
 There are several ways available to balance pressure
 between both sets of lists.
 
 Splitting them out will also make it possible to do
 proper use-once replacement for the page cache pages.
 Ie. leaving the really active page cache pages on the
 page cache active list, instead of deactivating them
 because they're lower priority than anonymous pages.

Well I would expect this to have marginal improvements and delay the 
inevitable for awhile until we have even bigger memory. If the app uses 
mmapped data areas then the problem is still there. And such tinkering 
does not solve the issue of large scale I/O requiring the handling of 
gazillions of page structs. I do not think that there is a way around 
somehow handling larger chunks of memory in an easier way. We already do 
handle larger page sizes for some limited purposes and with huge pages we 
already have a larger page size. Mel's defrag/anti-frag patches are 
necessary to allow us to deal with the resulting fragmentation problems.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + fully-honor-vdso_enabled.patch added to -mm tree

2007-03-02 Thread Oleg Nesterov

On 03/02, Paul Mundt wrote:

 On Thu, Mar 01, 2007 at 08:52:07PM +0300, Oleg Nesterov wrote:
  
   @@ -105,10 +107,25 @@ int arch_setup_additional_pages(struct l
{
 struct mm_struct *mm = current-mm;
 unsigned long addr;
   + unsigned long flags;
 int ret;
  
   + switch (vdso_enabled) {
   + case 0:  /* none */
   + return 0;
  
  This means we don't initialize mm-context.vdso and -sysenter_return.
  
  Is it ok? For example, setup_rt_frame() uses 
  VDSO_SYM(__kernel_rt_sigreturn),
  sysenter_past_esp pushes -sysenter_return on stack.

 The setup_rt_frame() case is fairly straightforward, both PPC and SH
 already check to make sure there's a valid context before trying to use
 VDSO_SYM(), I'm not sure why x86 doesn't.

 Though I wonder if there's any point in checking binfmt-hasvdso here?
 There shouldn't be a valid mm-context.vdso in the !hasvdso case..

setup_rt_frame() is obviously wrong? Surely it must check -hasvdso like
setup_frame() does! Otherwise, we will have SIGSEGV on SA_SIGINFO if
-load_binary() does not call arch_setup_additional_pages(), no?

If no, what -hasvdso is?

 Someone else will have to comment on -sysenter_return.

It is needed for sysexit. If we don't use sysenter (and we shouldn't, because
syscall_page is not mapped), we don't need to initialize it. Note also that
sys_execve() sets TIF_IRET, so we are safe even if sys_execve() was called
using __kernel_vsyscall.

Still, I don't understand why we don't pass NEW_AUX_ENT(AT_SYSINFO) when
vdso_enabled == 0. We don't need linux-gate.so to use __kernel_vsyscall,
we have FIX_VDSO. In that case we should s/PAGE_KERNEL_RO/PAGE_READONLY/
of course. I guess the reason is some magic in glibc.

Oleg.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] timer/hrtimer: take per cpu locks in sane order

2007-03-02 Thread Andrew Morton

On Fri, 2 Mar 2007 20:08:36 +0100
Heiko Carstens [EMAIL PROTECTED] wrote:

 +/*
 + * double_hrtimer_lock/unlock are used to ensure that on cpu hotplug the
 + * per cpu timer locks are always taken in the same order.
 + */
 +static void double_hrtimer_lock(struct hrtimer_cpu_base *base1,
 + struct hrtimer_cpu_base *base2, int ind)
 + __acquires(base1-lock)
 + __acquires(base2-lock)
 +{

 ...

 +/*
 + * double_timer_lock/unlock are used to ensure that on cpu hotplug the
 + * per cpu timer locks are always taken in the same order.
 + */
 +static void __devinit double_timer_lock(tvec_base_t *base1,
 + tvec_base_t *base2, int ind)
 + __acquires(base1-lock)
 + __acquires(base2-lock)

hm.  Can we not just pass in the spinlock_t*'s and use a common function?

void double_spin_lock(spinlock_t *l1, spinlock_t *l2, int ind);

that way it has nothing to do with timers and can potentially be used
elsewhere in the kernel, too.

(what does ind mean?)
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 00/13] Syslets, Threadlets, generic AIO support, v3

2007-03-02 Thread Ingo Molnar


* Davide Libenzi davidel@xmailserver.org wrote:

 [...] We're still missing proper FPU context switch in the 
 move_user_context(). [...]

yeah - i'm starting to be of the opinion that the FPU context should 
stay with the threadlet, exclusively. I.e. when calling a threadlet, the 
'outer loop' (the event loop) should not leak FPU context into the 
threadlet and then expect it to be replicated from whatever random point 
the threadlet ended up sleeping at. It would be possible, but it just 
makes no sense. What makes most sense is to just keep the FPU context 
with the threadlet, and to let the 'new head' use an initial (unused) 
FPU context. And it's in fact the threadlet that will most likely have 
an acrive FPU context across a system call, not the outer loop. In other 
words: no special FPU support needed at all for threadlets (i.e. no 
flipping needed even) - this behavior just naturally happens in the 
current implementation. Hm?

Ingo
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH -mm] clocksource init adjustments (fix bug #7426)

2007-03-02 Thread john stultz

On Fri, 2007-03-02 at 02:18 -0800, Andrew Morton wrote:
 On Thu, 22 Feb 2007 16:13:02 -0800 john stultz [EMAIL PROTECTED] wrote:
  Thus the solution here is to register clocksources earlier (ideally when
  the hardware is being initialized), and then we enable clocksource
  selection at fs_initcall (before device_initcall).
  
  This patch should probably get some testing time in -mm, since
  clocksource selection is one of the most important issues for correct
  timekeeping, and I've only been able to test this on a few of my own
  boxes.
 
 This doornails my Nocona box early in boot:
 http://userweb.kernel.org/~akpm/s5000431.jpg
 
 Slab isn't ready yet - time_init()-hpet_arch_init() is called before
 start_kernel() has run kmem_cache_init().

Oh! Sorry! Yea, looking at it more the ioremap isn't actually necessary,
as we can use hpet_readl() instead of re-calculating the hpet base
address pointer.

I'll fix this up (and find an HPET enabled x86_64 box to test it on) and
get a patch to you shortly.

Sorry again!
-john


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RTS/CTS handshaking for embedded devices

2007-03-02 Thread Oleksiy Kebkal


Hi,

I use arm-linux platform to program some peripheral where pecular
serial flow control is required:

- There is no flow control for the arm-linux device - control unit
- For the control unix - arm-linux device
* RTS must be kept low, the device keeps CTS low as well.
* when the device wants to send data, it raises CTS. RTS must
  be raised as well. Data can then pass, CTS and RTS are lowered.

Actually it's pretty simillar to the problem, described here:
http://groups.google.de/group/linux.kernel/browse_thread/thread/b5a059bfba931188/1d8cfeef17431180?lnk=gstq=crtsctsrnum=5hl=ru#1d8cfeef17431180
and I suppose, here:
http://groups.google.de/group/linux.kernel/browse_thread/thread/9479f1a2336e6aaa/b2d0cccb7cdb1f2a?lnk=gstq=crtsctsrnum=2hl=ru#b2d0cccb7cdb1f2a

It looked for me obvios that the solution is to switch off the flow
control (CRTSCTS) and to assert/deassert RTS via
ioctl. But I have found, that:
* even if flow control is switched off, serial_core.c asserts RTS
during opening the serial device.
* there is no possibility (at least I didn't find it) to preconfigure
serial interface in the user space before opening
 serial device (looks logically), but there is a time gap between
opening the serial device and configuring it, there
 control device thinks that arm is ready to get data (RTS is
asserted), but the configuration parameters are
 potentially wrong.

So the first question is, is there any provided by kernel way to solve
the described above task?

If not, maybe it would be more correct just to don't touch RTS, if
flow control is switched off, like here:

Index: drivers/serial/serial_core.c
===
--- a/drivers/serial/serial_core.c  (mode:100644)
+++ b/drivers/serial/serial_core.c  (mode:100644)
@@ -212,7 +212,7 @@
   * is open and ready to respond.
   */
  if (info-tty-termios-c_cflag  CBAUD)
-   info-mctrl |= TIOCM_RTS | TIOCM_DTR;
+   info-mctrl |= TIOCM_DTR | (info-flags 
ASYNC_CTS_FLOW ? 0 : TIOCM_RTS );
  info-ops-set_mctrl(info-port, info-mctrl);

  info-flags |= ASYNC_INITIALIZED;
@@ -1001,8 +1001,8 @@
  /* Handle transition away from B0 status */
  if (!(old_termios-c_cflag  CBAUD)  (cflag  CBAUD)) {
  info-mctrl |= TIOCM_DTR;
-   if (!(cflag  CRTSCTS) ||
-   !test_bit(TTY_THROTTLED, tty-flags))
+   if (/*!(cflag  CRTSCTS) ||*/
+   !test_bit(TTY_THROTTLED, tty-flags))
  info-mctrl |= TIOCM_RTS;
  info-ops-set_mctrl(info-port, info-mctrl);
  }
@@ -1301,7 +1301,7 @@
  spin_lock_irqsave(info-lock, flags);
  if (!(info-flags  ASYNC_CALLOUT_ACTIVE) 
  (tty-termios-c_cflag  CBAUD)) {
-   info-mctrl |= TIOCM_DTR | TIOCM_RTS;
+   info-mctrl |= TIOCM_DTR | (info-flags 
ASYNC_CTS_FLOW ? 0 : TIOCM_RTS );
  info-ops-set_mctrl(info-port, info-mctrl);
  }
  spin_unlock_irqrestore(info-lock, flags);

---
Oleksiy
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH (update 3)] timer: Run calc_load halfway through each round_jiffies second

2007-03-02 Thread Simon Arlott


On 02/03/07 18:03, Eric Dumazet wrote:

On Friday 02 March 2007 18:32, Simon Arlott wrote:

On 02/03/07 16:35, Eric Dumazet wrote:



You could just change LOAD_FREQ from (5*HZ) to (5*HZ+1)
You can see that 5.01 instead of 5.00 second gives the same EXP_xx
values.

So (5*HZ + 1) is safe. (because HZ = 100)

On HZ=1000, this would cause the load average to be pushed towards +1.00
for up to 2 minutes every ~83 minutes with no obvious cause. (If a task
takes ~10-20ms to run, so 20 runs are needed at HZ=1000 before it passes
it again).


Nope, you dont quite understand how load (avenrun[]) is computed.
Not exactly 1.0 as you think !
Then in the next intervals (if active count is 0), it will decrease 'slowly' : 
0.0735627

0.0676809
0.0622695
0.0572907

In average, your load factor close to reality.


I knew that; but the task runs for more than 1 tick and it takes until the 
next calc_load run before it moves on even 1 tick.


Just try my suggestion, it should work. I even proved it in my previous 
mail :)


With HZ=1000, the active count will be 1 up to 20 times in a row before it 
becomes out of sync with when the task is run again. This is ample time for 
the load value itself to get closer to 1:

$ uptime; (yes/dev/null ); sleep 100; uptime
20:00:29 up  4:35,  7 users,  load average: 0.33, 0.51, 0.78
20:02:09 up  4:37,  7 users,  load average: 0.97, 0.67, 0.81
(not very useful results since the load isn't at 0.00 very often)


On 02/03/07 16:35, Eric Dumazet wrote:
I believe this patch is too complex/hazardous and may break exp decay 
computation.


I still don't know why you think it may change the computation of load (aside 
from at boot or jiffies wrapping), and it's not really complex at all. It is 
possible that someone will change the value of LOAD_FREQ to something other 
than a multiple of HZ and this won't work because it'll get rounded up to a 
whole second. That and the negligible extra processing time of doing 
round_jiffies every 5 seconds is the only problem I can see.


I accidentally left LOAD_FREQ at 5 instead of 5*HZ and had a printk in there, 
it still worked fine aside from the load average going up and down every tick.


--
Simon Arlott
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 00/13] Syslets, Threadlets, generic AIO support, v3

2007-03-02 Thread Davide Libenzi

On Fri, 2 Mar 2007, Ingo Molnar wrote:

 
 * Davide Libenzi davidel@xmailserver.org wrote:
 
  [...] We're still missing proper FPU context switch in the 
  move_user_context(). [...]
 
 yeah - i'm starting to be of the opinion that the FPU context should 
 stay with the threadlet, exclusively. I.e. when calling a threadlet, the 
 'outer loop' (the event loop) should not leak FPU context into the 
 threadlet and then expect it to be replicated from whatever random point 
 the threadlet ended up sleeping at. It would be possible, but it just 
 makes no sense. What makes most sense is to just keep the FPU context 
 with the threadlet, and to let the 'new head' use an initial (unused) 
 FPU context. And it's in fact the threadlet that will most likely have 
 an acrive FPU context across a system call, not the outer loop. In other 
 words: no special FPU support needed at all for threadlets (i.e. no 
 flipping needed even) - this behavior just naturally happens in the 
 current implementation. Hm?

I think that the dirty FPU context must, at least, follow the new head. 
That's what the userspace sees, and you don't want an async_exec to 
re-emerge with a different FPU context.
I think it should also follow the async thread (old, going-to-sleep, 
thread), since a threadlet might have that dirtied, and as a consequence 
it'll want to find it back when it's re-scheduled.
So, IMO, if the USEDFPU bit is set, we need to sync the dirty  FPU context 
with an early unlazy_fpu(), *and* copy the sync'd FPU context to the new head.
This should really be a fork of the dirty FPU context IMO, and should only 
happen if the USEDFPU bit is set.



- Davide


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mv643xx ethernet driver

2007-03-02 Thread Stephen Hemminger

On Thu, 01 Mar 2007 23:52:06 -0500
Giridhar Pemmasani [EMAIL PROTECTED] wrote:

 During initialization, mv643xx driver registers IRQ before setting up tx/rx
 rings. This causes kernel oops because mv643xx_poll, which gets called
 right after registering IRQ, calls netif_rx_complete, which accesses the rx
 ring (I don't have the oops message anymore; I just remember this sequence
 of calls). Attached (tested) patch first initializes the rx/tx rings and
 then registers the IRQ.
 
 Giri

Wrong mailing list network device patches should be sent to:

NETWORK DEVICE DRIVERS
P:  Andrew Morton
M:  [EMAIL PROTECTED]
P:  Jeff Garzik
M:  [EMAIL PROTECTED]
L:  netdev@vger.kernel.org
T:  git kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
S:  Maintained

-- 
Stephen Hemminger [EMAIL PROTECTED]
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] Heads up on sys_fallocate()

2007-03-02 Thread Badari Pulavarty

On Fri, 2007-03-02 at 09:16 -0600, Eric Sandeen wrote:
 Badari Pulavarty wrote:
  
  Amit K. Arora wrote:
  
  This is to give a heads up on few patches that we will be soon coming up
  with. These patches implement a new system call sys_fallocate() and a
  new inode operation fallocate, for persistent preallocation. The new
  system call, as Andrew suggested, will look like:
 
   asmlinkage long sys_fallocate(int fd, loff_t offset, loff_t len);
 
  I am wondering about return values from this syscall ? Is it supposed to 
  return the
  number of bytes allocated ? What about partial allocations ? 
 
 If you don't have enough blocks to cover the request, you should 
 probably just return -ENOSPC, not a partial allocation.

That could be challenging, when multiple writers are working in
parallel. You may not be able to return -ENOSPC, till you fail the
allocation (for filesystems which alllocates a block at a time).

 
  What about 
  if the
  blocks already exists ? What would be return values in those cases ?
 
 0 on success, other normal errors oetherwise..
 
 If asked for a range that includes already-allocated blocks, you just 
 allocate any non-allocated blocks in the range, I think.

Yes. What I was trying to figure out is, if there is a requirement that
interface need to return exact number of bytes it *really* allocated
(like write() or read()). I can't think of any, but just wanted to
through it out..

BTW, what is the interface for finding out what is the size of the
pre-allocated file ? 

Thanks,
Badari

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2 -stable] libata: add missing PM callbacks

2007-03-02 Thread Tejun Heo

Some LLDs were missing scsi device PM callbacks while having host/port
suspend support.  Add missing ones.

Signed-off-by: Tejun Heo [EMAIL PROTECTED]
---
This should fix the problem you're seeing on sil680.  These patches
are against 2.6.20.1.  Patches for libata-dev#upstream is separately
posted to linux-ide in the following thread.

  http://thread.gmane.org/gmane.linux.ide/16475

 drivers/ata/pata_jmicron.c |4 
 drivers/ata/pata_sil680.c  |4 
 2 files changed, 8 insertions(+)

Index: work1/drivers/ata/pata_sil680.c
===
--- work1.orig/drivers/ata/pata_sil680.c
+++ work1/drivers/ata/pata_sil680.c
@@ -226,6 +226,10 @@ static struct scsi_host_template sil680_
.slave_configure= ata_scsi_slave_config,
.slave_destroy  = ata_scsi_slave_destroy,
.bios_param = ata_std_bios_param,
+#ifdef CONFIG_PM
+   .suspend= ata_scsi_device_suspend,
+   .resume = ata_scsi_device_resume,
+#endif
 };
 
 static struct ata_port_operations sil680_port_ops = {
Index: work1/drivers/ata/pata_jmicron.c
===
--- work1.orig/drivers/ata/pata_jmicron.c
+++ work1/drivers/ata/pata_jmicron.c
@@ -137,6 +137,10 @@ static struct scsi_host_template jmicron
.slave_destroy  = ata_scsi_slave_destroy,
/* Use standard CHS mapping rules */
.bios_param = ata_std_bios_param,
+#ifdef CONFIG_PM
+   .suspend= ata_scsi_device_suspend,
+   .resume = ata_scsi_device_resume,
+#endif
 };
 
 static const struct ata_port_operations jmicron_ops = {
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2 -stable] libata: add missing CONFIG_PM in LLDs

2007-03-02 Thread Tejun Heo

Add missing #ifdef CONFIG_PM conditionals around all PM related parts
in libata LLDs.

Signed-off-by: Tejun Heo [EMAIL PROTECTED]
---
 drivers/ata/ahci.c  |   14 ++
 drivers/ata/ata_generic.c   |4 
 drivers/ata/ata_piix.c  |4 
 drivers/ata/pata_ali.c  |6 ++
 drivers/ata/pata_amd.c  |6 ++
 drivers/ata/pata_atiixp.c   |4 
 drivers/ata/pata_cmd64x.c   |6 ++
 drivers/ata/pata_cs5520.c   |7 +++
 drivers/ata/pata_cs5530.c   |6 ++
 drivers/ata/pata_cs5535.c   |4 
 drivers/ata/pata_cypress.c  |4 
 drivers/ata/pata_efar.c |4 
 drivers/ata/pata_hpt366.c   |7 ++-
 drivers/ata/pata_hpt3x3.c   |6 ++
 drivers/ata/pata_it821x.c   |6 ++
 drivers/ata/pata_jmicron.c  |4 
 drivers/ata/pata_marvell.c  |4 
 drivers/ata/pata_mpiix.c|4 
 drivers/ata/pata_netcell.c  |4 
 drivers/ata/pata_ns87410.c  |4 
 drivers/ata/pata_oldpiix.c  |4 
 drivers/ata/pata_opti.c |4 
 drivers/ata/pata_optidma.c  |4 
 drivers/ata/pata_pdc202xx_old.c |4 
 drivers/ata/pata_radisys.c  |4 
 drivers/ata/pata_rz1000.c   |6 ++
 drivers/ata/pata_sc1200.c   |4 
 drivers/ata/pata_serverworks.c  |6 ++
 drivers/ata/pata_sil680.c   |4 
 drivers/ata/pata_sis.c  |4 
 drivers/ata/pata_triflex.c  |4 
 drivers/ata/pata_via.c  |6 ++
 drivers/ata/sata_sil.c  |2 ++
 drivers/ata/sata_sil24.c|2 ++
 34 files changed, 165 insertions(+), 1 deletion(-)

Index: work1/drivers/ata/ahci.c
===
--- work1.orig/drivers/ata/ahci.c
+++ work1/drivers/ata/ahci.c
@@ -225,10 +225,12 @@ static void ahci_thaw(struct ata_port *a
 static void ahci_error_handler(struct ata_port *ap);
 static void ahci_vt8251_error_handler(struct ata_port *ap);
 static void ahci_post_internal_cmd(struct ata_queued_cmd *qc);
+#ifdef CONFIG_PM
 static int ahci_port_suspend(struct ata_port *ap, pm_message_t mesg);
 static int ahci_port_resume(struct ata_port *ap);
 static int ahci_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg);
 static int ahci_pci_device_resume(struct pci_dev *pdev);
+#endif
 static void ahci_remove_one (struct pci_dev *pdev);
 
 static struct scsi_host_template ahci_sht = {
@@ -248,8 +250,10 @@ static struct scsi_host_template ahci_sh
.slave_configure= ata_scsi_slave_config,
.slave_destroy  = ata_scsi_slave_destroy,
.bios_param = ata_std_bios_param,
+#ifdef CONFIG_PM
.suspend= ata_scsi_device_suspend,
.resume = ata_scsi_device_resume,
+#endif
 };
 
 static const struct ata_port_operations ahci_ops = {
@@ -276,8 +280,10 @@ static const struct ata_port_operations 
.error_handler  = ahci_error_handler,
.post_internal_cmd  = ahci_post_internal_cmd,
 
+#ifdef CONFIG_PM
.port_suspend   = ahci_port_suspend,
.port_resume= ahci_port_resume,
+#endif
 
.port_start = ahci_port_start,
.port_stop  = ahci_port_stop,
@@ -307,8 +313,10 @@ static const struct ata_port_operations 
.error_handler  = ahci_vt8251_error_handler,
.post_internal_cmd  = ahci_post_internal_cmd,
 
+#ifdef CONFIG_PM
.port_suspend   = ahci_port_suspend,
.port_resume= ahci_port_resume,
+#endif
 
.port_start = ahci_port_start,
.port_stop  = ahci_port_stop,
@@ -441,8 +449,10 @@ static struct pci_driver ahci_pci_driver
.name   = DRV_NAME,
.id_table   = ahci_pci_tbl,
.probe  = ahci_init_one,
+#ifdef CONFIG_PM
.suspend= ahci_pci_device_suspend,
.resume = ahci_pci_device_resume,
+#endif
.remove = ahci_remove_one,
 };
 
@@ -587,6 +597,7 @@ static void ahci_power_up(void __iomem *
writel(cmd | PORT_CMD_ICC_ACTIVE, port_mmio + PORT_CMD);
 }
 
+#ifdef CONFIG_PM
 static void ahci_power_down(void __iomem *port_mmio, u32 cap)
 {
u32 cmd, scontrol;
@@ -604,6 +615,7 @@ static void ahci_power_down(void __iomem
cmd = ~PORT_CMD_SPIN_UP;
writel(cmd, port_mmio + PORT_CMD);
 }
+#endif
 
 static void ahci_init_port(void __iomem *port_mmio, u32 cap,
   dma_addr_t cmd_slot_dma, dma_addr_t rx_fis_dma)
@@ -1336,6 +1348,7 @@ static void ahci_post_internal_cmd(struc
}
 }
 
+#ifdef CONFIG_PM
 static int ahci_port_suspend(struct ata_port *ap, pm_message_t mesg)
 {
struct ahci_host_priv *hpriv = ap-host-private_data;
@@ -1412,6 +1425,7 @@ static int

Re: [PATCH] libata: Cable detection fixes

2007-03-02 Thread Alan Cox

 However, given that we are in -rc cycle, and the wide impact of this 
 change, this patch wants splitting.  The -cable_detect stuff should be 
 in a separate patch from the IDENTIFY DEVICE ordering stuff.  This 
 ensures sanity when git-bisecting changes, and allows fast-tracking of 
 the identify-ordering change.

Fine by me - I carefully sent Linus the hook but no changes using it and
sent those changes just to the list/you.

Alan
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] libata: Cable detection fixes

2007-03-02 Thread Alan Cox

 Hm, I got recently hands on a hardware where 2.6.21-rc1 based
 kernels from Fedora rawhide simply do not boot as there is no
 way to get to disks.  I would not mind some change in behavior
 although so far I can boot at least some earlier kernels.

Doesn't look related at all. Looks like the box you have has chronic IRQ
routing problems.

Alan
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] pata_qdi: Fix initialisation

2007-03-02 Thread Alan Cox

The QDI init code contains some bugs which mean it only works if you have
a test setup that causes both a successful and failed probe. Fix this

Found by Philip Guo

(Who found it working on code analysis tools not running VLB IDE
controllers)

Signed-off-by: Alan Cox [EMAIL PROTECTED]diff -u --new-file --recursive 
--exclude-from /usr/src/exclude linux.vanilla-2.6.21-rc2/drivers/ata/pata_qdi.c 
linux-2.6.21-rc2/drivers/ata/pata_qdi.c
--- linux.vanilla-2.6.21-rc2/drivers/ata/pata_qdi.c 2007-03-01 
13:36:03.0 +
+++ linux-2.6.21-rc2/drivers/ata/pata_qdi.c 2007-03-02 13:20:27.0 
+
@@ -363,7 +365,8 @@
release_region(port, 2);
continue;
}
-   ct += qdi_init_one(port, 6500, ide_port[r  
0x01], ide_irq[r  0x01], r  0x04);
+   if (qdi_init_one(port, 6500, ide_port[r  
0x01], ide_irq[r  0x01], r  0x04) == 0)
+   ct++;
}
if (((r  0xF0) == 0xA0) || (r  0xF0) == 0x50) {
/* QD6580: dual channel */
@@ -375,11 +378,14 @@
res = inb(port + 3);
if (res  1) {
/* Single channel mode */
-   ct += qdi_init_one(port, 6580, 
ide_port[r  0x01], ide_irq[r  0x01], r  0x04);
+   if (qdi_init_one(port, 6580, ide_port[r 
 0x01], ide_irq[r  0x01], r  0x04))
+   ct++;
} else {
/* Dual channel mode */
-   ct += qdi_init_one(port, 6580, 0x1F0, 
14, r  0x04);
-   ct += qdi_init_one(port + 2, 6580, 
0x170, 15, r  0x04);
+   if (qdi_init_one(port, 6580, 0x1F0, 14, 
r  0x04) == 0)
+   ct++;
+   if (qdi_init_one(port + 2, 6580, 0x170, 
15, r  0x04) == 0)
+   ct++;
}
}
}


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] pata_cmd640: CMD640 PCI support

2007-03-02 Thread Alan Cox

Support for the PCI CMD640 (not VLB)

Signed-off-by: Alan Cox [EMAIL PROTECTED]

diff -u --new-file --recursive --exclude-from /usr/src/exclude 
linux.vanilla-2.6.21-rc2/drivers/ata/Kconfig 
linux-2.6.21-rc2/drivers/ata/Kconfig
--- linux.vanilla-2.6.21-rc2/drivers/ata/Kconfig2007-03-01 
13:36:03.0 +
+++ linux-2.6.21-rc2/drivers/ata/Kconfig2007-03-02 13:30:50.535164824 
+
@@ -209,6 +209,16 @@
 
  If unsure, say N.
 
+config PATA_CMD640_PCI
+   tristate CMD640 PCI PATA support (Very Experimental)
+   depends on PCI  EXPERIMENTAL
+   help
+ This option enables support for the CMD640 PCI IDE
+ interface chip. Only the primary channel is currently
+ supported.
+
+ If unsure, say N.
+
 config PATA_CMD64X
tristate CMD64x PATA support (Very Experimental)
depends on PCI EXPERIMENTAL
diff -u --new-file --recursive --exclude-from /usr/src/exclude 
linux.vanilla-2.6.21-rc2/drivers/ata/Makefile 
linux-2.6.21-rc2/drivers/ata/Makefile
--- linux.vanilla-2.6.21-rc2/drivers/ata/Makefile   2007-03-01 
13:36:03.0 +
+++ linux-2.6.21-rc2/drivers/ata/Makefile   2007-03-02 13:12:26.0 
+
@@ -22,6 +22,7 @@
 obj-$(CONFIG_PATA_AMD) += pata_amd.o
 obj-$(CONFIG_PATA_ARTOP)   += pata_artop.o
 obj-$(CONFIG_PATA_ATIIXP)  += pata_atiixp.o
+obj-$(CONFIG_PATA_CMD640_PCI)  += pata_cmd640.o
 obj-$(CONFIG_PATA_CMD64X)  += pata_cmd64x.o
 obj-$(CONFIG_PATA_CS5520)  += pata_cs5520.o
 obj-$(CONFIG_PATA_CS5530)  += pata_cs5530.o
diff -u --new-file --recursive --exclude-from /usr/src/exclude 
linux.vanilla-2.6.21-rc2/drivers/ata/pata_cmd640.c 
linux-2.6.21-rc2/drivers/ata/pata_cmd640.c
--- linux.vanilla-2.6.21-rc2/drivers/ata/pata_cmd640.c  1970-01-01 
01:00:00.0 +0100
+++ linux-2.6.21-rc2/drivers/ata/pata_cmd640.c  2007-03-02 13:25:32.041583208 
+
@@ -0,0 +1,298 @@
+/*
+ * pata_cmd640.c   - CMD640 PCI PATA for new ATA layer
+ *   (C) 2007 Red Hat Inc
+ *   Alan Cox [EMAIL PROTECTED]
+ *
+ * Based upon
+ *  linux/drivers/ide/pci/cmd640.c Version 1.02  Sep 01, 1996
+ *
+ *  Copyright (C) 1995-1996  Linus Torvalds  authors (see driver)
+ *
+ * This drives only the PCI version of the controller. If you have a
+ * VLB one then we have enough docs to support it but you can write
+ * your own code.
+ */
+
+#include linux/kernel.h
+#include linux/module.h
+#include linux/pci.h
+#include linux/init.h
+#include linux/blkdev.h
+#include linux/delay.h
+#include scsi/scsi_host.h
+#include linux/libata.h
+
+#define DRV_NAME pata_cmd640
+#define DRV_VERSION 0.0.3
+
+struct cmd640_reg {
+   int last;
+   u8 reg58[ATA_MAX_DEVICES];
+};
+
+enum {
+   CFR = 0x50,
+   CNTRL = 0x51,
+   CMDTIM = 0x52,
+   ARTIM0 = 0x53,
+   DRWTIM0 = 0x54,
+   ARTIM23 = 0x57,
+   DRWTIM23 = 0x58,
+   BRST = 0x59
+};
+
+/**
+ * cmd640_set_piomode  -   set initial PIO mode data
+ * @adev: ATA device
+ *
+ * Called to do the PIO mode setup.
+ */
+
+static void cmd640_set_piomode(struct ata_port *ap, struct ata_device *adev)
+{
+   struct cmd640_reg *timing = ap-private_data;
+   struct pci_dev *pdev = to_pci_dev(ap-host-dev);
+   struct ata_timing t;
+   const unsigned long T = 100 / 33;
+   const u8 setup_data[] = { 0x40, 0x40, 0x40, 0x80, 0x00 };
+   u8 reg;
+   int arttim = ARTIM0 + 2 * adev-devno;
+   struct ata_device *pair = ata_dev_pair(adev);
+
+   if (ata_timing_compute(adev, adev-pio_mode, t, T, 0)  0) {
+   printk(KERN_ERR DRV_NAME : mode computation failed.\n);
+   return;
+   }
+   
+   /* The second channel has shared timings and the setup timing is
+  messy to switch to merge it for worst case */
+   if (ap-port_no  pair) {
+   struct ata_timing p;
+   ata_timing_compute(pair, pair-pio_mode, p, T, 1);
+   ata_timing_merge(p, t, t, ATA_TIMING_SETUP);
+   }
+
+   /* Make the timings fit */
+   if (t.recover  16) {
+   t.active += t.recover - 16;
+   t.recover = 16;
+   }
+   if (t.active  16)
+   t.active = 16;
+
+   /* Now convert the clocks into values we can actually stuff into
+  the chip */
+
+   if (t.recover  1)
+   t.recover--;/* 640B only */
+   else
+   t.recover = 15;
+
+   if (t.setup  4)
+   t.setup = 0xC0;
+   else
+   t.setup = setup_data[t.setup];
+
+   if (ap-port_no == 0) {
+   t.active = 0x0F;   /* 0 = 16 */
+
+   /* Load setup timing */
+   pci_read_config_byte(pdev, arttim, reg);
+   reg = 0x3F;
+   reg |= t.setup;
+   pci_write_config_byte(pdev, arttim, reg);
+
+   /* Load active/recovery */
+

< 1 2 3 4 5 6 7 8 9 >

101 - 200 of 825 matches

Mail list logo