date:20070815

Re: Linux 2.6.20.16

2007-08-15 Thread Willy Tarreau

diff --git a/Makefile b/Makefile
index 947ff3c..b3806cb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 20
-EXTRAVERSION = .15
+EXTRAVERSION = .16
 NAME = Homicidal Dwarf Hamster
 
 # *DOCUMENTATION*
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5e47683..9bf056e 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -367,10 +367,6 @@ ENTRY(system_call)
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
-   testl $TF_MASK,PT_EFLAGS(%esp)
-   jz no_singlestep
-   orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
# system call tracing in operation / 
emulation
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not 
testb */
testw 
$(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
@@ -385,6 +381,10 @@ syscall_exit:
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
+   testl $TF_MASK,PT_EFLAGS(%esp)  # If tracing set singlestep flag on exit
+   jz no_singlestep
+   orl $_TIF_SINGLESTEP,TI_flags(%ebp)
+no_singlestep:
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx   # current->work
jne syscall_exit_work
diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c
index 3700eef..be4a9a8 100644
--- a/arch/i386/oprofile/nmi_int.c
+++ b/arch/i386/oprofile/nmi_int.c
@@ -131,7 +131,6 @@ static void nmi_save_registers(void * dummy)
 {
int cpu = smp_processor_id();
struct op_msrs * msrs = _msrs[cpu];
-   model->fill_in_addresses(msrs);
nmi_cpu_save_registers(msrs);
 }
 
@@ -195,6 +194,7 @@ static struct notifier_block profile_exceptions_nb = {
 static int nmi_setup(void)
 {
int err=0;
+   int cpu;
 
if (!allocate_msrs())
return -ENOMEM;
@@ -207,6 +207,13 @@ static int nmi_setup(void)
/* We need to serialize save and setup for HT because the subset
 * of msrs are distinct for save and setup operations
 */
+
+   /* Assume saved/restored counters are the same on all CPUs */
+   model->fill_in_addresses(_msrs[0]);
+   for_each_possible_cpu (cpu) {
+   if (cpu != 0)
+   cpu_msrs[cpu] = cpu_msrs[0];
+   }
on_each_cpu(nmi_save_registers, NULL, 0, 1);
on_each_cpu(nmi_cpu_setup, NULL, 0, 1);
nmi_enabled = 1;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index f72e8e8..a84304e 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -177,6 +177,13 @@ static long restore_sigcontext(struct pt_regs *regs, 
sigset_t *set, int sig,
 */
discard_lazy_cpu_state();
 
+   /*
+* Force reload of FP/VEC.
+* This has to be done before copying stuff into current->thread.fpr/vr
+* for the reasons explained in the previous comment.
+*/
+   regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC);
+
err |= __copy_from_user(>thread.fpr, >fp_regs, 
FP_REGS_SIZE);
 
 #ifdef CONFIG_ALTIVEC
@@ -198,9 +205,6 @@ static long restore_sigcontext(struct pt_regs *regs, 
sigset_t *set, int sig,
current->thread.vrsave = 0;
 #endif /* CONFIG_ALTIVEC */
 
-   /* Force reload of FP/VEC */
-   regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC);
-
return err;
 }
 
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 2968b90..e67cc4f 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -72,6 +72,8 @@ void show_mem(void)
 
for_each_online_pgdat(pgdat) {
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+   if (!pfn_valid(pgdat->node_start_pfn + i))
+   continue;
page = pfn_to_page(pgdat->node_start_pfn + i);
total++;
if (PageReserved(page))
@@ -766,3 +768,9 @@ int in_gate_area_no_task(unsigned long addr)
 {
return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
 }
+
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+   return __alloc_bootmem_core(pgdat->bdata, size,
+   SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
+}
diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 3ffa080..e4e0ccb 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -1102,6 +1102,7 @@ static void cyy_intr_chip(struct cyclades_card *cinfo, 
int chip,
 
if (data & info->ignore_status_mask) {
info->icount.rx++;
+   spin_unlock(>card_lock);
return;
}

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Satyam Sharma

On Thu, 16 Aug 2007, Satyam Sharma wrote:

> Hi Bill,
> 
> 
> On Wed, 15 Aug 2007, Bill Fink wrote:
> 
> > On Wed, 15 Aug 2007, Satyam Sharma wrote:
> > 
> > > (C)
> > > $ cat tp3.c
> > > int a;
> > > 
> > > void func(void)
> > > {
> > >   *(volatile int *) = 10;
> > >   *(volatile int *) = 20;
> > > }
> > > $ gcc -Os -S tp3.c
> > > $ cat tp3.s
> > > ...
> > > movl$10, a
> > > movl$20, a
> > > ...
> > 
> > I'm curious about one minor tangential point.  Why, instead of:
> > 
> > b = *(volatile int *)
> > 
> > why can't this just be expressed as:
> > 
> > b = (volatile int)a;
> > 
> > Isn't it the contents of a that's volatile, i.e. it's value can change
> > invisibly to the compiler, and that's why you want to force a read from
> > memory?  Why do you need the "*(volatile int *)&" construct?
> 
> "b = (volatile int)a;" doesn't help us because a cast to a qualified type
> has the same effect as a cast to an unqualified version of that type, as
> mentioned in 6.5.4:4 (footnote 86) of the standard. Note that "volatile"
> is a type-qualifier, not a type itself, so a cast of the _object_ itself
> to a qualified-type i.e. (volatile int) would not make the access itself
> volatile-qualified.
> 
> To serve our purposes, it is necessary for us to take the address of this
> (non-volatile) object, cast the resulting _pointer_ to the corresponding
> volatile-qualified pointer-type, and then dereference it. This makes that
> particular _access_ be volatile-qualified, without the object itself being
> such. Also note that the (dereferenced) result is also a valid lvalue and
> hence can be used in "*(volatile int *) = b;" kind of construction
> (which we use for the atomic_set case).

Here, I should obviously admit that the semantics of *(volatile int *)&
aren't any neater or well-defined in the _language standard_ at all. The
standard does say (verbatim) "precisely what constitutes as access to
object of volatile-qualified type is implementation-defined", but GCC
does help us out here by doing the right thing. Accessing the non-volatile
object there using the volatile-qualified pointer-type cast makes GCC
treat the object stored at that memory address itself as if it were a 
volatile object, thus making the access end up having what we're calling
as "volatility" semantics here.

Honestly, given such confusion, and the propensity of the "volatile"
type-qualifier keyword to be ill-defined (or at least poorly understood,
often inconsistently implemented), I'd (again) express my opinion that it
would be best to avoid its usage, given other alternatives do exist.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Linux 2.6.20.16

2007-08-15 Thread Willy Tarreau


I've just released Linux 2.6.20.16. This version catches up with 2.6.21.7.
I hope to issue newer releases soon with next batches of pending patches.

I'll also be replying to this message with a copy of the patch between
2.6.20.15 and 2.6.20.16.

The patch and changelog will appear soon at the following locations:
  ftp://ftp.all.kernel.org/pub/linux/kernel/v2.6/
  ftp://ftp.all.kernel.org/pub/linux/kernel/v2.6/patch-2.6.20.16.bz2
  ftp://ftp.all.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.20.16

Git repository:
   git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-2.6.20.y.git
  http://www.kernel.org/pub/scm/linux/kernel/git/stable/linux-2.6.20.y.git

Git repository through the gitweb interface:
  http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.20.y.git

Willy

---

 Makefile  |2 
 arch/i386/kernel/entry.S  |8 -
 arch/i386/oprofile/nmi_int.c  |9 +
 arch/powerpc/kernel/signal_64.c   |   10 -
 arch/x86_64/mm/init.c |8 +
 drivers/char/cyclades.c   |1 
 drivers/md/bitmap.c   |   17 +-
 drivers/md/dm-crypt.c |   50 ++
 drivers/md/raid0.c|3 
 drivers/md/raid1.c|   21 +-
 drivers/md/raid10.c   |6 
 drivers/media/video/saa7134/saa7134-tvaudio.c |2 
 drivers/net/e1000/e1000_main.c|   11 -
 drivers/net/sky2.c|9 +
 drivers/serial/mpsc.c |3 
 include/linux/bootmem.h   |1 
 include/linux/pci_ids.h   |4 
 include/linux/raid/bitmap.h   |1 
 include/linux/sched.h |1 
 include/linux/workqueue.h |2 
 kernel/auditfilter.c  |2 
 kernel/exit.c |   22 ++
 kernel/futex.c|  205 --
 kernel/rtmutex.c  |   24 ++-
 kernel/sched.c|   22 +-
 mm/rmap.c |   24 ---
 mm/sparse.c   |   11 +
 27 files changed, 307 insertions(+), 172 deletions(-)

Summary of changes from 2.6.20.15 to 2.6.20.16


Alexey Kuznetsov (1):
  pi-futex: Fix exit races and locking problems

Andi Kleen (1):
  i386: Fix K8/core2 oprofile on multiple CPUs

Auke Kok (1):
  e1000: disable polling before registering netdevice

Bob Picco (1):
  sparsemem: fix oops in x86_64 show_mem

Christoph Lameter (1):
  sched: fix next_interval determination in idle_balance()

Hugh Dickins (1):
  mm: kill validate_anon_vma to avoid mapcount BUG

Jason Gaston (1):
  pci_ids: update patch for Intel ICH9M

Jason Wessel (1):
  i386: fix infinite loop with singlestep int80 syscalls

Jay Lubomirski (1):
  serial: clear proper MPSC interrupt cause bits

Jeff Mahoney (1):
  saa7134: fix thread shutdown handling

Jiri Slaby (1):
  Char: cyclades, fix deadlock

Mike Accetta (1):
  md: Fix bug in error handling during raid1 repair.

Milan Broz (1):
  dm crypt: disable barriers

NeilBrown (3):
  md: Avoid overflow in raid0 calculation with large components.
  md: Don't write more than is required of the last page of a bitmap
  md: Fix two raid10 bugs.

Olaf Kirch (3):
  dm crypt: fix call to clone_init
  dm crypt: fix avoid cloned bio ref after free
  dm crypt: fix remove first_clone

Oleg Nesterov (1):
  make freezeable workqueues singlethread

Paul Mackerras (1):
  POWERPC: Fix subtle FP state corruption bug in signal return on SMP

Stephen Hemminger (1):
  sky2: workaround for lost IRQ

Thomas Gleixner (3):
  rt-mutex: Fix stale return value
  rt-mutex: Fix chain walk early wakeup bug
  FUTEX: Restore the dropped ERSCH fix

Tony Jones (1):
  audit: fix oops removing watch if audit disabled

Willy Tarreau (1):
  Linux 2.6.20.16

Zou Nan hai (1):
  x86_64: allocate sparsemem memmap above 4G

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Thu, Aug 16, 2007 at 02:11:43PM +1000, Paul Mackerras wrote:
>
> The uses of atomic_read where one might want it to allow caching of
> the result seem to me to fall into 3 categories:
> 
> 1. Places that are buggy because of a race arising from the way it's
>used.
> 
> 2. Places where there is a race but it doesn't matter because we're
>doing some clever trick.
> 
> 3. Places where there is some locking in place that eliminates any
>potential race.

Agreed.

> In case 1, adding volatile won't solve the race, of course, but it's
> hard to argue that we shouldn't do something because it will slow down
> buggy code.  Case 2 is hopefully pretty rare and accompanied by large
> comment blocks, and in those cases caching the result of atomic_read
> explicitly in a local variable would probably make the code clearer.
> And in case 3 there is no reason to use atomic_t at all; we might as
> well just use an int.

Since adding volatile doesn't help any of the 3 cases, and
takes away optimisations from both 2 and 3, I wonder what
is the point of the addition after all?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Thu, Aug 16, 2007 at 02:34:25PM +1000, Paul Mackerras wrote:
>
> I'm talking about this situation:
> 
> CPU 0 comes into __sk_stream_mem_reclaim, reads memory_allocated, but
> then before it can do the store to *memory_pressure, CPUs 1-1023 all
> go through sk_stream_mem_schedule, collectively increase
> memory_allocated to more than sysctl_mem[2] and set *memory_pressure.
> Finally CPU 0 gets to do its store and it sets *memory_pressure back
> to 0, but by this stage memory_allocated is way larger than
> sysctl_mem[2].

It doesn't matter.  The memory pressure flag is an *advisory*
flag.  If we get it wrong the worst that'll happen is that we'd
waste some time doing work that'll be thrown away.

Please look at the places where it's used before jumping to
conclusions.

> Now, maybe it's the case that it doesn't really matter whether
> *->memory_pressure is 0 or 1.  But if so, why bother computing it at
> all?

As long as we get it right most of the time (and I think you
would agree that we do get it right most of the time), then
this flag has achieved its purpose.

> People seem to think that using atomic_t means they don't need to use
> a spinlock.  That's fine if there is only one variable involved, but
> as soon as there's more than one, there's the possibility of a race,
> whether or not you use atomic_t, and whether or not atomic_read has
> "volatile" behaviour.

In any case, this actually illustrates why the addition of
volatile is completely pointless.  Even if this code was
broken, which it definitely is not, having the volatile
there wouldn't have helped at all.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Documentation] Page Table Layout diagrams

2007-08-15 Thread David Gibson

On Wed, Aug 08, 2007 at 01:47:45PM -0500, Adam Litke wrote:
> Hello all.  In an effort to understand how the page tables are laid out
> across various architectures I put together some diagrams.  I have
> posted them on the linux-mm wiki: http://linux-mm.org/PageTableStructure
> and I hope they will be useful to others.  
> 
> Just to make sure I am not spreading misinformation, could a few of you
> experts take a quick look at the three diagrams I've got finished so far
> and point out any errors I have made?  Thanks.

Nice.  Didn't spot any innaccuracies.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 005 of 5] Merge blk_recount_segments into blk_recalc_rq_segments

2007-08-15 Thread NeilBrown


blk_recalc_rq_segments calls blk_recount_segments on each bio,
then does some extra calculations to handle segments that overlap
two bios.

If we merge the code from blk_recount_segments into
blk_recalc_rq_segments, we can process the whole request one bio_vec
at a time, and not need the messy cross-bio calculations.

Then blk_recount_segments can be implemented by calling
blk_recalc_rq_segments, passing it a simple on-stack request which
stores just the bio.  This function is only temporary and will go away
completely by the end of this patch series.

This allows us to remove rq_for_each_bio


Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./block/ll_rw_blk.c  |  125 ---
 ./include/linux/blkdev.h |3 -
 2 files changed, 55 insertions(+), 73 deletions(-)

diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c
--- .prev/block/ll_rw_blk.c 2007-08-16 15:02:31.0 +1000
+++ ./block/ll_rw_blk.c 2007-08-16 15:02:31.0 +1000
@@ -42,6 +42,7 @@ static void drive_stat_acct(struct reque
 static void init_request_from_bio(struct request *req, struct bio *bio);
 static int __make_request(struct request_queue *q, struct bio *bio);
 static struct io_context *current_io_context(gfp_t gfp_flags, int node);
+static void blk_recalc_rq_segments(struct request *rq);
 
 /*
  * For the allocated request tables
@@ -1209,65 +1210,91 @@ EXPORT_SYMBOL(blk_dump_rq_flags);
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-   struct bio_vec *bv, *bvprv = NULL;
-   int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster;
+   struct request rq;
+   struct bio *nxt = bio->bi_next;
+   rq.q = q;
+   rq.bio = rq.biotail = bio;
+   bio->bi_next = NULL;
+   blk_recalc_rq_segments();
+   bio->bi_next = nxt;
+   bio->bi_phys_segments = rq.nr_phys_segments;
+   bio->bi_hw_segments = rq.nr_hw_segments;
+   bio->bi_flags |= (1 << BIO_SEG_VALID);
+}
+EXPORT_SYMBOL(blk_recount_segments);
+
+static void blk_recalc_rq_segments(struct request *rq)
+{
+   int nr_phys_segs;
+   int nr_hw_segs;
+   unsigned int phys_size;
+   unsigned int hw_size;
+   struct bio_vec bv;
+   struct bio_vec bvprv = {0};
+   int seg_size;
+   int hw_seg_size;
+   int cluster;
+   struct req_iterator i;
int high, highprv = 1;
+   struct request_queue *q = rq->q;
 
-   if (unlikely(!bio->bi_io_vec))
+   if (!rq->bio)
return;
 
cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
-   hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0;
-   bio_for_each_segment(bv, bio, i) {
+   hw_seg_size = seg_size = 0;
+   phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
+   rq_for_each_segment(rq, i, bv) {
/*
 * the trick here is making sure that a high page is never
 * considered part of another segment, since that might
 * change with the bounce page.
 */
-   high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
+   high = page_to_pfn(bv.bv_page) > q->bounce_pfn;
if (high || highprv)
goto new_hw_segment;
if (cluster) {
-   if (seg_size + bv->bv_len > q->max_segment_size)
+   if (seg_size + bv.bv_len > q->max_segment_size)
goto new_segment;
-   if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
+   if (!BIOVEC_PHYS_MERGEABLE(, ))
goto new_segment;
-   if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
+   if (!BIOVEC_SEG_BOUNDARY(q, , ))
goto new_segment;
-   if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
+   if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv.bv_len))
goto new_hw_segment;
 
-   seg_size += bv->bv_len;
-   hw_seg_size += bv->bv_len;
+   seg_size += bv.bv_len;
+   hw_seg_size += bv.bv_len;
bvprv = bv;
continue;
}
 new_segment:
-   if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
-   !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) {
-   hw_seg_size += bv->bv_len;
-   } else {
+   if (BIOVEC_VIRT_MERGEABLE(, ) &&
+   !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv.bv_len))
+   hw_seg_size += bv.bv_len;
+   else {
 new_hw_segment:
-   if (hw_seg_size > bio->bi_hw_front_size)
-   bio->bi_hw_front_size = hw_seg_size;
-   hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
+

[PATCH 004 of 5] Introduce rq_for_each_segment replacing rq_for_each_bio

2007-08-15 Thread NeilBrown


(almost) every usage of rq_for_each_bio wraps a usage of
bio_for_each_segment, so these can be combined into
rq_for_each_segment.

We get it to fill in a bio_vec structure rather than provide a
pointer, as future changes to make bi_io_vec immutable will require
that.

The one place where rq_for_each_bio remains will be changed to use
rq_for_each_segment in a subsequent patch.

Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./block/ll_rw_blk.c  |   54 +++--
 ./drivers/block/floppy.c |   84 ++-
 ./drivers/block/lguest_blk.c |   25 +--
 ./drivers/block/nbd.c|   67 +++
 ./drivers/block/xen-blkfront.c   |   57 --
 ./drivers/ide/ide-floppy.c   |   62 +---
 ./drivers/s390/block/dasd_diag.c |   40 --
 ./drivers/s390/block/dasd_eckd.c |   47 ++---
 ./drivers/s390/block/dasd_fba.c  |   47 ++---
 ./drivers/s390/char/tape_34xx.c  |   33 ++-
 ./drivers/s390/char/tape_3590.c  |   41 ---
 ./include/linux/blkdev.h |   18 
 12 files changed, 280 insertions(+), 295 deletions(-)

diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c
--- .prev/block/ll_rw_blk.c 2007-08-16 15:02:30.0 +1000
+++ ./block/ll_rw_blk.c 2007-08-16 15:02:31.0 +1000
@@ -1313,9 +1313,11 @@ static int blk_hw_contig_segment(struct 
 int blk_rq_map_sg(struct request_queue *q, struct request *rq,
  struct scatterlist *sg)
 {
-   struct bio_vec *bvec, *bvprv;
-   struct bio *bio;
-   int nsegs, i, cluster;
+   struct bio_vec bvec;
+   struct bio_vec bvprv = { 0 };
+   struct req_iterator i;
+   int nsegs;
+   int cluster;
 
nsegs = 0;
cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
@@ -1323,36 +1325,30 @@ int blk_rq_map_sg(struct request_queue *
/*
 * for each bio in rq
 */
-   bvprv = NULL;
-   rq_for_each_bio(bio, rq) {
-   /*
-* for each segment in bio
-*/
-   bio_for_each_segment(bvec, bio, i) {
-   int nbytes = bvec->bv_len;
+   rq_for_each_segment(rq, i, bvec) {
+   int nbytes = bvec.bv_len;
 
-   if (bvprv && cluster) {
-   if (sg[nsegs - 1].length + nbytes > 
q->max_segment_size)
-   goto new_segment;
-
-   if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
-   goto new_segment;
-   if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
-   goto new_segment;
+   if (bvprv.bv_page && cluster) {
+   if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
+   goto new_segment;
+
+   if (!BIOVEC_PHYS_MERGEABLE(, ))
+   goto new_segment;
+   if (!BIOVEC_SEG_BOUNDARY(q, , ))
+   goto new_segment;
 
-   sg[nsegs - 1].length += nbytes;
-   } else {
+   sg[nsegs - 1].length += nbytes;
+   } else {
 new_segment:
-   memset([nsegs],0,sizeof(struct scatterlist));
-   sg[nsegs].page = bvec->bv_page;
-   sg[nsegs].length = nbytes;
-   sg[nsegs].offset = bvec->bv_offset;
+   memset([nsegs], 0, sizeof(struct scatterlist));
+   sg[nsegs].page = bvec.bv_page;
+   sg[nsegs].length = nbytes;
+   sg[nsegs].offset = bvec.bv_offset;
 
-   nsegs++;
-   }
-   bvprv = bvec;
-   } /* segments in bio */
-   } /* bios in rq */
+   nsegs++;
+   }
+   bvprv = bvec;
+   }
 
return nsegs;
 }

diff .prev/drivers/block/floppy.c ./drivers/block/floppy.c
--- .prev/drivers/block/floppy.c2007-08-16 15:02:30.0 +1000
+++ ./drivers/block/floppy.c2007-08-16 15:02:31.0 +1000
@@ -2450,23 +2450,20 @@ static void rw_interrupt(void)
 /* Compute maximal contiguous buffer size. */
 static int buffer_chain_size(void)
 {
-   struct bio *bio;
-   struct bio_vec *bv;
+   struct bio_vec bv;
int size;
-   int i;
+   struct req_iterator i;
char *base;
 
base = blk_rq_data(current_req);
size = 0;
 
-   rq_for_each_bio(bio, current_req) {
-   bio_for_each_segment(bv, bio, i) {
-   if (page_address(bv->bv_page) + bv->bv_offset !=
-

[PATCH 002 of 5] Replace bio_data with blk_rq_data

2007-08-15 Thread NeilBrown


Almost every call to bio_data is for the first bio
in a request.  A future patch will add some accounting
information to 'struct request' which will need to be
used to find the start of the request in the bio.
So replace bio_data with blk_rq_data which takes a 'struct request *'

The one exception is in dm-emc were using
   page_address(bio->bi_io_vec[0].bv_page);
is appropriate.

Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./block/ll_rw_blk.c  |   16 
 ./drivers/block/floppy.c |2 +-
 ./drivers/ide/ide-cd.c   |   11 ++-
 ./drivers/ide/ide-io.c   |2 +-
 ./drivers/md/dm-emc.c|2 +-
 ./drivers/message/fusion/mptsas.c|4 ++--
 ./drivers/scsi/libsas/sas_expander.c |4 ++--
 ./include/linux/bio.h|4 +---
 ./include/linux/blkdev.h |1 +
 9 files changed, 27 insertions(+), 19 deletions(-)

diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c
--- .prev/block/ll_rw_blk.c 2007-08-16 15:02:29.0 +1000
+++ ./block/ll_rw_blk.c 2007-08-16 15:02:30.0 +1000
@@ -2907,8 +2907,8 @@ static void init_request_from_bio(struct
req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio);
req->nr_phys_segments = bio_phys_segments(req->q, bio);
req->nr_hw_segments = bio_hw_segments(req->q, bio);
-   req->buffer = bio_data(bio);/* see ->buffer comment above */
req->bio = req->biotail = bio;
+   req->buffer = blk_rq_data(req); /* see ->buffer comment above */
req->ioprio = bio_prio(bio);
req->rq_disk = bio->bi_bdev->bd_disk;
req->start_time = jiffies;
@@ -2977,7 +2977,7 @@ static int __make_request(struct request
 * it didn't need a bounce buffer then it better
 * not touch req->buffer either...
 */
-   req->buffer = bio_data(bio);
+   req->buffer = blk_rq_data(req);
req->current_nr_sectors = bio_cur_sectors(bio);
req->hard_cur_sectors = req->current_nr_sectors;
req->sector = req->hard_sector = bio->bi_sector;
@@ -3373,7 +3373,7 @@ static void blk_recalc_rq_sectors(struct
rq->nr_sectors = rq->hard_nr_sectors;
rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
rq->current_nr_sectors = rq->hard_cur_sectors;
-   rq->buffer = bio_data(rq->bio);
+   rq->buffer = blk_rq_data(rq);
}
 
/*
@@ -3678,14 +3678,22 @@ void blk_rq_bio_prep(struct request_queu
rq->current_nr_sectors = bio_cur_sectors(bio);
rq->hard_cur_sectors = rq->current_nr_sectors;
rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
-   rq->buffer = bio_data(bio);
rq->data_len = bio->bi_size;
 
rq->bio = rq->biotail = bio;
+   rq->buffer = blk_rq_data(rq);
 }
 
 EXPORT_SYMBOL(blk_rq_bio_prep);
 
+void *blk_rq_data(struct request *rq)
+{
+   return page_address(bio_page(rq->bio)) +
+   bio_offset(rq->bio);
+}
+EXPORT_SYMBOL(blk_rq_data);
+
+
 int kblockd_schedule_work(struct work_struct *work)
 {
return queue_work(kblockd_workqueue, work);

diff .prev/drivers/block/floppy.c ./drivers/block/floppy.c
--- .prev/drivers/block/floppy.c2007-08-16 14:57:31.0 +1000
+++ ./drivers/block/floppy.c2007-08-16 15:02:30.0 +1000
@@ -2456,7 +2456,7 @@ static int buffer_chain_size(void)
int i;
char *base;
 
-   base = bio_data(current_req->bio);
+   base = blk_rq_data(current_req);
size = 0;
 
rq_for_each_bio(bio, current_req) {

diff .prev/drivers/ide/ide-cd.c ./drivers/ide/ide-cd.c
--- .prev/drivers/ide/ide-cd.c  2007-08-16 14:57:31.0 +1000
+++ ./drivers/ide/ide-cd.c  2007-08-16 15:02:30.0 +1000
@@ -1381,10 +1381,11 @@ static ide_startstop_t cdrom_start_seek 
start it over entirely, or even put it back on the request queue. */
 static void restore_request (struct request *rq)
 {
-   if (rq->buffer != bio_data(rq->bio)) {
-   sector_t n = (rq->buffer - (char *) bio_data(rq->bio)) / 
SECTOR_SIZE;
+   if (rq->buffer != blk_rq_data(rq)) {
+   sector_t n = (rq->buffer - (char *)blk_rq_data(rq))
+   / SECTOR_SIZE;
 
-   rq->buffer = bio_data(rq->bio);
+   rq->buffer = blk_rq_data(rq);
rq->nr_sectors += n;
rq->sector -= n;
}
@@ -1659,7 +1660,7 @@ static void post_transform_command(struc
return;
 
if (req->bio)
-   ibuf = bio_data(req->bio);
+   ibuf = blk_rq_data(req);
else
ibuf = req->data;
 
@@ -1768,7 +1769,7 @@ static ide_startstop_t cdrom_newpc_intr(
 *

[PATCH 003 of 5] Replace bio_cur_sectors with blk_rq_cur_sectors.

2007-08-15 Thread NeilBrown


All calls to bio_cur_sectors are for the first bio in a 'struct request'.
A future patch will make the discovery of this number dependant on
information in the request.  So change the function to take a 
'struct request *' instread of a 'struct bio *', and make it a real
function as more code will need to be added.

One place wants the current bytes rather than sectors, so the
'real function' we create is blk_rq_cur_bytes, and 
blk_rq_cur_sectors divides this value by 512.

Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./block/ll_rw_blk.c  |   18 --
 ./drivers/ide/ide-cd.c   |   11 ++-
 ./drivers/ide/ide-io.c   |2 +-
 ./include/linux/bio.h|1 -
 ./include/linux/blkdev.h |5 +
 5 files changed, 24 insertions(+), 13 deletions(-)

diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c
--- .prev/block/ll_rw_blk.c 2007-08-16 15:02:30.0 +1000
+++ ./block/ll_rw_blk.c 2007-08-16 15:02:30.0 +1000
@@ -2904,10 +2904,11 @@ static void init_request_from_bio(struct
req->errors = 0;
req->hard_sector = req->sector = bio->bi_sector;
req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio);
-   req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio);
+   req->bio = req->biotail = bio;
+   req->current_nr_sectors = req->hard_cur_sectors =
+   blk_rq_cur_sectors(req);
req->nr_phys_segments = bio_phys_segments(req->q, bio);
req->nr_hw_segments = bio_hw_segments(req->q, bio);
-   req->bio = req->biotail = bio;
req->buffer = blk_rq_data(req); /* see ->buffer comment above */
req->ioprio = bio_prio(bio);
req->rq_disk = bio->bi_bdev->bd_disk;
@@ -2978,7 +2979,7 @@ static int __make_request(struct request
 * not touch req->buffer either...
 */
req->buffer = blk_rq_data(req);
-   req->current_nr_sectors = bio_cur_sectors(bio);
+   req->current_nr_sectors = blk_rq_cur_sectors(req);
req->hard_cur_sectors = req->current_nr_sectors;
req->sector = req->hard_sector = bio->bi_sector;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
@@ -3371,7 +3372,7 @@ static void blk_recalc_rq_sectors(struct
(rq->sector <= rq->hard_sector)) {
rq->sector = rq->hard_sector;
rq->nr_sectors = rq->hard_nr_sectors;
-   rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
+   rq->hard_cur_sectors = blk_rq_cur_sectors(rq);
rq->current_nr_sectors = rq->hard_cur_sectors;
rq->buffer = blk_rq_data(rq);
}
@@ -3675,13 +3676,13 @@ void blk_rq_bio_prep(struct request_queu
 
rq->nr_phys_segments = bio_phys_segments(q, bio);
rq->nr_hw_segments = bio_hw_segments(q, bio);
-   rq->current_nr_sectors = bio_cur_sectors(bio);
-   rq->hard_cur_sectors = rq->current_nr_sectors;
rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
rq->data_len = bio->bi_size;
 
rq->bio = rq->biotail = bio;
rq->buffer = blk_rq_data(rq);
+   rq->current_nr_sectors = blk_rq_cur_sectors(rq);
+   rq->hard_cur_sectors = rq->current_nr_sectors;
 }
 
 EXPORT_SYMBOL(blk_rq_bio_prep);
@@ -3693,6 +3694,11 @@ void *blk_rq_data(struct request *rq)
 }
 EXPORT_SYMBOL(blk_rq_data);
 
+int blk_rq_cur_bytes(struct request *rq)
+{
+   return bio_iovec(rq->bio)->bv_len;
+}
+EXPORT_SYMBOL(blk_rq_cur_bytes);
 
 int kblockd_schedule_work(struct work_struct *work)
 {

diff .prev/drivers/ide/ide-cd.c ./drivers/ide/ide-cd.c
--- .prev/drivers/ide/ide-cd.c  2007-08-16 15:02:30.0 +1000
+++ ./drivers/ide/ide-cd.c  2007-08-16 15:02:30.0 +1000
@@ -1173,7 +1173,8 @@ static ide_startstop_t cdrom_read_intr (
 
/* First, figure out if we need to bit-bucket
   any of the leading sectors. */
-   nskip = min_t(int, rq->current_nr_sectors - bio_cur_sectors(rq->bio), 
sectors_to_transfer);
+   nskip = min_t(int, rq->current_nr_sectors - blk_rq_cur_sectors(rq),
+ sectors_to_transfer);
 
while (nskip > 0) {
/* We need to throw away a sector. */
@@ -1273,7 +1274,7 @@ static int cdrom_read_from_buffer (ide_d
   represent the number of sectors to skip at the start of a transfer
   will fail.  I think that this will never happen, but let's be
   paranoid and check. */
-   if (rq->current_nr_sectors < bio_cur_sectors(rq->bio) &&
+   if (rq->current_nr_sectors < blk_rq_cur_sectors(rq) &&
(rq->sector & (sectors_per_frame - 1))) {
printk(KERN_ERR "%s: cdrom_read_from_buffer: buffer botch 
(%ld)\n",
drive->name, (long)rq->sector);
@@ -1308,7 +1309,7 @@ static

[PATCH 001 of 5] Don't update bi_hw_*_size if we aren't going to merge.

2007-08-15 Thread NeilBrown


ll_merge_requests_fn can update bi_hw_*_size in one case where we end
up not merging.  This is wrong.

Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./block/ll_rw_blk.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c
--- .prev/block/ll_rw_blk.c 2007-08-16 14:57:39.0 +1000
+++ ./block/ll_rw_blk.c 2007-08-16 15:02:29.0 +1000
@@ -1518,11 +1518,13 @@ static int ll_merge_requests_fn(struct r
/*
 * propagate the combined length to the end of the requests
 */
+   total_hw_segments--;
+   if (total_hw_segments > q->max_hw_segments)
+   return 0;
if (req->nr_hw_segments == 1)
req->bio->bi_hw_front_size = len;
if (next->nr_hw_segments == 1)
next->biotail->bi_hw_back_size = len;
-   total_hw_segments--;
}
 
if (total_hw_segments > q->max_hw_segments)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 000 of 5] Introductory patches for bio refactor.

2007-08-15 Thread NeilBrown


Hi Jens,
 I wonder if you would accept these patches the block layer.
 They are, as far as I can tell, quite uncontroversial and provide
 good cleanups.

 The first is a minor bug-fix.
 The next to replace helper function that take a bio (always the first
 bio of a request), to instead take a request.

 The last two combine to replace rq_for_each_bio with
 rq_for_each_segment which makes blk_recalc_rq_segments a lot
 cleaner, and improves (to a lesser degree) every other place that
 used rq_for_each_bio.

 The net result is a decrease in object code size (for my x86-64 compile)
 for block/built-in.o of 96 bytes, though there is some growth out in
 drivers making and over-all decrease of only 48 bytes.

Thanks,
NeilBrown



 [PATCH 001 of 5] Don't update bi_hw_*_size if we aren't going to merge.
 [PATCH 002 of 5] Replace bio_data with blk_rq_data
 [PATCH 003 of 5] Replace bio_cur_sectors with blk_rq_cur_sectors.
 [PATCH 004 of 5] Introduce rq_for_each_segment replacing rq_for_each_bio
 [PATCH 005 of 5] Merge blk_recount_segments into blk_recalc_rq_segments
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Satyam Sharma

Hi Bill,

On Wed, 15 Aug 2007, Bill Fink wrote:

> On Wed, 15 Aug 2007, Satyam Sharma wrote:
> 
> > (C)
> > $ cat tp3.c
> > int a;
> > 
> > void func(void)
> > {
> > *(volatile int *) = 10;
> > *(volatile int *) = 20;
> > }
> > $ gcc -Os -S tp3.c
> > $ cat tp3.s
> > ...
> > movl$10, a
> > movl$20, a
> > ...
> 
> I'm curious about one minor tangential point.  Why, instead of:
> 
>   b = *(volatile int *)
> 
> why can't this just be expressed as:
> 
>   b = (volatile int)a;
> 
> Isn't it the contents of a that's volatile, i.e. it's value can change
> invisibly to the compiler, and that's why you want to force a read from
> memory?  Why do you need the "*(volatile int *)&" construct?

"b = (volatile int)a;" doesn't help us because a cast to a qualified type
has the same effect as a cast to an unqualified version of that type, as
mentioned in 6.5.4:4 (footnote 86) of the standard. Note that "volatile"
is a type-qualifier, not a type itself, so a cast of the _object_ itself
to a qualified-type i.e. (volatile int) would not make the access itself
volatile-qualified.

To serve our purposes, it is necessary for us to take the address of this
(non-volatile) object, cast the resulting _pointer_ to the corresponding
volatile-qualified pointer-type, and then dereference it. This makes that
particular _access_ be volatile-qualified, without the object itself being
such. Also note that the (dereferenced) result is also a valid lvalue and
hence can be used in "*(volatile int *) = b;" kind of construction
(which we use for the atomic_set case).

Satyam
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [ckrm-tech] Regression in 2.6.23-rc2-mm2, mounting cpusets causes a hang

2007-08-15 Thread Paul Menage

On 8/15/07, Paul Jackson <[EMAIL PROTECTED]> wrote:
> Lee wrote:
> > [altho' methinks CPUSET should select CONTAINERS rather than
> > depend on it...]
>
> Good point -- what do you think, Paul Menage?

That's how I made the configs originally; akpm asked me to invert the
dependencies to use "depends" rather than "select" due to some general
problems with using "select". I agree that "select" seems more
natural.

Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Thinking outside the box on file systems

2007-08-15 Thread Kyle Moffett


Mmm, slow-as-dirt hotel wireless.  What fun...

On Aug 15, 2007, at 18:14:44, Phillip Susi wrote:

Kyle Moffett wrote:
I am well aware of that, I'm simply saying that sucks.  Doing a  
recursive chmod or setfacl on a large directory tree is slow as  
all hell.


Doing it in the kernel won't make it any faster.


Right... I'm talking about getting rid of it entirely.


Let me repeat myself here:  Algorithmically you fundamentally CANNOT  
implement inheritance-based ACLs without one of the following  
(although if you have some other algorithm in mind, I'm listening):
  (A) Some kind of recursive operation *every* time you change an  
inheritable permission
  (B) A unified "starting point" from which you begin *every* access- 
control lookup (or one "starting point" per useful semantic grouping,  
like a namespace).


The "(A)" is presently done in userspace and that's what you want to  
avoid.  As to (B), I will attempt to prove below that you cannot  
implement "(B)" without breaking existing assumptions and restricting  
a very nice VFS model.


Not necessarily.  When I do "vim some-file-in-current-directory",  
for example, the kernel does *NOT* look up the path of my current  
directory.  It does (in pseudocode):

if (starts_with_slash(filename)) {
entry = task->cwd;
} else {
entry = task->root;
}
while (have_components_left(filename)
entry = lookup_next_component(filename);
return entry;


Right and task->cwd would have the effective acl in memory,  
ready to be combined with any acl set on the file.


What ACL would "task->cwd" use?

Options:
(1.a) Use the one calculated during the original chdir() call.
(1.b) Navigate "up" task->cwd building an ACL backwards.
(1.c) $CAN_YOU_THINK_OF_SOMETHING_ELSE_HERE

Unsolvable problems with each option:

(1.a.I)
You just broke all sorts of chrooted daemons.  When I start bind in  
its chroot jail, it does the following:

  chdir("/private/bind9");
  chroot(".");
  setgid(...);
  setuid(...);
The "/private" directory is readable only by root, since root is the  
only one who will be navigating you into these chroots for any  
reason.  You only switch UID/GID after the chroot() call, at which  
point you are inside of a sub-context and your cwd is fully  
accessible.  If you stick an inheritable ACL on "/private", then the  
"cwd" ACL will not allow access by anybody but root and my bind won't  
be able to read any config files.


You also break relative paths and directory-moving.  Say a process  
does chdir("/foo/bar").  Now the ACL data in "cwd" is appropriate  
for /foo/bar.  If you later chdir("../quux"), how do you unapply the  
changes made when you switched into that directory?  For inheritable  
ACLs, you can't "unapply" such an ACL state change unless you save  
state for all the parent directories, except...  What happens when  
you are in "/foo/bar" and another process does "mv /foo/bar /foobar/ 
quux"?  Suddenly any "cwd" ACL data you have is completely invalid  
and you have to rebuild your ACLs from scratch.  Moreover, if the  
directory you are in was moved to a portion of the filesystem not  
accessible from your current namespace then how do you deal with it?


For example:
NS1 has the / root dir of /dev/sdb1 mounted on /mnt
NS2 has the /bar subdir of /dev/sdb1 mounted on /mnt

Your process is in NS2 and does chdir("/mnt/quux").  A user in NS1  
does: "mv /mnt/bar/quux /mnt/quux".  Now your "cwd" is in a directory  
on a filesystem you have mounted, but it does not correspond *AT ALL*  
to any path available from your namespace.


Another example:
Your process has done dirfd=open("/media/cdrom/somestuff") when the  
admin does "umount -l /media/cdrom".  You still have the CD-ROM open  
and accessible but IT HAS NO PATH.  It isn't even mounted in *any*  
namespace, it's just kind of dangling waiting for its last users to  
go away.  You can still do fchdir(dirfd), openat(dirfd, "foo/ 
bar", ...), open("./foo"), etc.


In Linux the ONLY distinction between "relative" and "absolute" paths  
is that the "absolute" path begins with a magic slash which implies  
that you start at the hidden "root" fd the kernel manages.


More detail on problems with the "building the ACL from scratch" part:
That's not even paying attention to functions like "fchdir" or  
their interactions with "chroot" and namespaces.  I can probably  
have an open directory handle to a volume in a completely  
different namespace, a volume which isn't even *MOUNTED* in my  
current fs namespace.  Using that file-handle I believe I can  
"fchdir", "openat", etc, in a completely different namespace.  I  
can do the same thing with a chroot, except there I can even  
"escape":

  /* Switch into chroot.  Doesn't drop root privs */
  chdir("/some/dir/somewhere");
  chroot(".");
  /* Malicious code later on */
  chdir("/");
  chroot("another_dir");
  chdir("../../../../../../../../..");
  chroot(".");
  /* Now I'm back in the real root filesystem */


I don't see what

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul Mackerras

Herbert Xu writes:

> > You mean it's intended that *sk->sk_prot->memory_pressure can end up
> > as 1 when sk->sk_prot->memory_allocated is small (less than
> > ->sysctl_mem[0]), or as 0 when ->memory_allocated is large (greater
> > than ->sysctl_mem[2])?  Because that's the effect of the current code.
> > If so I wonder why you bother computing it.
> 
> You need to remember that there are three different limits:
> minimum, pressure, and maximum.  By default we should never
> be in a situation where what you say can occur.
> 
> If you set all three limits to the same thing, then yes it
> won't work as intended but it's still well-behaved.

I'm not talking about setting all three limits to the same thing.

I'm talking about this situation:

CPU 0 comes into __sk_stream_mem_reclaim, reads memory_allocated, but
then before it can do the store to *memory_pressure, CPUs 1-1023 all
go through sk_stream_mem_schedule, collectively increase
memory_allocated to more than sysctl_mem[2] and set *memory_pressure.
Finally CPU 0 gets to do its store and it sets *memory_pressure back
to 0, but by this stage memory_allocated is way larger than
sysctl_mem[2].

Yes, it's unlikely, but that is the nature of race conditions - they
are unlikely, and only show up at inconvenient times, never when
someone who could fix the bug is watching. :)

Similarly it would be possible for other CPUs to decrease
memory_allocated from greater than sysctl_mem[2] to less than
sysctl_mem[0] in the interval between when we read memory_allocated
and set *memory_pressure to 1.  And it's quite possible for their
setting of *memory_pressure to 0 to happen before our setting of it to
1, so that it ends up at 1 when it should be 0.

Now, maybe it's the case that it doesn't really matter whether
*->memory_pressure is 0 or 1.  But if so, why bother computing it at
all?

People seem to think that using atomic_t means they don't need to use
a spinlock.  That's fine if there is only one variable involved, but
as soon as there's more than one, there's the possibility of a race,
whether or not you use atomic_t, and whether or not atomic_read has
"volatile" behaviour.

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: drivers/infiniband/mlx/mad.c misplaced ;

2007-08-15 Thread Kok, Auke


Joe Perches wrote:

On Wed, 2007-08-15 at 19:19 -0700, Kok, Auke wrote:

Joe Perches wrote:

On Wed, 2007-08-15 at 19:58 -0400, Dave Jones wrote:
There's more than a few of these (not inspected).
$ egrep -r --include=*.c "\bif[[:space:]]*\([^\)]*\)[[:space:]]*\;" * 
arch/sh/boards/se/7343/io.c:if (0) ;

drivers/atm/iphase.c: if (!desc1) ;
drivers/infiniband/hw/mlx4/mad.c:   if (!err);
drivers/isdn/capi/capiutil.c:   else if (c <= 0x0f);
drivers/net/tokenring/ibmtr.c:  else if (ti->shared_ram_paging == 0xf);  /* No 
paging in adapter */
drivers/s390/scsi/zfcp_erp.c:   if (status == ZFCP_ERP_SUCCEEDED) ; 
/* no further action */
fs/hostfs/hostfs_user.c:if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ;
net/netfilter/xt_u32.c: if (skb->len < 4 || pos > skb->len - 4);
sound/pci/au88x0/au88x0_synth.c:if (eax == 0) ;

sounds like an excellent candidate check for checkpatch.pl !!!


I think it's poor style and all of these should go away.

the netfilter one appears to be a real error too.


I was more thinking of making sure that none of these slip back in...

Auke
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul Mackerras

Satyam Sharma writes:

> Anyway, the problem, of course, is that this conversion to a stronger /
> safer-by-default behaviour doesn't happen with zero cost to performance.
> Converting atomic ops to "volatile" behaviour did add ~2K to kernel text
> for archs such as i386 (possibly to important codepaths) that didn't have
> those semantics already so it would be constructive to actually look at
> those differences and see if there were really any heisenbugs that got
> rectified. Or if there were legitimate optimizations that got wrongly
> disabled. Onus lies on those proposing the modifications, I'd say ;-)

The uses of atomic_read where one might want it to allow caching of
the result seem to me to fall into 3 categories:

1. Places that are buggy because of a race arising from the way it's
   used.

2. Places where there is a race but it doesn't matter because we're
   doing some clever trick.

3. Places where there is some locking in place that eliminates any
   potential race.

In case 1, adding volatile won't solve the race, of course, but it's
hard to argue that we shouldn't do something because it will slow down
buggy code.  Case 2 is hopefully pretty rare and accompanied by large
comment blocks, and in those cases caching the result of atomic_read
explicitly in a local variable would probably make the code clearer.
And in case 3 there is no reason to use atomic_t at all; we might as
well just use an int.

So I don't see any good reason to make the atomic API more complex by
having "volatile" and "non-volatile" versions of atomic_read.  It
should just have the "volatile" behaviour.

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Thu, Aug 16, 2007 at 01:48:32PM +1000, Paul Mackerras wrote:
> Herbert Xu writes:
> 
> > If you're referring to the code in sk_stream_mem_schedule
> > then it's working as intended.  The atomicity guarantees
> 
> You mean it's intended that *sk->sk_prot->memory_pressure can end up
> as 1 when sk->sk_prot->memory_allocated is small (less than
> ->sysctl_mem[0]), or as 0 when ->memory_allocated is large (greater
> than ->sysctl_mem[2])?  Because that's the effect of the current code.
> If so I wonder why you bother computing it.

You need to remember that there are three different limits:
minimum, pressure, and maximum.  By default we should never
be in a situation where what you say can occur.

If you set all three limits to the same thing, then yes it
won't work as intended but it's still well-behaved.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul Mackerras

Herbert Xu writes:

> If you're referring to the code in sk_stream_mem_schedule
> then it's working as intended.  The atomicity guarantees

You mean it's intended that *sk->sk_prot->memory_pressure can end up
as 1 when sk->sk_prot->memory_allocated is small (less than
->sysctl_mem[0]), or as 0 when ->memory_allocated is large (greater
than ->sysctl_mem[2])?  Because that's the effect of the current code.
If so I wonder why you bother computing it.

> that the atomic_add/atomic_sub won't be seen in parts by
> other readers.
> 
> We certainly do not need to see other atomic_add/atomic_sub
> operations immediately.
> 
> If you're referring to another code snippet please cite.
> 
> > I'd go so far as to say that anywhere where you want a non-"volatile"
> > atomic_read, either your code is buggy, or else an int would work just
> > as well.
> 
> An int won't work here because += and -= do not have the
> atomicity guarantees that atomic_add/atomic_sub do.  In
> particular, this may cause an atomic_read on another CPU
> to give a bogus reading.

The point is that guaranteeing the atomicity of the increment or
decrement does not suffice to make the code race-free.  In this case
the race arises from the fact that reading ->memory_allocated and
setting *->memory_pressure are separate operations.  To make that code
work properly you need a lock.  And once you have the lock an ordinary
int would suffice for ->memory_allocated.

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Bill Fink

On Wed, 15 Aug 2007, Satyam Sharma wrote:

> (C)
> $ cat tp3.c
> int a;
> 
> void func(void)
> {
>   *(volatile int *) = 10;
>   *(volatile int *) = 20;
> }
> $ gcc -Os -S tp3.c
> $ cat tp3.s
> ...
> movl$10, a
> movl$20, a
> ...

I'm curious about one minor tangential point.  Why, instead of:

b = *(volatile int *)

why can't this just be expressed as:

b = (volatile int)a;

Isn't it the contents of a that's volatile, i.e. it's value can change
invisibly to the compiler, and that's why you want to force a read from
memory?  Why do you need the "*(volatile int *)&" construct?

-Bill
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Thu, Aug 16, 2007 at 01:15:05PM +1000, Paul Mackerras wrote:
> 
> But others can also reduce the reservation.  Also, the code sets and
> clears *sk->sk_prot->memory_pressure nonatomically with respect to the
> reads of sk->sk_prot->memory_allocated, so in fact the code doesn't
> guarantee any particular relationship between the two.

Yes others can reduce the reservation, but the point of this
is that the code doesn't care.  We'll either see the value
before or after the reduction and in either case we'll do
something sensible.

The worst that can happen is when we're just below the hard
limit and multiple CPUs fail to allocate but that's not really
a problem because if the machine is making progress at all
then we will eventually scale back and allow these allocations
to succeed.

As to the non-atomic operation on memory_pressue, that's OK
because we only ever assign values to it and never do other
operations such as += or -=.  Remember that int/long assignments
must be atomic or Linux won't run on your architecture.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Thinking outside the box on file systems

2007-08-15 Thread Valdis . Kletnieks

On Wed, 15 Aug 2007 15:48:15 PDT, Marc Perkel said:
> > Consider the rules:
> > 
> > peter '*a*' can create
> > peter '*b*' cannot create
> > 
> > Peter tries to create 'foo-ab-bar' - is he allowed
> > to or not?
> > 
> 
> First - I'm proposing a concept, not writing the
> implementation of the concept. You are asking what
> happens when someone write conflicting rules. That
> depends on how you implement it. Conflicting rules can
> cause unpredictable results.

Good. Go work out what the rules have to be in order for the system to
behave sanely.  "Hand-waving concept" doesn't get anywhere.  Fully fleshed-out
concepts sometimes do - once they sprout code to actually  implement them.

> The point is that one can choose any rule system they
> want and the rules applies to the names of the files
> and the permissions of the users.

No, you *can't* choose any rule system you want - some rule systems are
unworkable because they create security exposures (usually of the
"ln /etc/passwd /tmp/foo" variety, but sometimes race conditions as well).

> > For an exersize, either write a program or do by
> > hand:

> All you would have to do is create a set of rules that
> emulates the current rules and you would have the same
> results.

Good. Go create it. Let us know when you're done.  Remember - not only do
you need to have it generate the same results, you need to find a way to
implement it so that it's somewhere near the same speed as the current code.
If it's 10 times slower, it's not going to fly no matter *how* "cool" it is.


pgp7EvsAH2cHY.pgp
Description: PGP signature

Re: drivers/infiniband/mlx/mad.c misplaced ;

2007-08-15 Thread Roland Dreier

Thanks!  Seems like checking for this is in the air, I just applied an
identical patch from Ilpo JÃ¤rvinen.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[GIT PULL] please pull infiniband.git for-linus branch

2007-08-15 Thread Roland Dreier

Linus, please pull from

master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git for-linus

This tree is also available from kernel.org mirrors at:

git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband.git 
for-linus

This is basically a resend of the small fixes for 2.6.23 that I've
already asked you to pull, with the addition of one patch that deletes
a stray semicolon.

Dotan Barak (3):
  IB: Include  from 
  IB: Include  and  from 
  IB: Move the macro IB_UMEM_MAX_PAGE_CHUNK() to umem.c

Eli Cohen (1):
  mlx4_core: Wait 1 second after reset before accessing device

Hal Rosenstock (3):
  IB/mad: Fix error path if response alloc fails in 
ib_mad_recv_done_handler()
  IB/mad: Fix memory leak in switch handling in ib_mad_recv_done_handler()
  IB/mad: agent_send_response() should be void

Ilpo JÃ¤rvinen (1):
  IB/mlx4: Incorrect semicolon after if statement

Jack Morgenstein (1):
  IPoIB: Fix leak in ipoib_transport_dev_init() error path

Moni Shoua (1):
  IB/core: Ignore membership bit in ib_find_pkey()

Raghava Kondapalli (1):
  IB/srp: Add OUI for new Cisco targets

Roland Dreier (2):
  IB/sa: Don't need to check for default P_Key twice
  IB/srp: Wrap OUI checking for workarounds in helper functions

Sean Hefty (1):
  IB/mad: Fix address handle leak in mad_rmpp

Steve Wise (1):
  RDMA/cxgb3: Always call low level send function via cxgb3_ofld_send()

Vu Pham (1):
  IB/mlx4: Fix opcode returned in RDMA read completion

 drivers/infiniband/core/agent.c|   24 +
 drivers/infiniband/core/agent.h|6 ++--
 drivers/infiniband/core/device.c   |2 +-
 drivers/infiniband/core/mad.c  |   25 +++--
 drivers/infiniband/core/mad_rmpp.c |8 +++---
 drivers/infiniband/core/sa_query.c |4 +--
 drivers/infiniband/core/umem.c |5 
 drivers/infiniband/hw/cxgb3/iwch_cm.c  |   16 +++---
 drivers/infiniband/hw/mlx4/cq.c|2 +-
 drivers/infiniband/hw/mlx4/mad.c   |2 +-
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |1 +
 drivers/infiniband/ulp/srp/ib_srp.c|   31 +++
 drivers/net/mlx4/reset.c   |3 ++
 include/rdma/ib_mad.h  |2 +
 include/rdma/ib_verbs.h|7 +
 15 files changed, 77 insertions(+), 61 deletions(-)


diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index db2633e..ae7c288 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -78,15 +78,14 @@ ib_get_agent_port(struct ib_device *device, int port_num)
return entry;
 }
 
-int agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
-   struct ib_wc *wc, struct ib_device *device,
-   int port_num, int qpn)
+void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+struct ib_wc *wc, struct ib_device *device,
+int port_num, int qpn)
 {
struct ib_agent_port_private *port_priv;
struct ib_mad_agent *agent;
struct ib_mad_send_buf *send_buf;
struct ib_ah *ah;
-   int ret;
struct ib_mad_send_wr_private *mad_send_wr;
 
if (device->node_type == RDMA_NODE_IB_SWITCH)
@@ -96,23 +95,21 @@ int agent_send_response(struct ib_mad *mad, struct ib_grh 
*grh,
 
if (!port_priv) {
printk(KERN_ERR SPFX "Unable to find port agent\n");
-   return -ENODEV;
+   return;
}
 
agent = port_priv->agent[qpn];
ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
if (IS_ERR(ah)) {
-   ret = PTR_ERR(ah);
-   printk(KERN_ERR SPFX "ib_create_ah_from_wc error:%d\n", ret);
-   return ret;
+   printk(KERN_ERR SPFX "ib_create_ah_from_wc error\n");
+   return;
}
 
send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
  IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
  GFP_KERNEL);
if (IS_ERR(send_buf)) {
-   ret = PTR_ERR(send_buf);
-   printk(KERN_ERR SPFX "ib_create_send_mad error:%d\n", ret);
+   printk(KERN_ERR SPFX "ib_create_send_mad error\n");
goto err1;
}
 
@@ -126,16 +123,15 @@ int agent_send_response(struct ib_mad *mad, struct ib_grh 
*grh,
mad_send_wr->send_wr.wr.ud.port_num = port_num;
}
 
-   if ((ret = ib_post_send_mad(send_buf, NULL))) {
-   printk(KERN_ERR SPFX "ib_post_send_mad error:%d\n", ret);
+   if (ib_post_send_mad(send_buf, NULL)) {
+   printk(KERN_ERR SPFX "ib_post_send_mad error\n");
goto err2;
}
-   return 0;
+   return;
 err2:

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Thu, Aug 16, 2007 at 01:23:06PM +1000, Paul Mackerras wrote:
>
> In particular, atomic_read seems to lend itself to buggy uses.  People
> seem to do things like:
> 
>   atomic_add(, something);
>   if (atomic_read() > something_else) ...

If you're referring to the code in sk_stream_mem_schedule
then it's working as intended.  The atomicity guarantees
that the atomic_add/atomic_sub won't be seen in parts by
other readers.

We certainly do not need to see other atomic_add/atomic_sub
operations immediately.

If you're referring to another code snippet please cite.

> I'd go so far as to say that anywhere where you want a non-"volatile"
> atomic_read, either your code is buggy, or else an int would work just
> as well.

An int won't work here because += and -= do not have the
atomicity guarantees that atomic_add/atomic_sub do.  In
particular, this may cause an atomic_read on another CPU
to give a bogus reading.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC 0/3] Recursive reclaim (on __PF_MEMALLOC)

2007-08-15 Thread Nick Piggin

On Wed, Aug 15, 2007 at 03:12:06PM +0200, Peter Zijlstra wrote:
> On Wed, 2007-08-15 at 14:22 +0200, Nick Piggin wrote:
> > On Tue, Aug 14, 2007 at 07:21:03AM -0700, Christoph Lameter wrote:
> > > The following patchset implements recursive reclaim. Recursive reclaim
> > > is necessary if we run out of memory in the writeout patch from reclaim.
> > > 
> > > This is f.e. important for stacked filesystems or anything that does
> > > complicated processing in the writeout path.
> > 
> > Filesystems (most of them) that require compilcated allocations at
> > writeout time suck. That said, especially with network ones, it
> > seems like making them preallocate or reserve required memory isn't
> > progressing very smoothly.
> 
> Mainly because we seem to go in circles :-(
> 
> >  I think these patchsets are definitely
> > worth considering as an alternative. 
> 
> Honestly, I don't. They very much do not solve the problem, they just
> displace it.

Well perhaps it doesn't work for networked swap, because dirty accounting
doesn't work the same way with anonymous memory... but for _filesystems_,
right?

I mean, it intuitively seems like a good idea to terminate the recursive
allocation problem with an attempt to reclaim clean pages rather than
immediately let them have-at our memory reserve that is used for other
things as well. Any and all writepage() via reclaim is allowed to eat
into all of memory (I hate that writepage() ever has to use any memory,
and have prototyped how to fix that for simple block based filesystems
in fsblock, but others will require it).

> Christoph's suggestion to set min_free_kbytes to 20% is ridiculous - nor
> does it solve all deadlocks :-(

Well of course it doesn't, but it is a pragmatic way to reduce some
memory depletion cases. I don't see too much harm in it (although I didn't
see the 20% suggestion?)

> > No substantial comments though. 
> 
> Please do ponder the problem and its proposed solutions, because I'm
> going crazy here.

Well yeah I think you simply have to reserve a minimum amount of memory in
order to reclaim a page, and I don't see any other way to do it other than
what you describe to be _technically_ deadlock free.

But firstly, you don't _want_ to start dropping packets when you hit a tough
patch in reclaim -- even if you are strictly deadlock free. And secondly,
I think recursive reclaim could reduce the deadlocks in practice which is
not a bad thing as your patches aren't merged.

How are your deadlock patches going anyway? AFAIK they are mostly a network
issue and I haven't been keeping up with them for a while. Do you really need
networked swap and actually encounter the deadlock, or is it just a question of
wanting to fix the bugs? If the former, what for, may I ask?

> <> What Christoph is proposing is doing recursive reclaim and not
> initiating writeout. This will only work _IFF_ there are clean pages
> about. Which in the general case need not be true (memory might be
> packed with anonymous pages - consider an MPI cluster doing computation
> stuff). So this gets us a workload dependant solution - which IMHO is
> bad!

Although you will quite likely have at least a couple of MB worth of
clean program text. The important part of recursive reclaim is that it
doesn't so easily allow reclaim to blow all memory reserves (including
interrupt context). Sure you still have theoretical deadlocks, but if
I understand correctly, they are going to be lessened. I would be
really interested to see if even just these recursive reclaim patches
eliminate the problem in practice.

> > I've been sick all week.
> 
> Do get well.

Thanks!

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCP portsfrom the host TCP port space.

2007-08-15 Thread Sean Hefty

>It's not about being a niche.  It's about creating a maintainable
>software net stack that has predictable behavior.
>
>Needing to reach out of the RDMA sandbox and reserve net stack resources
>away from itself travels a path we've consistently avoided.

We need to ensure that we're also creating a maintainable kernel.  RDMA doesn't
use sockets, but that doesn't mean it's not part of the networking support
provided by the Linux kernel.  Making blanket statements that RDMA should stay
within a sandbox is equivalent to saying that RDMA should duplicate any network
related functionality that it might need.

>>> I will NACK any patch that opens up sockets to eat up ports or
>>> anything stupid like that.
>
>Ditto for me as well.

I agree that using a socket is the wrong approach, but my guess is that it was
suggested as a possibility because of the attempt to keep RDMA in its 'sandbox'.
The iWarp architecture implements RDMA over TCP; it just doesn't use sockets.
The Linux network stack doesn't easily support this possibility.  Are there any
reasonable ways to enable this to the degree necessary for iWarp?

- Sean
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] vfs: use list_for_each_entry instead

2007-08-15 Thread Denis Cheng

Signed-off-by: Denis Cheng <[EMAIL PROTECTED]>
---
 fs/super.c |7 ++-
 1 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index fc8ebed..f287c15 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -334,15 +334,12 @@ struct super_block *sget(struct file_system_type *type,
int (*set)(struct super_block *,void *),
void *data)
 {
-   struct super_block *s = NULL;
-   struct list_head *p;
+   struct super_block *s = NULL, *old;
int err;
 
 retry:
spin_lock(_lock);
-   if (test) list_for_each(p, >fs_supers) {
-   struct super_block *old;
-   old = list_entry(p, struct super_block, s_instances);
+   if (test) list_for_each_entry(old, >fs_supers, s_instances) {
if (!test(old, data))
continue;
if (!grab_super(old))
-- 
1.5.3.rc4

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul Mackerras

Herbert Xu writes:

> > Are you sure?  How do you know some other CPU hasn't changed the value
> > in between?
> 
> Yes I'm sure, because we don't care if others have increased
> the reservation.

But others can also reduce the reservation.  Also, the code sets and
clears *sk->sk_prot->memory_pressure nonatomically with respect to the
reads of sk->sk_prot->memory_allocated, so in fact the code doesn't
guarantee any particular relationship between the two.

That code looks like a beautiful example of buggy, racy code where
someone has sprinkled magic fix-the-races dust (otherwise known as
atomic_t) around in a vain attempt to fix the races.

That's assuming that all that stuff actually performs any useful
purpose, of course, and that there isn't some lock held by the
callers.  In the latter case it is pointless using atomic_t.

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul Mackerras

Christoph Lameter writes:

> > But I have to say that I still don't know of a single place
> > where one would actually use the volatile variant.
> 
> I suspect that what you say is true after we have looked at all callers.

It seems that there could be a lot of places where atomic_t is used in
a non-atomic fashion, and that those uses are either buggy, or there
is some lock held at the time which guarantees that other CPUs aren't
changing the value.  In both cases there is no point in using
atomic_t; we might as well just use an ordinary int.

In particular, atomic_read seems to lend itself to buggy uses.  People
seem to do things like:

atomic_add(, something);
if (atomic_read() > something_else) ...

and expect that there is some relationship between the value that the
atomic_add stored and the value that the atomic_read will return,
which there isn't.  People seem to think that using atomic_t magically
gets rid of races.  It doesn't.

I'd go so far as to say that anywhere where you want a non-"volatile"
atomic_read, either your code is buggy, or else an int would work just
as well.

Paul.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul Mackerras

Satyam Sharma writes:

> I can't speak for this particular case, but there could be similar code
> examples elsewhere, where we do the atomic ops on an atomic_t object
> inside a higher-level locking scheme that would take care of the kind of
> problem you're referring to here. It would be useful for such or similar
> code if the compiler kept the value of that atomic object in a register.

If there is a higher-level locking scheme then there is no point to
using atomic_t variables.  Atomic_t is specifically for the situation
where multiple CPUs are updating a variable without locking.

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [2.6 patch] remove Documentation/networking/net-modules.txt

2007-08-15 Thread Paul Gortmaker

On 8/14/07, Adrian Bunk <[EMAIL PROTECTED]> wrote:
> According to git, the only one who touched this file during the last
> 5 years was me when removing drivers...
>
> modinfo offers less ancient information.
>
> Signed-off-by: Adrian Bunk <[EMAIL PROTECTED]>
>

Fine by me for any of the old stuff I was responsible for...

Acked-by: Paul Gortmaker <[EMAIL PROTECTED]>



> -
> -   8390 based Network Modules  (Paul Gortmaker, Nov 12, 1995)
> -   --
> -
> -(Includes: smc-ultra, ne, wd, 3c503, hp, hp-plus, e2100 and ac3200)
> -
> -The 8390 series of network drivers now support multiple card systems without
> -reloading the same module multiple times (memory efficient!) This is done by
> -specifying multiple comma separated values, such as:
> -
> -   insmod 3c503.o io=0x280,0x300,0x330,0x350  xcvr=0,1,0,1
> -
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: drivers/infiniband/mlx/mad.c misplaced ;

2007-08-15 Thread Joe Perches

On Wed, 2007-08-15 at 19:19 -0700, Kok, Auke wrote:
> Joe Perches wrote:
> > On Wed, 2007-08-15 at 19:58 -0400, Dave Jones wrote:
> > There's more than a few of these (not inspected).
> > $ egrep -r --include=*.c "\bif[[:space:]]*\([^\)]*\)[[:space:]]*\;" * 
> > arch/sh/boards/se/7343/io.c:if (0) ;
> > drivers/atm/iphase.c: if (!desc1) ;
> > drivers/infiniband/hw/mlx4/mad.c:   if (!err);
> > drivers/isdn/capi/capiutil.c:   else if (c <= 0x0f);
> > drivers/net/tokenring/ibmtr.c:  else if (ti->shared_ram_paging == 0xf);  /* 
> > No paging in adapter */
> > drivers/s390/scsi/zfcp_erp.c:   if (status == ZFCP_ERP_SUCCEEDED) ; 
> > /* no further action */
> > fs/hostfs/hostfs_user.c:if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ;
> > net/netfilter/xt_u32.c: if (skb->len < 4 || pos > skb->len - 4);
> > sound/pci/au88x0/au88x0_synth.c:if (eax == 
> > 0) ;
> 
> sounds like an excellent candidate check for checkpatch.pl !!!

I think it's poor style and all of these should go away.

the netfilter one appears to be a real error too.

Signed-off-by: Joe Perches <[EMAIL PROTECTED]>

diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c
index 74f9b14..bec4279 100644
--- a/net/netfilter/xt_u32.c
+++ b/net/netfilter/xt_u32.c
@@ -36,7 +36,7 @@ static bool u32_match_it(const struct xt_u32 *data,
at  = 0;
pos = ct->location[0].number;
 
-   if (skb->len < 4 || pos > skb->len - 4);
+   if (skb->len < 4 || pos > skb->len - 4)
return false;
 
ret   = skb_copy_bits(skb, pos, , sizeof(n));


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH] /proc//pmaps - memory maps in granularity of pages

2007-08-15 Thread Fengguang Wu

On Thu, Aug 16, 2007 at 09:37:01AM +0800, Fengguang Wu wrote:
> 3) A 64bit number can be encoded in hex in 8 bytes, no more than its
 ~~ sorry, I'm a fool!
> binary presentation. And often the text string will be much shorter
> because of the common small values.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCP ports from the host TCP port space.

2007-08-15 Thread Roland Dreier

 > Needing to reach out of the RDMA sandbox and reserve net stack
 > resources away from itself travels a path we've consistently avoided.

Where did the idea of an "RDMA sandbox" come from?  Obviously no one
disagrees with keeping things clean and maintainable, but the idea
that RDMA is a second-class citizen that doesn't get any input into
the evolution of the networking code seems kind of offensive to me.

 - R.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: how can I get an account on master.kernel.org machine

2007-08-15 Thread Randy Dunlap

On Thu, 16 Aug 2007 10:58:41 +0800 ye janboe wrote:

> I want to down linux-arm git repos, but I do not have an account on
> master machine.

see http://www.kernel.org/faq/#account

---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

how can I get an account on master.kernel.org machine

2007-08-15 Thread ye janboe

I want to down linux-arm git repos, but I do not have an account on
master machine.

thanks

Yuanbo
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] calculation of pgoff in do_linear_fault() uses mixed units

2007-08-15 Thread Nick Piggin

On Wed, Aug 15, 2007 at 12:43:26PM -0500, Dean Nelson wrote:
> The calculation of pgoff in do_linear_fault() should use PAGE_SHIFT and not
> PAGE_CACHE_SHIFT since vma->vm_pgoff is in units of PAGE_SIZE and not
> PAGE_CACHE_SIZE. At the moment linux/pagemap.h has PAGE_CACHE_SHIFT defined
> as PAGE_SHIFT, but should that ever change this calculation would break.
> 
> Signed-off-by: Dean Nelson <[EMAIL PROTECTED]>
> 

Acked-by: Nick Piggin <[EMAIL PROTECTED]>

> 
> Index: linux-2.6/mm/memory.c
> ===
> --- linux-2.6.orig/mm/memory.c2007-08-14 06:42:18.322378070 -0500
> +++ linux-2.6/mm/memory.c 2007-08-15 12:30:51.621604739 -0500
> @@ -2465,7 +2465,7 @@
>   int write_access, pte_t orig_pte)
>  {
>   pgoff_t pgoff = (((address & PAGE_MASK)
> - - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
> + - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
>   unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
>  
>   return __do_fault(mm, vma, address, page_table, pmd, pgoff,
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC 0/9] Reclaim during GFP_ATOMIC allocs

2007-08-15 Thread Nick Piggin

On Tue, Aug 14, 2007 at 08:30:21AM -0700, Christoph Lameter wrote:
> This is the extended version of the reclaim patchset. It enables reclaim from
> clean file backed pages during GFP_ATOMIC allocs. A bit invasive since
> may locks must now be taken with saving flags. But it works.
> 
> Tested by repeatedly allocating 12MB of memory from the timer interrupt.
> 
> -- 

Just to clarify... I can see how recursive reclaim can prevent memory getting
eaten up by reclaim (which thus causes allocations from interrupt handlers to
fail)...

But this patchset I don't see will do anything to prevent reclaim deadlocks,
right? (because if there is reclaimable memory at hand, then kswapd should
eventually reclaim it).
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Fix VM_FAULT flags conversion for hugetlb

2007-08-15 Thread Nick Piggin

On Tue, Aug 14, 2007 at 02:41:21PM -0500, Adam Litke wrote:
> It seems a simple mistake was made when converting follow_hugetlb_page()
> over to the VM_FAULT flags bitmask stuff:
>   (commit  83c54070ee1a2d05c89793884bea1a03f2851ed4).
> 
> By using the wrong bitmask, hugetlb_fault() failures are not being
> recognized.  This results in an infinite loop whenever
> follow_hugetlb_page is involved in a failed fault.
> 
> Signed-off-by: Adam Litke <[EMAIL PROTECTED]>

Thanks Adam. Looks good.

> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d7ca59d..de4cf45 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -643,7 +643,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct 
> vm_area_struct *vma,
>   spin_unlock(>page_table_lock);
>   ret = hugetlb_fault(mm, vma, vaddr, 0);
>   spin_lock(>page_table_lock);
> - if (!(ret & VM_FAULT_MAJOR))
> + if (!(ret & VM_FAULT_ERROR))
>   continue;
>  
>   remainder = 0;
> 
> -- 
> Adam Litke - (agl at us.ibm.com)
> IBM Linux Technology Center
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] Make rcutorture RNG use temporal entropy

2007-08-15 Thread Paul E. McKenney

Repost of http://lkml.org/lkml/2007/8/10/472 made available by request.

The locking used by get_random_bytes() can conflict with the
preempt_disable() and synchronize_sched() form of RCU.  This patch changes
rcutorture's RNG to gather entropy from the new cpu_clock() interface
(relying on interrupts, preemption, daemons, and rcutorture's reader
thread's rock-bottom scheduling priority to provide useful entropy),
and also adds and EXPORT_SYMBOL_GPL() to make that interface available
to GPLed kernel modules such as rcutorture.

Passes several hours of rcutorture.

Signed-off-by: Paul E. McKenney <[EMAIL PROTECTED]>
---

 rcutorture.c |8 ++--
 sched.c  |2 ++
 2 files changed, 4 insertions(+), 6 deletions(-)

diff -urpNa -X dontdiff linux-2.6.23-rc2/kernel/rcutorture.c 
linux-2.6.23-rc2-rcutorturesched/kernel/rcutorture.c
--- linux-2.6.23-rc2/kernel/rcutorture.c2007-08-03 19:49:55.0 
-0700
+++ linux-2.6.23-rc2-rcutorturesched/kernel/rcutorture.c2007-08-10 
17:15:22.0 -0700
@@ -42,7 +42,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -166,16 +165,13 @@ struct rcu_random_state {
 
 /*
  * Crude but fast random-number generator.  Uses a linear congruential
- * generator, with occasional help from get_random_bytes().
+ * generator, with occasional help from cpu_clock().
  */
 static unsigned long
 rcu_random(struct rcu_random_state *rrsp)
 {
-   long refresh;
-
if (--rrsp->rrs_count < 0) {
-   get_random_bytes(, sizeof(refresh));
-   rrsp->rrs_state += refresh;
+   rrsp->rrs_state += (unsigned long)cpu_clock(smp_processor_id());
rrsp->rrs_count = RCU_RANDOM_REFRESH;
}
rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff -urpNa -X dontdiff linux-2.6.23-rc2/kernel/sched.c 
linux-2.6.23-rc2-rcutorturesched/kernel/sched.c
--- linux-2.6.23-rc2/kernel/sched.c 2007-08-03 19:49:55.0 -0700
+++ linux-2.6.23-rc2-rcutorturesched/kernel/sched.c 2007-08-10 
17:22:57.0 -0700
@@ -394,6 +394,8 @@ unsigned long long cpu_clock(int cpu)
return now;
 }
 
+EXPORT_SYMBOL_GPL(cpu_clock);
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /* Change a task's ->cfs_rq if it moves across CPUs */
 static inline void set_task_cfs_rq(struct task_struct *p)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Satyam Sharma

On Thu, 16 Aug 2007, Satyam Sharma wrote:

> On Thu, 16 Aug 2007, Paul Mackerras wrote:
> > Herbert Xu writes:
> > 
> > > See sk_stream_mem_schedule in net/core/stream.c:
> > > 
> > > /* Under limit. */
> > > if (atomic_read(sk->sk_prot->memory_allocated) < 
> > > sk->sk_prot->sysctl_mem[0]) {
> > > if (*sk->sk_prot->memory_pressure)
> > > *sk->sk_prot->memory_pressure = 0;
> > > return 1;
> > > }
> > > 
> > > /* Over hard limit. */
> > > if (atomic_read(sk->sk_prot->memory_allocated) > 
> > > sk->sk_prot->sysctl_mem[2]) {
> > > sk->sk_prot->enter_memory_pressure();
> > > goto suppress_allocation;
> > > }
> > > 
> > > We don't need to reload sk->sk_prot->memory_allocated here.
> > 
> > Are you sure?  How do you know some other CPU hasn't changed the value
> > in between?
> 
> I can't speak for this particular case, but there could be similar code
> examples elsewhere, where we do the atomic ops on an atomic_t object
> inside a higher-level locking scheme that would take care of the kind of
> problem you're referring to here. It would be useful for such or similar
> code if the compiler kept the value of that atomic object in a register.

We might not be using atomic_t (and ops) if we already have a higher-level
locking scheme, actually. So as Herbert mentioned, such cases might just
not care. [ Too much of this thread, too little sleep, sorry! ]

Anyway, the problem, of course, is that this conversion to a stronger /
safer-by-default behaviour doesn't happen with zero cost to performance.
Converting atomic ops to "volatile" behaviour did add ~2K to kernel text
for archs such as i386 (possibly to important codepaths) that didn't have
those semantics already so it would be constructive to actually look at
those differences and see if there were really any heisenbugs that got
rectified. Or if there were legitimate optimizations that got wrongly
disabled. Onus lies on those proposing the modifications, I'd say ;-)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [ckrm-tech] Regression in 2.6.23-rc2-mm2, mounting cpusets causes a hang

2007-08-15 Thread Paul Jackson

Christoph wrote (and Lee quoted):
> Looks like we need to fix cpuset nodemasks for the !NUMA case then?
> It cannot expect to find valid nodemasks if !NUMA.

The code in kernel/cpuset.c should be written as if there are always
valid nodemasks.  In the case of !NUMA, it will happen that there is
only one node, but kernel/cpuset.c should not manifest that detail.

-- 
  I won't rest till it's the best ...
  Programmer, Linux Scalability
  Paul Jackson <[EMAIL PROTECTED]> 1.925.600.0401
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul E. McKenney

On Thu, Aug 16, 2007 at 10:11:05AM +0800, Herbert Xu wrote:
> On Thu, Aug 16, 2007 at 12:05:56PM +1000, Paul Mackerras wrote:
> > Herbert Xu writes:
> > 
> > > See sk_stream_mem_schedule in net/core/stream.c:
> > > 
> > > /* Under limit. */
> > > if (atomic_read(sk->sk_prot->memory_allocated) < 
> > > sk->sk_prot->sysctl_mem[0]) {
> > > if (*sk->sk_prot->memory_pressure)
> > > *sk->sk_prot->memory_pressure = 0;
> > > return 1;
> > > }
> > > 
> > > /* Over hard limit. */
> > > if (atomic_read(sk->sk_prot->memory_allocated) > 
> > > sk->sk_prot->sysctl_mem[2]) {
> > > sk->sk_prot->enter_memory_pressure();
> > > goto suppress_allocation;
> > > }
> > > 
> > > We don't need to reload sk->sk_prot->memory_allocated here.
> > 
> > Are you sure?  How do you know some other CPU hasn't changed the value
> > in between?
> 
> Yes I'm sure, because we don't care if others have increased
> the reservation.
> 
> Note that even if we did we'd be using barriers so volatile
> won't do us any good here.

If the load-coalescing is important to performance, why not load into
a local variable?

Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul E. McKenney

On Wed, Aug 15, 2007 at 06:41:40PM -0700, Christoph Lameter wrote:
> On Wed, 15 Aug 2007, Paul E. McKenney wrote:
> 
> > Understood.  My point is not that the impact is precisely zero, but
> > rather that the impact on optimization is much less hurtful than the
> > problems that could arise otherwise, particularly as compilers become
> > more aggressive in their optimizations.
> 
> The problems arise because barriers are not used as required. Volatile 
> has wishy washy semantics and somehow marries memory barriers with data 
> access. It is clearer to separate the two. Conceptual cleanness usually 
> translates into better code. If one really wants the volatile then lets 
> make it explicit and use
> 
>   atomic_read_volatile()

There are indeed architectures where you can cause gcc to emit memory
barriers in response to volatile.  I am assuming that we are -not-
making gcc do this.  Given this, then volatiles and memory barrier
instructions are orthogonal -- one controls the compiler, the other
controls the CPU.

Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul E. McKenney

On Thu, Aug 16, 2007 at 03:30:44AM +0200, Segher Boessenkool wrote:
> >>>Part of the motivation here is to fix heisenbugs.  If I knew where 
> >>>they
> >>
> >>By the same token we should probably disable optimisations
> >>altogether since that too can create heisenbugs.
> >
> >Precisely the point -- use of volatile (whether in casts or on asms)
> >in these cases are intended to disable those optimizations likely to
> >result in heisenbugs.
> 
> The only thing volatile on an asm does is create a side effect
> on the asm statement; in effect, it tells the compiler "do not
> remove this asm even if you don't need any of its outputs".
> 
> It's not disabling optimisation likely to result in bugs,
> heisen- or otherwise; _not_ putting the volatile on an asm
> that needs it simply _is_ a bug :-)

Yep.  And the reason it is a bug is that it fails to disable
the relevant compiler optimizations.  So I suspect that we might
actually be saying the same thing here.

Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Early printk behaviour

2007-08-15 Thread Robin Getz

I was putting together an early printk implementation for the Blackfin, and 
was wondering what the expected behaviour was in this situation.

When I set up my bootargs earlyprintk=serial,ttyBF0,57600 and have no console 
defined (no graphical console, no serial console).

based on the patch:
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=69331af79cf29e26d1231152a172a1a10c2df511

which no longer calls disable_early_printk, the earlyprintk console never gets 
turned off (because nothing else ever calls register_console). I get 
everything out the early console, until the init section is released (where 
the console structure is sitting), and it starts printing out garbage.

Is this expected behaviour?

-Robin
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 6/24] make atomic_read() behave consistently on frv

2007-08-15 Thread Paul E. McKenney

On Thu, Aug 16, 2007 at 11:09:25AM +1000, Nick Piggin wrote:
> Paul E. McKenney wrote:
> >On Wed, Aug 15, 2007 at 11:30:05PM +1000, Nick Piggin wrote:
> 
> >>Especially since several big architectures don't have volatile in their
> >>atomic_get and _set, I think it would be a step backwards to add them in
> >>as a "just in case" thin now (unless there is a better reason).
> >
> >Good point, except that I would expect gcc's optimization to continue
> >to improve.  I would like the kernel to be able to take advantage of
> >improved optimization, which means that we are going to have to make
> >a few changes.  Adding volatile to atomic_get() and atomic_set() is
> >IMHO one of those changes.
> 
> What optimisations? gcc already does most of the things you need a
> barrier/volatile for, like reordering non-dependant loads and stores,
> and eliminating mem ops completely by caching in registers.

Yep, most of the currently practiced optimizations.  Given that CPU clock
frequencies are not rising as quickly as they once did, I would expect
that there will be added effort on implementing the known more-aggressive
optimization techniques, and on coming up with new ones as well.

Some of these new optimizations will likely be inappropriate for kernel
code, but I expect that some will be things that we want.

> >>As to your followup question of why to use it over ACCESS_ONCE. I
> >>guess, aside from consistency with the rest of the barrier APIs, you
> >>can use it in other primitives when you don't actually know what the
> >>caller is going to do or if it even will make an access. You could
> >>also use it between calls to _other_ primitives, etc... it just
> >>seems more flexible to me, but I haven't actually used such a thing
> >>in real code...
> >>
> >>ACCESS_ONCE doesn't seem as descriptive. What it results in is the
> >>memory location being loaded or stored (presumably once exactly),
> >>but I think the more general underlying idea is a barrier point.
> >
> >OK, first, I am not arguing that ACCESS_ONCE() can replace all current
> >uses of barrier().
> 
> OK. Well I also wasn't saying that ACCESS_ONCE should not be
> implemented. But if we want something like it, then it would make
> sense to have an equivalent barrier statement as well (ie. order()).

And I am not arguing against use of asms to implement the volatility
in atomic_read() and atomic_set(), and in fact it appears that one
of the architectures will be taking this approach.

Sounds like we might be in violent agreement.  ;-)

Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCP ports from the host TCP port space.

2007-08-15 Thread Jeff Garzik

Steve Wise wrote:

David Miller wrote:

From: Sean Hefty <[EMAIL PROTECTED]>
Date: Thu, 09 Aug 2007 14:40:16 -0700

Steve Wise wrote:

Any more comments?
Does anyone have ideas on how to reserve the port space without using 
a struct socket?

How about we just remove the RDMA stack altogether?  I am not at all
kidding.  If you guys can't stay in your sand box and need to cause
problems for the normal network stack, it's unacceptable.  We were
told all along the if RDMA went into the tree none of this kind of
stuff would be an issue.

I think removing the RDMA stack is the wrong thing to do, and you 
shouldn't just threaten to yank entire subsystems because you don't like 
the technology.  Lets keep this constructive, can we?  RDMA should get 
the respect of any other technology in Linux.  Maybe its a niche in your 
opinion, but come on, there's more RDMA users than say, the sparc64 
port.  Eh?

It's not about being a niche.  It's about creating a maintainable 
software net stack that has predictable behavior.

Needing to reach out of the RDMA sandbox and reserve net stack resources 
away from itself travels a path we've consistently avoided.

I will NACK any patch that opens up sockets to eat up ports or
anything stupid like that.

Got it.

Ditto for me as well.

Jeff

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Nick Piggin


Segher Boessenkool wrote:

Part of the motivation here is to fix heisenbugs.  If I knew where they



By the same token we should probably disable optimisations
altogether since that too can create heisenbugs.



Almost everything is a tradeoff; and so is this.  I don't
believe most people would find disabling all compiler
optimisations an acceptable price to pay for some peace
of mind.


So why is this a good tradeoff?

I also think that just adding things to APIs in the hope it might fix
up some bugs isn't really a good road to go down. Where do you stop?

On the actual proposal to make atomic_operators volatile: I think the
better approach in the long term, for both maintainability of the
code and education of coders, is to make the use of barriers _more_
explicit rather than sprinkling these "just in case" ones around.

You may get rid of a few atomic_read heisenbugs (in noise when
compared to all bugs), but if the coder was using a regular atomic
load, or a test_bit (which is also atomic), etc. then they're going
to have problems.

It would be better for Linux if everyone was to have better awareness
of barriers than to hide some of the cases where they're required.
A pretty large number of bugs I see in lock free code in the VM is
due to memory ordering problems. It's hard to find those bugs, or
even be aware when you're writing buggy code if you don't have some
feel for barriers.

--
SUSE Labs, Novell Inc.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul E. McKenney

On Thu, Aug 16, 2007 at 03:23:28AM +0200, Segher Boessenkool wrote:
> No; compilation units have nothing to do with it, GCC can optimise
> across compilation unit boundaries just fine, if you tell it to
> compile more than one compilation unit at once.
> >>>
> >>>Last I checked, the Linux kernel build system did compile each .c 
> >>>file
> >>>as a separate compilation unit.
> >>
> >>I have some patches to use -combine -fwhole-program for Linux.
> >>Highly experimental, you need a patched bleeding edge toolchain.
> >>If there's interest I'll clean it up and put it online.
> >>
> >>David Woodhouse had some similar patches about a year ago.
> >
> >Sounds exciting...  ;-)
> 
> Yeah, the breakage is *quite* spectacular :-)

I bet!!!  ;-)

> >In many cases, the compiler also has to assume that
> >msleep_interruptible()
> >might call back into a function in the current compilation unit, 
> >thus
> >possibly modifying global static variables.
> 
> It most often is smart enough to see what compilation-unit-local
> variables might be modified that way, though :-)
> >>>
> >>>Yep.  For example, if it knows the current value of a given such 
> >>>local
> >>>variable, and if all code paths that would change some other variable
> >>>cannot be reached given that current value of the first variable.
> >>
> >>Or the most common thing: if neither the address of the translation-
> >>unit local variable nor the address of any function writing to that
> >>variable can "escape" from that translation unit, nothing outside
> >>the translation unit can write to the variable.
> >
> >But there is usually at least one externally callable function in
> >a .c file.
> 
> Of course, but often none of those will (indirectly) write a certain
> static variable.

But there has to be some path to the static functions, assuming that
they are not dead code.  Yes, there can be cases where the compiler
knows enough about the state of the variables to rule out some of code
paths to them, but I have no idea how often this happens in kernel
code.

Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Satyam Sharma

On Thu, 16 Aug 2007, Paul Mackerras wrote:

> Herbert Xu writes:
> 
> > See sk_stream_mem_schedule in net/core/stream.c:
> > 
> > /* Under limit. */
> > if (atomic_read(sk->sk_prot->memory_allocated) < 
> > sk->sk_prot->sysctl_mem[0]) {
> > if (*sk->sk_prot->memory_pressure)
> > *sk->sk_prot->memory_pressure = 0;
> > return 1;
> > }
> > 
> > /* Over hard limit. */
> > if (atomic_read(sk->sk_prot->memory_allocated) > 
> > sk->sk_prot->sysctl_mem[2]) {
> > sk->sk_prot->enter_memory_pressure();
> > goto suppress_allocation;
> > }
> > 
> > We don't need to reload sk->sk_prot->memory_allocated here.
> 
> Are you sure?  How do you know some other CPU hasn't changed the value
> in between?

I can't speak for this particular case, but there could be similar code
examples elsewhere, where we do the atomic ops on an atomic_t object
inside a higher-level locking scheme that would take care of the kind of
problem you're referring to here. It would be useful for such or similar
code if the compiler kept the value of that atomic object in a register.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: drivers/infiniband/mlx/mad.c misplaced ;

2007-08-15 Thread Kok, Auke


Joe Perches wrote:

On Wed, 2007-08-15 at 19:58 -0400, Dave Jones wrote:

Signed-off-by: Dave Jones <[EMAIL PROTECTED]>

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 3330917..0ed02b7 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -109,7 +109,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, 
int ignore_bkey,
   in_modifier, op_modifier,
   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
 
-	if (!err);

+   if (!err)


There's more than a few of these (not inspected).

$ egrep -r --include=*.c "\bif[[:space:]]*\([^\)]*\)[[:space:]]*\;" * 
arch/sh/boards/se/7343/io.c:if (0) ;

drivers/atm/iphase.c: if (!desc1) ;
drivers/infiniband/hw/mlx4/mad.c:   if (!err);
drivers/isdn/capi/capiutil.c:   else if (c <= 0x0f);
drivers/net/tokenring/ibmtr.c:  else if (ti->shared_ram_paging == 0xf);  /* No 
paging in adapter */
drivers/s390/scsi/zfcp_erp.c:   if (status == ZFCP_ERP_SUCCEEDED) ; 
/* no further action */
fs/hostfs/hostfs_user.c:if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ;
net/netfilter/xt_u32.c: if (skb->len < 4 || pos > skb->len - 4);
sound/pci/au88x0/au88x0_synth.c:if (eax == 0) ;


sounds like an excellent candidate check for checkpatch.pl !!!

Cheers,

Auke
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Chris Friesen


Herbert Xu wrote:


But I have to say that I still don't know of a single place
where one would actually use the volatile variant.


Given that many of the existing users do currently have "volatile", are 
you comfortable simply removing that behaviour from them?  Are you sure 
that you will not introduce any issues?


Forcing a re-read is only a performance penalty.  Removing it can cause 
behavioural changes.


I would be more comfortable making the default match the majority of the 
current implementations (ie: volatile semantics).  Then, if someone 
cares about performance they can explicitly validate the call path and 
convert it over to the non-volatile version.


Correctness before speed...

Chris
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Christoph Lameter

On Thu, 16 Aug 2007, Herbert Xu wrote:

> > Do we have a consensus here? (hoping against hope, probably :-)
> 
> I can certainly agree with this.

I agree too.

> But I have to say that I still don't know of a single place
> where one would actually use the volatile variant.

I suspect that what you say is true after we have looked at all callers.


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Christoph Lameter

On Wed, 15 Aug 2007, Christoph Lameter wrote:

> On Thu, 16 Aug 2007, Paul Mackerras wrote:
> 
> > > We don't need to reload sk->sk_prot->memory_allocated here.
> > 
> > Are you sure?  How do you know some other CPU hasn't changed the value
> > in between?
> 
> The cpu knows because the cacheline was not invalidated.

Crap my statement above is wrong. We do not care that the 
value was changed otherwise we would have put a barrier in there.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Christoph Lameter

On Thu, 16 Aug 2007, Paul Mackerras wrote:

> > We don't need to reload sk->sk_prot->memory_allocated here.
> 
> Are you sure?  How do you know some other CPU hasn't changed the value
> in between?

The cpu knows because the cacheline was not invalidated.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Thu, Aug 16, 2007 at 12:05:56PM +1000, Paul Mackerras wrote:
> Herbert Xu writes:
> 
> > See sk_stream_mem_schedule in net/core/stream.c:
> > 
> > /* Under limit. */
> > if (atomic_read(sk->sk_prot->memory_allocated) < 
> > sk->sk_prot->sysctl_mem[0]) {
> > if (*sk->sk_prot->memory_pressure)
> > *sk->sk_prot->memory_pressure = 0;
> > return 1;
> > }
> > 
> > /* Over hard limit. */
> > if (atomic_read(sk->sk_prot->memory_allocated) > 
> > sk->sk_prot->sysctl_mem[2]) {
> > sk->sk_prot->enter_memory_pressure();
> > goto suppress_allocation;
> > }
> > 
> > We don't need to reload sk->sk_prot->memory_allocated here.
> 
> Are you sure?  How do you know some other CPU hasn't changed the value
> in between?

Yes I'm sure, because we don't care if others have increased
the reservation.

Note that even if we did we'd be using barriers so volatile
won't do us any good here.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Thu, Aug 16, 2007 at 07:45:44AM +0530, Satyam Sharma wrote:
>
> Completely agreed, again. To summarize again (had done so about ~100 mails
> earlier in this thread too :-) ...
> 
> atomic_{read,set}_volatile() -- guarantees volatility also along with
> atomicity (the two _are_ different concepts after all, irrespective of
> whether callsites normally want one with the other or not)
> 
> atomic_{read,set}_nonvolatile() -- only guarantees atomicity, compiler
> free to elid / coalesce / optimize such accesses, can keep the object
> in question cached in a local register, leads to smaller text, etc.
> 
> As to which one should be the default atomic_read() is a question of
> whether majority of callsites (more weightage to important / hot
> codepaths, lesser to obscure callsites) want a particular behaviour.
> 
> Do we have a consensus here? (hoping against hope, probably :-)

I can certainly agree with this.

But I have to say that I still don't know of a single place
where one would actually use the volatile variant.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Segher Boessenkool


A volatile default would disable optimizations for atomic_read.
atomic_read without volatile would allow for full optimization by the
compiler. Seems that this is what one wants in many cases.


Name one such case.

An atomic_read should do a load from memory.  If the programmer puts
an atomic_read() in the code then the compiler should emit a load for
it, not re-use a value returned by a previous atomic_read.  I do not
believe it would ever be useful for the compiler to collapse two
atomic_read statements into a single load.


An atomic_read() implemented as a "normal" C variable read would
allow that read to be combined with another "normal" read from
that variable.  This could perhaps be marginally useful, although
I'd bet you cannot see it unless counting cycles on a simulator
or counting bits in the binary size.

With an asm() implementation, the compiler can not do this; with
a "volatile" implementation (either volatile variable or volatile-cast),
this invokes undefined behaviour (in both C and GCC).


Segher

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul Mackerras

Herbert Xu writes:

> See sk_stream_mem_schedule in net/core/stream.c:
> 
> /* Under limit. */
> if (atomic_read(sk->sk_prot->memory_allocated) < 
> sk->sk_prot->sysctl_mem[0]) {
> if (*sk->sk_prot->memory_pressure)
> *sk->sk_prot->memory_pressure = 0;
> return 1;
> }
> 
> /* Over hard limit. */
> if (atomic_read(sk->sk_prot->memory_allocated) > 
> sk->sk_prot->sysctl_mem[2]) {
> sk->sk_prot->enter_memory_pressure();
> goto suppress_allocation;
> }
> 
> We don't need to reload sk->sk_prot->memory_allocated here.

Are you sure?  How do you know some other CPU hasn't changed the value
in between?

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Satyam Sharma

On Wed, 15 Aug 2007, Christoph Lameter wrote:

> On Wed, 15 Aug 2007, Paul E. McKenney wrote:
> 
> > Understood.  My point is not that the impact is precisely zero, but
> > rather that the impact on optimization is much less hurtful than the
> > problems that could arise otherwise, particularly as compilers become
> > more aggressive in their optimizations.
> 
> The problems arise because barriers are not used as required. Volatile 
> has wishy washy semantics and somehow marries memory barriers with data 
> access. It is clearer to separate the two. Conceptual cleanness usually 
> translates into better code. If one really wants the volatile then lets 
> make it explicit and use
> 
>   atomic_read_volatile()

Completely agreed, again. To summarize again (had done so about ~100 mails
earlier in this thread too :-) ...

atomic_{read,set}_volatile() -- guarantees volatility also along with
atomicity (the two _are_ different concepts after all, irrespective of
whether callsites normally want one with the other or not)

atomic_{read,set}_nonvolatile() -- only guarantees atomicity, compiler
free to elid / coalesce / optimize such accesses, can keep the object
in question cached in a local register, leads to smaller text, etc.

As to which one should be the default atomic_read() is a question of
whether majority of callsites (more weightage to important / hot
codepaths, lesser to obscure callsites) want a particular behaviour.

Do we have a consensus here? (hoping against hope, probably :-)

[ This thread has gotten completely out of hand ... for my mail client
  alpine as well, it now seems. Reminds of that 1000+ GPLv3 fest :-) ]
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Thu, Aug 16, 2007 at 11:51:42AM +1000, Paul Mackerras wrote:
> 
> Name one such case.

See sk_stream_mem_schedule in net/core/stream.c:

/* Under limit. */
if (atomic_read(sk->sk_prot->memory_allocated) < 
sk->sk_prot->sysctl_mem[0]) {
if (*sk->sk_prot->memory_pressure)
*sk->sk_prot->memory_pressure = 0;
return 1;
}

/* Over hard limit. */
if (atomic_read(sk->sk_prot->memory_allocated) > 
sk->sk_prot->sysctl_mem[2]) {
sk->sk_prot->enter_memory_pressure();
goto suppress_allocation;
}

We don't need to reload sk->sk_prot->memory_allocated here.

Now where is your example again?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul Mackerras

Christoph Lameter writes:

> A volatile default would disable optimizations for atomic_read. 
> atomic_read without volatile would allow for full optimization by the 
> compiler. Seems that this is what one wants in many cases.

Name one such case.

An atomic_read should do a load from memory.  If the programmer puts
an atomic_read() in the code then the compiler should emit a load for
it, not re-use a value returned by a previous atomic_read.  I do not
believe it would ever be useful for the compiler to collapse two
atomic_read statements into a single load.

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Christoph Lameter

On Wed, 15 Aug 2007, Paul E. McKenney wrote:

> Understood.  My point is not that the impact is precisely zero, but
> rather that the impact on optimization is much less hurtful than the
> problems that could arise otherwise, particularly as compilers become
> more aggressive in their optimizations.

The problems arise because barriers are not used as required. Volatile 
has wishy washy semantics and somehow marries memory barriers with data 
access. It is clearer to separate the two. Conceptual cleanness usually 
translates into better code. If one really wants the volatile then lets 
make it explicit and use

atomic_read_volatile()
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re:

2007-08-15 Thread Segher Boessenkool


"compilation unit" is a C standard term.  It typically boils down
to "single .c file".


As you mentioned later, "single .c file with all the other files 
(headers
or other .c files) that it pulls in via #include" is actually 
"translation

unit", both in the C standard as well as gcc docs.


Yeah.  "single .c file after preprocessing".  Same thing :-)


"Compilation unit"
doesn't seem to be nearly as standard a term, though in most places it
is indeed meant to be same as "translation unit", but with the new gcc
inter-module-analysis stuff that you referred to above, I suspect one 
may
reasonably want to call a "compilation unit" as all that the compiler 
sees

at a given instant.


That would be a bit confusing, would it not?  They'd better find
some better name for that if they want to name it at all (remember,
none of these optimisations should have any effect on the semantics
of the program, you just get fewer .o files etc.).


Segher

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[BUG -rt] circular locking deadlock

2007-08-15 Thread john stultz

Hey Ingo, Thomas,

I was playing with the latency tracer on 2.6.23-rc2-rt2 while a "make
-j8" was going on in the background and the box hung with this on the
console:



[ BUG: circular locking deadlock detected! ]

cc1/7161 is deadlocking current task cc1/7164


1) cc1/7164 is trying to acquire this lock:
 [8101fa375250] {>lock}
.. ->owner: 8101ef7f7162
.. held by:   cc1: 7161 [8101ef7f7160, 120]

2) cc1/7161 is blocked on this lock:
 [8102118e5900] {>lock}
.. ->owner: 8100ddb3ace2
.. held by:   cc1: 7164 [8100ddb3ace0, 120]

cc1/7161's [blocked] stackdump:

 8101f9423c68 0086 0001 0002
 82205d00 0202 82189280 8101ef7f7160
 8100ddb3ace0 8101f9423c10 0202 
Call Trace:
 [] mcount+0x2c/0x3a
 [] schedule+0x3b/0x130
 [] rt_spin_lock_slowlock+0xdb/0x210
 [] __rt_spin_lock+0x3d/0x50
 [] rt_spin_lock+0xe/0x10
 [] __lock_list+0x42/0x70
 [] lock_list_splice_init+0x33/0xb0
 [] file_move+0xc8/0xd0
 [] __dentry_open+0xae/0x1d0
 [] nameidata_to_filp+0x44/0x60
 [] do_filp_open+0x4d/0x60
 [] __lock_text_start+0xe/0x10
 [] get_unused_fd_flags+0x103/0x130
 [] do_sys_open+0x61/0x100
 [] sys_open+0x20/0x30
 [] tracesys+0x151/0x1be

---
| preempt count: 0002 ]
| 2-level deep critical section nesting:

.. []  __schedule+0x23/0x265
.[] ..   ( <= schedule+0x3b/0x130)
.. []  __spin_lock_irq+0x21/0x40
.[] ..   ( <= __schedule+0xb4/0x265)


cc1/7164's [current] stackdump:


Call Trace:
 [] show_stack+0x13/0x20
 [] debug_rt_mutex_print_deadlock+0x163/0x180
 [] rt_spin_lock_slowlock+0xd6/0x210
 [] __rt_spin_lock+0x3d/0x50
 [] rt_spin_lock+0xe/0x10
 [] __lock_list+0x42/0x70
 [] lock_list_del_init+0x26/0x70
 [] file_kill+0x4a/0x60
 [] __fput+0x12e/0x1d0
 [] fput+0x25/0x30
 [] filp_close+0x61/0x90
 [] sys_close+0xa9/0x100
 [] tracesys+0x151/0x1be

---
| preempt count:  ]
| 0-level deep critical section nesting:


[ turning off deadlock detection.Please report this trace. ]


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 1/2] i386: use asm() like the other atomic operations already do.

2007-08-15 Thread Nick Piggin


Herbert Xu wrote:

On Wed, Aug 15, 2007 at 01:02:23PM -0400, Chris Snook wrote:


Herbert Xu wrote:



I'm still unconvinced why we need this because nobody has
brought up any examples of kernel code that legitimately
need this.


There's plenty of kernel code that *wants* this though.  If we can 



You keep saying this yet everytime I ask for an example I
get nothing.


Agreed. Simplest thing to do will be to post those patches which
eliminate the register-clobbering barriers, shrink binaries, etc.
for all that code that wants this.

--
SUSE Labs, Novell Inc.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH] /proc//pmaps - memory maps in granularity of pages

2007-08-15 Thread Fengguang Wu

On Tue, Aug 14, 2007 at 11:26:18AM -0500, Matt Mackall wrote:
> On Tue, Aug 14, 2007 at 04:52:04PM +0800, Fengguang Wu wrote:
> > Hello,
> > 
> > Matt Mackall brings us many memory-footprint-optimization
> > opportunities with his pagemap/kpagemap patches. However I wonder if
> > the binary interface can be improved.
> > 
> > This seq_file based implementation iterates in the unit of vmas.  So
> > super large vmas can be a problem. The next step is to to do address
> > based iteration. That should not be hard.
> > 
> > Andrew, please stop me early if it is at all a wrong interface :)
> > 
> > Thank you,
> > Fengguang
> > ===
> > 
> > Show a process's memory maps page-by-page in /proc//pmaps.
> > It helps to analyze application's memory footprint in a comprehensive way.
> > 
> > Pages share the same states are grouped into a page range.
> > For each page range, the following fields are exported:
> > - first page index
> > - number of pages in the range
> > - well known page/pte flags
> > - number of mmap users
> > 
> > Only page flags not expected to disappear in the near future are exported:
> > 
> > Y:young R:referenced A:active U:uptodate P:ptedirty D:dirty W:writeback
> > 
> > Here is a sample output:
> > 
> > # cat /proc/$$/pmaps
> > 08048000-080c9000 r-xp 08048000 00:00 0
> > 32840   129 Y_A_P__ 1
> > 080c9000-080f6000 rwxp 080c9000 00:00 0  
> > [heap]
> > 32969   45  Y_A_P__ 1
> > f7dba000-f7dc3000 r-xp  03:00 176633 
> > /lib/libnss_files-2.3.6.so
> > 0   1   Y_AU___ 1
> > 1   1   YR_U___ 1
> > 5   1   YR_U___ 1
> > 8   1   Y_AU___ 1
> > f7dc3000-f7dc5000 rwxp 8000 03:00 176633 
> > /lib/libnss_files-2.3.6.so
> > 8   2   Y_A_P__ 1
> > f7dc5000-f7dcd000 r-xp  03:00 176635 
> > /lib/libnss_nis-2.3.6.so
> > 0   1   Y_AU___ 1
> > 1   1   YR_U___ 1
> > 4   1   YR_U___ 1
> > 7   1   Y_AU___ 1
> > f7dcd000-f7dcf000 rwxp 7000 03:00 176635 
> > /lib/libnss_nis-2.3.6.so
> > 7   2   Y_A_P__ 1
> > f7dcf000-f7de1000 r-xp  03:00 176630 
> > /lib/libnsl-2.3.6.so
> > 0   1   Y_AU___ 1
> > 1   3   YR_U___ 1
> > 16  1   YR_U___ 1
> > f7de1000-f7de3000 rwxp 00011000 03:00 176630 
> > /lib/libnsl-2.3.6.so
> > 17  2   Y_A_P__ 1
> > [...]
> 
> That's a _lot_ of data to generate and parse. I doubt we can watch a

Yes, but won't be too bad because it works in a sparse way.

1) It will only generate output for resident pages, that normally is
much smaller than the mapped size. Take my shell for example, the
(size:rss) ratio is (7:1)!

wfg ~% cat /proc/$$/smaps |grep Size|sum
sum  50552.000 
avg777.723 

wfg ~% cat /proc/$$/smaps |grep Rss|sum 
sum   7604.000 
avg116.985 

2) The page range trick suppresses more output. Look at these lines:

> > 08048000-080c9000 r-xp 08048000 00:00 0
> > 32840   129 Y_A_P__ 1
> > 080c9000-080f6000 rwxp 080c9000 00:00 0 [heap]
> > 32969   45  Y_A_P__ 1

It's interesting to see that the seq_file interface demands some
more programming efforts, and provides this flexibility as well.

3) A 64bit number can be encoded in hex in 8 bytes, no more than its
binary presentation. And often the text string will be much shorter
because of the common small values.

4) String formatting is cheap. Much cheaper than pte/struct page
walkings, and context switches. Not to mention its user friendliness.

> larger app in realtime with this interface. And it doesn't give us
> physical page numbers either.

This could be a problem. Binary interface is discouraging, which is
good and bad. The good thing about it is that we can safely export
the raw PFN/page flag numbers without upsetting upset normal users. In
this way a possible useful kernel debugging interface can be shipped
everywhere.

If kpagemap is a reasonable kernel debug interface, I do not see the
reason why kpagemap and pmaps cannot coexist :)

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Segher Boessenkool

Part of the motivation here is to fix heisenbugs.  If I knew where 
they


By the same token we should probably disable optimisations
altogether since that too can create heisenbugs.


Precisely the point -- use of volatile (whether in casts or on asms)
in these cases are intended to disable those optimizations likely to
result in heisenbugs.


The only thing volatile on an asm does is create a side effect
on the asm statement; in effect, it tells the compiler "do not
remove this asm even if you don't need any of its outputs".

It's not disabling optimisation likely to result in bugs,
heisen- or otherwise; _not_ putting the volatile on an asm
that needs it simply _is_ a bug :-)


Segher

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Segher Boessenkool

Part of the motivation here is to fix heisenbugs.  If I knew where 
they


By the same token we should probably disable optimisations
altogether since that too can create heisenbugs.


Almost everything is a tradeoff; and so is this.  I don't
believe most people would find disabling all compiler
optimisations an acceptable price to pay for some peace
of mind.


Segher

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Segher Boessenkool


No; compilation units have nothing to do with it, GCC can optimise
across compilation unit boundaries just fine, if you tell it to
compile more than one compilation unit at once.


Last I checked, the Linux kernel build system did compile each .c 
file

as a separate compilation unit.


I have some patches to use -combine -fwhole-program for Linux.
Highly experimental, you need a patched bleeding edge toolchain.
If there's interest I'll clean it up and put it online.

David Woodhouse had some similar patches about a year ago.


Sounds exciting...  ;-)


Yeah, the breakage is *quite* spectacular :-)


In many cases, the compiler also has to assume that
msleep_interruptible()
might call back into a function in the current compilation unit, 
thus

possibly modifying global static variables.


It most often is smart enough to see what compilation-unit-local
variables might be modified that way, though :-)


Yep.  For example, if it knows the current value of a given such 
local

variable, and if all code paths that would change some other variable
cannot be reached given that current value of the first variable.


Or the most common thing: if neither the address of the translation-
unit local variable nor the address of any function writing to that
variable can "escape" from that translation unit, nothing outside
the translation unit can write to the variable.


But there is usually at least one externally callable function in
a .c file.


Of course, but often none of those will (indirectly) write a certain
static variable.


Segher

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 6/24] make atomic_read() behave consistently on frv

2007-08-15 Thread Nick Piggin


Segher Boessenkool wrote:

Please check the definition of "cache coherence".



Which of the twelve thousand such definitions?  :-)


Every definition I have seen says that writes to a single memory
location have a serial order as seen by all CPUs, and that a read
will return the most recent write in the sequence (with a bit of
extra mumbo jumbo to cover store queues and store forwarding).

Within such a definition, I don't see how would be allowed for a
single CPU to execute reads out of order and have the second read
return an earlier write than the first read.

--
SUSE Labs, Novell Inc.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul E. McKenney

On Thu, Aug 16, 2007 at 08:53:16AM +0800, Herbert Xu wrote:
> On Wed, Aug 15, 2007 at 05:49:50PM -0700, Paul E. McKenney wrote:
> > On Thu, Aug 16, 2007 at 08:30:23AM +0800, Herbert Xu wrote:
> >
> > > Thanks.  But I don't need a summary of the thread, I'm asking
> > > for an extant code snippet in our kernel that benefits from
> > > the volatile change and is not part of a busy-wait.
> > 
> > Sorry, can't help you there.  I really do believe that the information
> > you need (as opposed to the specific item you are asking for) really
> > has been put forth in this thread.
> 
> That only leads me to believe that such a code snippet simply
> does not exist.

Whatever...

Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul E. McKenney

On Wed, Aug 15, 2007 at 05:59:41PM -0700, Christoph Lameter wrote:
> On Wed, 15 Aug 2007, Paul E. McKenney wrote:
> 
> > The volatile cast should not disable all that many optimizations,
> > for example, it is much less hurtful than barrier().  Furthermore,
> > the main optimizations disabled (pulling atomic_read() and atomic_set()
> > out of loops) really do need to be disabled.
> 
> In many cases you do not need a barrier. Having volatile there *will* 
> impact optimization because the compiler cannot use a register that may 
> contain the value that was fetched earlier. And the compiler cannot choose 
> freely when to fetch the value. The order of memory accesses are fixed if 
> you use volatile. If the variable is not volatile then the compiler can 
> arrange memory accesses any way they fit and thus generate better code.

Understood.  My point is not that the impact is precisely zero, but
rather that the impact on optimization is much less hurtful than the
problems that could arise otherwise, particularly as compilers become
more aggressive in their optimizations.

Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch] sched: skip updating rq's next_balance under null SD

2007-08-15 Thread Siddha, Suresh B

Was playing with sched_smt_power_savings/sched_mc_power_savings and found
out that while the scheduler domains are reconstructed when sysfs settings
change, rebalance_domains() can get triggered with null domain on other cpus,
which is setting next_balance to jiffies + 60*HZ. Resulting in no idle/busy
balancing for 60 seconds.

Fix this.

Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]>
---

diff --git a/kernel/sched.c b/kernel/sched.c
index 45e17b8..74565c0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3020,6 +3020,7 @@ static inline void rebalance_domains(int cpu, enum 
cpu_idle_type idle)
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
+   int update_next_balance = 0;
 
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3056,8 +3057,10 @@ static inline void rebalance_domains(int cpu, enum 
cpu_idle_type idle)
if (sd->flags & SD_SERIALIZE)
spin_unlock();
 out:
-   if (time_after(next_balance, sd->last_balance + interval))
+   if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
+   update_next_balance = 1;
+   }
 
/*
 * Stop the load balance at this level. There is another
@@ -3067,7 +3070,14 @@ out:
if (!balance)
break;
}
-   rq->next_balance = next_balance;
+
+   /*
+* next_balance will be updated only when there is a need.
+* When the cpu is attached to null domain for ex, it will not be
+* updated.
+*/
+   if (likely(update_next_balance))
+   rq->next_balance = next_balance;
 }
 
 /*
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Satyam Sharma

On Wed, 15 Aug 2007, Segher Boessenkool wrote:

> [...]
> > BTW:
> > 
> > #define atomic_read(a)  (*(volatile int *)&(a))
> > #define atomic_set(a,i) (*(volatile int *)&(a) = (i))
> > 
> > int a;
> > 
> > void func(void)
> > {
> > int b;
> > 
> > b = atomic_read(a);
> > atomic_set(a, 20);
> > b = atomic_read(a);
> > }
> > 
> > gives:
> > 
> > func:
> > pushl   %ebp
> > movla, %eax
> > movl%esp, %ebp
> > movl$20, a
> > movla, %eax
> > popl%ebp
> > ret
> > 
> > so the first atomic_read() wasn't optimized away.
> 
> Of course.  It is executed by the abstract machine, so
> it will be executed by the actual machine.  On the other
> hand, try
> 
>   b = 0;
>   if (b)
>   b = atomic_read(a);
> 
> or similar.

Yup, obviously. Volatile accesses (or any access to volatile objects),
or even "__volatile__ asms" (which gcc normally promises never to elid)
can always be optimized for cases such as these where the compiler can
trivially determine that the code in question is not reachable.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 6/24] make atomic_read() behave consistently on frv

2007-08-15 Thread Nick Piggin


Paul E. McKenney wrote:

On Wed, Aug 15, 2007 at 11:30:05PM +1000, Nick Piggin wrote:



Especially since several big architectures don't have volatile in their
atomic_get and _set, I think it would be a step backwards to add them in
as a "just in case" thin now (unless there is a better reason).



Good point, except that I would expect gcc's optimization to continue
to improve.  I would like the kernel to be able to take advantage of
improved optimization, which means that we are going to have to make
a few changes.  Adding volatile to atomic_get() and atomic_set() is
IMHO one of those changes.


What optimisations? gcc already does most of the things you need a
barrier/volatile for, like reordering non-dependant loads and stores,
and eliminating mem ops completely by caching in registers.



As to your followup question of why to use it over ACCESS_ONCE. I
guess, aside from consistency with the rest of the barrier APIs, you
can use it in other primitives when you don't actually know what the
caller is going to do or if it even will make an access. You could
also use it between calls to _other_ primitives, etc... it just
seems more flexible to me, but I haven't actually used such a thing
in real code...

ACCESS_ONCE doesn't seem as descriptive. What it results in is the
memory location being loaded or stored (presumably once exactly),
but I think the more general underlying idea is a barrier point.



OK, first, I am not arguing that ACCESS_ONCE() can replace all current
uses of barrier().


OK. Well I also wasn't saying that ACCESS_ONCE should not be
implemented. But if we want something like it, then it would make
sense to have an equivalent barrier statement as well (ie. order()).

--
SUSE Labs, Novell Inc.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch] sched: fix broken smt/mc optimizations with CFS

2007-08-15 Thread Siddha, Suresh B

Ingo, let me know if there any side effects of this change. Thanks.
---

On a four package system with HT - HT load balancing optimizations
were broken. For example, if two tasks end up running on two logical
threads of one of the packages, scheduler is not able to pull one of
the tasks to a completely idle package.

In this scenario, for nice-0 tasks, imbalance calculated by scheduler
will be 512 and find_busiest_queue() will return 0 (as each cpu's load
is 1024 > imbalance and has only one task running).

Similarly MC scheduler optimizations also get fixed with this patch.

Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]>
---

diff --git a/kernel/sched.c b/kernel/sched.c
index 45e17b8..c5ac710 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2494,7 +2494,7 @@ group_next:
 * a think about bumping its value to force at least one task to be
 * moved
 */
-   if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
+   if (*imbalance < busiest_load_per_task) {
unsigned long tmp, pwr_now, pwr_move;
unsigned int imbn;
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Satyam Sharma

Hi Herbert,


On Thu, 16 Aug 2007, Herbert Xu wrote:

> On Thu, Aug 16, 2007 at 06:28:42AM +0530, Satyam Sharma wrote:
> >
> > > The udelay itself certainly should have some form of cpu_relax in it.
> > 
> > Yes, a form of barrier() must be present in mdelay() or udelay() itself
> > as you say, having it in __const_udelay() is *not* enough (superflous
> > actually, considering it is already a separate translation unit and
> > invisible to the compiler).
> 
> As long as __const_udelay does something which has the same
> effect as barrier it is enough even if it's in the same unit.

Only if __const_udelay() is inlined. But as I said, __const_udelay()
-- although marked "inline" -- will never be inlined anywhere in the
kernel in reality. It's an exported symbol, and never inlined from
modules. Even from built-in targets, the definition of __const_udelay
is invisible when gcc is compiling the compilation units of those
callsites. The compiler has no idea that that function has barriers
or not, so we're saved here _only_ by the lucky fact that
__const_udelay() is in a different compilation unit.


> As a matter of fact it does on i386 where __delay either uses
> rep_nop or asm/volatile.

__delay() can be either delay_tsc() or delay_loop() on i386.

delay_tsc() uses the rep_nop() there for it's own little busy
loop, actually. But for a call site that inlines __const_udelay()
-- if it were ever moved to a .h file and marked inline -- the
call to __delay() will _still_ be across compilation units. So,
again for this case, it does not matter if the callee function
has compiler barriers or not (it would've been a different story
if we were discussing real/CPU barriers, I think), what saves us
here is just the fact that a call is made to a function from a
different compilation unit, which is invisible to the compiler
when compiling the callsite, and hence acting as the compiler
barrier.

Regarding delay_loop(), it uses "volatile" for the "asm" which
has quite different semantics from the C language "volatile"
type-qualifier keyword and does not imply any compiler barrier
at all.


Satyam
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Christoph Lameter

On Wed, 15 Aug 2007, Paul E. McKenney wrote:

> The volatile cast should not disable all that many optimizations,
> for example, it is much less hurtful than barrier().  Furthermore,
> the main optimizations disabled (pulling atomic_read() and atomic_set()
> out of loops) really do need to be disabled.

In many cases you do not need a barrier. Having volatile there *will* 
impact optimization because the compiler cannot use a register that may 
contain the value that was fetched earlier. And the compiler cannot choose 
freely when to fetch the value. The order of memory accesses are fixed if 
you use volatile. If the variable is not volatile then the compiler can 
arrange memory accesses any way they fit and thus generate better code.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: RFC: do get_rtc_time() correctly

2007-08-15 Thread Rene Herman


On 08/16/2007 02:26 AM, David P. Reed wrote:


My mention of NMI (which by definition can't be masked) is because NMI


Well, not by the CPU it can't, but on a PC, masking NMIs is a simple matter 
of setting bit 7 of I/O port 0x70 to 1 (it seems the kernel does not provide 
an interface for it though?).


Rene.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Wed, Aug 15, 2007 at 05:49:50PM -0700, Paul E. McKenney wrote:
> On Thu, Aug 16, 2007 at 08:30:23AM +0800, Herbert Xu wrote:
>
> > Thanks.  But I don't need a summary of the thread, I'm asking
> > for an extant code snippet in our kernel that benefits from
> > the volatile change and is not part of a busy-wait.
> 
> Sorry, can't help you there.  I really do believe that the information
> you need (as opposed to the specific item you are asking for) really
> has been put forth in this thread.

That only leads me to believe that such a code snippet simply
does not exist.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul E. McKenney

On Wed, Aug 15, 2007 at 05:42:07PM -0700, Christoph Lameter wrote:
> On Wed, 15 Aug 2007, Paul E. McKenney wrote:
> 
> > Seems to me that we face greater chance of confusion without the
> > volatile than with, particularly as compiler optimizations become
> > more aggressive.  Yes, we could simply disable optimization, but
> > optimization can be quite helpful.
> 
> A volatile default would disable optimizations for atomic_read. 
> atomic_read without volatile would allow for full optimization by the 
> compiler. Seems that this is what one wants in many cases.

The volatile cast should not disable all that many optimizations,
for example, it is much less hurtful than barrier().  Furthermore,
the main optimizations disabled (pulling atomic_read() and atomic_set()
out of loops) really do need to be disabled.

Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Thu, Aug 16, 2007 at 06:28:42AM +0530, Satyam Sharma wrote:
>
> > The udelay itself certainly should have some form of cpu_relax in it.
> 
> Yes, a form of barrier() must be present in mdelay() or udelay() itself
> as you say, having it in __const_udelay() is *not* enough (superflous
> actually, considering it is already a separate translation unit and
> invisible to the compiler).

As long as __const_udelay does something which has the same
effect as barrier it is enough even if it's in the same unit.
As a matter of fact it does on i386 where __delay either uses
rep_nop or asm/volatile.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul E. McKenney

On Thu, Aug 16, 2007 at 08:30:23AM +0800, Herbert Xu wrote:
> On Wed, Aug 15, 2007 at 05:23:10PM -0700, Paul E. McKenney wrote:
> > On Thu, Aug 16, 2007 at 08:12:48AM +0800, Herbert Xu wrote:
> > > On Wed, Aug 15, 2007 at 04:53:35PM -0700, Paul E. McKenney wrote:
> > > >
> > > > > > Communicating between process context and interrupt/NMI handlers 
> > > > > > using
> > > > > > per-CPU variables.
> > > > > 
> > > > > Remeber we're talking about atomic_read/atomic_set.  Please
> > > > > cite the actual file/function name you have in mind.
> > > > 
> > > > Yep, we are indeed talking about atomic_read()/atomic_set().
> > > > 
> > > > We have been through this issue already in this thread.
> > > 
> > > Sorry, but I must've missed it.  Could you cite the file or
> > > function for my benefit?
> > 
> > I might summarize the thread if there is interest, but I am not able to
> > do so right this minute.
> 
> Thanks.  But I don't need a summary of the thread, I'm asking
> for an extant code snippet in our kernel that benefits from
> the volatile change and is not part of a busy-wait.

Sorry, can't help you there.  I really do believe that the information
you need (as opposed to the specific item you are asking for) really
has been put forth in this thread.

Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Satyam Sharma

[ Sorry for empty subject line in previous mail. I intended to make
  a patch so cleared it to change it, but ultimately neither made
  a patch nor restored subject line. Done that now. ]

On Thu, 16 Aug 2007, Herbert Xu wrote:

> On Thu, Aug 16, 2007 at 06:06:00AM +0530, Satyam Sharma wrote:
> > 
> > that are:
> > 
> > while ((atomic_read(_for_crash_ipi) > 0) && msecs) {
> > mdelay(1);
> > msecs--;
> > }
> > 
> > where mdelay() becomes __const_udelay() which happens to be in another
> > translation unit (arch/i386/lib/delay.c) and hence saves this callsite
> > from being a bug :-)
> 
> The udelay itself certainly should have some form of cpu_relax in it.

Yes, a form of barrier() must be present in mdelay() or udelay() itself
as you say, having it in __const_udelay() is *not* enough (superflous
actually, considering it is already a separate translation unit and
invisible to the compiler).

However, there are no compiler barriers on the macro-definition-path
between mdelay(1) and __const_udelay(), so the only thing that saves us
from being a bug here is indeed the different-translation-unit concept.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Christoph Lameter

On Wed, 15 Aug 2007, Paul E. McKenney wrote:

> Seems to me that we face greater chance of confusion without the
> volatile than with, particularly as compiler optimizations become
> more aggressive.  Yes, we could simply disable optimization, but
> optimization can be quite helpful.

A volatile default would disable optimizations for atomic_read. 
atomic_read without volatile would allow for full optimization by the 
compiler. Seems that this is what one wants in many cases.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: drivers/infiniband/mlx/mad.c misplaced ;

2007-08-15 Thread Joe Perches

On Wed, 2007-08-15 at 19:58 -0400, Dave Jones wrote:
> Signed-off-by: Dave Jones <[EMAIL PROTECTED]>
> 
> diff --git a/drivers/infiniband/hw/mlx4/mad.c 
> b/drivers/infiniband/hw/mlx4/mad.c
> index 3330917..0ed02b7 100644
> --- a/drivers/infiniband/hw/mlx4/mad.c
> +++ b/drivers/infiniband/hw/mlx4/mad.c
> @@ -109,7 +109,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int 
> ignore_mkey, int ignore_bkey,
>  in_modifier, op_modifier,
>  MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C);
>  
> - if (!err);
> + if (!err)

There's more than a few of these (not inspected).

$ egrep -r --include=*.c "\bif[[:space:]]*\([^\)]*\)[[:space:]]*\;" * 
arch/sh/boards/se/7343/io.c:if (0) ;
drivers/atm/iphase.c: if (!desc1) ;
drivers/infiniband/hw/mlx4/mad.c:   if (!err);
drivers/isdn/capi/capiutil.c:   else if (c <= 0x0f);
drivers/net/tokenring/ibmtr.c:  else if (ti->shared_ram_paging == 0xf);  /* No 
paging in adapter */
drivers/s390/scsi/zfcp_erp.c:   if (status == ZFCP_ERP_SUCCEEDED) ; 
/* no further action */
fs/hostfs/hostfs_user.c:if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ;
net/netfilter/xt_u32.c: if (skb->len < 4 || pos > skb->len - 4);
sound/pci/au88x0/au88x0_synth.c:if (eax == 0) ;


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Christoph Lameter

On Thu, 16 Aug 2007, Paul Mackerras wrote:

> Those barriers are for when we need ordering between atomic variables
> and other memory locations.  An atomic variable by itself doesn't and
> shouldn't need any barriers for other CPUs to be able to see what's
> happening to it.

It does not need any barriers. As soon as one cpu acquires the 
cacheline for write it will be invalidated in the caches of the others. So 
the other cpu will have to refetch. No need for volatile.

The issue here may be that the compiler has fetched the atomic variable 
earlier and put it into a register. However, that prefetching is limited 
because it cannot cross functions calls etc. The only problem could be 
loops where the compiler does not refetch the variable since it assumes 
that it does not change and there are no function calls in the body of the 
loop. But AFAIK these loops need cpu_relax and other measures anyways to 
avoid bad effects from busy waiting.

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul E. McKenney

On Wed, Aug 15, 2007 at 05:26:34PM -0700, Christoph Lameter wrote:
> On Thu, 16 Aug 2007, Paul Mackerras wrote:
> 
> > In the kernel we use atomic variables in precisely those situations
> > where a variable is potentially accessed concurrently by multiple
> > CPUs, and where each CPU needs to see updates done by other CPUs in a
> > timely fashion.  That is what they are for.  Therefore the compiler
> > must not cache values of atomic variables in registers; each
> > atomic_read must result in a load and each atomic_set must result in a
> > store.  Anything else will just lead to subtle bugs.
> 
> This may have been the intend. However, today the visibility is controlled 
> using barriers. And we have barriers that we use with atomic operations. 
> Having volatile be the default just lead to confusion. Atomic read should 
> just read with no extras. Extras can be added by using variants like 
> atomic_read_volatile or so.

Seems to me that we face greater chance of confusion without the
volatile than with, particularly as compiler optimizations become
more aggressive.  Yes, we could simply disable optimization, but
optimization can be quite helpful.

Thanx, Paul
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 002 of 2] md: Correctly update sysfs when a raid1 is reshaped.

2007-08-15 Thread NeilBrown


When a raid1 array is reshaped (number of drives changed),
the list of devices is compacted, so that slots for missing
devices are filled with working devices from later slots.
This requires the "rd%d" symlinks in sysfs to be updated.


Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./drivers/md/raid1.c |   22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c
--- .prev/drivers/md/raid1.c2007-08-16 10:27:57.0 +1000
+++ ./drivers/md/raid1.c2007-08-16 10:29:58.0 +1000
@@ -2154,11 +2154,25 @@ static int raid1_reshape(mddev_t *mddev)
oldpool = conf->r1bio_pool;
conf->r1bio_pool = newpool;
 
-   for (d=d2=0; d < conf->raid_disks; d++)
-   if (conf->mirrors[d].rdev) {
-   conf->mirrors[d].rdev->raid_disk = d2;
-   newmirrors[d2++].rdev = conf->mirrors[d].rdev;
+   for (d = d2 = 0; d < conf->raid_disks; d++) {
+   mdk_rdev_t *rdev = conf->mirrors[d].rdev;
+   if (rdev && rdev->raid_disk != d2) {
+   char nm[20];
+   sprintf(nm, "rd%d", rdev->raid_disk);
+   sysfs_remove_link(>kobj, nm);
+   rdev->raid_disk = d2;
+   sprintf(nm, "rd%d", rdev->raid_disk);
+   sysfs_remove_link(>kobj, nm);
+   if (sysfs_create_link(>kobj,
+ >kobj, nm))
+   printk(KERN_WARNING
+  "md/raid1: cannot register "
+  "%s for %s\n",
+  nm, mdname(mddev));
}
+   if (rdev)
+   newmirrors[d2++].rdev = rdev;
+   }
kfree(conf->mirrors);
conf->mirrors = newmirrors;
kfree(conf->poolinfo);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 001 of 2] md: Make sure a re-add after a restart honours bitmap when resyncing.

2007-08-15 Thread NeilBrown


Commit 1757128438d41670ded8bc3bc735325cc07dc8f9 was slightly bad.  If
an array has a write-intent bitmap, and you remove a drive, then readd
it, only the changed parts should be resynced.  However after the
above commit, this only works if the array has not been shut down and
restarted.

This is because it sets 'fullsync' at little more often
than it should.  This patch is more careful.

Signed-off-by: Neil Brown <[EMAIL PROTECTED]>

### Diffstat output
 ./drivers/md/raid1.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c
--- .prev/drivers/md/raid1.c2007-08-16 10:27:57.0 +1000
+++ ./drivers/md/raid1.c2007-08-16 10:27:57.0 +1000
@@ -1972,7 +1972,8 @@ static int run(mddev_t *mddev)
!test_bit(In_sync, >rdev->flags)) {
disk->head_position = 0;
mddev->degraded++;
-   conf->fullsync = 1;
+   if (disk->rdev)
+   conf->fullsync = 1;
}
}
if (mddev->degraded == conf->raid_disks) {
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 000 of 2] md: bug fixes for 2.6.23-rc

2007-08-15 Thread NeilBrown

Following 2 patches contain bugfixes for md.  Both apply to earlier
kernels, but probably aren't significant enough for -stable (no oops,
no data corruption, no security hole).
They should go in 2.6.23 though.

Thanks,
NeilBrown


 [PATCH 001 of 2] md: Make sure a re-add after a restart honours bitmap when 
resyncing.
 [PATCH 002 of 2] md: Correctly update sysfs when a raid1 is reshaped.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Paul Mackerras

Christoph Lameter writes:

> On Thu, 16 Aug 2007, Paul Mackerras wrote:
> 
> > In the kernel we use atomic variables in precisely those situations
> > where a variable is potentially accessed concurrently by multiple
> > CPUs, and where each CPU needs to see updates done by other CPUs in a
> > timely fashion.  That is what they are for.  Therefore the compiler
> > must not cache values of atomic variables in registers; each
> > atomic_read must result in a load and each atomic_set must result in a
> > store.  Anything else will just lead to subtle bugs.
> 
> This may have been the intend. However, today the visibility is controlled 
> using barriers. And we have barriers that we use with atomic operations. 

Those barriers are for when we need ordering between atomic variables
and other memory locations.  An atomic variable by itself doesn't and
shouldn't need any barriers for other CPUs to be able to see what's
happening to it.

Paul.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: your mail

2007-08-15 Thread Herbert Xu

On Thu, Aug 16, 2007 at 06:06:00AM +0530, Satyam Sharma wrote:
> 
> that are:
> 
>   while ((atomic_read(_for_crash_ipi) > 0) && msecs) {
>   mdelay(1);
>   msecs--;
>   }
> 
> where mdelay() becomes __const_udelay() which happens to be in another
> translation unit (arch/i386/lib/delay.c) and hence saves this callsite
> from being a bug :-)

The udelay itself certainly should have some form of cpu_relax in it.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-15 Thread Herbert Xu

On Wed, Aug 15, 2007 at 05:23:10PM -0700, Paul E. McKenney wrote:
> On Thu, Aug 16, 2007 at 08:12:48AM +0800, Herbert Xu wrote:
> > On Wed, Aug 15, 2007 at 04:53:35PM -0700, Paul E. McKenney wrote:
> > >
> > > > > Communicating between process context and interrupt/NMI handlers using
> > > > > per-CPU variables.
> > > > 
> > > > Remeber we're talking about atomic_read/atomic_set.  Please
> > > > cite the actual file/function name you have in mind.
> > > 
> > > Yep, we are indeed talking about atomic_read()/atomic_set().
> > > 
> > > We have been through this issue already in this thread.
> > 
> > Sorry, but I must've missed it.  Could you cite the file or
> > function for my benefit?
> 
> I might summarize the thread if there is interest, but I am not able to
> do so right this minute.

Thanks.  But I don't need a summary of the thread, I'm asking
for an extant code snippet in our kernel that benefits from
the volatile change and is not part of a busy-wait.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] i386: Fix a couple busy loops in mach_wakecpu.h:wait_for_init_deassert()

2007-08-15 Thread Satyam Sharma



On Wed, 15 Aug 2007, Heiko Carstens wrote:

> [...]
> Btw.: we still have
> 
> include/asm-i386/mach-es7000/mach_wakecpu.h:  while (!atomic_read(deassert));
> include/asm-i386/mach-default/mach_wakecpu.h: while (!atomic_read(deassert));
> 
> Looks like they need to be fixed as well.


[PATCH] i386: Fix a couple busy loops in mach_wakecpu.h:wait_for_init_deassert()

Use cpu_relax() in the busy loops, as atomic_read() doesn't automatically
imply volatility for i386 and x86_64. x86_64 doesn't have this issue because
it open-codes the while loop in smpboot.c:smp_callin() itself that already
uses cpu_relax().

For i386, however, smpboot.c:smp_callin() calls wait_for_init_deassert()
which is buggy for mach-default and mach-es7000 cases.

[ I test-built a kernel -- smp_callin() itself got inlined in its only
  callsite, smpboot.c:start_secondary() -- and the relevant piece of
  code disassembles to the following:

0xc1019704 :mov0xc144c4c8,%eax
0xc1019709 :test   %eax,%eax
0xc101970b :je 0xc1019709 

  init_deasserted (at 0xc144c4c8) gets fetched into %eax only once and
  then we loop over the test of the stale value in the register only,
  so these look like real bugs to me. With the fix below, this becomes:

0xc1019706 :pause
0xc1019708 :cmpl   $0x0,0xc144c4c8
0xc101970f :je 0xc1019706 

  which looks nice and healthy. ]

Thanks to Heiko Carstens for noticing this.

Signed-off-by: Satyam Sharma <[EMAIL PROTECTED]>

---

 include/asm-i386/mach-default/mach_wakecpu.h |3 ++-
 include/asm-i386/mach-es7000/mach_wakecpu.h  |3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/asm-i386/mach-default/mach_wakecpu.h 
b/include/asm-i386/mach-default/mach_wakecpu.h
index 673b85c..3ebb178 100644
--- a/include/asm-i386/mach-default/mach_wakecpu.h
+++ b/include/asm-i386/mach-default/mach_wakecpu.h
@@ -15,7 +15,8 @@
 
 static inline void wait_for_init_deassert(atomic_t *deassert)
 {
-   while (!atomic_read(deassert));
+   while (!atomic_read(deassert))
+   cpu_relax();
return;
 }
 
diff --git a/include/asm-i386/mach-es7000/mach_wakecpu.h 
b/include/asm-i386/mach-es7000/mach_wakecpu.h
index efc903b..84ff583 100644
--- a/include/asm-i386/mach-es7000/mach_wakecpu.h
+++ b/include/asm-i386/mach-es7000/mach_wakecpu.h
@@ -31,7 +31,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 static inline void wait_for_init_deassert(atomic_t *deassert)
 {
 #ifdef WAKE_SECONDARY_VIA_INIT
-   while (!atomic_read(deassert));
+   while (!atomic_read(deassert))
+   cpu_relax();
 #endif
return;
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: RFC: do get_rtc_time() correctly

2007-08-15 Thread David P. Reed


Alan:

Thanks for the comment. I will code a patch, and include a sanity check 
as you suggested, and send it for review. Just to clarify one concern 
your note raised:


I understand that SMM/SMI servicing can take a long time, but SMM/SMI 
shouldn't  happen while interrupts are masked using local_irq_disable() 
[included in spin_lock_irq()], at least on x86-architectures.  If 
SMM/SMI can happen even then, the NMI fix below could be generalized.


My mention of NMI (which by definition can't be masked) is because NMI 
can happen even while interrupts are masked.  This is a timing problem 
that can't be dealt with by masking interrupts, and NMI's are used for 
watchdogs, etc these days.   It seems like just a general good thing to 
be able to ask if an NMI has happened.   A per-cpu NMI eventcount that 
is incremented every NMI would allow one to detect NMI's that happen 
during an otherwise masked code sequence by reading it at the beginning 
and end of the code sequence.   Don't know if NMIs are common on other 
architectures, or if this is an architecture dependent concern.


Perhaps I'm really talking about two patches here.  One for a mechanism 
to detect NMIs that happen during a critical piece of code (so it can be 
retried), and one that depends on that to be really proper in reading 
the RTC reliably.


Alan Cox wrote:
So the proper way to read the RTC contents is to read the UIP flag, and 
if zero, read all the RTC registers with interrupts masked completely, 
so all reads happen in the 224 usec window.  (NMI can still be a 
problem, but you can have NMI's set a flag that forces a retry).



SMM/SMI is more likely to be what bumps you 224usec or more.

  
I'm happy to code and test a patch.   Rather than just submit a patch, I 
thought I'd request others' comments on this, since it affects so many 
architectures.  cc me, if you will, as I don't subscribe to LKML, just 
check it periodically.



Go for it. The other architectures generally inherit it by inheriting
similar bridge chips or in more modern times the RTC macrocell. It should
also be possible to debug by putting in an optional sanity check
initially which checks the read made sense compared to a few more

  

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 984 matches

Mail list logo