Re: Linux 2.6.20.16
diff --git a/Makefile b/Makefile index 947ff3c..b3806cb 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 20 -EXTRAVERSION = .15 +EXTRAVERSION = .16 NAME = Homicidal Dwarf Hamster # *DOCUMENTATION* diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 5e47683..9bf056e 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -367,10 +367,6 @@ ENTRY(system_call) CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL GET_THREAD_INFO(%ebp) - testl $TF_MASK,PT_EFLAGS(%esp) - jz no_singlestep - orl $_TIF_SINGLESTEP,TI_flags(%ebp) -no_singlestep: # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) @@ -385,6 +381,10 @@ syscall_exit: # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit + jz no_singlestep + orl $_TIF_SINGLESTEP,TI_flags(%ebp) +no_singlestep: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c index 3700eef..be4a9a8 100644 --- a/arch/i386/oprofile/nmi_int.c +++ b/arch/i386/oprofile/nmi_int.c @@ -131,7 +131,6 @@ static void nmi_save_registers(void * dummy) { int cpu = smp_processor_id(); struct op_msrs * msrs = _msrs[cpu]; - model->fill_in_addresses(msrs); nmi_cpu_save_registers(msrs); } @@ -195,6 +194,7 @@ static struct notifier_block profile_exceptions_nb = { static int nmi_setup(void) { int err=0; + int cpu; if (!allocate_msrs()) return -ENOMEM; @@ -207,6 +207,13 @@ static int nmi_setup(void) /* We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations */ + + /* Assume saved/restored counters are the same on all CPUs */ + model->fill_in_addresses(_msrs[0]); + for_each_possible_cpu (cpu) { + if (cpu != 0) + cpu_msrs[cpu] = cpu_msrs[0]; + } on_each_cpu(nmi_save_registers, NULL, 0, 1); on_each_cpu(nmi_cpu_setup, NULL, 0, 1); nmi_enabled = 1; diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index f72e8e8..a84304e 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -177,6 +177,13 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig, */ discard_lazy_cpu_state(); + /* +* Force reload of FP/VEC. +* This has to be done before copying stuff into current->thread.fpr/vr +* for the reasons explained in the previous comment. +*/ + regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC); + err |= __copy_from_user(>thread.fpr, >fp_regs, FP_REGS_SIZE); #ifdef CONFIG_ALTIVEC @@ -198,9 +205,6 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig, current->thread.vrsave = 0; #endif /* CONFIG_ALTIVEC */ - /* Force reload of FP/VEC */ - regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC); - return err; } diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 2968b90..e67cc4f 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -72,6 +72,8 @@ void show_mem(void) for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; ++i) { + if (!pfn_valid(pgdat->node_start_pfn + i)) + continue; page = pfn_to_page(pgdat->node_start_pfn + i); total++; if (PageReserved(page)) @@ -766,3 +768,9 @@ int in_gate_area_no_task(unsigned long addr) { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } + +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +{ + return __alloc_bootmem_core(pgdat->bdata, size, + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); +} diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c index 3ffa080..e4e0ccb 100644 --- a/drivers/char/cyclades.c +++ b/drivers/char/cyclades.c @@ -1102,6 +1102,7 @@ static void cyy_intr_chip(struct cyclades_card *cinfo, int chip, if (data & info->ignore_status_mask) { info->icount.rx++; + spin_unlock(>card_lock); return; }
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, 16 Aug 2007, Satyam Sharma wrote: > Hi Bill, > > > On Wed, 15 Aug 2007, Bill Fink wrote: > > > On Wed, 15 Aug 2007, Satyam Sharma wrote: > > > > > (C) > > > $ cat tp3.c > > > int a; > > > > > > void func(void) > > > { > > > *(volatile int *) = 10; > > > *(volatile int *) = 20; > > > } > > > $ gcc -Os -S tp3.c > > > $ cat tp3.s > > > ... > > > movl$10, a > > > movl$20, a > > > ... > > > > I'm curious about one minor tangential point. Why, instead of: > > > > b = *(volatile int *) > > > > why can't this just be expressed as: > > > > b = (volatile int)a; > > > > Isn't it the contents of a that's volatile, i.e. it's value can change > > invisibly to the compiler, and that's why you want to force a read from > > memory? Why do you need the "*(volatile int *)&" construct? > > "b = (volatile int)a;" doesn't help us because a cast to a qualified type > has the same effect as a cast to an unqualified version of that type, as > mentioned in 6.5.4:4 (footnote 86) of the standard. Note that "volatile" > is a type-qualifier, not a type itself, so a cast of the _object_ itself > to a qualified-type i.e. (volatile int) would not make the access itself > volatile-qualified. > > To serve our purposes, it is necessary for us to take the address of this > (non-volatile) object, cast the resulting _pointer_ to the corresponding > volatile-qualified pointer-type, and then dereference it. This makes that > particular _access_ be volatile-qualified, without the object itself being > such. Also note that the (dereferenced) result is also a valid lvalue and > hence can be used in "*(volatile int *) = b;" kind of construction > (which we use for the atomic_set case). Here, I should obviously admit that the semantics of *(volatile int *)& aren't any neater or well-defined in the _language standard_ at all. The standard does say (verbatim) "precisely what constitutes as access to object of volatile-qualified type is implementation-defined", but GCC does help us out here by doing the right thing. Accessing the non-volatile object there using the volatile-qualified pointer-type cast makes GCC treat the object stored at that memory address itself as if it were a volatile object, thus making the access end up having what we're calling as "volatility" semantics here. Honestly, given such confusion, and the propensity of the "volatile" type-qualifier keyword to be ill-defined (or at least poorly understood, often inconsistently implemented), I'd (again) express my opinion that it would be best to avoid its usage, given other alternatives do exist. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Linux 2.6.20.16
I've just released Linux 2.6.20.16. This version catches up with 2.6.21.7. I hope to issue newer releases soon with next batches of pending patches. I'll also be replying to this message with a copy of the patch between 2.6.20.15 and 2.6.20.16. The patch and changelog will appear soon at the following locations: ftp://ftp.all.kernel.org/pub/linux/kernel/v2.6/ ftp://ftp.all.kernel.org/pub/linux/kernel/v2.6/patch-2.6.20.16.bz2 ftp://ftp.all.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.20.16 Git repository: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-2.6.20.y.git http://www.kernel.org/pub/scm/linux/kernel/git/stable/linux-2.6.20.y.git Git repository through the gitweb interface: http://git.kernel.org/?p=linux/kernel/git/stable/linux-2.6.20.y.git Willy --- Makefile |2 arch/i386/kernel/entry.S |8 - arch/i386/oprofile/nmi_int.c |9 + arch/powerpc/kernel/signal_64.c | 10 - arch/x86_64/mm/init.c |8 + drivers/char/cyclades.c |1 drivers/md/bitmap.c | 17 +- drivers/md/dm-crypt.c | 50 ++ drivers/md/raid0.c|3 drivers/md/raid1.c| 21 +- drivers/md/raid10.c |6 drivers/media/video/saa7134/saa7134-tvaudio.c |2 drivers/net/e1000/e1000_main.c| 11 - drivers/net/sky2.c|9 + drivers/serial/mpsc.c |3 include/linux/bootmem.h |1 include/linux/pci_ids.h |4 include/linux/raid/bitmap.h |1 include/linux/sched.h |1 include/linux/workqueue.h |2 kernel/auditfilter.c |2 kernel/exit.c | 22 ++ kernel/futex.c| 205 -- kernel/rtmutex.c | 24 ++- kernel/sched.c| 22 +- mm/rmap.c | 24 --- mm/sparse.c | 11 + 27 files changed, 307 insertions(+), 172 deletions(-) Summary of changes from 2.6.20.15 to 2.6.20.16 Alexey Kuznetsov (1): pi-futex: Fix exit races and locking problems Andi Kleen (1): i386: Fix K8/core2 oprofile on multiple CPUs Auke Kok (1): e1000: disable polling before registering netdevice Bob Picco (1): sparsemem: fix oops in x86_64 show_mem Christoph Lameter (1): sched: fix next_interval determination in idle_balance() Hugh Dickins (1): mm: kill validate_anon_vma to avoid mapcount BUG Jason Gaston (1): pci_ids: update patch for Intel ICH9M Jason Wessel (1): i386: fix infinite loop with singlestep int80 syscalls Jay Lubomirski (1): serial: clear proper MPSC interrupt cause bits Jeff Mahoney (1): saa7134: fix thread shutdown handling Jiri Slaby (1): Char: cyclades, fix deadlock Mike Accetta (1): md: Fix bug in error handling during raid1 repair. Milan Broz (1): dm crypt: disable barriers NeilBrown (3): md: Avoid overflow in raid0 calculation with large components. md: Don't write more than is required of the last page of a bitmap md: Fix two raid10 bugs. Olaf Kirch (3): dm crypt: fix call to clone_init dm crypt: fix avoid cloned bio ref after free dm crypt: fix remove first_clone Oleg Nesterov (1): make freezeable workqueues singlethread Paul Mackerras (1): POWERPC: Fix subtle FP state corruption bug in signal return on SMP Stephen Hemminger (1): sky2: workaround for lost IRQ Thomas Gleixner (3): rt-mutex: Fix stale return value rt-mutex: Fix chain walk early wakeup bug FUTEX: Restore the dropped ERSCH fix Tony Jones (1): audit: fix oops removing watch if audit disabled Willy Tarreau (1): Linux 2.6.20.16 Zou Nan hai (1): x86_64: allocate sparsemem memmap above 4G - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 02:11:43PM +1000, Paul Mackerras wrote: > > The uses of atomic_read where one might want it to allow caching of > the result seem to me to fall into 3 categories: > > 1. Places that are buggy because of a race arising from the way it's >used. > > 2. Places where there is a race but it doesn't matter because we're >doing some clever trick. > > 3. Places where there is some locking in place that eliminates any >potential race. Agreed. > In case 1, adding volatile won't solve the race, of course, but it's > hard to argue that we shouldn't do something because it will slow down > buggy code. Case 2 is hopefully pretty rare and accompanied by large > comment blocks, and in those cases caching the result of atomic_read > explicitly in a local variable would probably make the code clearer. > And in case 3 there is no reason to use atomic_t at all; we might as > well just use an int. Since adding volatile doesn't help any of the 3 cases, and takes away optimisations from both 2 and 3, I wonder what is the point of the addition after all? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 02:34:25PM +1000, Paul Mackerras wrote: > > I'm talking about this situation: > > CPU 0 comes into __sk_stream_mem_reclaim, reads memory_allocated, but > then before it can do the store to *memory_pressure, CPUs 1-1023 all > go through sk_stream_mem_schedule, collectively increase > memory_allocated to more than sysctl_mem[2] and set *memory_pressure. > Finally CPU 0 gets to do its store and it sets *memory_pressure back > to 0, but by this stage memory_allocated is way larger than > sysctl_mem[2]. It doesn't matter. The memory pressure flag is an *advisory* flag. If we get it wrong the worst that'll happen is that we'd waste some time doing work that'll be thrown away. Please look at the places where it's used before jumping to conclusions. > Now, maybe it's the case that it doesn't really matter whether > *->memory_pressure is 0 or 1. But if so, why bother computing it at > all? As long as we get it right most of the time (and I think you would agree that we do get it right most of the time), then this flag has achieved its purpose. > People seem to think that using atomic_t means they don't need to use > a spinlock. That's fine if there is only one variable involved, but > as soon as there's more than one, there's the possibility of a race, > whether or not you use atomic_t, and whether or not atomic_read has > "volatile" behaviour. In any case, this actually illustrates why the addition of volatile is completely pointless. Even if this code was broken, which it definitely is not, having the volatile there wouldn't have helped at all. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Documentation] Page Table Layout diagrams
On Wed, Aug 08, 2007 at 01:47:45PM -0500, Adam Litke wrote: > Hello all. In an effort to understand how the page tables are laid out > across various architectures I put together some diagrams. I have > posted them on the linux-mm wiki: http://linux-mm.org/PageTableStructure > and I hope they will be useful to others. > > Just to make sure I am not spreading misinformation, could a few of you > experts take a quick look at the three diagrams I've got finished so far > and point out any errors I have made? Thanks. Nice. Didn't spot any innaccuracies. -- David Gibson| I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 005 of 5] Merge blk_recount_segments into blk_recalc_rq_segments
blk_recalc_rq_segments calls blk_recount_segments on each bio, then does some extra calculations to handle segments that overlap two bios. If we merge the code from blk_recount_segments into blk_recalc_rq_segments, we can process the whole request one bio_vec at a time, and not need the messy cross-bio calculations. Then blk_recount_segments can be implemented by calling blk_recalc_rq_segments, passing it a simple on-stack request which stores just the bio. This function is only temporary and will go away completely by the end of this patch series. This allows us to remove rq_for_each_bio Signed-off-by: Neil Brown <[EMAIL PROTECTED]> ### Diffstat output ./block/ll_rw_blk.c | 125 --- ./include/linux/blkdev.h |3 - 2 files changed, 55 insertions(+), 73 deletions(-) diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c --- .prev/block/ll_rw_blk.c 2007-08-16 15:02:31.0 +1000 +++ ./block/ll_rw_blk.c 2007-08-16 15:02:31.0 +1000 @@ -42,6 +42,7 @@ static void drive_stat_acct(struct reque static void init_request_from_bio(struct request *req, struct bio *bio); static int __make_request(struct request_queue *q, struct bio *bio); static struct io_context *current_io_context(gfp_t gfp_flags, int node); +static void blk_recalc_rq_segments(struct request *rq); /* * For the allocated request tables @@ -1209,65 +1210,91 @@ EXPORT_SYMBOL(blk_dump_rq_flags); void blk_recount_segments(struct request_queue *q, struct bio *bio) { - struct bio_vec *bv, *bvprv = NULL; - int i, nr_phys_segs, nr_hw_segs, seg_size, hw_seg_size, cluster; + struct request rq; + struct bio *nxt = bio->bi_next; + rq.q = q; + rq.bio = rq.biotail = bio; + bio->bi_next = NULL; + blk_recalc_rq_segments(); + bio->bi_next = nxt; + bio->bi_phys_segments = rq.nr_phys_segments; + bio->bi_hw_segments = rq.nr_hw_segments; + bio->bi_flags |= (1 << BIO_SEG_VALID); +} +EXPORT_SYMBOL(blk_recount_segments); + +static void blk_recalc_rq_segments(struct request *rq) +{ + int nr_phys_segs; + int nr_hw_segs; + unsigned int phys_size; + unsigned int hw_size; + struct bio_vec bv; + struct bio_vec bvprv = {0}; + int seg_size; + int hw_seg_size; + int cluster; + struct req_iterator i; int high, highprv = 1; + struct request_queue *q = rq->q; - if (unlikely(!bio->bi_io_vec)) + if (!rq->bio) return; cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); - hw_seg_size = seg_size = nr_phys_segs = nr_hw_segs = 0; - bio_for_each_segment(bv, bio, i) { + hw_seg_size = seg_size = 0; + phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0; + rq_for_each_segment(rq, i, bv) { /* * the trick here is making sure that a high page is never * considered part of another segment, since that might * change with the bounce page. */ - high = page_to_pfn(bv->bv_page) > q->bounce_pfn; + high = page_to_pfn(bv.bv_page) > q->bounce_pfn; if (high || highprv) goto new_hw_segment; if (cluster) { - if (seg_size + bv->bv_len > q->max_segment_size) + if (seg_size + bv.bv_len > q->max_segment_size) goto new_segment; - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) + if (!BIOVEC_PHYS_MERGEABLE(, )) goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv)) + if (!BIOVEC_SEG_BOUNDARY(q, , )) goto new_segment; - if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) + if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv.bv_len)) goto new_hw_segment; - seg_size += bv->bv_len; - hw_seg_size += bv->bv_len; + seg_size += bv.bv_len; + hw_seg_size += bv.bv_len; bvprv = bv; continue; } new_segment: - if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) && - !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len)) { - hw_seg_size += bv->bv_len; - } else { + if (BIOVEC_VIRT_MERGEABLE(, ) && + !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv.bv_len)) + hw_seg_size += bv.bv_len; + else { new_hw_segment: - if (hw_seg_size > bio->bi_hw_front_size) - bio->bi_hw_front_size = hw_seg_size; - hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len; +
[PATCH 004 of 5] Introduce rq_for_each_segment replacing rq_for_each_bio
(almost) every usage of rq_for_each_bio wraps a usage of bio_for_each_segment, so these can be combined into rq_for_each_segment. We get it to fill in a bio_vec structure rather than provide a pointer, as future changes to make bi_io_vec immutable will require that. The one place where rq_for_each_bio remains will be changed to use rq_for_each_segment in a subsequent patch. Signed-off-by: Neil Brown <[EMAIL PROTECTED]> ### Diffstat output ./block/ll_rw_blk.c | 54 +++-- ./drivers/block/floppy.c | 84 ++- ./drivers/block/lguest_blk.c | 25 +-- ./drivers/block/nbd.c| 67 +++ ./drivers/block/xen-blkfront.c | 57 -- ./drivers/ide/ide-floppy.c | 62 +--- ./drivers/s390/block/dasd_diag.c | 40 -- ./drivers/s390/block/dasd_eckd.c | 47 ++--- ./drivers/s390/block/dasd_fba.c | 47 ++--- ./drivers/s390/char/tape_34xx.c | 33 ++- ./drivers/s390/char/tape_3590.c | 41 --- ./include/linux/blkdev.h | 18 12 files changed, 280 insertions(+), 295 deletions(-) diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c --- .prev/block/ll_rw_blk.c 2007-08-16 15:02:30.0 +1000 +++ ./block/ll_rw_blk.c 2007-08-16 15:02:31.0 +1000 @@ -1313,9 +1313,11 @@ static int blk_hw_contig_segment(struct int blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sg) { - struct bio_vec *bvec, *bvprv; - struct bio *bio; - int nsegs, i, cluster; + struct bio_vec bvec; + struct bio_vec bvprv = { 0 }; + struct req_iterator i; + int nsegs; + int cluster; nsegs = 0; cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER); @@ -1323,36 +1325,30 @@ int blk_rq_map_sg(struct request_queue * /* * for each bio in rq */ - bvprv = NULL; - rq_for_each_bio(bio, rq) { - /* -* for each segment in bio -*/ - bio_for_each_segment(bvec, bio, i) { - int nbytes = bvec->bv_len; + rq_for_each_segment(rq, i, bvec) { + int nbytes = bvec.bv_len; - if (bvprv && cluster) { - if (sg[nsegs - 1].length + nbytes > q->max_segment_size) - goto new_segment; - - if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) - goto new_segment; - if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec)) - goto new_segment; + if (bvprv.bv_page && cluster) { + if (sg[nsegs - 1].length + nbytes > q->max_segment_size) + goto new_segment; + + if (!BIOVEC_PHYS_MERGEABLE(, )) + goto new_segment; + if (!BIOVEC_SEG_BOUNDARY(q, , )) + goto new_segment; - sg[nsegs - 1].length += nbytes; - } else { + sg[nsegs - 1].length += nbytes; + } else { new_segment: - memset([nsegs],0,sizeof(struct scatterlist)); - sg[nsegs].page = bvec->bv_page; - sg[nsegs].length = nbytes; - sg[nsegs].offset = bvec->bv_offset; + memset([nsegs], 0, sizeof(struct scatterlist)); + sg[nsegs].page = bvec.bv_page; + sg[nsegs].length = nbytes; + sg[nsegs].offset = bvec.bv_offset; - nsegs++; - } - bvprv = bvec; - } /* segments in bio */ - } /* bios in rq */ + nsegs++; + } + bvprv = bvec; + } return nsegs; } diff .prev/drivers/block/floppy.c ./drivers/block/floppy.c --- .prev/drivers/block/floppy.c2007-08-16 15:02:30.0 +1000 +++ ./drivers/block/floppy.c2007-08-16 15:02:31.0 +1000 @@ -2450,23 +2450,20 @@ static void rw_interrupt(void) /* Compute maximal contiguous buffer size. */ static int buffer_chain_size(void) { - struct bio *bio; - struct bio_vec *bv; + struct bio_vec bv; int size; - int i; + struct req_iterator i; char *base; base = blk_rq_data(current_req); size = 0; - rq_for_each_bio(bio, current_req) { - bio_for_each_segment(bv, bio, i) { - if (page_address(bv->bv_page) + bv->bv_offset != -
[PATCH 002 of 5] Replace bio_data with blk_rq_data
Almost every call to bio_data is for the first bio in a request. A future patch will add some accounting information to 'struct request' which will need to be used to find the start of the request in the bio. So replace bio_data with blk_rq_data which takes a 'struct request *' The one exception is in dm-emc were using page_address(bio->bi_io_vec[0].bv_page); is appropriate. Signed-off-by: Neil Brown <[EMAIL PROTECTED]> ### Diffstat output ./block/ll_rw_blk.c | 16 ./drivers/block/floppy.c |2 +- ./drivers/ide/ide-cd.c | 11 ++- ./drivers/ide/ide-io.c |2 +- ./drivers/md/dm-emc.c|2 +- ./drivers/message/fusion/mptsas.c|4 ++-- ./drivers/scsi/libsas/sas_expander.c |4 ++-- ./include/linux/bio.h|4 +--- ./include/linux/blkdev.h |1 + 9 files changed, 27 insertions(+), 19 deletions(-) diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c --- .prev/block/ll_rw_blk.c 2007-08-16 15:02:29.0 +1000 +++ ./block/ll_rw_blk.c 2007-08-16 15:02:30.0 +1000 @@ -2907,8 +2907,8 @@ static void init_request_from_bio(struct req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio); req->nr_phys_segments = bio_phys_segments(req->q, bio); req->nr_hw_segments = bio_hw_segments(req->q, bio); - req->buffer = bio_data(bio);/* see ->buffer comment above */ req->bio = req->biotail = bio; + req->buffer = blk_rq_data(req); /* see ->buffer comment above */ req->ioprio = bio_prio(bio); req->rq_disk = bio->bi_bdev->bd_disk; req->start_time = jiffies; @@ -2977,7 +2977,7 @@ static int __make_request(struct request * it didn't need a bounce buffer then it better * not touch req->buffer either... */ - req->buffer = bio_data(bio); + req->buffer = blk_rq_data(req); req->current_nr_sectors = bio_cur_sectors(bio); req->hard_cur_sectors = req->current_nr_sectors; req->sector = req->hard_sector = bio->bi_sector; @@ -3373,7 +3373,7 @@ static void blk_recalc_rq_sectors(struct rq->nr_sectors = rq->hard_nr_sectors; rq->hard_cur_sectors = bio_cur_sectors(rq->bio); rq->current_nr_sectors = rq->hard_cur_sectors; - rq->buffer = bio_data(rq->bio); + rq->buffer = blk_rq_data(rq); } /* @@ -3678,14 +3678,22 @@ void blk_rq_bio_prep(struct request_queu rq->current_nr_sectors = bio_cur_sectors(bio); rq->hard_cur_sectors = rq->current_nr_sectors; rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); - rq->buffer = bio_data(bio); rq->data_len = bio->bi_size; rq->bio = rq->biotail = bio; + rq->buffer = blk_rq_data(rq); } EXPORT_SYMBOL(blk_rq_bio_prep); +void *blk_rq_data(struct request *rq) +{ + return page_address(bio_page(rq->bio)) + + bio_offset(rq->bio); +} +EXPORT_SYMBOL(blk_rq_data); + + int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); diff .prev/drivers/block/floppy.c ./drivers/block/floppy.c --- .prev/drivers/block/floppy.c2007-08-16 14:57:31.0 +1000 +++ ./drivers/block/floppy.c2007-08-16 15:02:30.0 +1000 @@ -2456,7 +2456,7 @@ static int buffer_chain_size(void) int i; char *base; - base = bio_data(current_req->bio); + base = blk_rq_data(current_req); size = 0; rq_for_each_bio(bio, current_req) { diff .prev/drivers/ide/ide-cd.c ./drivers/ide/ide-cd.c --- .prev/drivers/ide/ide-cd.c 2007-08-16 14:57:31.0 +1000 +++ ./drivers/ide/ide-cd.c 2007-08-16 15:02:30.0 +1000 @@ -1381,10 +1381,11 @@ static ide_startstop_t cdrom_start_seek start it over entirely, or even put it back on the request queue. */ static void restore_request (struct request *rq) { - if (rq->buffer != bio_data(rq->bio)) { - sector_t n = (rq->buffer - (char *) bio_data(rq->bio)) / SECTOR_SIZE; + if (rq->buffer != blk_rq_data(rq)) { + sector_t n = (rq->buffer - (char *)blk_rq_data(rq)) + / SECTOR_SIZE; - rq->buffer = bio_data(rq->bio); + rq->buffer = blk_rq_data(rq); rq->nr_sectors += n; rq->sector -= n; } @@ -1659,7 +1660,7 @@ static void post_transform_command(struc return; if (req->bio) - ibuf = bio_data(req->bio); + ibuf = blk_rq_data(req); else ibuf = req->data; @@ -1768,7 +1769,7 @@ static ide_startstop_t cdrom_newpc_intr( *
[PATCH 003 of 5] Replace bio_cur_sectors with blk_rq_cur_sectors.
All calls to bio_cur_sectors are for the first bio in a 'struct request'. A future patch will make the discovery of this number dependant on information in the request. So change the function to take a 'struct request *' instread of a 'struct bio *', and make it a real function as more code will need to be added. One place wants the current bytes rather than sectors, so the 'real function' we create is blk_rq_cur_bytes, and blk_rq_cur_sectors divides this value by 512. Signed-off-by: Neil Brown <[EMAIL PROTECTED]> ### Diffstat output ./block/ll_rw_blk.c | 18 -- ./drivers/ide/ide-cd.c | 11 ++- ./drivers/ide/ide-io.c |2 +- ./include/linux/bio.h|1 - ./include/linux/blkdev.h |5 + 5 files changed, 24 insertions(+), 13 deletions(-) diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c --- .prev/block/ll_rw_blk.c 2007-08-16 15:02:30.0 +1000 +++ ./block/ll_rw_blk.c 2007-08-16 15:02:30.0 +1000 @@ -2904,10 +2904,11 @@ static void init_request_from_bio(struct req->errors = 0; req->hard_sector = req->sector = bio->bi_sector; req->hard_nr_sectors = req->nr_sectors = bio_sectors(bio); - req->current_nr_sectors = req->hard_cur_sectors = bio_cur_sectors(bio); + req->bio = req->biotail = bio; + req->current_nr_sectors = req->hard_cur_sectors = + blk_rq_cur_sectors(req); req->nr_phys_segments = bio_phys_segments(req->q, bio); req->nr_hw_segments = bio_hw_segments(req->q, bio); - req->bio = req->biotail = bio; req->buffer = blk_rq_data(req); /* see ->buffer comment above */ req->ioprio = bio_prio(bio); req->rq_disk = bio->bi_bdev->bd_disk; @@ -2978,7 +2979,7 @@ static int __make_request(struct request * not touch req->buffer either... */ req->buffer = blk_rq_data(req); - req->current_nr_sectors = bio_cur_sectors(bio); + req->current_nr_sectors = blk_rq_cur_sectors(req); req->hard_cur_sectors = req->current_nr_sectors; req->sector = req->hard_sector = bio->bi_sector; req->nr_sectors = req->hard_nr_sectors += nr_sectors; @@ -3371,7 +3372,7 @@ static void blk_recalc_rq_sectors(struct (rq->sector <= rq->hard_sector)) { rq->sector = rq->hard_sector; rq->nr_sectors = rq->hard_nr_sectors; - rq->hard_cur_sectors = bio_cur_sectors(rq->bio); + rq->hard_cur_sectors = blk_rq_cur_sectors(rq); rq->current_nr_sectors = rq->hard_cur_sectors; rq->buffer = blk_rq_data(rq); } @@ -3675,13 +3676,13 @@ void blk_rq_bio_prep(struct request_queu rq->nr_phys_segments = bio_phys_segments(q, bio); rq->nr_hw_segments = bio_hw_segments(q, bio); - rq->current_nr_sectors = bio_cur_sectors(bio); - rq->hard_cur_sectors = rq->current_nr_sectors; rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); rq->data_len = bio->bi_size; rq->bio = rq->biotail = bio; rq->buffer = blk_rq_data(rq); + rq->current_nr_sectors = blk_rq_cur_sectors(rq); + rq->hard_cur_sectors = rq->current_nr_sectors; } EXPORT_SYMBOL(blk_rq_bio_prep); @@ -3693,6 +3694,11 @@ void *blk_rq_data(struct request *rq) } EXPORT_SYMBOL(blk_rq_data); +int blk_rq_cur_bytes(struct request *rq) +{ + return bio_iovec(rq->bio)->bv_len; +} +EXPORT_SYMBOL(blk_rq_cur_bytes); int kblockd_schedule_work(struct work_struct *work) { diff .prev/drivers/ide/ide-cd.c ./drivers/ide/ide-cd.c --- .prev/drivers/ide/ide-cd.c 2007-08-16 15:02:30.0 +1000 +++ ./drivers/ide/ide-cd.c 2007-08-16 15:02:30.0 +1000 @@ -1173,7 +1173,8 @@ static ide_startstop_t cdrom_read_intr ( /* First, figure out if we need to bit-bucket any of the leading sectors. */ - nskip = min_t(int, rq->current_nr_sectors - bio_cur_sectors(rq->bio), sectors_to_transfer); + nskip = min_t(int, rq->current_nr_sectors - blk_rq_cur_sectors(rq), + sectors_to_transfer); while (nskip > 0) { /* We need to throw away a sector. */ @@ -1273,7 +1274,7 @@ static int cdrom_read_from_buffer (ide_d represent the number of sectors to skip at the start of a transfer will fail. I think that this will never happen, but let's be paranoid and check. */ - if (rq->current_nr_sectors < bio_cur_sectors(rq->bio) && + if (rq->current_nr_sectors < blk_rq_cur_sectors(rq) && (rq->sector & (sectors_per_frame - 1))) { printk(KERN_ERR "%s: cdrom_read_from_buffer: buffer botch (%ld)\n", drive->name, (long)rq->sector); @@ -1308,7 +1309,7 @@ static
[PATCH 001 of 5] Don't update bi_hw_*_size if we aren't going to merge.
ll_merge_requests_fn can update bi_hw_*_size in one case where we end up not merging. This is wrong. Signed-off-by: Neil Brown <[EMAIL PROTECTED]> ### Diffstat output ./block/ll_rw_blk.c |4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff .prev/block/ll_rw_blk.c ./block/ll_rw_blk.c --- .prev/block/ll_rw_blk.c 2007-08-16 14:57:39.0 +1000 +++ ./block/ll_rw_blk.c 2007-08-16 15:02:29.0 +1000 @@ -1518,11 +1518,13 @@ static int ll_merge_requests_fn(struct r /* * propagate the combined length to the end of the requests */ + total_hw_segments--; + if (total_hw_segments > q->max_hw_segments) + return 0; if (req->nr_hw_segments == 1) req->bio->bi_hw_front_size = len; if (next->nr_hw_segments == 1) next->biotail->bi_hw_back_size = len; - total_hw_segments--; } if (total_hw_segments > q->max_hw_segments) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 000 of 5] Introductory patches for bio refactor.
Hi Jens, I wonder if you would accept these patches the block layer. They are, as far as I can tell, quite uncontroversial and provide good cleanups. The first is a minor bug-fix. The next to replace helper function that take a bio (always the first bio of a request), to instead take a request. The last two combine to replace rq_for_each_bio with rq_for_each_segment which makes blk_recalc_rq_segments a lot cleaner, and improves (to a lesser degree) every other place that used rq_for_each_bio. The net result is a decrease in object code size (for my x86-64 compile) for block/built-in.o of 96 bytes, though there is some growth out in drivers making and over-all decrease of only 48 bytes. Thanks, NeilBrown [PATCH 001 of 5] Don't update bi_hw_*_size if we aren't going to merge. [PATCH 002 of 5] Replace bio_data with blk_rq_data [PATCH 003 of 5] Replace bio_cur_sectors with blk_rq_cur_sectors. [PATCH 004 of 5] Introduce rq_for_each_segment replacing rq_for_each_bio [PATCH 005 of 5] Merge blk_recount_segments into blk_recalc_rq_segments - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Hi Bill, On Wed, 15 Aug 2007, Bill Fink wrote: > On Wed, 15 Aug 2007, Satyam Sharma wrote: > > > (C) > > $ cat tp3.c > > int a; > > > > void func(void) > > { > > *(volatile int *) = 10; > > *(volatile int *) = 20; > > } > > $ gcc -Os -S tp3.c > > $ cat tp3.s > > ... > > movl$10, a > > movl$20, a > > ... > > I'm curious about one minor tangential point. Why, instead of: > > b = *(volatile int *) > > why can't this just be expressed as: > > b = (volatile int)a; > > Isn't it the contents of a that's volatile, i.e. it's value can change > invisibly to the compiler, and that's why you want to force a read from > memory? Why do you need the "*(volatile int *)&" construct? "b = (volatile int)a;" doesn't help us because a cast to a qualified type has the same effect as a cast to an unqualified version of that type, as mentioned in 6.5.4:4 (footnote 86) of the standard. Note that "volatile" is a type-qualifier, not a type itself, so a cast of the _object_ itself to a qualified-type i.e. (volatile int) would not make the access itself volatile-qualified. To serve our purposes, it is necessary for us to take the address of this (non-volatile) object, cast the resulting _pointer_ to the corresponding volatile-qualified pointer-type, and then dereference it. This makes that particular _access_ be volatile-qualified, without the object itself being such. Also note that the (dereferenced) result is also a valid lvalue and hence can be used in "*(volatile int *) = b;" kind of construction (which we use for the atomic_set case). Satyam - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [ckrm-tech] Regression in 2.6.23-rc2-mm2, mounting cpusets causes a hang
On 8/15/07, Paul Jackson <[EMAIL PROTECTED]> wrote: > Lee wrote: > > [altho' methinks CPUSET should select CONTAINERS rather than > > depend on it...] > > Good point -- what do you think, Paul Menage? That's how I made the configs originally; akpm asked me to invert the dependencies to use "depends" rather than "select" due to some general problems with using "select". I agree that "select" seems more natural. Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Thinking outside the box on file systems
Mmm, slow-as-dirt hotel wireless. What fun... On Aug 15, 2007, at 18:14:44, Phillip Susi wrote: Kyle Moffett wrote: I am well aware of that, I'm simply saying that sucks. Doing a recursive chmod or setfacl on a large directory tree is slow as all hell. Doing it in the kernel won't make it any faster. Right... I'm talking about getting rid of it entirely. Let me repeat myself here: Algorithmically you fundamentally CANNOT implement inheritance-based ACLs without one of the following (although if you have some other algorithm in mind, I'm listening): (A) Some kind of recursive operation *every* time you change an inheritable permission (B) A unified "starting point" from which you begin *every* access- control lookup (or one "starting point" per useful semantic grouping, like a namespace). The "(A)" is presently done in userspace and that's what you want to avoid. As to (B), I will attempt to prove below that you cannot implement "(B)" without breaking existing assumptions and restricting a very nice VFS model. Not necessarily. When I do "vim some-file-in-current-directory", for example, the kernel does *NOT* look up the path of my current directory. It does (in pseudocode): if (starts_with_slash(filename)) { entry = task->cwd; } else { entry = task->root; } while (have_components_left(filename) entry = lookup_next_component(filename); return entry; Right and task->cwd would have the effective acl in memory, ready to be combined with any acl set on the file. What ACL would "task->cwd" use? Options: (1.a) Use the one calculated during the original chdir() call. (1.b) Navigate "up" task->cwd building an ACL backwards. (1.c) $CAN_YOU_THINK_OF_SOMETHING_ELSE_HERE Unsolvable problems with each option: (1.a.I) You just broke all sorts of chrooted daemons. When I start bind in its chroot jail, it does the following: chdir("/private/bind9"); chroot("."); setgid(...); setuid(...); The "/private" directory is readable only by root, since root is the only one who will be navigating you into these chroots for any reason. You only switch UID/GID after the chroot() call, at which point you are inside of a sub-context and your cwd is fully accessible. If you stick an inheritable ACL on "/private", then the "cwd" ACL will not allow access by anybody but root and my bind won't be able to read any config files. You also break relative paths and directory-moving. Say a process does chdir("/foo/bar"). Now the ACL data in "cwd" is appropriate for /foo/bar. If you later chdir("../quux"), how do you unapply the changes made when you switched into that directory? For inheritable ACLs, you can't "unapply" such an ACL state change unless you save state for all the parent directories, except... What happens when you are in "/foo/bar" and another process does "mv /foo/bar /foobar/ quux"? Suddenly any "cwd" ACL data you have is completely invalid and you have to rebuild your ACLs from scratch. Moreover, if the directory you are in was moved to a portion of the filesystem not accessible from your current namespace then how do you deal with it? For example: NS1 has the / root dir of /dev/sdb1 mounted on /mnt NS2 has the /bar subdir of /dev/sdb1 mounted on /mnt Your process is in NS2 and does chdir("/mnt/quux"). A user in NS1 does: "mv /mnt/bar/quux /mnt/quux". Now your "cwd" is in a directory on a filesystem you have mounted, but it does not correspond *AT ALL* to any path available from your namespace. Another example: Your process has done dirfd=open("/media/cdrom/somestuff") when the admin does "umount -l /media/cdrom". You still have the CD-ROM open and accessible but IT HAS NO PATH. It isn't even mounted in *any* namespace, it's just kind of dangling waiting for its last users to go away. You can still do fchdir(dirfd), openat(dirfd, "foo/ bar", ...), open("./foo"), etc. In Linux the ONLY distinction between "relative" and "absolute" paths is that the "absolute" path begins with a magic slash which implies that you start at the hidden "root" fd the kernel manages. More detail on problems with the "building the ACL from scratch" part: That's not even paying attention to functions like "fchdir" or their interactions with "chroot" and namespaces. I can probably have an open directory handle to a volume in a completely different namespace, a volume which isn't even *MOUNTED* in my current fs namespace. Using that file-handle I believe I can "fchdir", "openat", etc, in a completely different namespace. I can do the same thing with a chroot, except there I can even "escape": /* Switch into chroot. Doesn't drop root privs */ chdir("/some/dir/somewhere"); chroot("."); /* Malicious code later on */ chdir("/"); chroot("another_dir"); chdir("../../../../../../../../.."); chroot("."); /* Now I'm back in the real root filesystem */ I don't see what
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Herbert Xu writes: > > You mean it's intended that *sk->sk_prot->memory_pressure can end up > > as 1 when sk->sk_prot->memory_allocated is small (less than > > ->sysctl_mem[0]), or as 0 when ->memory_allocated is large (greater > > than ->sysctl_mem[2])? Because that's the effect of the current code. > > If so I wonder why you bother computing it. > > You need to remember that there are three different limits: > minimum, pressure, and maximum. By default we should never > be in a situation where what you say can occur. > > If you set all three limits to the same thing, then yes it > won't work as intended but it's still well-behaved. I'm not talking about setting all three limits to the same thing. I'm talking about this situation: CPU 0 comes into __sk_stream_mem_reclaim, reads memory_allocated, but then before it can do the store to *memory_pressure, CPUs 1-1023 all go through sk_stream_mem_schedule, collectively increase memory_allocated to more than sysctl_mem[2] and set *memory_pressure. Finally CPU 0 gets to do its store and it sets *memory_pressure back to 0, but by this stage memory_allocated is way larger than sysctl_mem[2]. Yes, it's unlikely, but that is the nature of race conditions - they are unlikely, and only show up at inconvenient times, never when someone who could fix the bug is watching. :) Similarly it would be possible for other CPUs to decrease memory_allocated from greater than sysctl_mem[2] to less than sysctl_mem[0] in the interval between when we read memory_allocated and set *memory_pressure to 1. And it's quite possible for their setting of *memory_pressure to 0 to happen before our setting of it to 1, so that it ends up at 1 when it should be 0. Now, maybe it's the case that it doesn't really matter whether *->memory_pressure is 0 or 1. But if so, why bother computing it at all? People seem to think that using atomic_t means they don't need to use a spinlock. That's fine if there is only one variable involved, but as soon as there's more than one, there's the possibility of a race, whether or not you use atomic_t, and whether or not atomic_read has "volatile" behaviour. Paul. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: drivers/infiniband/mlx/mad.c misplaced ;
Joe Perches wrote: On Wed, 2007-08-15 at 19:19 -0700, Kok, Auke wrote: Joe Perches wrote: On Wed, 2007-08-15 at 19:58 -0400, Dave Jones wrote: There's more than a few of these (not inspected). $ egrep -r --include=*.c "\bif[[:space:]]*\([^\)]*\)[[:space:]]*\;" * arch/sh/boards/se/7343/io.c:if (0) ; drivers/atm/iphase.c: if (!desc1) ; drivers/infiniband/hw/mlx4/mad.c: if (!err); drivers/isdn/capi/capiutil.c: else if (c <= 0x0f); drivers/net/tokenring/ibmtr.c: else if (ti->shared_ram_paging == 0xf); /* No paging in adapter */ drivers/s390/scsi/zfcp_erp.c: if (status == ZFCP_ERP_SUCCEEDED) ; /* no further action */ fs/hostfs/hostfs_user.c:if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ; net/netfilter/xt_u32.c: if (skb->len < 4 || pos > skb->len - 4); sound/pci/au88x0/au88x0_synth.c:if (eax == 0) ; sounds like an excellent candidate check for checkpatch.pl !!! I think it's poor style and all of these should go away. the netfilter one appears to be a real error too. I was more thinking of making sure that none of these slip back in... Auke - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Satyam Sharma writes: > Anyway, the problem, of course, is that this conversion to a stronger / > safer-by-default behaviour doesn't happen with zero cost to performance. > Converting atomic ops to "volatile" behaviour did add ~2K to kernel text > for archs such as i386 (possibly to important codepaths) that didn't have > those semantics already so it would be constructive to actually look at > those differences and see if there were really any heisenbugs that got > rectified. Or if there were legitimate optimizations that got wrongly > disabled. Onus lies on those proposing the modifications, I'd say ;-) The uses of atomic_read where one might want it to allow caching of the result seem to me to fall into 3 categories: 1. Places that are buggy because of a race arising from the way it's used. 2. Places where there is a race but it doesn't matter because we're doing some clever trick. 3. Places where there is some locking in place that eliminates any potential race. In case 1, adding volatile won't solve the race, of course, but it's hard to argue that we shouldn't do something because it will slow down buggy code. Case 2 is hopefully pretty rare and accompanied by large comment blocks, and in those cases caching the result of atomic_read explicitly in a local variable would probably make the code clearer. And in case 3 there is no reason to use atomic_t at all; we might as well just use an int. So I don't see any good reason to make the atomic API more complex by having "volatile" and "non-volatile" versions of atomic_read. It should just have the "volatile" behaviour. Paul. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 01:48:32PM +1000, Paul Mackerras wrote: > Herbert Xu writes: > > > If you're referring to the code in sk_stream_mem_schedule > > then it's working as intended. The atomicity guarantees > > You mean it's intended that *sk->sk_prot->memory_pressure can end up > as 1 when sk->sk_prot->memory_allocated is small (less than > ->sysctl_mem[0]), or as 0 when ->memory_allocated is large (greater > than ->sysctl_mem[2])? Because that's the effect of the current code. > If so I wonder why you bother computing it. You need to remember that there are three different limits: minimum, pressure, and maximum. By default we should never be in a situation where what you say can occur. If you set all three limits to the same thing, then yes it won't work as intended but it's still well-behaved. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Herbert Xu writes: > If you're referring to the code in sk_stream_mem_schedule > then it's working as intended. The atomicity guarantees You mean it's intended that *sk->sk_prot->memory_pressure can end up as 1 when sk->sk_prot->memory_allocated is small (less than ->sysctl_mem[0]), or as 0 when ->memory_allocated is large (greater than ->sysctl_mem[2])? Because that's the effect of the current code. If so I wonder why you bother computing it. > that the atomic_add/atomic_sub won't be seen in parts by > other readers. > > We certainly do not need to see other atomic_add/atomic_sub > operations immediately. > > If you're referring to another code snippet please cite. > > > I'd go so far as to say that anywhere where you want a non-"volatile" > > atomic_read, either your code is buggy, or else an int would work just > > as well. > > An int won't work here because += and -= do not have the > atomicity guarantees that atomic_add/atomic_sub do. In > particular, this may cause an atomic_read on another CPU > to give a bogus reading. The point is that guaranteeing the atomicity of the increment or decrement does not suffice to make the code race-free. In this case the race arises from the fact that reading ->memory_allocated and setting *->memory_pressure are separate operations. To make that code work properly you need a lock. And once you have the lock an ordinary int would suffice for ->memory_allocated. Paul. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, 15 Aug 2007, Satyam Sharma wrote: > (C) > $ cat tp3.c > int a; > > void func(void) > { > *(volatile int *) = 10; > *(volatile int *) = 20; > } > $ gcc -Os -S tp3.c > $ cat tp3.s > ... > movl$10, a > movl$20, a > ... I'm curious about one minor tangential point. Why, instead of: b = *(volatile int *) why can't this just be expressed as: b = (volatile int)a; Isn't it the contents of a that's volatile, i.e. it's value can change invisibly to the compiler, and that's why you want to force a read from memory? Why do you need the "*(volatile int *)&" construct? -Bill - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 01:15:05PM +1000, Paul Mackerras wrote: > > But others can also reduce the reservation. Also, the code sets and > clears *sk->sk_prot->memory_pressure nonatomically with respect to the > reads of sk->sk_prot->memory_allocated, so in fact the code doesn't > guarantee any particular relationship between the two. Yes others can reduce the reservation, but the point of this is that the code doesn't care. We'll either see the value before or after the reduction and in either case we'll do something sensible. The worst that can happen is when we're just below the hard limit and multiple CPUs fail to allocate but that's not really a problem because if the machine is making progress at all then we will eventually scale back and allow these allocations to succeed. As to the non-atomic operation on memory_pressue, that's OK because we only ever assign values to it and never do other operations such as += or -=. Remember that int/long assignments must be atomic or Linux won't run on your architecture. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Thinking outside the box on file systems
On Wed, 15 Aug 2007 15:48:15 PDT, Marc Perkel said: > > Consider the rules: > > > > peter '*a*' can create > > peter '*b*' cannot create > > > > Peter tries to create 'foo-ab-bar' - is he allowed > > to or not? > > > > First - I'm proposing a concept, not writing the > implementation of the concept. You are asking what > happens when someone write conflicting rules. That > depends on how you implement it. Conflicting rules can > cause unpredictable results. Good. Go work out what the rules have to be in order for the system to behave sanely. "Hand-waving concept" doesn't get anywhere. Fully fleshed-out concepts sometimes do - once they sprout code to actually implement them. > The point is that one can choose any rule system they > want and the rules applies to the names of the files > and the permissions of the users. No, you *can't* choose any rule system you want - some rule systems are unworkable because they create security exposures (usually of the "ln /etc/passwd /tmp/foo" variety, but sometimes race conditions as well). > > For an exersize, either write a program or do by > > hand: > All you would have to do is create a set of rules that > emulates the current rules and you would have the same > results. Good. Go create it. Let us know when you're done. Remember - not only do you need to have it generate the same results, you need to find a way to implement it so that it's somewhere near the same speed as the current code. If it's 10 times slower, it's not going to fly no matter *how* "cool" it is. pgp7EvsAH2cHY.pgp Description: PGP signature
Re: drivers/infiniband/mlx/mad.c misplaced ;
Thanks! Seems like checking for this is in the air, I just applied an identical patch from Ilpo Järvinen. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[GIT PULL] please pull infiniband.git for-linus branch
Linus, please pull from master.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git for-linus This tree is also available from kernel.org mirrors at: git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband.git for-linus This is basically a resend of the small fixes for 2.6.23 that I've already asked you to pull, with the addition of one patch that deletes a stray semicolon. Dotan Barak (3): IB: Include from IB: Include and from IB: Move the macro IB_UMEM_MAX_PAGE_CHUNK() to umem.c Eli Cohen (1): mlx4_core: Wait 1 second after reset before accessing device Hal Rosenstock (3): IB/mad: Fix error path if response alloc fails in ib_mad_recv_done_handler() IB/mad: Fix memory leak in switch handling in ib_mad_recv_done_handler() IB/mad: agent_send_response() should be void Ilpo Järvinen (1): IB/mlx4: Incorrect semicolon after if statement Jack Morgenstein (1): IPoIB: Fix leak in ipoib_transport_dev_init() error path Moni Shoua (1): IB/core: Ignore membership bit in ib_find_pkey() Raghava Kondapalli (1): IB/srp: Add OUI for new Cisco targets Roland Dreier (2): IB/sa: Don't need to check for default P_Key twice IB/srp: Wrap OUI checking for workarounds in helper functions Sean Hefty (1): IB/mad: Fix address handle leak in mad_rmpp Steve Wise (1): RDMA/cxgb3: Always call low level send function via cxgb3_ofld_send() Vu Pham (1): IB/mlx4: Fix opcode returned in RDMA read completion drivers/infiniband/core/agent.c| 24 + drivers/infiniband/core/agent.h|6 ++-- drivers/infiniband/core/device.c |2 +- drivers/infiniband/core/mad.c | 25 +++-- drivers/infiniband/core/mad_rmpp.c |8 +++--- drivers/infiniband/core/sa_query.c |4 +-- drivers/infiniband/core/umem.c |5 drivers/infiniband/hw/cxgb3/iwch_cm.c | 16 +++--- drivers/infiniband/hw/mlx4/cq.c|2 +- drivers/infiniband/hw/mlx4/mad.c |2 +- drivers/infiniband/ulp/ipoib/ipoib_verbs.c |1 + drivers/infiniband/ulp/srp/ib_srp.c| 31 +++ drivers/net/mlx4/reset.c |3 ++ include/rdma/ib_mad.h |2 + include/rdma/ib_verbs.h|7 + 15 files changed, 77 insertions(+), 61 deletions(-) diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index db2633e..ae7c288 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -78,15 +78,14 @@ ib_get_agent_port(struct ib_device *device, int port_num) return entry; } -int agent_send_response(struct ib_mad *mad, struct ib_grh *grh, - struct ib_wc *wc, struct ib_device *device, - int port_num, int qpn) +void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, +struct ib_wc *wc, struct ib_device *device, +int port_num, int qpn) { struct ib_agent_port_private *port_priv; struct ib_mad_agent *agent; struct ib_mad_send_buf *send_buf; struct ib_ah *ah; - int ret; struct ib_mad_send_wr_private *mad_send_wr; if (device->node_type == RDMA_NODE_IB_SWITCH) @@ -96,23 +95,21 @@ int agent_send_response(struct ib_mad *mad, struct ib_grh *grh, if (!port_priv) { printk(KERN_ERR SPFX "Unable to find port agent\n"); - return -ENODEV; + return; } agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { - ret = PTR_ERR(ah); - printk(KERN_ERR SPFX "ib_create_ah_from_wc error:%d\n", ret); - return ret; + printk(KERN_ERR SPFX "ib_create_ah_from_wc error\n"); + return; } send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_KERNEL); if (IS_ERR(send_buf)) { - ret = PTR_ERR(send_buf); - printk(KERN_ERR SPFX "ib_create_send_mad error:%d\n", ret); + printk(KERN_ERR SPFX "ib_create_send_mad error\n"); goto err1; } @@ -126,16 +123,15 @@ int agent_send_response(struct ib_mad *mad, struct ib_grh *grh, mad_send_wr->send_wr.wr.ud.port_num = port_num; } - if ((ret = ib_post_send_mad(send_buf, NULL))) { - printk(KERN_ERR SPFX "ib_post_send_mad error:%d\n", ret); + if (ib_post_send_mad(send_buf, NULL)) { + printk(KERN_ERR SPFX "ib_post_send_mad error\n"); goto err2; } - return 0; + return; err2:
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 01:23:06PM +1000, Paul Mackerras wrote: > > In particular, atomic_read seems to lend itself to buggy uses. People > seem to do things like: > > atomic_add(, something); > if (atomic_read() > something_else) ... If you're referring to the code in sk_stream_mem_schedule then it's working as intended. The atomicity guarantees that the atomic_add/atomic_sub won't be seen in parts by other readers. We certainly do not need to see other atomic_add/atomic_sub operations immediately. If you're referring to another code snippet please cite. > I'd go so far as to say that anywhere where you want a non-"volatile" > atomic_read, either your code is buggy, or else an int would work just > as well. An int won't work here because += and -= do not have the atomicity guarantees that atomic_add/atomic_sub do. In particular, this may cause an atomic_read on another CPU to give a bogus reading. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/3] Recursive reclaim (on __PF_MEMALLOC)
On Wed, Aug 15, 2007 at 03:12:06PM +0200, Peter Zijlstra wrote: > On Wed, 2007-08-15 at 14:22 +0200, Nick Piggin wrote: > > On Tue, Aug 14, 2007 at 07:21:03AM -0700, Christoph Lameter wrote: > > > The following patchset implements recursive reclaim. Recursive reclaim > > > is necessary if we run out of memory in the writeout patch from reclaim. > > > > > > This is f.e. important for stacked filesystems or anything that does > > > complicated processing in the writeout path. > > > > Filesystems (most of them) that require compilcated allocations at > > writeout time suck. That said, especially with network ones, it > > seems like making them preallocate or reserve required memory isn't > > progressing very smoothly. > > Mainly because we seem to go in circles :-( > > > I think these patchsets are definitely > > worth considering as an alternative. > > Honestly, I don't. They very much do not solve the problem, they just > displace it. Well perhaps it doesn't work for networked swap, because dirty accounting doesn't work the same way with anonymous memory... but for _filesystems_, right? I mean, it intuitively seems like a good idea to terminate the recursive allocation problem with an attempt to reclaim clean pages rather than immediately let them have-at our memory reserve that is used for other things as well. Any and all writepage() via reclaim is allowed to eat into all of memory (I hate that writepage() ever has to use any memory, and have prototyped how to fix that for simple block based filesystems in fsblock, but others will require it). > Christoph's suggestion to set min_free_kbytes to 20% is ridiculous - nor > does it solve all deadlocks :-( Well of course it doesn't, but it is a pragmatic way to reduce some memory depletion cases. I don't see too much harm in it (although I didn't see the 20% suggestion?) > > No substantial comments though. > > Please do ponder the problem and its proposed solutions, because I'm > going crazy here. Well yeah I think you simply have to reserve a minimum amount of memory in order to reclaim a page, and I don't see any other way to do it other than what you describe to be _technically_ deadlock free. But firstly, you don't _want_ to start dropping packets when you hit a tough patch in reclaim -- even if you are strictly deadlock free. And secondly, I think recursive reclaim could reduce the deadlocks in practice which is not a bad thing as your patches aren't merged. How are your deadlock patches going anyway? AFAIK they are mostly a network issue and I haven't been keeping up with them for a while. Do you really need networked swap and actually encounter the deadlock, or is it just a question of wanting to fix the bugs? If the former, what for, may I ask? > <> What Christoph is proposing is doing recursive reclaim and not > initiating writeout. This will only work _IFF_ there are clean pages > about. Which in the general case need not be true (memory might be > packed with anonymous pages - consider an MPI cluster doing computation > stuff). So this gets us a workload dependant solution - which IMHO is > bad! Although you will quite likely have at least a couple of MB worth of clean program text. The important part of recursive reclaim is that it doesn't so easily allow reclaim to blow all memory reserves (including interrupt context). Sure you still have theoretical deadlocks, but if I understand correctly, they are going to be lessened. I would be really interested to see if even just these recursive reclaim patches eliminate the problem in practice. > > I've been sick all week. > > Do get well. Thanks! - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
RE: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCP portsfrom the host TCP port space.
>It's not about being a niche. It's about creating a maintainable >software net stack that has predictable behavior. > >Needing to reach out of the RDMA sandbox and reserve net stack resources >away from itself travels a path we've consistently avoided. We need to ensure that we're also creating a maintainable kernel. RDMA doesn't use sockets, but that doesn't mean it's not part of the networking support provided by the Linux kernel. Making blanket statements that RDMA should stay within a sandbox is equivalent to saying that RDMA should duplicate any network related functionality that it might need. >>> I will NACK any patch that opens up sockets to eat up ports or >>> anything stupid like that. > >Ditto for me as well. I agree that using a socket is the wrong approach, but my guess is that it was suggested as a possibility because of the attempt to keep RDMA in its 'sandbox'. The iWarp architecture implements RDMA over TCP; it just doesn't use sockets. The Linux network stack doesn't easily support this possibility. Are there any reasonable ways to enable this to the degree necessary for iWarp? - Sean - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] vfs: use list_for_each_entry instead
Signed-off-by: Denis Cheng <[EMAIL PROTECTED]> --- fs/super.c |7 ++- 1 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/super.c b/fs/super.c index fc8ebed..f287c15 100644 --- a/fs/super.c +++ b/fs/super.c @@ -334,15 +334,12 @@ struct super_block *sget(struct file_system_type *type, int (*set)(struct super_block *,void *), void *data) { - struct super_block *s = NULL; - struct list_head *p; + struct super_block *s = NULL, *old; int err; retry: spin_lock(_lock); - if (test) list_for_each(p, >fs_supers) { - struct super_block *old; - old = list_entry(p, struct super_block, s_instances); + if (test) list_for_each_entry(old, >fs_supers, s_instances) { if (!test(old, data)) continue; if (!grab_super(old)) -- 1.5.3.rc4 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Herbert Xu writes: > > Are you sure? How do you know some other CPU hasn't changed the value > > in between? > > Yes I'm sure, because we don't care if others have increased > the reservation. But others can also reduce the reservation. Also, the code sets and clears *sk->sk_prot->memory_pressure nonatomically with respect to the reads of sk->sk_prot->memory_allocated, so in fact the code doesn't guarantee any particular relationship between the two. That code looks like a beautiful example of buggy, racy code where someone has sprinkled magic fix-the-races dust (otherwise known as atomic_t) around in a vain attempt to fix the races. That's assuming that all that stuff actually performs any useful purpose, of course, and that there isn't some lock held by the callers. In the latter case it is pointless using atomic_t. Paul. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Christoph Lameter writes: > > But I have to say that I still don't know of a single place > > where one would actually use the volatile variant. > > I suspect that what you say is true after we have looked at all callers. It seems that there could be a lot of places where atomic_t is used in a non-atomic fashion, and that those uses are either buggy, or there is some lock held at the time which guarantees that other CPUs aren't changing the value. In both cases there is no point in using atomic_t; we might as well just use an ordinary int. In particular, atomic_read seems to lend itself to buggy uses. People seem to do things like: atomic_add(, something); if (atomic_read() > something_else) ... and expect that there is some relationship between the value that the atomic_add stored and the value that the atomic_read will return, which there isn't. People seem to think that using atomic_t magically gets rid of races. It doesn't. I'd go so far as to say that anywhere where you want a non-"volatile" atomic_read, either your code is buggy, or else an int would work just as well. Paul. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Satyam Sharma writes: > I can't speak for this particular case, but there could be similar code > examples elsewhere, where we do the atomic ops on an atomic_t object > inside a higher-level locking scheme that would take care of the kind of > problem you're referring to here. It would be useful for such or similar > code if the compiler kept the value of that atomic object in a register. If there is a higher-level locking scheme then there is no point to using atomic_t variables. Atomic_t is specifically for the situation where multiple CPUs are updating a variable without locking. Paul. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [2.6 patch] remove Documentation/networking/net-modules.txt
On 8/14/07, Adrian Bunk <[EMAIL PROTECTED]> wrote: > According to git, the only one who touched this file during the last > 5 years was me when removing drivers... > > modinfo offers less ancient information. > > Signed-off-by: Adrian Bunk <[EMAIL PROTECTED]> > Fine by me for any of the old stuff I was responsible for... Acked-by: Paul Gortmaker <[EMAIL PROTECTED]> > - > - 8390 based Network Modules (Paul Gortmaker, Nov 12, 1995) > - -- > - > -(Includes: smc-ultra, ne, wd, 3c503, hp, hp-plus, e2100 and ac3200) > - > -The 8390 series of network drivers now support multiple card systems without > -reloading the same module multiple times (memory efficient!) This is done by > -specifying multiple comma separated values, such as: > - > - insmod 3c503.o io=0x280,0x300,0x330,0x350 xcvr=0,1,0,1 > - - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: drivers/infiniband/mlx/mad.c misplaced ;
On Wed, 2007-08-15 at 19:19 -0700, Kok, Auke wrote: > Joe Perches wrote: > > On Wed, 2007-08-15 at 19:58 -0400, Dave Jones wrote: > > There's more than a few of these (not inspected). > > $ egrep -r --include=*.c "\bif[[:space:]]*\([^\)]*\)[[:space:]]*\;" * > > arch/sh/boards/se/7343/io.c:if (0) ; > > drivers/atm/iphase.c: if (!desc1) ; > > drivers/infiniband/hw/mlx4/mad.c: if (!err); > > drivers/isdn/capi/capiutil.c: else if (c <= 0x0f); > > drivers/net/tokenring/ibmtr.c: else if (ti->shared_ram_paging == 0xf); /* > > No paging in adapter */ > > drivers/s390/scsi/zfcp_erp.c: if (status == ZFCP_ERP_SUCCEEDED) ; > > /* no further action */ > > fs/hostfs/hostfs_user.c:if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ; > > net/netfilter/xt_u32.c: if (skb->len < 4 || pos > skb->len - 4); > > sound/pci/au88x0/au88x0_synth.c:if (eax == > > 0) ; > > sounds like an excellent candidate check for checkpatch.pl !!! I think it's poor style and all of these should go away. the netfilter one appears to be a real error too. Signed-off-by: Joe Perches <[EMAIL PROTECTED]> diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c index 74f9b14..bec4279 100644 --- a/net/netfilter/xt_u32.c +++ b/net/netfilter/xt_u32.c @@ -36,7 +36,7 @@ static bool u32_match_it(const struct xt_u32 *data, at = 0; pos = ct->location[0].number; - if (skb->len < 4 || pos > skb->len - 4); + if (skb->len < 4 || pos > skb->len - 4) return false; ret = skb_copy_bits(skb, pos, , sizeof(n)); - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH] /proc//pmaps - memory maps in granularity of pages
On Thu, Aug 16, 2007 at 09:37:01AM +0800, Fengguang Wu wrote: > 3) A 64bit number can be encoded in hex in 8 bytes, no more than its ~~ sorry, I'm a fool! > binary presentation. And often the text string will be much shorter > because of the common small values. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCP ports from the host TCP port space.
> Needing to reach out of the RDMA sandbox and reserve net stack > resources away from itself travels a path we've consistently avoided. Where did the idea of an "RDMA sandbox" come from? Obviously no one disagrees with keeping things clean and maintainable, but the idea that RDMA is a second-class citizen that doesn't get any input into the evolution of the networking code seems kind of offensive to me. - R. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: how can I get an account on master.kernel.org machine
On Thu, 16 Aug 2007 10:58:41 +0800 ye janboe wrote: > I want to down linux-arm git repos, but I do not have an account on > master machine. see http://www.kernel.org/faq/#account --- ~Randy *** Remember to use Documentation/SubmitChecklist when testing your code *** - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
how can I get an account on master.kernel.org machine
I want to down linux-arm git repos, but I do not have an account on master machine. thanks Yuanbo - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] calculation of pgoff in do_linear_fault() uses mixed units
On Wed, Aug 15, 2007 at 12:43:26PM -0500, Dean Nelson wrote: > The calculation of pgoff in do_linear_fault() should use PAGE_SHIFT and not > PAGE_CACHE_SHIFT since vma->vm_pgoff is in units of PAGE_SIZE and not > PAGE_CACHE_SIZE. At the moment linux/pagemap.h has PAGE_CACHE_SHIFT defined > as PAGE_SHIFT, but should that ever change this calculation would break. > > Signed-off-by: Dean Nelson <[EMAIL PROTECTED]> > Acked-by: Nick Piggin <[EMAIL PROTECTED]> > > Index: linux-2.6/mm/memory.c > === > --- linux-2.6.orig/mm/memory.c2007-08-14 06:42:18.322378070 -0500 > +++ linux-2.6/mm/memory.c 2007-08-15 12:30:51.621604739 -0500 > @@ -2465,7 +2465,7 @@ > int write_access, pte_t orig_pte) > { > pgoff_t pgoff = (((address & PAGE_MASK) > - - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff; > + - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; > unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); > > return __do_fault(mm, vma, address, page_table, pmd, pgoff, - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/9] Reclaim during GFP_ATOMIC allocs
On Tue, Aug 14, 2007 at 08:30:21AM -0700, Christoph Lameter wrote: > This is the extended version of the reclaim patchset. It enables reclaim from > clean file backed pages during GFP_ATOMIC allocs. A bit invasive since > may locks must now be taken with saving flags. But it works. > > Tested by repeatedly allocating 12MB of memory from the timer interrupt. > > -- Just to clarify... I can see how recursive reclaim can prevent memory getting eaten up by reclaim (which thus causes allocations from interrupt handlers to fail)... But this patchset I don't see will do anything to prevent reclaim deadlocks, right? (because if there is reclaimable memory at hand, then kswapd should eventually reclaim it). - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] Fix VM_FAULT flags conversion for hugetlb
On Tue, Aug 14, 2007 at 02:41:21PM -0500, Adam Litke wrote: > It seems a simple mistake was made when converting follow_hugetlb_page() > over to the VM_FAULT flags bitmask stuff: > (commit 83c54070ee1a2d05c89793884bea1a03f2851ed4). > > By using the wrong bitmask, hugetlb_fault() failures are not being > recognized. This results in an infinite loop whenever > follow_hugetlb_page is involved in a failed fault. > > Signed-off-by: Adam Litke <[EMAIL PROTECTED]> Thanks Adam. Looks good. > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index d7ca59d..de4cf45 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -643,7 +643,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct > vm_area_struct *vma, > spin_unlock(>page_table_lock); > ret = hugetlb_fault(mm, vma, vaddr, 0); > spin_lock(>page_table_lock); > - if (!(ret & VM_FAULT_MAJOR)) > + if (!(ret & VM_FAULT_ERROR)) > continue; > > remainder = 0; > > -- > Adam Litke - (agl at us.ibm.com) > IBM Linux Technology Center - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] Make rcutorture RNG use temporal entropy
Repost of http://lkml.org/lkml/2007/8/10/472 made available by request. The locking used by get_random_bytes() can conflict with the preempt_disable() and synchronize_sched() form of RCU. This patch changes rcutorture's RNG to gather entropy from the new cpu_clock() interface (relying on interrupts, preemption, daemons, and rcutorture's reader thread's rock-bottom scheduling priority to provide useful entropy), and also adds and EXPORT_SYMBOL_GPL() to make that interface available to GPLed kernel modules such as rcutorture. Passes several hours of rcutorture. Signed-off-by: Paul E. McKenney <[EMAIL PROTECTED]> --- rcutorture.c |8 ++-- sched.c |2 ++ 2 files changed, 4 insertions(+), 6 deletions(-) diff -urpNa -X dontdiff linux-2.6.23-rc2/kernel/rcutorture.c linux-2.6.23-rc2-rcutorturesched/kernel/rcutorture.c --- linux-2.6.23-rc2/kernel/rcutorture.c2007-08-03 19:49:55.0 -0700 +++ linux-2.6.23-rc2-rcutorturesched/kernel/rcutorture.c2007-08-10 17:15:22.0 -0700 @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -166,16 +165,13 @@ struct rcu_random_state { /* * Crude but fast random-number generator. Uses a linear congruential - * generator, with occasional help from get_random_bytes(). + * generator, with occasional help from cpu_clock(). */ static unsigned long rcu_random(struct rcu_random_state *rrsp) { - long refresh; - if (--rrsp->rrs_count < 0) { - get_random_bytes(, sizeof(refresh)); - rrsp->rrs_state += refresh; + rrsp->rrs_state += (unsigned long)cpu_clock(smp_processor_id()); rrsp->rrs_count = RCU_RANDOM_REFRESH; } rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; diff -urpNa -X dontdiff linux-2.6.23-rc2/kernel/sched.c linux-2.6.23-rc2-rcutorturesched/kernel/sched.c --- linux-2.6.23-rc2/kernel/sched.c 2007-08-03 19:49:55.0 -0700 +++ linux-2.6.23-rc2-rcutorturesched/kernel/sched.c 2007-08-10 17:22:57.0 -0700 @@ -394,6 +394,8 @@ unsigned long long cpu_clock(int cpu) return now; } +EXPORT_SYMBOL_GPL(cpu_clock); + #ifdef CONFIG_FAIR_GROUP_SCHED /* Change a task's ->cfs_rq if it moves across CPUs */ static inline void set_task_cfs_rq(struct task_struct *p) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, 16 Aug 2007, Satyam Sharma wrote: > On Thu, 16 Aug 2007, Paul Mackerras wrote: > > Herbert Xu writes: > > > > > See sk_stream_mem_schedule in net/core/stream.c: > > > > > > /* Under limit. */ > > > if (atomic_read(sk->sk_prot->memory_allocated) < > > > sk->sk_prot->sysctl_mem[0]) { > > > if (*sk->sk_prot->memory_pressure) > > > *sk->sk_prot->memory_pressure = 0; > > > return 1; > > > } > > > > > > /* Over hard limit. */ > > > if (atomic_read(sk->sk_prot->memory_allocated) > > > > sk->sk_prot->sysctl_mem[2]) { > > > sk->sk_prot->enter_memory_pressure(); > > > goto suppress_allocation; > > > } > > > > > > We don't need to reload sk->sk_prot->memory_allocated here. > > > > Are you sure? How do you know some other CPU hasn't changed the value > > in between? > > I can't speak for this particular case, but there could be similar code > examples elsewhere, where we do the atomic ops on an atomic_t object > inside a higher-level locking scheme that would take care of the kind of > problem you're referring to here. It would be useful for such or similar > code if the compiler kept the value of that atomic object in a register. We might not be using atomic_t (and ops) if we already have a higher-level locking scheme, actually. So as Herbert mentioned, such cases might just not care. [ Too much of this thread, too little sleep, sorry! ] Anyway, the problem, of course, is that this conversion to a stronger / safer-by-default behaviour doesn't happen with zero cost to performance. Converting atomic ops to "volatile" behaviour did add ~2K to kernel text for archs such as i386 (possibly to important codepaths) that didn't have those semantics already so it would be constructive to actually look at those differences and see if there were really any heisenbugs that got rectified. Or if there were legitimate optimizations that got wrongly disabled. Onus lies on those proposing the modifications, I'd say ;-) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [ckrm-tech] Regression in 2.6.23-rc2-mm2, mounting cpusets causes a hang
Christoph wrote (and Lee quoted): > Looks like we need to fix cpuset nodemasks for the !NUMA case then? > It cannot expect to find valid nodemasks if !NUMA. The code in kernel/cpuset.c should be written as if there are always valid nodemasks. In the case of !NUMA, it will happen that there is only one node, but kernel/cpuset.c should not manifest that detail. -- I won't rest till it's the best ... Programmer, Linux Scalability Paul Jackson <[EMAIL PROTECTED]> 1.925.600.0401 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 10:11:05AM +0800, Herbert Xu wrote: > On Thu, Aug 16, 2007 at 12:05:56PM +1000, Paul Mackerras wrote: > > Herbert Xu writes: > > > > > See sk_stream_mem_schedule in net/core/stream.c: > > > > > > /* Under limit. */ > > > if (atomic_read(sk->sk_prot->memory_allocated) < > > > sk->sk_prot->sysctl_mem[0]) { > > > if (*sk->sk_prot->memory_pressure) > > > *sk->sk_prot->memory_pressure = 0; > > > return 1; > > > } > > > > > > /* Over hard limit. */ > > > if (atomic_read(sk->sk_prot->memory_allocated) > > > > sk->sk_prot->sysctl_mem[2]) { > > > sk->sk_prot->enter_memory_pressure(); > > > goto suppress_allocation; > > > } > > > > > > We don't need to reload sk->sk_prot->memory_allocated here. > > > > Are you sure? How do you know some other CPU hasn't changed the value > > in between? > > Yes I'm sure, because we don't care if others have increased > the reservation. > > Note that even if we did we'd be using barriers so volatile > won't do us any good here. If the load-coalescing is important to performance, why not load into a local variable? Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, Aug 15, 2007 at 06:41:40PM -0700, Christoph Lameter wrote: > On Wed, 15 Aug 2007, Paul E. McKenney wrote: > > > Understood. My point is not that the impact is precisely zero, but > > rather that the impact on optimization is much less hurtful than the > > problems that could arise otherwise, particularly as compilers become > > more aggressive in their optimizations. > > The problems arise because barriers are not used as required. Volatile > has wishy washy semantics and somehow marries memory barriers with data > access. It is clearer to separate the two. Conceptual cleanness usually > translates into better code. If one really wants the volatile then lets > make it explicit and use > > atomic_read_volatile() There are indeed architectures where you can cause gcc to emit memory barriers in response to volatile. I am assuming that we are -not- making gcc do this. Given this, then volatiles and memory barrier instructions are orthogonal -- one controls the compiler, the other controls the CPU. Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 03:30:44AM +0200, Segher Boessenkool wrote: > >>>Part of the motivation here is to fix heisenbugs. If I knew where > >>>they > >> > >>By the same token we should probably disable optimisations > >>altogether since that too can create heisenbugs. > > > >Precisely the point -- use of volatile (whether in casts or on asms) > >in these cases are intended to disable those optimizations likely to > >result in heisenbugs. > > The only thing volatile on an asm does is create a side effect > on the asm statement; in effect, it tells the compiler "do not > remove this asm even if you don't need any of its outputs". > > It's not disabling optimisation likely to result in bugs, > heisen- or otherwise; _not_ putting the volatile on an asm > that needs it simply _is_ a bug :-) Yep. And the reason it is a bug is that it fails to disable the relevant compiler optimizations. So I suspect that we might actually be saying the same thing here. Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Early printk behaviour
I was putting together an early printk implementation for the Blackfin, and was wondering what the expected behaviour was in this situation. When I set up my bootargs earlyprintk=serial,ttyBF0,57600 and have no console defined (no graphical console, no serial console). based on the patch: http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=69331af79cf29e26d1231152a172a1a10c2df511 which no longer calls disable_early_printk, the earlyprintk console never gets turned off (because nothing else ever calls register_console). I get everything out the early console, until the init section is released (where the console structure is sitting), and it starts printing out garbage. Is this expected behaviour? -Robin - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 6/24] make atomic_read() behave consistently on frv
On Thu, Aug 16, 2007 at 11:09:25AM +1000, Nick Piggin wrote: > Paul E. McKenney wrote: > >On Wed, Aug 15, 2007 at 11:30:05PM +1000, Nick Piggin wrote: > > >>Especially since several big architectures don't have volatile in their > >>atomic_get and _set, I think it would be a step backwards to add them in > >>as a "just in case" thin now (unless there is a better reason). > > > >Good point, except that I would expect gcc's optimization to continue > >to improve. I would like the kernel to be able to take advantage of > >improved optimization, which means that we are going to have to make > >a few changes. Adding volatile to atomic_get() and atomic_set() is > >IMHO one of those changes. > > What optimisations? gcc already does most of the things you need a > barrier/volatile for, like reordering non-dependant loads and stores, > and eliminating mem ops completely by caching in registers. Yep, most of the currently practiced optimizations. Given that CPU clock frequencies are not rising as quickly as they once did, I would expect that there will be added effort on implementing the known more-aggressive optimization techniques, and on coming up with new ones as well. Some of these new optimizations will likely be inappropriate for kernel code, but I expect that some will be things that we want. > >>As to your followup question of why to use it over ACCESS_ONCE. I > >>guess, aside from consistency with the rest of the barrier APIs, you > >>can use it in other primitives when you don't actually know what the > >>caller is going to do or if it even will make an access. You could > >>also use it between calls to _other_ primitives, etc... it just > >>seems more flexible to me, but I haven't actually used such a thing > >>in real code... > >> > >>ACCESS_ONCE doesn't seem as descriptive. What it results in is the > >>memory location being loaded or stored (presumably once exactly), > >>but I think the more general underlying idea is a barrier point. > > > >OK, first, I am not arguing that ACCESS_ONCE() can replace all current > >uses of barrier(). > > OK. Well I also wasn't saying that ACCESS_ONCE should not be > implemented. But if we want something like it, then it would make > sense to have an equivalent barrier statement as well (ie. order()). And I am not arguing against use of asms to implement the volatility in atomic_read() and atomic_set(), and in fact it appears that one of the architectures will be taking this approach. Sounds like we might be in violent agreement. ;-) Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [ofa-general] Re: [PATCH RFC] RDMA/CMA: Allocate PS_TCP ports from the host TCP port space.
Steve Wise wrote: David Miller wrote: From: Sean Hefty <[EMAIL PROTECTED]> Date: Thu, 09 Aug 2007 14:40:16 -0700 Steve Wise wrote: Any more comments? Does anyone have ideas on how to reserve the port space without using a struct socket? How about we just remove the RDMA stack altogether? I am not at all kidding. If you guys can't stay in your sand box and need to cause problems for the normal network stack, it's unacceptable. We were told all along the if RDMA went into the tree none of this kind of stuff would be an issue. I think removing the RDMA stack is the wrong thing to do, and you shouldn't just threaten to yank entire subsystems because you don't like the technology. Lets keep this constructive, can we? RDMA should get the respect of any other technology in Linux. Maybe its a niche in your opinion, but come on, there's more RDMA users than say, the sparc64 port. Eh? It's not about being a niche. It's about creating a maintainable software net stack that has predictable behavior. Needing to reach out of the RDMA sandbox and reserve net stack resources away from itself travels a path we've consistently avoided. I will NACK any patch that opens up sockets to eat up ports or anything stupid like that. Got it. Ditto for me as well. Jeff - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Segher Boessenkool wrote: Part of the motivation here is to fix heisenbugs. If I knew where they By the same token we should probably disable optimisations altogether since that too can create heisenbugs. Almost everything is a tradeoff; and so is this. I don't believe most people would find disabling all compiler optimisations an acceptable price to pay for some peace of mind. So why is this a good tradeoff? I also think that just adding things to APIs in the hope it might fix up some bugs isn't really a good road to go down. Where do you stop? On the actual proposal to make atomic_operators volatile: I think the better approach in the long term, for both maintainability of the code and education of coders, is to make the use of barriers _more_ explicit rather than sprinkling these "just in case" ones around. You may get rid of a few atomic_read heisenbugs (in noise when compared to all bugs), but if the coder was using a regular atomic load, or a test_bit (which is also atomic), etc. then they're going to have problems. It would be better for Linux if everyone was to have better awareness of barriers than to hide some of the cases where they're required. A pretty large number of bugs I see in lock free code in the VM is due to memory ordering problems. It's hard to find those bugs, or even be aware when you're writing buggy code if you don't have some feel for barriers. -- SUSE Labs, Novell Inc. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 03:23:28AM +0200, Segher Boessenkool wrote: > No; compilation units have nothing to do with it, GCC can optimise > across compilation unit boundaries just fine, if you tell it to > compile more than one compilation unit at once. > >>> > >>>Last I checked, the Linux kernel build system did compile each .c > >>>file > >>>as a separate compilation unit. > >> > >>I have some patches to use -combine -fwhole-program for Linux. > >>Highly experimental, you need a patched bleeding edge toolchain. > >>If there's interest I'll clean it up and put it online. > >> > >>David Woodhouse had some similar patches about a year ago. > > > >Sounds exciting... ;-) > > Yeah, the breakage is *quite* spectacular :-) I bet!!! ;-) > >In many cases, the compiler also has to assume that > >msleep_interruptible() > >might call back into a function in the current compilation unit, > >thus > >possibly modifying global static variables. > > It most often is smart enough to see what compilation-unit-local > variables might be modified that way, though :-) > >>> > >>>Yep. For example, if it knows the current value of a given such > >>>local > >>>variable, and if all code paths that would change some other variable > >>>cannot be reached given that current value of the first variable. > >> > >>Or the most common thing: if neither the address of the translation- > >>unit local variable nor the address of any function writing to that > >>variable can "escape" from that translation unit, nothing outside > >>the translation unit can write to the variable. > > > >But there is usually at least one externally callable function in > >a .c file. > > Of course, but often none of those will (indirectly) write a certain > static variable. But there has to be some path to the static functions, assuming that they are not dead code. Yes, there can be cases where the compiler knows enough about the state of the variables to rule out some of code paths to them, but I have no idea how often this happens in kernel code. Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, 16 Aug 2007, Paul Mackerras wrote: > Herbert Xu writes: > > > See sk_stream_mem_schedule in net/core/stream.c: > > > > /* Under limit. */ > > if (atomic_read(sk->sk_prot->memory_allocated) < > > sk->sk_prot->sysctl_mem[0]) { > > if (*sk->sk_prot->memory_pressure) > > *sk->sk_prot->memory_pressure = 0; > > return 1; > > } > > > > /* Over hard limit. */ > > if (atomic_read(sk->sk_prot->memory_allocated) > > > sk->sk_prot->sysctl_mem[2]) { > > sk->sk_prot->enter_memory_pressure(); > > goto suppress_allocation; > > } > > > > We don't need to reload sk->sk_prot->memory_allocated here. > > Are you sure? How do you know some other CPU hasn't changed the value > in between? I can't speak for this particular case, but there could be similar code examples elsewhere, where we do the atomic ops on an atomic_t object inside a higher-level locking scheme that would take care of the kind of problem you're referring to here. It would be useful for such or similar code if the compiler kept the value of that atomic object in a register. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: drivers/infiniband/mlx/mad.c misplaced ;
Joe Perches wrote: On Wed, 2007-08-15 at 19:58 -0400, Dave Jones wrote: Signed-off-by: Dave Jones <[EMAIL PROTECTED]> diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 3330917..0ed02b7 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -109,7 +109,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int ignore_mkey, int ignore_bkey, in_modifier, op_modifier, MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C); - if (!err); + if (!err) There's more than a few of these (not inspected). $ egrep -r --include=*.c "\bif[[:space:]]*\([^\)]*\)[[:space:]]*\;" * arch/sh/boards/se/7343/io.c:if (0) ; drivers/atm/iphase.c: if (!desc1) ; drivers/infiniband/hw/mlx4/mad.c: if (!err); drivers/isdn/capi/capiutil.c: else if (c <= 0x0f); drivers/net/tokenring/ibmtr.c: else if (ti->shared_ram_paging == 0xf); /* No paging in adapter */ drivers/s390/scsi/zfcp_erp.c: if (status == ZFCP_ERP_SUCCEEDED) ; /* no further action */ fs/hostfs/hostfs_user.c:if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ; net/netfilter/xt_u32.c: if (skb->len < 4 || pos > skb->len - 4); sound/pci/au88x0/au88x0_synth.c:if (eax == 0) ; sounds like an excellent candidate check for checkpatch.pl !!! Cheers, Auke - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Herbert Xu wrote: But I have to say that I still don't know of a single place where one would actually use the volatile variant. Given that many of the existing users do currently have "volatile", are you comfortable simply removing that behaviour from them? Are you sure that you will not introduce any issues? Forcing a re-read is only a performance penalty. Removing it can cause behavioural changes. I would be more comfortable making the default match the majority of the current implementations (ie: volatile semantics). Then, if someone cares about performance they can explicitly validate the call path and convert it over to the non-volatile version. Correctness before speed... Chris - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, 16 Aug 2007, Herbert Xu wrote: > > Do we have a consensus here? (hoping against hope, probably :-) > > I can certainly agree with this. I agree too. > But I have to say that I still don't know of a single place > where one would actually use the volatile variant. I suspect that what you say is true after we have looked at all callers. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, 15 Aug 2007, Christoph Lameter wrote: > On Thu, 16 Aug 2007, Paul Mackerras wrote: > > > > We don't need to reload sk->sk_prot->memory_allocated here. > > > > Are you sure? How do you know some other CPU hasn't changed the value > > in between? > > The cpu knows because the cacheline was not invalidated. Crap my statement above is wrong. We do not care that the value was changed otherwise we would have put a barrier in there. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, 16 Aug 2007, Paul Mackerras wrote: > > We don't need to reload sk->sk_prot->memory_allocated here. > > Are you sure? How do you know some other CPU hasn't changed the value > in between? The cpu knows because the cacheline was not invalidated. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 12:05:56PM +1000, Paul Mackerras wrote: > Herbert Xu writes: > > > See sk_stream_mem_schedule in net/core/stream.c: > > > > /* Under limit. */ > > if (atomic_read(sk->sk_prot->memory_allocated) < > > sk->sk_prot->sysctl_mem[0]) { > > if (*sk->sk_prot->memory_pressure) > > *sk->sk_prot->memory_pressure = 0; > > return 1; > > } > > > > /* Over hard limit. */ > > if (atomic_read(sk->sk_prot->memory_allocated) > > > sk->sk_prot->sysctl_mem[2]) { > > sk->sk_prot->enter_memory_pressure(); > > goto suppress_allocation; > > } > > > > We don't need to reload sk->sk_prot->memory_allocated here. > > Are you sure? How do you know some other CPU hasn't changed the value > in between? Yes I'm sure, because we don't care if others have increased the reservation. Note that even if we did we'd be using barriers so volatile won't do us any good here. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 07:45:44AM +0530, Satyam Sharma wrote: > > Completely agreed, again. To summarize again (had done so about ~100 mails > earlier in this thread too :-) ... > > atomic_{read,set}_volatile() -- guarantees volatility also along with > atomicity (the two _are_ different concepts after all, irrespective of > whether callsites normally want one with the other or not) > > atomic_{read,set}_nonvolatile() -- only guarantees atomicity, compiler > free to elid / coalesce / optimize such accesses, can keep the object > in question cached in a local register, leads to smaller text, etc. > > As to which one should be the default atomic_read() is a question of > whether majority of callsites (more weightage to important / hot > codepaths, lesser to obscure callsites) want a particular behaviour. > > Do we have a consensus here? (hoping against hope, probably :-) I can certainly agree with this. But I have to say that I still don't know of a single place where one would actually use the volatile variant. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
A volatile default would disable optimizations for atomic_read. atomic_read without volatile would allow for full optimization by the compiler. Seems that this is what one wants in many cases. Name one such case. An atomic_read should do a load from memory. If the programmer puts an atomic_read() in the code then the compiler should emit a load for it, not re-use a value returned by a previous atomic_read. I do not believe it would ever be useful for the compiler to collapse two atomic_read statements into a single load. An atomic_read() implemented as a "normal" C variable read would allow that read to be combined with another "normal" read from that variable. This could perhaps be marginally useful, although I'd bet you cannot see it unless counting cycles on a simulator or counting bits in the binary size. With an asm() implementation, the compiler can not do this; with a "volatile" implementation (either volatile variable or volatile-cast), this invokes undefined behaviour (in both C and GCC). Segher - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Herbert Xu writes: > See sk_stream_mem_schedule in net/core/stream.c: > > /* Under limit. */ > if (atomic_read(sk->sk_prot->memory_allocated) < > sk->sk_prot->sysctl_mem[0]) { > if (*sk->sk_prot->memory_pressure) > *sk->sk_prot->memory_pressure = 0; > return 1; > } > > /* Over hard limit. */ > if (atomic_read(sk->sk_prot->memory_allocated) > > sk->sk_prot->sysctl_mem[2]) { > sk->sk_prot->enter_memory_pressure(); > goto suppress_allocation; > } > > We don't need to reload sk->sk_prot->memory_allocated here. Are you sure? How do you know some other CPU hasn't changed the value in between? Paul. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, 15 Aug 2007, Christoph Lameter wrote: > On Wed, 15 Aug 2007, Paul E. McKenney wrote: > > > Understood. My point is not that the impact is precisely zero, but > > rather that the impact on optimization is much less hurtful than the > > problems that could arise otherwise, particularly as compilers become > > more aggressive in their optimizations. > > The problems arise because barriers are not used as required. Volatile > has wishy washy semantics and somehow marries memory barriers with data > access. It is clearer to separate the two. Conceptual cleanness usually > translates into better code. If one really wants the volatile then lets > make it explicit and use > > atomic_read_volatile() Completely agreed, again. To summarize again (had done so about ~100 mails earlier in this thread too :-) ... atomic_{read,set}_volatile() -- guarantees volatility also along with atomicity (the two _are_ different concepts after all, irrespective of whether callsites normally want one with the other or not) atomic_{read,set}_nonvolatile() -- only guarantees atomicity, compiler free to elid / coalesce / optimize such accesses, can keep the object in question cached in a local register, leads to smaller text, etc. As to which one should be the default atomic_read() is a question of whether majority of callsites (more weightage to important / hot codepaths, lesser to obscure callsites) want a particular behaviour. Do we have a consensus here? (hoping against hope, probably :-) [ This thread has gotten completely out of hand ... for my mail client alpine as well, it now seems. Reminds of that 1000+ GPLv3 fest :-) ] - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 11:51:42AM +1000, Paul Mackerras wrote: > > Name one such case. See sk_stream_mem_schedule in net/core/stream.c: /* Under limit. */ if (atomic_read(sk->sk_prot->memory_allocated) < sk->sk_prot->sysctl_mem[0]) { if (*sk->sk_prot->memory_pressure) *sk->sk_prot->memory_pressure = 0; return 1; } /* Over hard limit. */ if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[2]) { sk->sk_prot->enter_memory_pressure(); goto suppress_allocation; } We don't need to reload sk->sk_prot->memory_allocated here. Now where is your example again? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Christoph Lameter writes: > A volatile default would disable optimizations for atomic_read. > atomic_read without volatile would allow for full optimization by the > compiler. Seems that this is what one wants in many cases. Name one such case. An atomic_read should do a load from memory. If the programmer puts an atomic_read() in the code then the compiler should emit a load for it, not re-use a value returned by a previous atomic_read. I do not believe it would ever be useful for the compiler to collapse two atomic_read statements into a single load. Paul. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, 15 Aug 2007, Paul E. McKenney wrote: > Understood. My point is not that the impact is precisely zero, but > rather that the impact on optimization is much less hurtful than the > problems that could arise otherwise, particularly as compilers become > more aggressive in their optimizations. The problems arise because barriers are not used as required. Volatile has wishy washy semantics and somehow marries memory barriers with data access. It is clearer to separate the two. Conceptual cleanness usually translates into better code. If one really wants the volatile then lets make it explicit and use atomic_read_volatile() - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re:
"compilation unit" is a C standard term. It typically boils down to "single .c file". As you mentioned later, "single .c file with all the other files (headers or other .c files) that it pulls in via #include" is actually "translation unit", both in the C standard as well as gcc docs. Yeah. "single .c file after preprocessing". Same thing :-) "Compilation unit" doesn't seem to be nearly as standard a term, though in most places it is indeed meant to be same as "translation unit", but with the new gcc inter-module-analysis stuff that you referred to above, I suspect one may reasonably want to call a "compilation unit" as all that the compiler sees at a given instant. That would be a bit confusing, would it not? They'd better find some better name for that if they want to name it at all (remember, none of these optimisations should have any effect on the semantics of the program, you just get fewer .o files etc.). Segher - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[BUG -rt] circular locking deadlock
Hey Ingo, Thomas, I was playing with the latency tracer on 2.6.23-rc2-rt2 while a "make -j8" was going on in the background and the box hung with this on the console: [ BUG: circular locking deadlock detected! ] cc1/7161 is deadlocking current task cc1/7164 1) cc1/7164 is trying to acquire this lock: [8101fa375250] {>lock} .. ->owner: 8101ef7f7162 .. held by: cc1: 7161 [8101ef7f7160, 120] 2) cc1/7161 is blocked on this lock: [8102118e5900] {>lock} .. ->owner: 8100ddb3ace2 .. held by: cc1: 7164 [8100ddb3ace0, 120] cc1/7161's [blocked] stackdump: 8101f9423c68 0086 0001 0002 82205d00 0202 82189280 8101ef7f7160 8100ddb3ace0 8101f9423c10 0202 Call Trace: [] mcount+0x2c/0x3a [] schedule+0x3b/0x130 [] rt_spin_lock_slowlock+0xdb/0x210 [] __rt_spin_lock+0x3d/0x50 [] rt_spin_lock+0xe/0x10 [] __lock_list+0x42/0x70 [] lock_list_splice_init+0x33/0xb0 [] file_move+0xc8/0xd0 [] __dentry_open+0xae/0x1d0 [] nameidata_to_filp+0x44/0x60 [] do_filp_open+0x4d/0x60 [] __lock_text_start+0xe/0x10 [] get_unused_fd_flags+0x103/0x130 [] do_sys_open+0x61/0x100 [] sys_open+0x20/0x30 [] tracesys+0x151/0x1be --- | preempt count: 0002 ] | 2-level deep critical section nesting: .. [] __schedule+0x23/0x265 .[] .. ( <= schedule+0x3b/0x130) .. [] __spin_lock_irq+0x21/0x40 .[] .. ( <= __schedule+0xb4/0x265) cc1/7164's [current] stackdump: Call Trace: [] show_stack+0x13/0x20 [] debug_rt_mutex_print_deadlock+0x163/0x180 [] rt_spin_lock_slowlock+0xd6/0x210 [] __rt_spin_lock+0x3d/0x50 [] rt_spin_lock+0xe/0x10 [] __lock_list+0x42/0x70 [] lock_list_del_init+0x26/0x70 [] file_kill+0x4a/0x60 [] __fput+0x12e/0x1d0 [] fput+0x25/0x30 [] filp_close+0x61/0x90 [] sys_close+0xa9/0x100 [] tracesys+0x151/0x1be --- | preempt count: ] | 0-level deep critical section nesting: [ turning off deadlock detection.Please report this trace. ] - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 1/2] i386: use asm() like the other atomic operations already do.
Herbert Xu wrote: On Wed, Aug 15, 2007 at 01:02:23PM -0400, Chris Snook wrote: Herbert Xu wrote: I'm still unconvinced why we need this because nobody has brought up any examples of kernel code that legitimately need this. There's plenty of kernel code that *wants* this though. If we can You keep saying this yet everytime I ask for an example I get nothing. Agreed. Simplest thing to do will be to post those patches which eliminate the register-clobbering barriers, shrink binaries, etc. for all that code that wants this. -- SUSE Labs, Novell Inc. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH] /proc//pmaps - memory maps in granularity of pages
On Tue, Aug 14, 2007 at 11:26:18AM -0500, Matt Mackall wrote: > On Tue, Aug 14, 2007 at 04:52:04PM +0800, Fengguang Wu wrote: > > Hello, > > > > Matt Mackall brings us many memory-footprint-optimization > > opportunities with his pagemap/kpagemap patches. However I wonder if > > the binary interface can be improved. > > > > This seq_file based implementation iterates in the unit of vmas. So > > super large vmas can be a problem. The next step is to to do address > > based iteration. That should not be hard. > > > > Andrew, please stop me early if it is at all a wrong interface :) > > > > Thank you, > > Fengguang > > === > > > > Show a process's memory maps page-by-page in /proc//pmaps. > > It helps to analyze application's memory footprint in a comprehensive way. > > > > Pages share the same states are grouped into a page range. > > For each page range, the following fields are exported: > > - first page index > > - number of pages in the range > > - well known page/pte flags > > - number of mmap users > > > > Only page flags not expected to disappear in the near future are exported: > > > > Y:young R:referenced A:active U:uptodate P:ptedirty D:dirty W:writeback > > > > Here is a sample output: > > > > # cat /proc/$$/pmaps > > 08048000-080c9000 r-xp 08048000 00:00 0 > > 32840 129 Y_A_P__ 1 > > 080c9000-080f6000 rwxp 080c9000 00:00 0 > > [heap] > > 32969 45 Y_A_P__ 1 > > f7dba000-f7dc3000 r-xp 03:00 176633 > > /lib/libnss_files-2.3.6.so > > 0 1 Y_AU___ 1 > > 1 1 YR_U___ 1 > > 5 1 YR_U___ 1 > > 8 1 Y_AU___ 1 > > f7dc3000-f7dc5000 rwxp 8000 03:00 176633 > > /lib/libnss_files-2.3.6.so > > 8 2 Y_A_P__ 1 > > f7dc5000-f7dcd000 r-xp 03:00 176635 > > /lib/libnss_nis-2.3.6.so > > 0 1 Y_AU___ 1 > > 1 1 YR_U___ 1 > > 4 1 YR_U___ 1 > > 7 1 Y_AU___ 1 > > f7dcd000-f7dcf000 rwxp 7000 03:00 176635 > > /lib/libnss_nis-2.3.6.so > > 7 2 Y_A_P__ 1 > > f7dcf000-f7de1000 r-xp 03:00 176630 > > /lib/libnsl-2.3.6.so > > 0 1 Y_AU___ 1 > > 1 3 YR_U___ 1 > > 16 1 YR_U___ 1 > > f7de1000-f7de3000 rwxp 00011000 03:00 176630 > > /lib/libnsl-2.3.6.so > > 17 2 Y_A_P__ 1 > > [...] > > That's a _lot_ of data to generate and parse. I doubt we can watch a Yes, but won't be too bad because it works in a sparse way. 1) It will only generate output for resident pages, that normally is much smaller than the mapped size. Take my shell for example, the (size:rss) ratio is (7:1)! wfg ~% cat /proc/$$/smaps |grep Size|sum sum 50552.000 avg777.723 wfg ~% cat /proc/$$/smaps |grep Rss|sum sum 7604.000 avg116.985 2) The page range trick suppresses more output. Look at these lines: > > 08048000-080c9000 r-xp 08048000 00:00 0 > > 32840 129 Y_A_P__ 1 > > 080c9000-080f6000 rwxp 080c9000 00:00 0 [heap] > > 32969 45 Y_A_P__ 1 It's interesting to see that the seq_file interface demands some more programming efforts, and provides this flexibility as well. 3) A 64bit number can be encoded in hex in 8 bytes, no more than its binary presentation. And often the text string will be much shorter because of the common small values. 4) String formatting is cheap. Much cheaper than pte/struct page walkings, and context switches. Not to mention its user friendliness. > larger app in realtime with this interface. And it doesn't give us > physical page numbers either. This could be a problem. Binary interface is discouraging, which is good and bad. The good thing about it is that we can safely export the raw PFN/page flag numbers without upsetting upset normal users. In this way a possible useful kernel debugging interface can be shipped everywhere. If kpagemap is a reasonable kernel debug interface, I do not see the reason why kpagemap and pmaps cannot coexist :) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Part of the motivation here is to fix heisenbugs. If I knew where they By the same token we should probably disable optimisations altogether since that too can create heisenbugs. Precisely the point -- use of volatile (whether in casts or on asms) in these cases are intended to disable those optimizations likely to result in heisenbugs. The only thing volatile on an asm does is create a side effect on the asm statement; in effect, it tells the compiler "do not remove this asm even if you don't need any of its outputs". It's not disabling optimisation likely to result in bugs, heisen- or otherwise; _not_ putting the volatile on an asm that needs it simply _is_ a bug :-) Segher - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Part of the motivation here is to fix heisenbugs. If I knew where they By the same token we should probably disable optimisations altogether since that too can create heisenbugs. Almost everything is a tradeoff; and so is this. I don't believe most people would find disabling all compiler optimisations an acceptable price to pay for some peace of mind. Segher - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
No; compilation units have nothing to do with it, GCC can optimise across compilation unit boundaries just fine, if you tell it to compile more than one compilation unit at once. Last I checked, the Linux kernel build system did compile each .c file as a separate compilation unit. I have some patches to use -combine -fwhole-program for Linux. Highly experimental, you need a patched bleeding edge toolchain. If there's interest I'll clean it up and put it online. David Woodhouse had some similar patches about a year ago. Sounds exciting... ;-) Yeah, the breakage is *quite* spectacular :-) In many cases, the compiler also has to assume that msleep_interruptible() might call back into a function in the current compilation unit, thus possibly modifying global static variables. It most often is smart enough to see what compilation-unit-local variables might be modified that way, though :-) Yep. For example, if it knows the current value of a given such local variable, and if all code paths that would change some other variable cannot be reached given that current value of the first variable. Or the most common thing: if neither the address of the translation- unit local variable nor the address of any function writing to that variable can "escape" from that translation unit, nothing outside the translation unit can write to the variable. But there is usually at least one externally callable function in a .c file. Of course, but often none of those will (indirectly) write a certain static variable. Segher - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 6/24] make atomic_read() behave consistently on frv
Segher Boessenkool wrote: Please check the definition of "cache coherence". Which of the twelve thousand such definitions? :-) Every definition I have seen says that writes to a single memory location have a serial order as seen by all CPUs, and that a read will return the most recent write in the sequence (with a bit of extra mumbo jumbo to cover store queues and store forwarding). Within such a definition, I don't see how would be allowed for a single CPU to execute reads out of order and have the second read return an earlier write than the first read. -- SUSE Labs, Novell Inc. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 08:53:16AM +0800, Herbert Xu wrote: > On Wed, Aug 15, 2007 at 05:49:50PM -0700, Paul E. McKenney wrote: > > On Thu, Aug 16, 2007 at 08:30:23AM +0800, Herbert Xu wrote: > > > > > Thanks. But I don't need a summary of the thread, I'm asking > > > for an extant code snippet in our kernel that benefits from > > > the volatile change and is not part of a busy-wait. > > > > Sorry, can't help you there. I really do believe that the information > > you need (as opposed to the specific item you are asking for) really > > has been put forth in this thread. > > That only leads me to believe that such a code snippet simply > does not exist. Whatever... Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, Aug 15, 2007 at 05:59:41PM -0700, Christoph Lameter wrote: > On Wed, 15 Aug 2007, Paul E. McKenney wrote: > > > The volatile cast should not disable all that many optimizations, > > for example, it is much less hurtful than barrier(). Furthermore, > > the main optimizations disabled (pulling atomic_read() and atomic_set() > > out of loops) really do need to be disabled. > > In many cases you do not need a barrier. Having volatile there *will* > impact optimization because the compiler cannot use a register that may > contain the value that was fetched earlier. And the compiler cannot choose > freely when to fetch the value. The order of memory accesses are fixed if > you use volatile. If the variable is not volatile then the compiler can > arrange memory accesses any way they fit and thus generate better code. Understood. My point is not that the impact is precisely zero, but rather that the impact on optimization is much less hurtful than the problems that could arise otherwise, particularly as compilers become more aggressive in their optimizations. Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch] sched: skip updating rq's next_balance under null SD
Was playing with sched_smt_power_savings/sched_mc_power_savings and found out that while the scheduler domains are reconstructed when sysfs settings change, rebalance_domains() can get triggered with null domain on other cpus, which is setting next_balance to jiffies + 60*HZ. Resulting in no idle/busy balancing for 60 seconds. Fix this. Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]> --- diff --git a/kernel/sched.c b/kernel/sched.c index 45e17b8..74565c0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3020,6 +3020,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -3056,8 +3057,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) if (sd->flags & SD_SERIALIZE) spin_unlock(); out: - if (time_after(next_balance, sd->last_balance + interval)) + if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; + update_next_balance = 1; + } /* * Stop the load balance at this level. There is another @@ -3067,7 +3070,14 @@ out: if (!balance) break; } - rq->next_balance = next_balance; + + /* +* next_balance will be updated only when there is a need. +* When the cpu is attached to null domain for ex, it will not be +* updated. +*/ + if (likely(update_next_balance)) + rq->next_balance = next_balance; } /* - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, 15 Aug 2007, Segher Boessenkool wrote: > [...] > > BTW: > > > > #define atomic_read(a) (*(volatile int *)&(a)) > > #define atomic_set(a,i) (*(volatile int *)&(a) = (i)) > > > > int a; > > > > void func(void) > > { > > int b; > > > > b = atomic_read(a); > > atomic_set(a, 20); > > b = atomic_read(a); > > } > > > > gives: > > > > func: > > pushl %ebp > > movla, %eax > > movl%esp, %ebp > > movl$20, a > > movla, %eax > > popl%ebp > > ret > > > > so the first atomic_read() wasn't optimized away. > > Of course. It is executed by the abstract machine, so > it will be executed by the actual machine. On the other > hand, try > > b = 0; > if (b) > b = atomic_read(a); > > or similar. Yup, obviously. Volatile accesses (or any access to volatile objects), or even "__volatile__ asms" (which gcc normally promises never to elid) can always be optimized for cases such as these where the compiler can trivially determine that the code in question is not reachable. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 6/24] make atomic_read() behave consistently on frv
Paul E. McKenney wrote: On Wed, Aug 15, 2007 at 11:30:05PM +1000, Nick Piggin wrote: Especially since several big architectures don't have volatile in their atomic_get and _set, I think it would be a step backwards to add them in as a "just in case" thin now (unless there is a better reason). Good point, except that I would expect gcc's optimization to continue to improve. I would like the kernel to be able to take advantage of improved optimization, which means that we are going to have to make a few changes. Adding volatile to atomic_get() and atomic_set() is IMHO one of those changes. What optimisations? gcc already does most of the things you need a barrier/volatile for, like reordering non-dependant loads and stores, and eliminating mem ops completely by caching in registers. As to your followup question of why to use it over ACCESS_ONCE. I guess, aside from consistency with the rest of the barrier APIs, you can use it in other primitives when you don't actually know what the caller is going to do or if it even will make an access. You could also use it between calls to _other_ primitives, etc... it just seems more flexible to me, but I haven't actually used such a thing in real code... ACCESS_ONCE doesn't seem as descriptive. What it results in is the memory location being loaded or stored (presumably once exactly), but I think the more general underlying idea is a barrier point. OK, first, I am not arguing that ACCESS_ONCE() can replace all current uses of barrier(). OK. Well I also wasn't saying that ACCESS_ONCE should not be implemented. But if we want something like it, then it would make sense to have an equivalent barrier statement as well (ie. order()). -- SUSE Labs, Novell Inc. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch] sched: fix broken smt/mc optimizations with CFS
Ingo, let me know if there any side effects of this change. Thanks. --- On a four package system with HT - HT load balancing optimizations were broken. For example, if two tasks end up running on two logical threads of one of the packages, scheduler is not able to pull one of the tasks to a completely idle package. In this scenario, for nice-0 tasks, imbalance calculated by scheduler will be 512 and find_busiest_queue() will return 0 (as each cpu's load is 1024 > imbalance and has only one task running). Similarly MC scheduler optimizations also get fixed with this patch. Signed-off-by: Suresh Siddha <[EMAIL PROTECTED]> --- diff --git a/kernel/sched.c b/kernel/sched.c index 45e17b8..c5ac710 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2494,7 +2494,7 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { + if (*imbalance < busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Hi Herbert, On Thu, 16 Aug 2007, Herbert Xu wrote: > On Thu, Aug 16, 2007 at 06:28:42AM +0530, Satyam Sharma wrote: > > > > > The udelay itself certainly should have some form of cpu_relax in it. > > > > Yes, a form of barrier() must be present in mdelay() or udelay() itself > > as you say, having it in __const_udelay() is *not* enough (superflous > > actually, considering it is already a separate translation unit and > > invisible to the compiler). > > As long as __const_udelay does something which has the same > effect as barrier it is enough even if it's in the same unit. Only if __const_udelay() is inlined. But as I said, __const_udelay() -- although marked "inline" -- will never be inlined anywhere in the kernel in reality. It's an exported symbol, and never inlined from modules. Even from built-in targets, the definition of __const_udelay is invisible when gcc is compiling the compilation units of those callsites. The compiler has no idea that that function has barriers or not, so we're saved here _only_ by the lucky fact that __const_udelay() is in a different compilation unit. > As a matter of fact it does on i386 where __delay either uses > rep_nop or asm/volatile. __delay() can be either delay_tsc() or delay_loop() on i386. delay_tsc() uses the rep_nop() there for it's own little busy loop, actually. But for a call site that inlines __const_udelay() -- if it were ever moved to a .h file and marked inline -- the call to __delay() will _still_ be across compilation units. So, again for this case, it does not matter if the callee function has compiler barriers or not (it would've been a different story if we were discussing real/CPU barriers, I think), what saves us here is just the fact that a call is made to a function from a different compilation unit, which is invisible to the compiler when compiling the callsite, and hence acting as the compiler barrier. Regarding delay_loop(), it uses "volatile" for the "asm" which has quite different semantics from the C language "volatile" type-qualifier keyword and does not imply any compiler barrier at all. Satyam - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, 15 Aug 2007, Paul E. McKenney wrote: > The volatile cast should not disable all that many optimizations, > for example, it is much less hurtful than barrier(). Furthermore, > the main optimizations disabled (pulling atomic_read() and atomic_set() > out of loops) really do need to be disabled. In many cases you do not need a barrier. Having volatile there *will* impact optimization because the compiler cannot use a register that may contain the value that was fetched earlier. And the compiler cannot choose freely when to fetch the value. The order of memory accesses are fixed if you use volatile. If the variable is not volatile then the compiler can arrange memory accesses any way they fit and thus generate better code. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: RFC: do get_rtc_time() correctly
On 08/16/2007 02:26 AM, David P. Reed wrote: My mention of NMI (which by definition can't be masked) is because NMI Well, not by the CPU it can't, but on a PC, masking NMIs is a simple matter of setting bit 7 of I/O port 0x70 to 1 (it seems the kernel does not provide an interface for it though?). Rene. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, Aug 15, 2007 at 05:49:50PM -0700, Paul E. McKenney wrote: > On Thu, Aug 16, 2007 at 08:30:23AM +0800, Herbert Xu wrote: > > > Thanks. But I don't need a summary of the thread, I'm asking > > for an extant code snippet in our kernel that benefits from > > the volatile change and is not part of a busy-wait. > > Sorry, can't help you there. I really do believe that the information > you need (as opposed to the specific item you are asking for) really > has been put forth in this thread. That only leads me to believe that such a code snippet simply does not exist. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, Aug 15, 2007 at 05:42:07PM -0700, Christoph Lameter wrote: > On Wed, 15 Aug 2007, Paul E. McKenney wrote: > > > Seems to me that we face greater chance of confusion without the > > volatile than with, particularly as compiler optimizations become > > more aggressive. Yes, we could simply disable optimization, but > > optimization can be quite helpful. > > A volatile default would disable optimizations for atomic_read. > atomic_read without volatile would allow for full optimization by the > compiler. Seems that this is what one wants in many cases. The volatile cast should not disable all that many optimizations, for example, it is much less hurtful than barrier(). Furthermore, the main optimizations disabled (pulling atomic_read() and atomic_set() out of loops) really do need to be disabled. Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 06:28:42AM +0530, Satyam Sharma wrote: > > > The udelay itself certainly should have some form of cpu_relax in it. > > Yes, a form of barrier() must be present in mdelay() or udelay() itself > as you say, having it in __const_udelay() is *not* enough (superflous > actually, considering it is already a separate translation unit and > invisible to the compiler). As long as __const_udelay does something which has the same effect as barrier it is enough even if it's in the same unit. As a matter of fact it does on i386 where __delay either uses rep_nop or asm/volatile. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, Aug 16, 2007 at 08:30:23AM +0800, Herbert Xu wrote: > On Wed, Aug 15, 2007 at 05:23:10PM -0700, Paul E. McKenney wrote: > > On Thu, Aug 16, 2007 at 08:12:48AM +0800, Herbert Xu wrote: > > > On Wed, Aug 15, 2007 at 04:53:35PM -0700, Paul E. McKenney wrote: > > > > > > > > > > Communicating between process context and interrupt/NMI handlers > > > > > > using > > > > > > per-CPU variables. > > > > > > > > > > Remeber we're talking about atomic_read/atomic_set. Please > > > > > cite the actual file/function name you have in mind. > > > > > > > > Yep, we are indeed talking about atomic_read()/atomic_set(). > > > > > > > > We have been through this issue already in this thread. > > > > > > Sorry, but I must've missed it. Could you cite the file or > > > function for my benefit? > > > > I might summarize the thread if there is interest, but I am not able to > > do so right this minute. > > Thanks. But I don't need a summary of the thread, I'm asking > for an extant code snippet in our kernel that benefits from > the volatile change and is not part of a busy-wait. Sorry, can't help you there. I really do believe that the information you need (as opposed to the specific item you are asking for) really has been put forth in this thread. Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
[ Sorry for empty subject line in previous mail. I intended to make a patch so cleared it to change it, but ultimately neither made a patch nor restored subject line. Done that now. ] On Thu, 16 Aug 2007, Herbert Xu wrote: > On Thu, Aug 16, 2007 at 06:06:00AM +0530, Satyam Sharma wrote: > > > > that are: > > > > while ((atomic_read(_for_crash_ipi) > 0) && msecs) { > > mdelay(1); > > msecs--; > > } > > > > where mdelay() becomes __const_udelay() which happens to be in another > > translation unit (arch/i386/lib/delay.c) and hence saves this callsite > > from being a bug :-) > > The udelay itself certainly should have some form of cpu_relax in it. Yes, a form of barrier() must be present in mdelay() or udelay() itself as you say, having it in __const_udelay() is *not* enough (superflous actually, considering it is already a separate translation unit and invisible to the compiler). However, there are no compiler barriers on the macro-definition-path between mdelay(1) and __const_udelay(), so the only thing that saves us from being a bug here is indeed the different-translation-unit concept. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, 15 Aug 2007, Paul E. McKenney wrote: > Seems to me that we face greater chance of confusion without the > volatile than with, particularly as compiler optimizations become > more aggressive. Yes, we could simply disable optimization, but > optimization can be quite helpful. A volatile default would disable optimizations for atomic_read. atomic_read without volatile would allow for full optimization by the compiler. Seems that this is what one wants in many cases. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: drivers/infiniband/mlx/mad.c misplaced ;
On Wed, 2007-08-15 at 19:58 -0400, Dave Jones wrote: > Signed-off-by: Dave Jones <[EMAIL PROTECTED]> > > diff --git a/drivers/infiniband/hw/mlx4/mad.c > b/drivers/infiniband/hw/mlx4/mad.c > index 3330917..0ed02b7 100644 > --- a/drivers/infiniband/hw/mlx4/mad.c > +++ b/drivers/infiniband/hw/mlx4/mad.c > @@ -109,7 +109,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int > ignore_mkey, int ignore_bkey, > in_modifier, op_modifier, > MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C); > > - if (!err); > + if (!err) There's more than a few of these (not inspected). $ egrep -r --include=*.c "\bif[[:space:]]*\([^\)]*\)[[:space:]]*\;" * arch/sh/boards/se/7343/io.c:if (0) ; drivers/atm/iphase.c: if (!desc1) ; drivers/infiniband/hw/mlx4/mad.c: if (!err); drivers/isdn/capi/capiutil.c: else if (c <= 0x0f); drivers/net/tokenring/ibmtr.c: else if (ti->shared_ram_paging == 0xf); /* No paging in adapter */ drivers/s390/scsi/zfcp_erp.c: if (status == ZFCP_ERP_SUCCEEDED) ; /* no further action */ fs/hostfs/hostfs_user.c:if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ; net/netfilter/xt_u32.c: if (skb->len < 4 || pos > skb->len - 4); sound/pci/au88x0/au88x0_synth.c:if (eax == 0) ; - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, 16 Aug 2007, Paul Mackerras wrote: > Those barriers are for when we need ordering between atomic variables > and other memory locations. An atomic variable by itself doesn't and > shouldn't need any barriers for other CPUs to be able to see what's > happening to it. It does not need any barriers. As soon as one cpu acquires the cacheline for write it will be invalidated in the caches of the others. So the other cpu will have to refetch. No need for volatile. The issue here may be that the compiler has fetched the atomic variable earlier and put it into a register. However, that prefetching is limited because it cannot cross functions calls etc. The only problem could be loops where the compiler does not refetch the variable since it assumes that it does not change and there are no function calls in the body of the loop. But AFAIK these loops need cpu_relax and other measures anyways to avoid bad effects from busy waiting. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, Aug 15, 2007 at 05:26:34PM -0700, Christoph Lameter wrote: > On Thu, 16 Aug 2007, Paul Mackerras wrote: > > > In the kernel we use atomic variables in precisely those situations > > where a variable is potentially accessed concurrently by multiple > > CPUs, and where each CPU needs to see updates done by other CPUs in a > > timely fashion. That is what they are for. Therefore the compiler > > must not cache values of atomic variables in registers; each > > atomic_read must result in a load and each atomic_set must result in a > > store. Anything else will just lead to subtle bugs. > > This may have been the intend. However, today the visibility is controlled > using barriers. And we have barriers that we use with atomic operations. > Having volatile be the default just lead to confusion. Atomic read should > just read with no extras. Extras can be added by using variants like > atomic_read_volatile or so. Seems to me that we face greater chance of confusion without the volatile than with, particularly as compiler optimizations become more aggressive. Yes, we could simply disable optimization, but optimization can be quite helpful. Thanx, Paul - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 002 of 2] md: Correctly update sysfs when a raid1 is reshaped.
When a raid1 array is reshaped (number of drives changed), the list of devices is compacted, so that slots for missing devices are filled with working devices from later slots. This requires the "rd%d" symlinks in sysfs to be updated. Signed-off-by: Neil Brown <[EMAIL PROTECTED]> ### Diffstat output ./drivers/md/raid1.c | 22 ++ 1 file changed, 18 insertions(+), 4 deletions(-) diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2007-08-16 10:27:57.0 +1000 +++ ./drivers/md/raid1.c2007-08-16 10:29:58.0 +1000 @@ -2154,11 +2154,25 @@ static int raid1_reshape(mddev_t *mddev) oldpool = conf->r1bio_pool; conf->r1bio_pool = newpool; - for (d=d2=0; d < conf->raid_disks; d++) - if (conf->mirrors[d].rdev) { - conf->mirrors[d].rdev->raid_disk = d2; - newmirrors[d2++].rdev = conf->mirrors[d].rdev; + for (d = d2 = 0; d < conf->raid_disks; d++) { + mdk_rdev_t *rdev = conf->mirrors[d].rdev; + if (rdev && rdev->raid_disk != d2) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(>kobj, nm); + rdev->raid_disk = d2; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(>kobj, nm); + if (sysfs_create_link(>kobj, + >kobj, nm)) + printk(KERN_WARNING + "md/raid1: cannot register " + "%s for %s\n", + nm, mdname(mddev)); } + if (rdev) + newmirrors[d2++].rdev = rdev; + } kfree(conf->mirrors); conf->mirrors = newmirrors; kfree(conf->poolinfo); - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 001 of 2] md: Make sure a re-add after a restart honours bitmap when resyncing.
Commit 1757128438d41670ded8bc3bc735325cc07dc8f9 was slightly bad. If an array has a write-intent bitmap, and you remove a drive, then readd it, only the changed parts should be resynced. However after the above commit, this only works if the array has not been shut down and restarted. This is because it sets 'fullsync' at little more often than it should. This patch is more careful. Signed-off-by: Neil Brown <[EMAIL PROTECTED]> ### Diffstat output ./drivers/md/raid1.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff .prev/drivers/md/raid1.c ./drivers/md/raid1.c --- .prev/drivers/md/raid1.c2007-08-16 10:27:57.0 +1000 +++ ./drivers/md/raid1.c2007-08-16 10:27:57.0 +1000 @@ -1972,7 +1972,8 @@ static int run(mddev_t *mddev) !test_bit(In_sync, >rdev->flags)) { disk->head_position = 0; mddev->degraded++; - conf->fullsync = 1; + if (disk->rdev) + conf->fullsync = 1; } } if (mddev->degraded == conf->raid_disks) { - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 000 of 2] md: bug fixes for 2.6.23-rc
Following 2 patches contain bugfixes for md. Both apply to earlier kernels, but probably aren't significant enough for -stable (no oops, no data corruption, no security hole). They should go in 2.6.23 though. Thanks, NeilBrown [PATCH 001 of 2] md: Make sure a re-add after a restart honours bitmap when resyncing. [PATCH 002 of 2] md: Correctly update sysfs when a raid1 is reshaped. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
Christoph Lameter writes: > On Thu, 16 Aug 2007, Paul Mackerras wrote: > > > In the kernel we use atomic variables in precisely those situations > > where a variable is potentially accessed concurrently by multiple > > CPUs, and where each CPU needs to see updates done by other CPUs in a > > timely fashion. That is what they are for. Therefore the compiler > > must not cache values of atomic variables in registers; each > > atomic_read must result in a load and each atomic_set must result in a > > store. Anything else will just lead to subtle bugs. > > This may have been the intend. However, today the visibility is controlled > using barriers. And we have barriers that we use with atomic operations. Those barriers are for when we need ordering between atomic variables and other memory locations. An atomic variable by itself doesn't and shouldn't need any barriers for other CPUs to be able to see what's happening to it. Paul. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: your mail
On Thu, Aug 16, 2007 at 06:06:00AM +0530, Satyam Sharma wrote: > > that are: > > while ((atomic_read(_for_crash_ipi) > 0) && msecs) { > mdelay(1); > msecs--; > } > > where mdelay() becomes __const_udelay() which happens to be in another > translation unit (arch/i386/lib/delay.c) and hence saves this callsite > from being a bug :-) The udelay itself certainly should have some form of cpu_relax in it. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Wed, Aug 15, 2007 at 05:23:10PM -0700, Paul E. McKenney wrote: > On Thu, Aug 16, 2007 at 08:12:48AM +0800, Herbert Xu wrote: > > On Wed, Aug 15, 2007 at 04:53:35PM -0700, Paul E. McKenney wrote: > > > > > > > > Communicating between process context and interrupt/NMI handlers using > > > > > per-CPU variables. > > > > > > > > Remeber we're talking about atomic_read/atomic_set. Please > > > > cite the actual file/function name you have in mind. > > > > > > Yep, we are indeed talking about atomic_read()/atomic_set(). > > > > > > We have been through this issue already in this thread. > > > > Sorry, but I must've missed it. Could you cite the file or > > function for my benefit? > > I might summarize the thread if there is interest, but I am not able to > do so right this minute. Thanks. But I don't need a summary of the thread, I'm asking for an extant code snippet in our kernel that benefits from the volatile change and is not part of a busy-wait. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmV>HI~} <[EMAIL PROTECTED]> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] i386: Fix a couple busy loops in mach_wakecpu.h:wait_for_init_deassert()
On Wed, 15 Aug 2007, Heiko Carstens wrote: > [...] > Btw.: we still have > > include/asm-i386/mach-es7000/mach_wakecpu.h: while (!atomic_read(deassert)); > include/asm-i386/mach-default/mach_wakecpu.h: while (!atomic_read(deassert)); > > Looks like they need to be fixed as well. [PATCH] i386: Fix a couple busy loops in mach_wakecpu.h:wait_for_init_deassert() Use cpu_relax() in the busy loops, as atomic_read() doesn't automatically imply volatility for i386 and x86_64. x86_64 doesn't have this issue because it open-codes the while loop in smpboot.c:smp_callin() itself that already uses cpu_relax(). For i386, however, smpboot.c:smp_callin() calls wait_for_init_deassert() which is buggy for mach-default and mach-es7000 cases. [ I test-built a kernel -- smp_callin() itself got inlined in its only callsite, smpboot.c:start_secondary() -- and the relevant piece of code disassembles to the following: 0xc1019704 :mov0xc144c4c8,%eax 0xc1019709 :test %eax,%eax 0xc101970b :je 0xc1019709 init_deasserted (at 0xc144c4c8) gets fetched into %eax only once and then we loop over the test of the stale value in the register only, so these look like real bugs to me. With the fix below, this becomes: 0xc1019706 :pause 0xc1019708 :cmpl $0x0,0xc144c4c8 0xc101970f :je 0xc1019706 which looks nice and healthy. ] Thanks to Heiko Carstens for noticing this. Signed-off-by: Satyam Sharma <[EMAIL PROTECTED]> --- include/asm-i386/mach-default/mach_wakecpu.h |3 ++- include/asm-i386/mach-es7000/mach_wakecpu.h |3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/asm-i386/mach-default/mach_wakecpu.h b/include/asm-i386/mach-default/mach_wakecpu.h index 673b85c..3ebb178 100644 --- a/include/asm-i386/mach-default/mach_wakecpu.h +++ b/include/asm-i386/mach-default/mach_wakecpu.h @@ -15,7 +15,8 @@ static inline void wait_for_init_deassert(atomic_t *deassert) { - while (!atomic_read(deassert)); + while (!atomic_read(deassert)) + cpu_relax(); return; } diff --git a/include/asm-i386/mach-es7000/mach_wakecpu.h b/include/asm-i386/mach-es7000/mach_wakecpu.h index efc903b..84ff583 100644 --- a/include/asm-i386/mach-es7000/mach_wakecpu.h +++ b/include/asm-i386/mach-es7000/mach_wakecpu.h @@ -31,7 +31,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) static inline void wait_for_init_deassert(atomic_t *deassert) { #ifdef WAKE_SECONDARY_VIA_INIT - while (!atomic_read(deassert)); + while (!atomic_read(deassert)) + cpu_relax(); #endif return; } - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: RFC: do get_rtc_time() correctly
Alan: Thanks for the comment. I will code a patch, and include a sanity check as you suggested, and send it for review. Just to clarify one concern your note raised: I understand that SMM/SMI servicing can take a long time, but SMM/SMI shouldn't happen while interrupts are masked using local_irq_disable() [included in spin_lock_irq()], at least on x86-architectures. If SMM/SMI can happen even then, the NMI fix below could be generalized. My mention of NMI (which by definition can't be masked) is because NMI can happen even while interrupts are masked. This is a timing problem that can't be dealt with by masking interrupts, and NMI's are used for watchdogs, etc these days. It seems like just a general good thing to be able to ask if an NMI has happened. A per-cpu NMI eventcount that is incremented every NMI would allow one to detect NMI's that happen during an otherwise masked code sequence by reading it at the beginning and end of the code sequence. Don't know if NMIs are common on other architectures, or if this is an architecture dependent concern. Perhaps I'm really talking about two patches here. One for a mechanism to detect NMIs that happen during a critical piece of code (so it can be retried), and one that depends on that to be really proper in reading the RTC reliably. Alan Cox wrote: So the proper way to read the RTC contents is to read the UIP flag, and if zero, read all the RTC registers with interrupts masked completely, so all reads happen in the 224 usec window. (NMI can still be a problem, but you can have NMI's set a flag that forces a retry). SMM/SMI is more likely to be what bumps you 224usec or more. I'm happy to code and test a patch. Rather than just submit a patch, I thought I'd request others' comments on this, since it affects so many architectures. cc me, if you will, as I don't subscribe to LKML, just check it periodically. Go for it. The other architectures generally inherit it by inheriting similar bridge chips or in more modern times the RTC macrocell. It should also be possible to debug by putting in an optional sanity check initially which checks the read made sense compared to a few more - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/