Paul Davis wrote:
> 
> abramo - i've just noticed a possibly serious flaw with your test
> program.
> 
> in real-life, there is (period_duration-cpu_time_for_period) after the
> completion of a "proc" cycle during which the processor is free to run
> other threads.
> 
> in the test, i notice that for the sp case, my processor load on 1 cpu
> goes to 100% and stays there for the duration of the test (not
> suprising, its running SCHED_RR and never sleeps). this means that you
> don't really have enough cache pollution effects from other processes
> at all in this case.
> 
> i can't test the running-on-UP case here because i have a dual CPU
> system, but there is a similar effect there as well. ctx code will own

Just boot it with "nosmp" parameter.

> the CPU as long as it has a thread ready to run, which is essentially
> always. therefore no non-ctx thread ever gets to use the CPU, and the
> only cache-driven effects you are seeing comes from the data that
> *doesn't* fit into the cache. this would explain the slightly
> "catastrophic" curve that steve showed - as soon as the pollution data
> size exceeds the cache space "dedicated" to ctx, things get *much*
> worse. but in a h/w driven system, its quite likely that the effective
> cache size would be smaller because other threads would have used it
> between periods.
> 
> do you see what i'm getting at? does it make sense?
> 
> if i'm right, then in a real world scenario (some other process has
> used the cache between invocations of proc()), you'd could very well
> see dramatic slowdowns with memory footprints notably smaller than
> steve's curves suggest.

If memory footprint threshold become smaller the difference between sp
and mp decrease. I've just verified this effect with attached ctx, if
you increase worker area size you'll note too.

> but i could, as usual, be missing something obvious. you could test
> this by using usleep() to put the threads to sleep, measuring the
> cycles they actually sleep for, and subtracting that from the
> total. you'd also have to ensure a realistic work-load from "other
> processes", such as a GUI for each component (and thus the X server)
> that wants to update some screen displays.

Here the problem is that "realistic work-load" is hard to define because
it's related to its memory footprint. Also consider that I don't think
that the GUI would run every period. And consider that usleep can't
sleep in not busy loop for less than a jiffie.

I think that we might consider one worker as the gui (at least on UP).
GUI effect is surely less than that (if I'm not wrong average cache
pollution effect can be measured in cache lines/time units and then
although GUI may have a greater footprint its run frequency is lower).

-- 
Abramo Bagnara                       mailto:[EMAIL PROTECTED]

Opera Unica                          Phone: +39.546.656023
Via Emilia Interna, 140
48014 Castel Bolognese (RA) - Italy

ALSA project               http://www.alsa-project.org
It sounds good!
/*
 *  Audio component approach cost meter
 *  Copyright (c) 2001 by Abramo Bagnara <[EMAIL PROTECTED]>
 *
 *   This library is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU Library General Public License as
 *   published by the Free Software Foundation; either version 2 of
 *   the License, or (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU Library General Public License for more details.
 *
 *   You should have received a copy of the GNU Library General Public
 *   License along with this library; if not, write to the Free Software
 *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 */

#include <limits.h>
#include <pthread.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <assert.h>
#include <sys/shm.h>
#include <sys/wait.h>
#include <asm/msr.h>

#define MAX_WORKERS 16

#define KEY 0xabce

#define PERIOD_SIZE 64
#define RATE 48000
#define CHANNELS 26

#define BUFSIZE (PERIOD_SIZE*CHANNELS)

size_t STACK_FOOTPRINT = 4 * 1024;
size_t WORKER_FOOTPRINT = 32 * 1024;
size_t SHARED_FOOTPRINT = 32 * 1024;

struct space {
        float samples[BUFSIZE];
        char shared_area[0];
};

char *worker_area[MAX_WORKERS];

struct space *area;

void proc(unsigned int n)
{
        unsigned int k;
        float *samples = area->samples;
        {
                char stack_area[STACK_FOOTPRINT];
                memset(stack_area, 0xaa, STACK_FOOTPRINT);
        }
        memset(area->shared_area, 0xaa, SHARED_FOOTPRINT);
        memset(worker_area[n], 0xaa, WORKER_FOOTPRINT);
        for (k = 0; k < BUFSIZE; k++)
                samples[k]++;
}

unsigned long long singlethread(unsigned int workers, unsigned int periods)
{
        unsigned long long begin, end;
        unsigned int k;
        float *samples;
        unsigned int p;
        area = malloc(sizeof(*area) + SHARED_FOOTPRINT);
        samples = area->samples;
        for (k = 0; k < BUFSIZE; k++)
                samples[k] = 0.0;

        for (k = 0; k < workers; k++)
                worker_area[k] = malloc(WORKER_FOOTPRINT);

        p = periods;

        rdtscll(begin);
        while (p-- > 0) {
                int n;
                for (n = 0; n < workers; ++n) {
                        proc(n);
                }
        }
        rdtscll(end);

        for (k = 0; k < BUFSIZE; k++)
                assert(samples[k] == periods * workers);
        
        free(area);
        return end - begin;
}

struct thr {
        int worker;
        pthread_t thread;
        int in, out;
} threads[MAX_WORKERS];

void *thread(void *data)
{
        char buf[1];
        struct thr *t = data;
        int err;
        float *samples = area->samples;
        while (1) {
                unsigned int k;
                err = read(t->in, buf, 1);
                assert(err == 1);
                if (!buf[0]) {
                        err = write(t->out, buf, 1);
                        assert(err == 1);
                        break;
                }
                {
                        char stack_area[STACK_FOOTPRINT];
                        memset(stack_area, 0xaa, STACK_FOOTPRINT);
                }
                memset(area->shared_area, 0xaa, SHARED_FOOTPRINT);
                memset(worker_area[t->worker], 0xaa, WORKER_FOOTPRINT);
                for (k = 0; k < BUFSIZE; k++)
                        samples[k]++;
                err = write(t->out, buf, 1);
                assert(err == 1);
        }
        return 0;
}

unsigned long long multithread(unsigned int workers, unsigned int periods)
{
        unsigned long long begin, end;
        int err;
        unsigned int k;
        char buf[1] = { 1 };
        float *samples;
        unsigned int p;
        int fds[workers + 1][2];
        int in, out;
        area = malloc(sizeof(*area) + SHARED_FOOTPRINT);

        samples = area->samples;

        for (k = 0; k <= workers; ++k) {
                err = pipe(fds[k]);
                assert(err == 0);
                worker_area[k] = malloc(WORKER_FOOTPRINT);
        }

        out = fds[0][1];
        in = fds[workers][0];
        for (k = 0; k < workers; ++k) {
                threads[k].worker = k;
                threads[k].in = fds[k][0];
                threads[k].out = fds[k + 1][1];
                err = pthread_create(&threads[k].thread, NULL, thread, &threads[k]);
                assert(err == 0);
        }

        /* Ensure all is started */
        err = write(out, buf, 1);
        assert(err == 1);
        err = read(in, buf, 1);
        assert(err == 1);

        for (k = 0; k < BUFSIZE; k++)
                samples[k] = 0.0;
        p = periods;

        rdtscll(begin);
        while (p-- > 0) {
                err = write(out, buf, 1);
                assert(err == 1);
                err = read(in, buf, 1);
                assert(err == 1);
        }
        rdtscll(end);

        buf[0] = 0;
        err = write(out, buf, 1);
        assert(err == 1);

        for (k = 0; k < workers; ++k) {
                err = pthread_join(threads[k].thread, 0);
                assert(err == 0);
        }
        for (k = 0; k <= workers; ++k) {
                close(fds[k][0]);
                close(fds[k][1]);
        }

        for (k = 0; k < BUFSIZE; k++)
                assert(samples[k] == periods * workers);

        free(area);
        return end - begin;
}

struct pro {
        int worker;
        pid_t pid;
        int in, out;
} processes[MAX_WORKERS];

void process(struct pro *p)
{
        char buf[1];
        int err;
        float *samples;
        int shmid = shmget(KEY, sizeof(*area) + SHARED_FOOTPRINT, 0666);
        assert(shmid >= 0);
        area = shmat(shmid, NULL, 0);
        assert(area != (void *) -1);
        samples = area->samples;
        while (1) {
                unsigned int k;
                err = read(p->in, buf, 1);
                assert(err == 1);
                if (!buf[0]) {
                        err = write(p->out, buf, 1);
                        assert(err == 1);
                        break;
                }
                {
                        char stack_area[STACK_FOOTPRINT];
                        memset(stack_area, 0xaa, STACK_FOOTPRINT);
                }
                memset(area->shared_area, 0xaa, SHARED_FOOTPRINT);
                memset(worker_area[0], 0xaa, WORKER_FOOTPRINT);
                for (k = 0; k < BUFSIZE; k++)
                        samples[k]++;
                err = write(p->out, buf, 1);
                assert(err == 1);
        
        }
        exit(0);
}

unsigned long long multiprocess(unsigned int workers, unsigned int periods)
{
        unsigned long long begin, end;
        int err;
        unsigned int k;
        char buf[1] = { 1 };
        float *samples;
        int fds[workers + 1][2];
        int in, out;
        int shmid = shmget(KEY, sizeof(*area) + SHARED_FOOTPRINT, IPC_CREAT | 0666);
        unsigned int p;
        assert(shmid >= 0);
        area = shmat(shmid, NULL, 0);
        assert(area != (void *) -1);

        samples = area->samples;

        worker_area[0] = malloc(WORKER_FOOTPRINT);

        for (k = 0; k <= workers; ++k) {
                err = pipe(fds[k]);
                assert(err == 0);
        }

        out = fds[0][1];
        in = fds[workers][0];
        for (k = 0; k < workers; ++k) {
                processes[k].worker = k;
                processes[k].in = fds[k][0];
                processes[k].out = fds[k + 1][1];
                err = fork();
                assert(err >= 0);
                if (err == 0)
                        process(&processes[k]);
                processes[k].pid = err;
        }

        /* Ensure all is started */
        err = write(out, buf, 1);
        assert(err == 1);
        err = read(in, buf, 1);
        assert(err == 1);
        
        for (k = 0; k < BUFSIZE; k++)
                samples[k] = 0.0;
        p = periods;

        rdtscll(begin);
        while (p-- > 0) {
                err = write(out, buf, 1);
                assert(err == 1);
                err = read(in, buf, 1);
                assert(err == 1);
        }
        rdtscll(end);

        buf[0] = 0;
        err = write(out, buf, 1);
        assert(err == 1);

        for (k = 0; k < workers; ++k) {
                err = waitpid(processes[k].pid, NULL, 0);
                assert(err >= 0);
        }
        for (k = 0; k <= workers; ++k) {
                close(fds[k][0]);
                close(fds[k][1]);
        }

        for (k = 0; k < BUFSIZE; k++)
                assert(samples[k] == periods * workers);

        err = shmdt(area);
        assert(err >= 0);
        err = shmctl(shmid, IPC_RMID, 0);
        assert(err >= 0);

        return end - begin;
}

void setscheduler(void)
{
        struct sched_param sched_param;

        if (sched_getparam(0, &sched_param) < 0) {
                printf("Scheduler getparam failed\n");
                return;
        }
        sched_param.sched_priority = sched_get_priority_max(SCHED_RR);
        if (!sched_setscheduler(0, SCHED_RR, &sched_param)) {
                printf("Scheduler set to Round Robin with priority %i...\n", 
sched_param.sched_priority);
                return;
        }
        printf("Scheduler set to Round Robin with priority %i failed\n", 
sched_param.sched_priority);
}


#define ST 0
#define MT 1
#define MP 2

/* Usage: ctx how workers periods mhz shared worker stack */

int main(int argc, char **argv)
{
        unsigned long long t;
        double c;
        int k;
        char *mode;
        unsigned int how = atoi(argv[1]);
        unsigned int workers = atoi(argv[2]);
        unsigned int periods = atoi(argv[3]);
        double mhz = atof(argv[4]);
        if (argc > 5)
                SHARED_FOOTPRINT = atoi(argv[5]) * 1024;
        if (argc > 6)
                WORKER_FOOTPRINT = atoi(argv[6]) * 1024;
        if (argc > 7)
                STACK_FOOTPRINT = atoi(argv[7]) * 1024;
        
        setscheduler();

        for (k = 0; k < 5; ++k) {
        switch (how) {
        case ST:
                mode = "Single thread";
                t = singlethread(workers, periods);
                break;
        case MT:
                mode = "Multi thread";
                t = multithread(workers, periods);
                break;
        case MP:
                mode = "Multi process";
                t = multiprocess(workers, periods);
                break;
        default:
                assert(0);
                break;
        }

        c = (double) t / periods;
        printf("%s: workers=%d periods=%d shared=%d worker=%d stack=%d\n",  mode, 
workers, periods, SHARED_FOOTPRINT, WORKER_FOOTPRINT, STACK_FOOTPRINT);
        printf("Cycles per period: %f\n", c);
        printf("%%CPU usage: %f\n", (100 * c / mhz) / (1000000.0 * PERIOD_SIZE / 
RATE));
        }
        return 0;
}

Reply via email to