Hi,

I have updated even more code. Both functions should be functional now. I was able to compile the code outside of OOo and it compiled (with MS VC 6.0 - the 1998 version) and worked correctly, though I have no idea if it will integrate seamlessly into OOo.

[For the external compile I had to make some minor changes to the code, but the general structure was unchanged.]

Now, both the one matrix and the multiple matrices variants function. Somebody with some OOo knowledge should now integrate the code where it belongs. A Menu Entry like 'One-Way Anova' seems reasonable. This would partially solve issue http://www.openoffice.org/issues/show_bug.cgi?id=4921 , well at least the one way ANOVA. [This is the more important of the ANOVA variants.]

Kind regards,

Leonard Mada


Code included:

Calc-ANOVA-No_ARRAY.cpp
- contains the code for one-way ANOVA (NO block type),
- all input is in the form of:
 -- one data matrix (every column is a different variable) or
 -- multiple data matrices
- have removed most dynamic arrays
 (except for two, which are initialized and destroyed properly)

ANOVA.Test.Code.cpp
- contains the same C++ code adapted so that it can be compiled with MS VC 6.0 (independent of OOo code) - this way, it is possible to test the code without having to (re)compile OOo
- see also the 'ANOVA-Test.gnumeric' from my previous post

ANOVA-Test.gnumeric (see previous post for this file)
- gnumeric file to test the results of the ANOVA code
// This implementation uses a robust 2-pass algorithm.
// While one-pass algorithms are faster, they are notoriously
//   unstable, precluding their use in serious statistical
//   programs.

void ScInterpreter::ScANOVA()
{
        // WE GET EITHER A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE 
VARIABLE
        //    DISADVANTAGE:
        //       - ONLY ONE COLUMN PER VARIABLE
        //       - COLUMNS MUST BE CONTIGUOS
        //    ADVANTAGE:
        //       - FAST AND EASY TO WRITE
        //       - JUST ONE SELECTION
        // OR MULTIPLE MATRICES, EACH MATRIX IS ONE VARIABLE
        //    DISADVANTAGE:
        //      - [CALC FUNCTIONS ACCEPT ONLY 30 PARAMS
        //         SO THERE ARE AT MOST 30 VARIABLES]
        //      - CUMBERSOME TO WRITE/SELECT ALL MATRICES

        // STORES ACTUALLY THE NUMBER OF VARIABLES
        SCSIZE iVarNr   = GetByte() /* NUMBER OF PARAMETERS */;
        SCSIZE iVarTmp  = iVarNr;   // NUMBER OF DATA MATRICES

        if ( iVarNr == 0 /* NO PARAMETERS */)
                return;      // EXIT
        if ( iVarNr == 1 /* ONLY ONE PARAMETER */ ) {
                ScInterpreter::ScANOVAMono()
                return;
        }

        ScInterpreter::ScANOVAMultiple(iVarNr)
}


void ScInterpreter::ScANOVAMultiple(SCSIZE iVarNr)
{
        // WE GET MULTIPLE MATRICES, EACH MATRIX CODES ONE VARIABLE
        // iVarNr STORES ACTUALLY THE NUMBER OF VARIABLES


        SCSIZE *nVar = new unsigned int[iVarNr];
        // THIS WILL STORE THE NUMBER OF DATA VALUES FOR EACH OF THE VARIABLES

        SCSIZE dfB   = 0; // DEGREES OF FREEDOM
        SCSIZE dfE   = 0; // DEGREES OF FREEDOM
        SCSIZE N     = 0; // TOTAL NUMBER OF DATA VALUES

        SCSIZE nR, nC;    // MATRIX DIMENSIONS

        SCSIZE iCount   = 0; // INDEX POINTING TO CURRENT VARIABLE
        SCSIZE jCount   = 0; // NUMBER OF VALUES FOR CURRENT VARIABLE

        double  fSumM    = 0.0; // THIS IS THE GRAND MEAN
        
        // WE NEED AN ARRAY TO STORE THE MEAN FOR EVERY GROUP (VARIABLE)
        double *fSumX = new double[iVarNr]; // THE MEANS FOR THE INDIVIDUAL VARS

        double fValX;                                           // THE 
INDIVIDUAL VALUES


        for (iCount = 0; iCount < iVarNr; iCount++) {
                fSumX[iCount] = 0.0;  // INITIALIZE THE SUM

                ScMatrixRef pMat = GetMatrix();
                if (!pMat) {
                        // NO DATA MATRIX - INVALID PARAMETERS
                        SetIllegalParameter();
                        delete nVar;
                        delete fSumX;
                        return;
                }
                pMat->GetDimensions(nC, nR);

                for (SCSIZE i = 0; i < nC; i++)
                        for (SCSIZE j = 0; j < nR; j++)
                        {
                                if (!pMat->IsString(i,j))
                                {
                                        fSumX[iCount]    += 
pMat->GetDouble(i,j);
                                        jCount++;
                                }
                        }

                fSumM += fSumX[iCount];
                fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE GROUP 
MEAN
                nVar[iCount] = jCount; // STORE HOW MANY DATA VALUES EXIST FOR 
THIS VAR
                N += jCount;           // ADD FOR GRAND TOTAL = TOTAL No OF 
DATA VALUES
                jCount = 0;            // RESET jCount FOR NEXT VARIABLE
        } // END OUTER FOR LOOP


        dfB = iVarNr -1 ;  // THESE ARE THE DEGREES OF FREEDOM
        dfE = N - iVarNr;

        fSumM = fSumM / N; // THIS IS THE GRAND MEAN

        double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
        double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)

        for(iCount = 0; iCount < iVarNr; iCount++) {
                ScMatrixRef pMat = GetMatrix();
                pMat->GetDimensions(nC, nR);

                for (SCSIZE i = 0; i < nC; i++)
                        for (SCSIZE j = 0; j < nR; j++)
                        {
                                if (!pMat->IsString(i,j))
                                {
                                        fValX = pMat->GetDouble(i,j);
                                        fMSE += (fValX - fSumX[iCount]) * 
(fValX - fSumX[iCount]);
                                }
                        }

                fMSB += nVar[iCount] * (fSumM - fSumX[iCount]) * (fSumM - 
fSumX[iCount]);
        }   // END OUTER FOR LOOP

        fMSB = fMSB / dfB;
        fMSE = fMSE / dfE;
        PushDouble( fMSB/fMSE );

        // TODO:
        //   - WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
        //   - THIS IS DONE USING: =FDIST(fMSB/fMSE; dfB; dfE)

        delete nVar;
        delete fSumX;
}


void ScInterpreter::ScANOVAMono()
{
        // WE GOT A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE VARIABLE
        //    DISADVANTAGE: ONLY ONE COLUMN PER VARIABLE
        //    BUT IT IS EASYER TO USE AND IT IS NOT LIMITED TO 30 VARIABLES
        
        ScMatrixRef pMat = GetMatrix();
        if (!pMat) {
                // NO DATA MATRIX - INVALID PARAMETERS
                SetIllegalParameter();
                return;
        }

        SCSIZE iVarNr, nRMax;
        // WE HAVE ONLY ONE MATRIX
        // WE CONSIDER EVERY COLUMN AS A SEPARATE DATA SET
        pMat->GetDimensions(iVarNr, nRMax);

        if( iVarNr == 1 ) {
                SetNoValue();
                return; // ONLY ONE VARIABLE - ANOVA NOT POSSIBLE
        }

        SCSIZE *nR = new unsigned int[iVarNr];
        // THIS WILL STORE THE NUMBER OF DATA VALUES FOR EACH VARIABLE

        SCSIZE dfB   = 0; // DEGREES OF FREEDOM
        SCSIZE dfE   = 0; // DEGREES OF FREEDOM
        SCSIZE N     = 0; // TOTAL NUMBER OF DATA VALUES

        SCSIZE iCount   = 0; // INDEX POINTING TO CURRENT VARIABLE
        SCSIZE jCount   = 0; // NUMBER OF VALUES FOR CURRENT VARIABLE

        double  fSumM    = 0.0; // THIS IS THE GRAND MEAN
        
        // WE NEED AN ARRAY TO STORE THE MEAN FOR EVERY GROUP (VARIABLE)
        double *fSumX = new double[iVarNr]; // THE MEANS FOR THE INDIVIDUAL VARS

        double fValX;                                           // THE 
INDIVIDUAL VALUES


        for (iCount     = 0; iCount < iVarNr; iCount++) {
                fSumX[iCount] = 0.0; // INITIALIZE THE PARTIAL SUM
                        for (SCSIZE j = 0; j < nRMax; j++)
                        {
                                if (!pMat->IsString(iCount,j))
                                {
                                        fSumX[iCount]    += 
pMat->GetDouble(iCount,j);
                                        jCount++;
                                }
                        }
                        fSumM += fSumX[iCount];                                 
// GRAND TOTAL
                        fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE 
MEAN
                        nR[iCount] = jCount; // STORE HOW MANY DATA VALUES WE 
HAVE
                        N += jCount;             // THIS IS THE TOTAL NUMBER OF 
VALUES
                        jCount = 0;              // RESET jCount FOR NEXT 
VARIABLE
        } // END OUTER FOR LOOP

        dfB = iCount - 1;  // CALCULATE THE DEGREES OF FREEDOM (df)
        dfE = N - iCount;

        fSumM = fSumM / N; // THIS IS THE GRAND MEAN

        double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
        double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)

        for (iCount     = 0; iCount < iVarNr; iCount++) {
                for (jCount = 0; jCount < nRMax; jCount++)
                {
                        if (!pMat->IsString(iCount,jCount))
                        {
                                fValX = pMat->GetDouble(iCount,jCount);
                                fMSE += (fValX - fSumX[iCount]) * (fValX - 
fSumX[iCount]);
                        }
                }

                fMSB += nR[iCount] * (fSumM - fSumX[iCount]) * (fSumM - 
fSumX[iCount]);
        }   // END OUTER FOR LOOP

        fMSB = fMSB / dfB;
        fMSE = fMSE / dfE;
        PushDouble( fMSB/fMSE );

        // TODO:
        //   - WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
        //   - THIS IS DONE USING: =FDIST(fMSB/fMSE; dfB; dfE)

        delete nR;
        delete fSumX;
}
// This application tests the ANOVA code, independently of OOo
// Just run the application and compare the F-statistic
//   from the outputted (last value) with that computed
//   using gnumeric
// Compiled with MS VC 6.0 (1998)


//#include "stdafx.h"
#include <stdio.h>
#include "iostream"

using namespace std;

///////////////////


// We emulate some of the OOo routines and functions

typedef unsigned int SCSIZE;

void SetNoValue() {
        wcout << L"No Value";
        return;
}

void SetIllegalParameter() {
        wcout << L"Illegal Parameter";
        return;
}

void PushDouble(double i) {
        wcout << L"\nF-Statistic: " << i;
        return;
}


///////////////////////////////////

// More emulation

class ScMatrixRef {
private:
        const SCSIZE ic;
        const SCSIZE jc;
public:
        ScMatrixRef(SCSIZE i, SCSIZE j):
                ic(i),
                jc(j) {return;}
        void GetDimensions(SCSIZE &iR, SCSIZE &jR) {
                iR = ic;
                jR = jc;
                return;
        }
        bool IsString(SCSIZE i, SCSIZE j) {
                return false;
        }
        bool IsMatrix()    const {return true;}
        bool operator ! () const {return !IsMatrix();}
        double GetDouble(double i, double j) {
                i = (i > 1)?i:1.1;
                return j * (i-1)/(i+1);
        }
};

ScMatrixRef* pMatrix;

void GetMatrix(ScMatrixRef &MatrixClass) {
        pMatrix = &MatrixClass;
        return;}

ScMatrixRef &GetMatrix() {
        return *pMatrix;}


///////////////////////////////////


// New OOo CALC FUNCTIONS //


// THIS IS THE IMPORTANT CODE



void ScANOVAMultiple(SCSIZE iVarNr)
{
        // WE GET MULTIPLE MATRICES, EACH MATRIX CODES ONE VARIABLE
        // iVarNr STORES ACTUALLY THE NUMBER OF VARIABLES


        SCSIZE *nVar = new unsigned int[iVarNr];
        // THIS WILL STORE THE NUMBER OF DATA VALUES FOR EACH OF THE VARIABLES

        SCSIZE dfB   = 0; // DEGREES OF FREEDOM
        SCSIZE dfE   = 0; // DEGREES OF FREEDOM
        SCSIZE N     = 0; // TOTAL NUMBER OF DATA VALUES

        SCSIZE nR, nC;    // MATRIX DIMENSIONS

        SCSIZE iCount   = 0; // INDEX POINTING TO CURRENT VARIABLE
        SCSIZE jCount   = 0; // NUMBER OF VALUES FOR CURRENT VARIABLE

        double  fSumM    = 0.0; // THIS IS THE GRAND MEAN
        
        // WE NEED AN ARRAY TO STORE THE MEAN FOR EVERY GROUP (VARIABLE)
        double *fSumX = new double[iVarNr]; // THE MEANS FOR THE INDIVIDUAL VARS

        double fValX;                                           // THE 
INDIVIDUAL VALUES


        for (iCount = 0; iCount < iVarNr; iCount++) {
                fSumX[iCount] = 0.0;  // INITIALIZE THE SUM

                ScMatrixRef pMat = GetMatrix();
                if (!pMat) {
                        // NO DATA MATRIX - INVALID PARAMETERS
                        SetIllegalParameter();
                        delete nVar;
                        delete fSumX;
                        return;
                }
                pMat.GetDimensions(nC, nR);

                for (SCSIZE i = 0; i < nC; i++)
                        for (SCSIZE j = 0; j < nR; j++)
                        {
                                if (!pMat.IsString(i,j))
                                {
                                        fSumX[iCount]    += pMat.GetDouble(/* i 
*/iCount,j); // FOR TESTING PURPOSES
                                        jCount++;
                                }
                        }

                fSumM += fSumX[iCount];
                fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE GROUP 
MEAN
                nVar[iCount] = jCount; // STORE HOW MANY DATA VALUES EXIST FOR 
THIS VAR
                N += jCount;           // ADD FOR GRAND TOTAL = TOTAL No OF 
DATA VALUES
                jCount = 0;            // RESET jCount FOR NEXT VARIABLE
        } // END OUTER FOR LOOP


        dfB = iVarNr -1 ;  // THESE ARE THE DEGREES OF FREEDOM
        dfE = N - iVarNr;

        fSumM = fSumM / N; // THIS IS THE GRAND MEAN

        double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
        double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)

        for(iCount = 0; iCount < iVarNr; iCount++) {
                ScMatrixRef pMat = GetMatrix();
                pMat.GetDimensions(nC, nR);

                for (SCSIZE i = 0; i < nC; i++)
                        for (SCSIZE j = 0; j < nR; j++)
                        {
                                if (!pMat.IsString(i,j))
                                {
                                        fValX = pMat.GetDouble(/* i */ 
iCount,j); // FOR TESTING PURPOSES
                                        fMSE += (fValX - fSumX[iCount]) * 
(fValX - fSumX[iCount]);
                                }
                        }

                fMSB += nVar[iCount] * (fSumM - fSumX[iCount]) * (fSumM - 
fSumX[iCount]);
        }   // END OUTER FOR LOOP

        fMSB = fMSB / dfB;
        fMSE = fMSE / dfE;
        PushDouble( fMSB/fMSE );

        // TODO:
        //   - WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
        //   - THIS IS DONE USING: =FDIST(fMSB/fMSE; dfB; dfE)

        delete nVar;
        delete fSumX;
}



/////////////////////////////

void ScANOVAMono()
{
        // WE GOT A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE VARIABLE
        //    DISADVANTAGE: ONLY ONE COLUMN PER VARIABLE
        //    BUT IT IS EASYER TO USE AND IT IS NOT LIMITED TO 30 VARIABLES
        
        ScMatrixRef pMat = GetMatrix();
        if (!pMat) {
                // NO DATA MATRIX - INVALID PARAMETERS
                SetIllegalParameter();
                return;
        }

        SCSIZE iVarNr /* nC */, nRMax;
        // WE HAVE ONLY ONE MATRIX
        // WE CONSIDER EVERY COLUMN AS A SEPARATE DATA SET
        pMat.GetDimensions(iVarNr /* nC */, nRMax);
        // iVarNr = nC; // nC IS NOT USED ANY FURTHER

        if( iVarNr == 1 ) {
                SetNoValue();
                return; // ONLY ONE VARIABLE - ANOVA NOT POSSIBLE
        }

        SCSIZE *nR = new unsigned int[iVarNr];
        // THIS WILL STORE THE NUMBER OF DATA VALUES FOR EACH VARIABLE

        SCSIZE dfB   = 0; // DEGREES OF FREEDOM
        SCSIZE dfE   = 0; // DEGREES OF FREEDOM
        SCSIZE N     = 0; // TOTAL NUMBER OF DATA VALUES

        SCSIZE iCount   = 0; // INDEX POINTING TO CURRENT VARIABLE
        SCSIZE jCount   = 0; // NUMBER OF VALUES FOR CURRENT VARIABLE

        double  fSumM    = 0.0; // THIS IS THE GRAND MEAN
        
        // WE NEED AN ARRAY TO STORE THE MEAN FOR EVERY GROUP (VARIABLE)
        double *fSumX = new double[iVarNr]; // THE MEANS FOR THE INDIVIDUAL VARS

        double fValX;                                           // THE 
INDIVIDUAL VALUES


        for (iCount     = 0; iCount < iVarNr; iCount++) {
                fSumX[iCount] = 0.0; // INITIALIZE THE SUM
                        for (SCSIZE j = 0; j < nRMax; j++)
                        {
                                if (!pMat.IsString(iCount,j))
                                {
                                        fSumX[iCount]    += 
pMat.GetDouble(iCount,j);
                                        jCount++;
                                }
                        }
                        fSumM += fSumX[iCount];                                 
// GRAND TOTAL
                        fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE 
MEAN
                        nR[iCount] = jCount; // STORE HOW MANY DATA VALUES WE 
HAVE
                        N += jCount; // THIS IS THE TOTAL NUMBER OF VALUES
                        jCount = 0;  // RESET jCount FOR NEXT VARIABLE

                        wcout << L"Partial Sum: " << fSumX[iCount] << L"\n"; // 
TEST

        } // END OUTER FOR LOOP

        dfB = iCount - 1;  // CALCULATE THE DEGREES OF FREEDOM (df)
        dfE = N - iCount;
        fSumM = fSumM / N; // THIS IS THE GRAND MEAN

        double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
        double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)

        for (iCount     = 0; iCount < iVarNr; iCount++) {
                for (jCount = 0; jCount < nRMax; jCount++)
                {
                        if (!pMat.IsString(iCount,jCount))
                        {
                                fValX = pMat.GetDouble(iCount,jCount);
                                fMSE += (fValX - fSumX[iCount]) * (fValX - 
fSumX[iCount]);
                        }
                }

                fMSB += nR[iCount] * (fSumM - fSumX[iCount]) * (fSumM - 
fSumX[iCount]);
                wcout << L"\n";
        }   // END OUTER FOR LOOP

        fMSB = fMSB / dfB;
        fMSE = fMSE / dfE;
        PushDouble( fMSB/fMSE );

        // TODO:
        //   - WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
        //   - THIS IS DONE USING: =FDIST(fMSB/fMSE; dfB; dfE)

        delete nR;
        delete fSumX;
}



/////////////////////////////




int main(int argc, char* argv[])
{

        // TEST THE FUNCTIONS


        ScMatrixRef MatrixClass(5,10);
        GetMatrix(MatrixClass);


        ScANOVAMono();


        ScMatrixRef MatrixClassA(1,10);
        GetMatrix(MatrixClassA);


        ScANOVAMultiple(5);


        //
                {wprintf(L"\n\nHit ENTER to terminate the program 
");_flushall();getchar();}
        //

        return 0;
}


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to