Re: [sc-dev] Statistical Functions Implementation

Leonard Mada Sat, 03 Feb 2007 11:46:39 -0800


I have updated some of the ANOVA code.

Unfortunately, I am quite busy with my own work, so the significant delay. But I was able to - hopefully - change the ANOVA code to something working. There is a need to adapt this code for the multiple matrices scenario, implement a UI, and much more. BUT this is the core code implementing the ANOVA calculations. The rest can be done easy by those who understand the OOo code. Unfortunately, the lack of useful comments prevents me to have a deeper understanding of OOo. (I looked at ScInterpreter::IterateParameters(), BUT did not understand much.)



Calc-ANOVA-No_ARRAY.cpp
- contains the code for one-way ANOVA (NO block type),

- all input is in the form of one data matrix (every column is a different variable)

- have removed most dynamic arrays
  (except for two, which are initialized and destroyed properly)

ANOVA.Test.Code.cpp

- contains the same C++ code adapted so that it can be compiled with MS VC 6.0 (independent of OOo code) - this way, it is possible to test the code without having to (re)compile OOo

- see also the 'ANOVA-Test.gnumeric'

ANOVA-Test.gnumeric
- gnumeric file to test the results of the ANOVA code
- it actually was correct


Kind regards,

Leonard Mada

void ScInterpreter::ScANOVA()
{
        // WE GET EITHER A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE 
VARIABLE
        //    DISADVANTAGE: ONLY ONE COLUMN PER VARIABLE
        //    ADVANTAGE: FAST AND EASY TO WRITE
        // OR MULTIPLE MATRICES, EACH MATRIX IS ONE VARIABLE
        //    DISADVANTAGE:
        //      [CALC FUNCTIONS ACCEPT ONLY 30 PARAMS
        //       SO THERE ARE AT MOST 30 VARIABLES]
        //      CUMBERSOME TO WRITE ALL MATRICES

        // STORES ACTUALLY THE NUMBER OF VARIABLES
        SCSIZE iVarNr   = GetByte() /* NUMBER OF PARAMETERS */;
        SCSIZE iVarTmp  = iVarNr;   // NUMBER OF DATA MATRICES

        if ( iVarNr == 0 /* NO PARAMETERS */)
                return; // EXIT
        if ( iVarNr == 1 /* ONLY ONE PARAMETER */ ) {
                // SEE ONE PARAMETER CASE ::ScANOVAMono()
                // CALL TO THAT FUNCTION OR USE SWITCH CASE
                return;
        }

        // ...
        // MORE COMPLEX CODE
        // IN PRINCIPLE SIMILAR TO THE MORE SIMPLE ScInterpreter::ScANOVAMono()
}


void ScInterpreter::ScANOVAMono()
{
        // WE GOT A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE VARIABLE
        //    DISADVANTAGE: ONLY ONE COLUMN PER VARIABLE
        //    BUT IT IS EASYER TO USE AND IT IS NOT LIMITED TO 30 VARIABLES
        
        ScMatrixRef pMat = GetMatrix();
        if (!pMat) {
                // NO DATA MATRIX - INVALID PARAMETERS
                SetIllegalParameter();
                return;
        }

        SCSIZE iVarNr, nRMax;
        // WE HAVE ONLY ONE MATRIX
        // WE CONSIDER EVERY COLUMN AS A SEPARATE DATA SET
        pMat->GetDimensions(iVarNr, nRMax);

        if( iVarNr == 1 ) {
                SetNoValue();
                return; // ONLY ONE VARIABLE - ANOVA NOT POSSIBLE
        }

        SCSIZE *nR = new unsigned int[iVarNr];
        // THIS WILL STORE THE NUMBER OF DATA VALUES FOR EACH VARIABLE

        SCSIZE dfB   = 0; // DEGREES OF FREEDOM
        SCSIZE dfE   = 0; // DEGREES OF FREEDOM
        SCSIZE N     = 0; // TOTAL NUMBER OF DATA VALUES

        SCSIZE iCount   = 0; // INDEX POINTING TO CURRENT VARIABLE
        SCSIZE jCount   = 0; // NUMBER OF VALUES FOR CURRENT VARIABLE

        double  fSumM    = 0.0; // THIS IS THE GRAND MEAN
        
        // WE NEED AN ARRAY TO STORE THE MEAN FOR EVERY GROUP (VARIABLE)
        double *fSumX = new double[iVarNr]; // THE MEANS FOR THE INDIVIDUAL VARS

        double fValX;                                           // THE 
INDIVIDUAL VALUES


        for (iCount     = 0; iCount < iVarNr; iCount++) {
                fSumX[iCount] = 0.0; // INITIALIZE THE PARTIAL SUM
                        for (SCSIZE j = 0; j < nRMax; j++)
                        {
                                if (!pMat->IsString(iCount,j))
                                {
                                        fSumX[iCount]    += 
pMat->GetDouble(iCount,j);
                                        jCount++;
                                }
                        }
                        fSumM += fSumX[iCount];                                 
// GRAND TOTAL
                        fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE 
MEAN
                        nR[iCount] = jCount; // STORE HOW MANY DATA VALUES WE 
HAVE
                        N += jCount;             // THIS IS THE TOTAL NUMBER OF 
VALUES
                        jCount = 0;              // RESET jCount FOR NEXT 
VARIABLE
        } // END OUTER FOR LOOP

        dfB = iCount - 1;  // CALCULATE THE DEGREES OF FREEDOM (df)
        dfE = N - iCount;

        fSumM = fSumM / N; // THIS IS THE GRAND MEAN

        double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
        double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)

        for (iCount     = 0; iCount < iVarNr; iCount++) {
                for (jCount = 0; jCount < nRMax; jCount++)
                {
                        if (!pMat->IsString(iCount,jCount))
                        {
                                fValX = pMat->GetDouble(iCount,jCount);
                                fMSE += (fValX - fSumX[iCount]) * (fValX - 
fSumX[iCount]);
                        }
                }

                fMSB += nR[iCount] * (fSumM - fSumX[iCount]) * (fSumM - 
fSumX[iCount]);
        }   // END OUTER FOR LOOP

        fMSB = fMSB / dfB;
        fMSE = fMSE / dfE;
        PushDouble( fMSB/fMSE );

        // TODO:
        //   - WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
        //   - THIS IS DONE USING: =FDIST(fMSB/fMSE; dfB; dfE)

        delete nR;
        delete fSumX;
}

// This application tests the ANOVA code, independently of OOo
// Just run the application and compare the F-statistic
//   from the outputted (last value) with that computed
//   using gnumeric
// Compiled with MS VC 6.0 (1998)


#include <stdio.h>
#include "iostream"

using namespace std;

///////////////////


// We emulate some of the OOo routines and functions

typedef unsigned int SCSIZE;

void SetNoValue() {
        wcout << L"No Value";
        return;
}

void SetIllegalParameter() {
        wcout << L"Illegal Parameter";
        return;
}

void PushDouble(double i) {
        wcout << L"\nF-Statistic: " << i;
        return;
}


///////////////////////////////////

// More emulation

class ScMatrixRef {
private:
        const SCSIZE ic;
        const SCSIZE jc;
public:
        ScMatrixRef(SCSIZE i, SCSIZE j):
                ic(i),
                jc(j) {return;}
        void GetDimensions(SCSIZE &iR, SCSIZE &jR) {
                iR = ic;
                jR = jc;
                return;
        }
        bool IsString(SCSIZE i, SCSIZE j) {
                return false;
        }
        bool IsMatrix()    const {return true;}
        bool operator ! () const {return !IsMatrix();}
        double GetDouble(double i, double j) {
                i = (i > 1)?i:1.1;
                return j * (i-1)/(i+1);
        }
};

ScMatrixRef MatrixClass(5,10);
ScMatrixRef &GetMatrix()
        {return MatrixClass;}

///////////////////////////////////


int main(int argc, char* argv[])
{

        // THIS IS THE IMPORTANT CODE


        // WE GOT A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE VARIABLE
        //    DISADVANTAGE: ONLY ONE COLUMN PER VARIABLE
        //    BUT IT IS EASYER TO USE AND IT IS NOT LIMITED TO 30 VARIABLES
        
        ScMatrixRef pMat = GetMatrix();
        if (!pMat) {
                // NO DATA MATRIX - INVALID PARAMETERS
                SetIllegalParameter();
                return 1;
        }

        SCSIZE iVarNr /* nC */, nRMax;
        // WE HAVE ONLY ONE MATRIX
        // WE CONSIDER EVERY COLUMN AS A SEPARATE DATA SET
        pMat.GetDimensions(iVarNr /* nC */, nRMax);
        // iVarNr = nC; // nC IS NOT USED ANY FURTHER

        if( iVarNr == 1 ) {
                SetNoValue();
                return 1; // ONLY ONE VARIABLE - ANOVA NOT POSSIBLE
        }

        SCSIZE *nR = new unsigned int[iVarNr];
        // THIS WILL STORE THE NUMBER OF DATA VALUES FOR EACH VARIABLE

        SCSIZE dfB   = 0; // DEGREES OF FREEDOM
        SCSIZE dfE   = 0; // DEGREES OF FREEDOM
        SCSIZE N     = 0; // TOTAL NUMBER OF DATA VALUES

        SCSIZE iCount   = 0; // INDEX POINTING TO CURRENT VARIABLE
        SCSIZE jCount   = 0; // NUMBER OF VALUES FOR CURRENT VARIABLE

        double  fSumM    = 0.0; // THIS IS THE GRAND MEAN
        
        // WE NEED AN ARRAY TO STORE THE MEAN FOR EVERY GROUP (VARIABLE)
        double *fSumX = new double[iVarNr]; // THE MEANS FOR THE INDIVIDUAL VARS

        double fValX;                                           // THE 
INDIVIDUAL VALUES


        for (iCount     = 0; iCount < iVarNr; iCount++) {
                fSumX[iCount] = 0.0; // INITIALIZE THE SUM
                        for (SCSIZE j = 0; j < nRMax; j++)
                        {
                                if (!pMat.IsString(iCount,j))
                                {
                                        fSumX[iCount]    += 
pMat.GetDouble(iCount,j);
                                        jCount++;
                                }
                        }
                        fSumM += fSumX[iCount];                                 
// GRAND TOTAL
                        fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE 
MEAN
                        nR[iCount] = jCount; // STORE HOW MANY DATA VALUES WE 
HAVE
                        N += jCount; // THIS IS THE TOTAL NUMBER OF VALUES
                        jCount = 0;  // RESET jCount FOR NEXT VARIABLE

                        wcout << L"Partial Sum: " << fSumX[iCount] << L"\n"; // 
TEST

        } // END OUTER FOR LOOP

        dfB = iCount - 1;  // CALCULATE THE DEGREES OF FREEDOM (df)
        dfE = N - iCount;
        fSumM = fSumM / N; // THIS IS THE GRAND MEAN

        double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
        double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)

        for (iCount     = 0; iCount < iVarNr; iCount++) {
                for (jCount = 0; jCount < nRMax; jCount++)
                {
                        if (!pMat.IsString(iCount,jCount))
                        {
                                fValX = pMat.GetDouble(iCount,jCount);
                                fMSE += (fValX - fSumX[iCount]) * (fValX - 
fSumX[iCount]);
                        }
                }

                fMSB += nR[iCount] * (fSumM - fSumX[iCount]) * (fSumM - 
fSumX[iCount]);
                wcout << L"\n";
        }   // END OUTER FOR LOOP

        fMSB = fMSB / dfB;
        fMSE = fMSE / dfE;
        PushDouble( fMSB/fMSE );

        // TODO:
        //   - WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
        //   - THIS IS DONE USING: =FDIST(fMSB/fMSE; dfB; dfE)

        delete nR;
        delete fSumX;

        //
        //if(AutoExit!=YES)
                {wprintf(L"\n\nHit ENTER to terminate the program 
");_flushall();getchar();}
        //

        return 0;
}

ANOVA-Test.gnumeric
Description: application/gnumeric

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: [sc-dev] Statistical Functions Implementation

Reply via email to