Hello to everyone.
1. I have corrected some issues/bugs in the initial ANOVA code
2. Implemented some features NOT resolved in the first draft
3. split the code in two sections:
- the one matrix situation
- multiple matrices situation
[both could be implemented inside a single function using a switch case
statement]
Most things should be OK by now. One last issue remains: how should the
arrays be defined (and handled). I am NOT an expert in this field. IF
using vectors, this would be easy to implement. I have not worked much
with the 'new' operator, so someone with experience might want to make
the relevant changes.
Many thanks,
Leonard Mada
P.S.
The ANOVA code in R looks a little bit frightening to beginners (though
it is much more powerful), and as there is currently NO function in Calc
that permits performing *multiple comparisons* AND I see NO rapid
embedding of R into Calc, it seems pertinent to implement the ANOVA code.
//
// CALCULATING ANOVA
// (C) LEONARD MADA (2006)
//
// THIS IS FREE SOFTWARE
//
// THIS SOFTWARE IS RELEASED UNDER BOTH
// THE GNU GPL AS WELL AS THE LGPL
void ScInterpreter::ScANOVA()
{
// WE GET EITHER A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE
VARIABLE
// DISADVANTAGE: ONLY ONE COLUMN PER VARIABLE
// OR MULTIPLE MATRICES, EACH MATRIX IS ONE VARIABLE
// DISADVANTAGE:
// CALC FUNCTIONS ACCEPT ONLY 30 PARAMS
// SO THERE ARE AT MOST 30 VARIABLES
// STORES ACTUALLY THE NUMBER OF VARIABLES
SCSIZE iVarNr = GetByte() /* NUMBER OF PARAMETERS */;
SCSIZE iVarTmp = iVarNr; // NUMBER OF DATA MATRICES
if ( iVarNr == 0 /* NO PARAMETERS */)
return; // EXIT
if ( iVarNr == 1 /* ONLY ONE PARAMETER */ ) {
// SEE ONE PARAMETER CASE ::ScANOVAMono()
// CALL TO THAT FUNCTION OR USE SWITCH CASE
return;
}
ScMatrixRef pMat[iVarNr]; // INITIALISE DATA-MATRICES
SCSIZE nR[iVarNr], nC[iVarNr]; // INITIALISE MATRIX DIMENSIONS
SCSIZE nMAX = 0; // MAX DIMENSION OF ANY OF THE MATRICES
for(SCSIZE i= 0; i<iVarNr; i++) {
ScMatrixRef pMat[i] = GetMatrix();
if (!pMat[i]) {
// NO DATA MATRIX - INVALID PARAMETERS
SetIllegalParameter();
return;
}
pMat[i]->GetDimensions(nC[i], nR[i]);
nMAX = (nMAX > nC[i]*nR[i])? nMAX : nC[i]*nR[i];
}
if( nMAX == 1 ) {
SetNoValue();
return; // ONLY ONE VALUE - ANOVA NOT POSSIBLE/MEANINGFUL
}
SCSIZE dfB = 0; // DEGREES OF FREEDOM
SCSIZE dfE = 0; // DEGREES OF FREEDOM
SCSIZE N = 0; // TOTAL NUMBER OF DATA VALUES
double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)
SCSIZE iCount = 0; // NUMBER OF THE CURRENT VARIABLE
SCSIZE jCount = 0; // NUMBER OF VALUES FOR EACH VARIABLE
double fSumM = 0.0; // THIS IS THE GRAND MEAN
double fSumX[iVarNr] ; // THIS STORES THE MEANS FOR THE INDIVIDUAL
VARIABLES
double fValX[iVarNr] [nMAX]; // THE VALUES
for (; iCount < iVarNr; iCount++) {
fSumX[iCount] = 0.0; // INITIALIZE THE SUM
for (SCSIZE i = 0; i < nC[iCount]; i++)
for (SCSIZE j = 0; j < nR[iCount]; j++)
{
if (!pMat[iCount]->IsString(i,j))
{
fValX[iCount][jCount] =
pMat[iCount]->GetDouble(i,j);
fSumX[iCount] +=
fValX[iCount][jCount];
jCount++;
}
}
fSumM += fSumX[iCount];
fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE MEAN
nR[iCount] = jCount; // STORE HOW MANY DATA VALUES EXIST FOR
THIS VAR
N += jCount; // ADD FOR GRAND TOTAL = TOTAL No OF DATA VALUES
jCount = 0; // RESET jCount FOR NEXT VARIABLE
} // END OUTER FOR LOOP
dfB = iVarNr -1 ; // THESE ARE THE DEGREES OF FREEDOM
dfE = N - iVarNr;
fSumM = fSumM / N; // THIS IS THE GRAND MEAN
for(SCSIZE i = 0; i < iVarNR; i++) {
// GROUPS MAY HAVE DIFFERENT SIZES
// READ INDIVIDUAL GROUP SIZE
for(SCSIZE j = 0; j < nR[i] /* INDIVIDUAL GROUP SIZE */; j++) {
fMSE += (fValX[i][j] - fSumX[i]) * (fValX[i][j] -
fSumX[i]);
// fMSB += (fSumM - fSumX[i]) * (fSumM - fSumX[i]);
// TO AVOID MORE COMPUTATIONS WE CAN CALCULATE fMSB
OUTSIDE THIS LOOP
}
fMSB += nR[i] * (fSumM - fSumX[i]) * (fSumM - fSumX[i]);
}
fMSB = fMSB / dfB;
fMSE = fMSE / dfE;
PushDouble( fMSB/fMSE );
// WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
}
void ScInterpreter::ScANOVAMono()
{
// WE GOT A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE VARIABLE
// DISADVANTAGE: ONLY ONE COLUMN PER VARIABLE
// BUT IT IS EASYER TO USE AND IT IS NOT LIMITED TO 30 VARIABLES
ScMatrixRef pMat = GetMatrix();
if (!pMat) {
// NO DATA MATRIX - INVALID PARAMETERS
SetIllegalParameter();
return;
}
SCSIZE iVarNr /* nC */, nRMax;
// WE HAVE ONLY ONE MATRIX
// WE CONSIDER EVERY COLUMN AS A SEPARATE DATA SET
pMat->GetDimensions(iVarNr /* nC */, nRMax);
// iVarNr = nC; // nC IS NOT USED ANY FURTHER
if( iVarNr == 1 ) {
SetNoValue();
return; // ONLY ONE VARIABLE - ANOVA NOT POSSIBLE
}
SCSIZE nR[iVarNr];
// THIS WILL STORE HOW MANY DATA VALUES DO WE HAVE FOR EACH VARIABLE
SCSIZE dfB = 0; // DEGREES OF FREEDOM
SCSIZE dfE = 0; // DEGREES OF FREEDOM
SCSIZE N = 0; // TOTAL NUMBER OF DATA VALUES
double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)
SCSIZE iCount = 0; // NUMBER OF THE CURRENT VARIABLE
SCSIZE jCount = 0; // NUMBER OF VALUES FOR EACH VARIABLE
double fSumM = 0.0; // THIS IS THE GRAND MEAN
// FOR THE NEXT 2 VARS WE NEED ARRAYS
// IT COULD BE IMPLEMENTED WITH VECTORS, TOO
double *fSumX[iVarNr] = new double[iVarNr]; // THE MEANS FOR THE
INDIVIDUAL VARS
double *fValX[iVarNr] [nRMax] = new double[iVarNr] [nRMax]; // THE
VALUES
for (; iCount < iVarNr; iCount++) {
fSumX[iCount] = 0.0; // INITIALIZE THE SUM
for (SCSIZE j = 0; j < nRMax; j++)
{
if (!pMat->IsString(iCount,j))
{
fValX[iCount][jCount] =
pMat->GetDouble(iCount,j);
fSumX[iCount] +=
fValX[iCount][jCount];
jCount++;
}
}
fSumM += fSumX[iCount];
fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE
MEAN
nR[iCount] = jCount; // STORE HOW MANY DATA VALUES WE
HAVE
N += jCount; // THIS IS GRAND TOTAL
jCount = 0; // RESET jCount FOR NEXT VARIABLE
} // END OUTER FOR LOOP
dfB = iCount - 1; // CALCULATE THE DEGREES OF FREEDOM
dfE = N - iCount;
fSumM = fSumM / N; // THIS IS THE GRAND MEAN
for(SCSIZE i = 0; i < iVarNr; i++) {
// GROUPS MAY HAVE DIFFERENT SIZES
// READ INDIVIDUAL GROUP SIZE
for(SCSIZE j = 0; j < nR[i] /* INDIVIDUAL GROUP SIZE */; j++) {
fMSE += (fValX[i][j] - fSumX[i]) * (fValX[i][j] -
fSumX[i]);
}
fMSB += nR[i] * (fSumM - fSumX[i]) * (fSumM - fSumX[i]);
}
fMSB = fMSB / dfB;
fMSE = fMSE / dfE;
PushDouble( fMSB/fMSE );
// WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]