Hi,
I have updated even more code. Both functions should be functional now.
I was able to compile the code outside of OOo and it compiled (with MS
VC 6.0 - the 1998 version) and worked correctly, though I have no idea
if it will integrate seamlessly into OOo.
[For the external compile I had to make some minor changes to the code,
but the general structure was unchanged.]
Now, both the one matrix and the multiple matrices variants function.
Somebody with some OOo knowledge should now integrate the code where it
belongs. A Menu Entry like 'One-Way Anova' seems reasonable. This would
partially solve issue
http://www.openoffice.org/issues/show_bug.cgi?id=4921 , well at least
the one way ANOVA. [This is the more important of the ANOVA variants.]
Kind regards,
Leonard Mada
Code included:
Calc-ANOVA-No_ARRAY.cpp
- contains the code for one-way ANOVA (NO block type),
- all input is in the form of:
-- one data matrix (every column is a different variable) or
-- multiple data matrices
- have removed most dynamic arrays
(except for two, which are initialized and destroyed properly)
ANOVA.Test.Code.cpp
- contains the same C++ code adapted so that it can be compiled with MS
VC 6.0 (independent of OOo code)
- this way, it is possible to test the code without having to
(re)compile OOo
- see also the 'ANOVA-Test.gnumeric' from my previous post
ANOVA-Test.gnumeric (see previous post for this file)
- gnumeric file to test the results of the ANOVA code
// This implementation uses a robust 2-pass algorithm.
// While one-pass algorithms are faster, they are notoriously
// unstable, precluding their use in serious statistical
// programs.
void ScInterpreter::ScANOVA()
{
// WE GET EITHER A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE
VARIABLE
// DISADVANTAGE:
// - ONLY ONE COLUMN PER VARIABLE
// - COLUMNS MUST BE CONTIGUOS
// ADVANTAGE:
// - FAST AND EASY TO WRITE
// - JUST ONE SELECTION
// OR MULTIPLE MATRICES, EACH MATRIX IS ONE VARIABLE
// DISADVANTAGE:
// - [CALC FUNCTIONS ACCEPT ONLY 30 PARAMS
// SO THERE ARE AT MOST 30 VARIABLES]
// - CUMBERSOME TO WRITE/SELECT ALL MATRICES
// STORES ACTUALLY THE NUMBER OF VARIABLES
SCSIZE iVarNr = GetByte() /* NUMBER OF PARAMETERS */;
SCSIZE iVarTmp = iVarNr; // NUMBER OF DATA MATRICES
if ( iVarNr == 0 /* NO PARAMETERS */)
return; // EXIT
if ( iVarNr == 1 /* ONLY ONE PARAMETER */ ) {
ScInterpreter::ScANOVAMono()
return;
}
ScInterpreter::ScANOVAMultiple(iVarNr)
}
void ScInterpreter::ScANOVAMultiple(SCSIZE iVarNr)
{
// WE GET MULTIPLE MATRICES, EACH MATRIX CODES ONE VARIABLE
// iVarNr STORES ACTUALLY THE NUMBER OF VARIABLES
SCSIZE *nVar = new unsigned int[iVarNr];
// THIS WILL STORE THE NUMBER OF DATA VALUES FOR EACH OF THE VARIABLES
SCSIZE dfB = 0; // DEGREES OF FREEDOM
SCSIZE dfE = 0; // DEGREES OF FREEDOM
SCSIZE N = 0; // TOTAL NUMBER OF DATA VALUES
SCSIZE nR, nC; // MATRIX DIMENSIONS
SCSIZE iCount = 0; // INDEX POINTING TO CURRENT VARIABLE
SCSIZE jCount = 0; // NUMBER OF VALUES FOR CURRENT VARIABLE
double fSumM = 0.0; // THIS IS THE GRAND MEAN
// WE NEED AN ARRAY TO STORE THE MEAN FOR EVERY GROUP (VARIABLE)
double *fSumX = new double[iVarNr]; // THE MEANS FOR THE INDIVIDUAL VARS
double fValX; // THE
INDIVIDUAL VALUES
for (iCount = 0; iCount < iVarNr; iCount++) {
fSumX[iCount] = 0.0; // INITIALIZE THE SUM
ScMatrixRef pMat = GetMatrix();
if (!pMat) {
// NO DATA MATRIX - INVALID PARAMETERS
SetIllegalParameter();
delete nVar;
delete fSumX;
return;
}
pMat->GetDimensions(nC, nR);
for (SCSIZE i = 0; i < nC; i++)
for (SCSIZE j = 0; j < nR; j++)
{
if (!pMat->IsString(i,j))
{
fSumX[iCount] +=
pMat->GetDouble(i,j);
jCount++;
}
}
fSumM += fSumX[iCount];
fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE GROUP
MEAN
nVar[iCount] = jCount; // STORE HOW MANY DATA VALUES EXIST FOR
THIS VAR
N += jCount; // ADD FOR GRAND TOTAL = TOTAL No OF
DATA VALUES
jCount = 0; // RESET jCount FOR NEXT VARIABLE
} // END OUTER FOR LOOP
dfB = iVarNr -1 ; // THESE ARE THE DEGREES OF FREEDOM
dfE = N - iVarNr;
fSumM = fSumM / N; // THIS IS THE GRAND MEAN
double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)
for(iCount = 0; iCount < iVarNr; iCount++) {
ScMatrixRef pMat = GetMatrix();
pMat->GetDimensions(nC, nR);
for (SCSIZE i = 0; i < nC; i++)
for (SCSIZE j = 0; j < nR; j++)
{
if (!pMat->IsString(i,j))
{
fValX = pMat->GetDouble(i,j);
fMSE += (fValX - fSumX[iCount]) *
(fValX - fSumX[iCount]);
}
}
fMSB += nVar[iCount] * (fSumM - fSumX[iCount]) * (fSumM -
fSumX[iCount]);
} // END OUTER FOR LOOP
fMSB = fMSB / dfB;
fMSE = fMSE / dfE;
PushDouble( fMSB/fMSE );
// TODO:
// - WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
// - THIS IS DONE USING: =FDIST(fMSB/fMSE; dfB; dfE)
delete nVar;
delete fSumX;
}
void ScInterpreter::ScANOVAMono()
{
// WE GOT A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE VARIABLE
// DISADVANTAGE: ONLY ONE COLUMN PER VARIABLE
// BUT IT IS EASYER TO USE AND IT IS NOT LIMITED TO 30 VARIABLES
ScMatrixRef pMat = GetMatrix();
if (!pMat) {
// NO DATA MATRIX - INVALID PARAMETERS
SetIllegalParameter();
return;
}
SCSIZE iVarNr, nRMax;
// WE HAVE ONLY ONE MATRIX
// WE CONSIDER EVERY COLUMN AS A SEPARATE DATA SET
pMat->GetDimensions(iVarNr, nRMax);
if( iVarNr == 1 ) {
SetNoValue();
return; // ONLY ONE VARIABLE - ANOVA NOT POSSIBLE
}
SCSIZE *nR = new unsigned int[iVarNr];
// THIS WILL STORE THE NUMBER OF DATA VALUES FOR EACH VARIABLE
SCSIZE dfB = 0; // DEGREES OF FREEDOM
SCSIZE dfE = 0; // DEGREES OF FREEDOM
SCSIZE N = 0; // TOTAL NUMBER OF DATA VALUES
SCSIZE iCount = 0; // INDEX POINTING TO CURRENT VARIABLE
SCSIZE jCount = 0; // NUMBER OF VALUES FOR CURRENT VARIABLE
double fSumM = 0.0; // THIS IS THE GRAND MEAN
// WE NEED AN ARRAY TO STORE THE MEAN FOR EVERY GROUP (VARIABLE)
double *fSumX = new double[iVarNr]; // THE MEANS FOR THE INDIVIDUAL VARS
double fValX; // THE
INDIVIDUAL VALUES
for (iCount = 0; iCount < iVarNr; iCount++) {
fSumX[iCount] = 0.0; // INITIALIZE THE PARTIAL SUM
for (SCSIZE j = 0; j < nRMax; j++)
{
if (!pMat->IsString(iCount,j))
{
fSumX[iCount] +=
pMat->GetDouble(iCount,j);
jCount++;
}
}
fSumM += fSumX[iCount];
// GRAND TOTAL
fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE
MEAN
nR[iCount] = jCount; // STORE HOW MANY DATA VALUES WE
HAVE
N += jCount; // THIS IS THE TOTAL NUMBER OF
VALUES
jCount = 0; // RESET jCount FOR NEXT
VARIABLE
} // END OUTER FOR LOOP
dfB = iCount - 1; // CALCULATE THE DEGREES OF FREEDOM (df)
dfE = N - iCount;
fSumM = fSumM / N; // THIS IS THE GRAND MEAN
double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)
for (iCount = 0; iCount < iVarNr; iCount++) {
for (jCount = 0; jCount < nRMax; jCount++)
{
if (!pMat->IsString(iCount,jCount))
{
fValX = pMat->GetDouble(iCount,jCount);
fMSE += (fValX - fSumX[iCount]) * (fValX -
fSumX[iCount]);
}
}
fMSB += nR[iCount] * (fSumM - fSumX[iCount]) * (fSumM -
fSumX[iCount]);
} // END OUTER FOR LOOP
fMSB = fMSB / dfB;
fMSE = fMSE / dfE;
PushDouble( fMSB/fMSE );
// TODO:
// - WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
// - THIS IS DONE USING: =FDIST(fMSB/fMSE; dfB; dfE)
delete nR;
delete fSumX;
}
// This application tests the ANOVA code, independently of OOo
// Just run the application and compare the F-statistic
// from the outputted (last value) with that computed
// using gnumeric
// Compiled with MS VC 6.0 (1998)
//#include "stdafx.h"
#include <stdio.h>
#include "iostream"
using namespace std;
///////////////////
// We emulate some of the OOo routines and functions
typedef unsigned int SCSIZE;
void SetNoValue() {
wcout << L"No Value";
return;
}
void SetIllegalParameter() {
wcout << L"Illegal Parameter";
return;
}
void PushDouble(double i) {
wcout << L"\nF-Statistic: " << i;
return;
}
///////////////////////////////////
// More emulation
class ScMatrixRef {
private:
const SCSIZE ic;
const SCSIZE jc;
public:
ScMatrixRef(SCSIZE i, SCSIZE j):
ic(i),
jc(j) {return;}
void GetDimensions(SCSIZE &iR, SCSIZE &jR) {
iR = ic;
jR = jc;
return;
}
bool IsString(SCSIZE i, SCSIZE j) {
return false;
}
bool IsMatrix() const {return true;}
bool operator ! () const {return !IsMatrix();}
double GetDouble(double i, double j) {
i = (i > 1)?i:1.1;
return j * (i-1)/(i+1);
}
};
ScMatrixRef* pMatrix;
void GetMatrix(ScMatrixRef &MatrixClass) {
pMatrix = &MatrixClass;
return;}
ScMatrixRef &GetMatrix() {
return *pMatrix;}
///////////////////////////////////
// New OOo CALC FUNCTIONS //
// THIS IS THE IMPORTANT CODE
void ScANOVAMultiple(SCSIZE iVarNr)
{
// WE GET MULTIPLE MATRICES, EACH MATRIX CODES ONE VARIABLE
// iVarNr STORES ACTUALLY THE NUMBER OF VARIABLES
SCSIZE *nVar = new unsigned int[iVarNr];
// THIS WILL STORE THE NUMBER OF DATA VALUES FOR EACH OF THE VARIABLES
SCSIZE dfB = 0; // DEGREES OF FREEDOM
SCSIZE dfE = 0; // DEGREES OF FREEDOM
SCSIZE N = 0; // TOTAL NUMBER OF DATA VALUES
SCSIZE nR, nC; // MATRIX DIMENSIONS
SCSIZE iCount = 0; // INDEX POINTING TO CURRENT VARIABLE
SCSIZE jCount = 0; // NUMBER OF VALUES FOR CURRENT VARIABLE
double fSumM = 0.0; // THIS IS THE GRAND MEAN
// WE NEED AN ARRAY TO STORE THE MEAN FOR EVERY GROUP (VARIABLE)
double *fSumX = new double[iVarNr]; // THE MEANS FOR THE INDIVIDUAL VARS
double fValX; // THE
INDIVIDUAL VALUES
for (iCount = 0; iCount < iVarNr; iCount++) {
fSumX[iCount] = 0.0; // INITIALIZE THE SUM
ScMatrixRef pMat = GetMatrix();
if (!pMat) {
// NO DATA MATRIX - INVALID PARAMETERS
SetIllegalParameter();
delete nVar;
delete fSumX;
return;
}
pMat.GetDimensions(nC, nR);
for (SCSIZE i = 0; i < nC; i++)
for (SCSIZE j = 0; j < nR; j++)
{
if (!pMat.IsString(i,j))
{
fSumX[iCount] += pMat.GetDouble(/* i
*/iCount,j); // FOR TESTING PURPOSES
jCount++;
}
}
fSumM += fSumX[iCount];
fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE GROUP
MEAN
nVar[iCount] = jCount; // STORE HOW MANY DATA VALUES EXIST FOR
THIS VAR
N += jCount; // ADD FOR GRAND TOTAL = TOTAL No OF
DATA VALUES
jCount = 0; // RESET jCount FOR NEXT VARIABLE
} // END OUTER FOR LOOP
dfB = iVarNr -1 ; // THESE ARE THE DEGREES OF FREEDOM
dfE = N - iVarNr;
fSumM = fSumM / N; // THIS IS THE GRAND MEAN
double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)
for(iCount = 0; iCount < iVarNr; iCount++) {
ScMatrixRef pMat = GetMatrix();
pMat.GetDimensions(nC, nR);
for (SCSIZE i = 0; i < nC; i++)
for (SCSIZE j = 0; j < nR; j++)
{
if (!pMat.IsString(i,j))
{
fValX = pMat.GetDouble(/* i */
iCount,j); // FOR TESTING PURPOSES
fMSE += (fValX - fSumX[iCount]) *
(fValX - fSumX[iCount]);
}
}
fMSB += nVar[iCount] * (fSumM - fSumX[iCount]) * (fSumM -
fSumX[iCount]);
} // END OUTER FOR LOOP
fMSB = fMSB / dfB;
fMSE = fMSE / dfE;
PushDouble( fMSB/fMSE );
// TODO:
// - WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
// - THIS IS DONE USING: =FDIST(fMSB/fMSE; dfB; dfE)
delete nVar;
delete fSumX;
}
/////////////////////////////
void ScANOVAMono()
{
// WE GOT A SINGLE MATRIX WHERE EVERY COLUMN IS A SEPARATE VARIABLE
// DISADVANTAGE: ONLY ONE COLUMN PER VARIABLE
// BUT IT IS EASYER TO USE AND IT IS NOT LIMITED TO 30 VARIABLES
ScMatrixRef pMat = GetMatrix();
if (!pMat) {
// NO DATA MATRIX - INVALID PARAMETERS
SetIllegalParameter();
return;
}
SCSIZE iVarNr /* nC */, nRMax;
// WE HAVE ONLY ONE MATRIX
// WE CONSIDER EVERY COLUMN AS A SEPARATE DATA SET
pMat.GetDimensions(iVarNr /* nC */, nRMax);
// iVarNr = nC; // nC IS NOT USED ANY FURTHER
if( iVarNr == 1 ) {
SetNoValue();
return; // ONLY ONE VARIABLE - ANOVA NOT POSSIBLE
}
SCSIZE *nR = new unsigned int[iVarNr];
// THIS WILL STORE THE NUMBER OF DATA VALUES FOR EACH VARIABLE
SCSIZE dfB = 0; // DEGREES OF FREEDOM
SCSIZE dfE = 0; // DEGREES OF FREEDOM
SCSIZE N = 0; // TOTAL NUMBER OF DATA VALUES
SCSIZE iCount = 0; // INDEX POINTING TO CURRENT VARIABLE
SCSIZE jCount = 0; // NUMBER OF VALUES FOR CURRENT VARIABLE
double fSumM = 0.0; // THIS IS THE GRAND MEAN
// WE NEED AN ARRAY TO STORE THE MEAN FOR EVERY GROUP (VARIABLE)
double *fSumX = new double[iVarNr]; // THE MEANS FOR THE INDIVIDUAL VARS
double fValX; // THE
INDIVIDUAL VALUES
for (iCount = 0; iCount < iVarNr; iCount++) {
fSumX[iCount] = 0.0; // INITIALIZE THE SUM
for (SCSIZE j = 0; j < nRMax; j++)
{
if (!pMat.IsString(iCount,j))
{
fSumX[iCount] +=
pMat.GetDouble(iCount,j);
jCount++;
}
}
fSumM += fSumX[iCount];
// GRAND TOTAL
fSumX[iCount] = fSumX[iCount] / jCount; // THIS IS THE
MEAN
nR[iCount] = jCount; // STORE HOW MANY DATA VALUES WE
HAVE
N += jCount; // THIS IS THE TOTAL NUMBER OF VALUES
jCount = 0; // RESET jCount FOR NEXT VARIABLE
wcout << L"Partial Sum: " << fSumX[iCount] << L"\n"; //
TEST
} // END OUTER FOR LOOP
dfB = iCount - 1; // CALCULATE THE DEGREES OF FREEDOM (df)
dfE = N - iCount;
fSumM = fSumM / N; // THIS IS THE GRAND MEAN
double fMSB = 0.0; // THIS IS INTER-GROUP VARIANCE
double fMSE = 0.0; // THIS IS INTRA-GROUP VARIANCE (DUE TO ERROR)
for (iCount = 0; iCount < iVarNr; iCount++) {
for (jCount = 0; jCount < nRMax; jCount++)
{
if (!pMat.IsString(iCount,jCount))
{
fValX = pMat.GetDouble(iCount,jCount);
fMSE += (fValX - fSumX[iCount]) * (fValX -
fSumX[iCount]);
}
}
fMSB += nR[iCount] * (fSumM - fSumX[iCount]) * (fSumM -
fSumX[iCount]);
wcout << L"\n";
} // END OUTER FOR LOOP
fMSB = fMSB / dfB;
fMSE = fMSE / dfE;
PushDouble( fMSB/fMSE );
// TODO:
// - WE STILL NEED TO INTERPRET fMSB/fMSE USING THE F STATISTICS
// - THIS IS DONE USING: =FDIST(fMSB/fMSE; dfB; dfE)
delete nR;
delete fSumX;
}
/////////////////////////////
int main(int argc, char* argv[])
{
// TEST THE FUNCTIONS
ScMatrixRef MatrixClass(5,10);
GetMatrix(MatrixClass);
ScANOVAMono();
ScMatrixRef MatrixClassA(1,10);
GetMatrix(MatrixClassA);
ScANOVAMultiple(5);
//
{wprintf(L"\n\nHit ENTER to terminate the program
");_flushall();getchar();}
//
return 0;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]