Author: psteitz
Date: Sun Jun 24 14:10:19 2007
New Revision: 550285
URL: http://svn.apache.org/viewvc?view=rev&rev=550285
Log:
Added two-sample (binned comparison) ChiSquare test
JIRA: MATH-160
Thanks to: Matthias Hummel
Modified:
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/ChiSquareTest.java
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/ChiSquareTestImpl.java
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/TestUtils.java
jakarta/commons/proper/math/trunk/src/test/org/apache/commons/math/stat/inference/ChiSquareTestTest.java
jakarta/commons/proper/math/trunk/xdocs/changes.xml
Modified:
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/ChiSquareTest.java
URL:
http://svn.apache.org/viewvc/jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/ChiSquareTest.java?view=diff&rev=550285&r1=550284&r2=550285
==============================================================================
---
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/ChiSquareTest.java
(original)
+++
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/ChiSquareTest.java
Sun Jun 24 14:10:19 2007
@@ -211,4 +211,118 @@
*/
boolean chiSquareTest(long[][] counts, double alpha)
throws IllegalArgumentException, MathException;
+
+ /**
+ * <p>Computes a
+ * <a
href="http://www.itl.nist.gov/div898/software/dataplot/refman1/auxillar/chi2samp.htm">
+ * Chi-Square two sample test statistic</a> comparing bin frequency counts
+ * in <code>observed1</code> and <code>observed2</code>. The
+ * sums of frequency counts in the two samples are not required to be the
+ * same. The formula used to compute the test statistic is</p>
+ * <code>
+ * ∑[(K * observed1[i] - observed2[i]/K)<sup>2</sup> / (observed1[i] +
observed2[i])]
+ * </code> where
+ * <br/><code>K = &sqrt;[&sum(observed2 / ∑(observed1)]</code>
+ * </p>
+ * <p>This statistic can be used to perform a Chi-Square test evaluating
the null hypothesis that
+ * both observed counts follow the same distribution.
+ * <p>
+ * <strong>Preconditions</strong>: <ul>
+ * <li>Observed counts must be non-negative.
+ * </li>
+ * <li>Observed counts for a specific bin must not both be zero.
+ * </li>
+ * <li>Observed counts for a specific sample must not all be 0.
+ * </li>
+ * <li>The arrays <code>observed1</code> and <code>observed2</code> must
have the same length and
+ * their common length must be at least 2.
+ * </li></ul><p>
+ * If any of the preconditions are not met, an
+ * <code>IllegalArgumentException</code> is thrown.
+ *
+ * @param observed1 array of observed frequency counts of the first data
set
+ * @param observed2 array of observed frequency counts of the second data
set
+ * @return chiSquare statistic
+ * @throws IllegalArgumentException if preconditions are not met
+ */
+ double chiSquareDataSetsComparison(long[] observed1, long[] observed2)
+ throws IllegalArgumentException;
+
+ /**
+ * <p>Returns the <i>observed significance level</i>, or <a href=
+ * "http://www.cas.lancs.ac.uk/glossary_v1.1/hyptest.html#pvalue">
+ * p-value</a>, associated with a Chi-Square two sample test comparing
+ * bin frequency counts in <code>observed1</code> and
+ * <code>observed2</code>.
+ * </p>
+ * <p>The number returned is the smallest significance level at which one
+ * can reject the null hypothesis that the observed counts conform to the
+ * same distribution.
+ * </p>
+ * <p>See [EMAIL PROTECTED] #chiSquareDataSetsComparison(long[], long[])
for details
+ * on the formula used to compute the test statistic. The degrees of
+ * of freedom used to perform the test is one less than the common length
+ * of the input observed count arrays.
+ * </p>
+ * <strong>Preconditions</strong>: <ul>
+ * <li>Observed counts must be non-negative.
+ * </li>
+ * <li>Observed counts for a specific bin must not both be zero.
+ * </li>
+ * <li>Observed counts for a specific sample must not all be 0.
+ * </li>
+ * <li>The arrays <code>observed1</code> and <code>observed2</code> must
+ * have the same length and
+ * their common length must be at least 2.
+ * </li></ul><p>
+ * If any of the preconditions are not met, an
+ * <code>IllegalArgumentException</code> is thrown.
+ *
+ * @param observed1 array of observed frequency counts of the first data
set
+ * @param observed2 array of observed frequency counts of the second data
set
+ * @return p-value
+ * @throws IllegalArgumentException if preconditions are not met
+ * @throws MathException if an error occurs computing the p-value
+ */
+ double chiSquareTestDataSetsComparison(long[] observed1, long[] observed2)
+ throws IllegalArgumentException, MathException;
+
+ /**
+ * <p>Performs a Chi-Square two sample test comparing two binned data
+ * sets. The test evaluates the null hypothesis that the two lists of
+ * observed counts conform to the same frequency distribution, with
+ * significance level <code>alpha</code>. Returns true iff the null
+ * hypothesis can be rejected with 100 * (1 - alpha) percent confidence.
+ * </p>
+ * <p>See [EMAIL PROTECTED] #chiSquareDataSetsComparison(double[],
double[])} for
+ * details on the forumla used to compute the Chisquare statistic used
+ * in the test. The degrees of of freedom used to perform the test is
+ * one less than the common length of the input observed count arrays.
+ * </p>
+ * <strong>Preconditions</strong>: <ul>
+ * <li>Observed counts must be non-negative.
+ * </li>
+ * <li>Observed counts for a specific bin must not both be zero.
+ * </li>
+ * <li>Observed counts for a specific sample must not all be 0.
+ * </li>
+ * <li>The arrays <code>observed1</code> and <code>observed2</code> must
+ * have the same length and their common length must be at least 2.
+ * </li>
+ * <li> <code> 0 < alpha < 0.5 </code>
+ * </li></ul><p>
+ * If any of the preconditions are not met, an
+ * <code>IllegalArgumentException</code> is thrown.
+ *
+ * @param observed1 array of observed frequency counts of the first data
set
+ * @param observed2 array of observed frequency counts of the second data
set
+ * @param alpha significance level of the test
+ * @return true iff null hypothesis can be rejected with confidence
+ * 1 - alpha
+ * @throws IllegalArgumentException if preconditions are not met
+ * @throws MathException if an error occurs performing the test
+ */
+ boolean chiSquareTestDataSetsComparison(long[] observed1, long[]
observed2, double alpha)
+ throws IllegalArgumentException, MathException;
+
}
Modified:
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/ChiSquareTestImpl.java
URL:
http://svn.apache.org/viewvc/jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/ChiSquareTestImpl.java?view=diff&rev=550285&r1=550284&r2=550285
==============================================================================
---
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/ChiSquareTestImpl.java
(original)
+++
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/ChiSquareTestImpl.java
Sun Jun 24 14:10:19 2007
@@ -173,6 +173,99 @@
}
/**
+ * @param observed1 array of observed frequency counts of the first data
set
+ * @param observed2 array of observed frequency counts of the second data
set
+ * @return chi-square test statistic
+ * @throws IllegalArgumentException if preconditions are not met
+ */
+ public double chiSquareDataSetsComparison(long[] observed1, long[]
observed2)
+ throws IllegalArgumentException {
+
+ // Make sure lengths are same
+ if ((observed1.length < 2) || (observed1.length != observed2.length)) {
+ throw new IllegalArgumentException(
+ "oberved1, observed2 array lengths incorrect");
+ }
+ // Ensure non-negative counts
+ if (!isNonNegative(observed1) || !isNonNegative(observed2)) {
+ throw new IllegalArgumentException(
+ "observed counts must be non-negative");
+ }
+ // Compute and compare count sums
+ long countSum1 = 0;
+ long countSum2 = 0;
+ boolean unequalCounts = false;
+ double weight = 0.0;
+ for (int i = 0; i < observed1.length; i++) {
+ countSum1 += observed1[i];
+ countSum2 += observed2[i];
+ }
+ // Ensure neither sample is uniformly 0
+ if (countSum1 * countSum2 == 0) {
+ throw new IllegalArgumentException(
+ "observed counts cannot all be 0");
+ }
+ // Compare and compute weight only if different
+ unequalCounts = (countSum1 != countSum2);
+ if (unequalCounts) {
+ weight = Math.sqrt((double) countSum1 / (double) countSum2);
+ }
+ // Compute ChiSquare statistic
+ double sumSq = 0.0d;
+ double dev = 0.0d;
+ double obs1 = 0.0d;
+ double obs2 = 0.0d;
+ for (int i = 0; i < observed1.length; i++) {
+ if (observed1[i] == 0 && observed2[i] == 0) {
+ throw new IllegalArgumentException(
+ "observed counts must not both be zero");
+ } else {
+ obs1 = (double) observed1[i];
+ obs2 = (double) observed2[i];
+ if (unequalCounts) { // apply weights
+ dev = obs1/weight - obs2 * weight;
+ } else {
+ dev = obs1 - obs2;
+ }
+ sumSq += (dev * dev) / (obs1 + obs2);
+ }
+ }
+ return sumSq;
+ }
+
+ /**
+ * @param observed1 array of observed frequency counts of the first data
set
+ * @param observed2 array of observed frequency counts of the second data
set
+ * @return p-value
+ * @throws IllegalArgumentException if preconditions are not met
+ * @throws MathException if an error occurs computing the p-value
+ */
+ public double chiSquareTestDataSetsComparison(long[] observed1, long[]
observed2)
+ throws IllegalArgumentException, MathException {
+ distribution.setDegreesOfFreedom((double) observed1.length - 1);
+ return 1 - distribution.cumulativeProbability(
+ chiSquareDataSetsComparison(observed1, observed2));
+ }
+
+ /**
+ * @param observed1 array of observed frequency counts of the first data
set
+ * @param observed2 array of observed frequency counts of the second data
set
+ * @param alpha significance level of the test
+ * @return true iff null hypothesis can be rejected with confidence
+ * 1 - alpha
+ * @throws IllegalArgumentException if preconditions are not met
+ * @throws MathException if an error occurs performing the test
+ */
+ public boolean chiSquareTestDataSetsComparison(long[] observed1, long[]
observed2,
+ double alpha) throws IllegalArgumentException, MathException {
+ if ((alpha <= 0) || (alpha > 0.5)) {
+ throw new IllegalArgumentException(
+ "bad significance level: " + alpha);
+ }
+ return (chiSquareTestDataSetsComparison(observed1, observed2) < alpha);
+ }
+
+ /**
* Checks to make sure that the input long[][] array is rectangular,
* has at least 2 rows and 2 columns, and has all non-negative entries,
* throwing IllegalArgumentException if any of these checks fail.
@@ -281,10 +374,12 @@
}
return true;
}
-
+
/**
* Modify the distribution used to compute inference statistics.
- * @param value the new distribution
+ *
+ * @param value
+ * the new distribution
* @since 1.2
*/
public void setDistribution(ChiSquaredDistribution value) {
Modified:
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/TestUtils.java
URL:
http://svn.apache.org/viewvc/jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/TestUtils.java?view=diff&rev=550285&r1=550284&r2=550285
==============================================================================
---
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/TestUtils.java
(original)
+++
jakarta/commons/proper/math/trunk/src/java/org/apache/commons/math/stat/inference/TestUtils.java
Sun Jun 24 14:10:19 2007
@@ -276,4 +276,31 @@
return chiSquareTest. chiSquareTest(counts);
}
+ /**
+ * @see
org.apache.commons.math.stat.inference.ChiSquareTest#chiSquareDataSetsComparison(double[],
double[])
+ */
+ public static double chiSquareDataSetsComparison(long[] observed1, long[]
observed2)
+ throws IllegalArgumentException {
+ return chiSquareTest.chiSquareDataSetsComparison(observed1, observed2);
+ }
+
+ /**
+ * @see
org.apache.commons.math.stat.inference.ChiSquareTest#chiSquareTestDataSetsComparison(double[],
double[])
+ */
+ public static double chiSquareTestDataSetsComparison(long[] observed1,
long[] observed2)
+ throws IllegalArgumentException, MathException {
+ return chiSquareTest.chiSquareTestDataSetsComparison(observed1,
observed2);
+ }
+
+
+ /**
+ * @see
org.apache.commons.math.stat.inference.ChiSquareTest#chiSquareTestDataSetsComparison(double[],
double[], double)
+ */
+ public static boolean chiSquareTestDataSetsComparison(long[] observed1,
long[] observed2,
+ double alpha)
+ throws IllegalArgumentException, MathException {
+ return chiSquareTest.chiSquareTestDataSetsComparison(observed1,
observed2, alpha);
+ }
+
+
}
Modified:
jakarta/commons/proper/math/trunk/src/test/org/apache/commons/math/stat/inference/ChiSquareTestTest.java
URL:
http://svn.apache.org/viewvc/jakarta/commons/proper/math/trunk/src/test/org/apache/commons/math/stat/inference/ChiSquareTestTest.java?view=diff&rev=550285&r1=550284&r2=550285
==============================================================================
---
jakarta/commons/proper/math/trunk/src/test/org/apache/commons/math/stat/inference/ChiSquareTestTest.java
(original)
+++
jakarta/commons/proper/math/trunk/src/test/org/apache/commons/math/stat/inference/ChiSquareTestTest.java
Sun Jun 24 14:10:19 2007
@@ -193,4 +193,70 @@
assertEquals("chi-square p-value", 0.0462835770603,
testStatistic.chiSquareTest(counts), 1E-9);
}
+
+ /** Target values verified using DATAPLOT version 2006.3 */
+ public void testChiSquareDataSetsComparisonEqualCounts()
+ throws Exception {
+ long[] observed1 = {10, 12, 12, 10};
+ long[] observed2 = {5, 15, 14, 10};
+ assertEquals("chi-square p value", 0.541096,
+ testStatistic.chiSquareTestDataSetsComparison(
+ observed1, observed2), 1E-6);
+ assertEquals("chi-square test statistic", 2.153846,
+ testStatistic.chiSquareDataSetsComparison(
+ observed1, observed2), 1E-6);
+ assertFalse("chi-square test result",
+ testStatistic.chiSquareTestDataSetsComparison(
+ observed1, observed2, 0.4));
+ }
+
+ /** Target values verified using DATAPLOT version 2006.3 */
+ public void testChiSquareDataSetsComparisonUnEqualCounts()
+ throws Exception {
+ long[] observed1 = {10, 12, 12, 10, 15};
+ long[] observed2 = {15, 10, 10, 15, 5};
+ assertEquals("chi-square p value", 0.124115,
+ testStatistic.chiSquareTestDataSetsComparison(
+ observed1, observed2), 1E-6);
+ assertEquals("chi-square test statistic", 7.232189,
+ testStatistic.chiSquareDataSetsComparison(
+ observed1, observed2), 1E-6);
+ assertTrue("chi-square test result",
+ testStatistic.chiSquareTestDataSetsComparison(
+ observed1, observed2, 0.13));
+ assertFalse("chi-square test result",
+ testStatistic.chiSquareTestDataSetsComparison(
+ observed1, observed2, 0.12));
+ }
+
+ public void testChiSquareDataSetsComparisonBadCounts()
+ throws Exception {
+ long[] observed1 = {10, -1, 12, 10, 15};
+ long[] observed2 = {15, 10, 10, 15, 5};
+ try {
+ testStatistic.chiSquareTestDataSetsComparison(
+ observed1, observed2);
+ fail("Expecting IllegalArgumentException - negative count");
+ } catch (IllegalArgumentException ex) {
+ // expected
+ }
+ long[] observed3 = {10, 0, 12, 10, 15};
+ long[] observed4 = {15, 0, 10, 15, 5};
+ try {
+ testStatistic.chiSquareTestDataSetsComparison(
+ observed3, observed4);
+ fail("Expecting IllegalArgumentException - double 0's");
+ } catch (IllegalArgumentException ex) {
+ // expected
+ }
+ long[] observed5 = {10, 10, 12, 10, 15};
+ long[] observed6 = {0, 0, 0, 0, 0};
+ try {
+ testStatistic.chiSquareTestDataSetsComparison(
+ observed5, observed6);
+ fail("Expecting IllegalArgumentException - vanishing counts");
+ } catch (IllegalArgumentException ex) {
+ // expected
+ }
+ }
}
Modified: jakarta/commons/proper/math/trunk/xdocs/changes.xml
URL:
http://svn.apache.org/viewvc/jakarta/commons/proper/math/trunk/xdocs/changes.xml?view=diff&rev=550285&r1=550284&r2=550285
==============================================================================
--- jakarta/commons/proper/math/trunk/xdocs/changes.xml (original)
+++ jakarta/commons/proper/math/trunk/xdocs/changes.xml Sun Jun 24 14:10:19 2007
@@ -84,6 +84,9 @@
<action dev="psteitz" type="update" issue="MATH-158" due-to "Hasan
Diwan">
Added log function to MathUtils.
</action>
+ <action dev="psteitz" type="update" issue="MATH-160" due-to "Matthias
Hummel">
+ Added two sample (binned comparison) ChiSquare test.
+ </action>
</release>
<release version="1.1" date="2005-12-17"
description="This is a maintenance release containing bug fixes and
enhancements.
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]