add chi2 and chi2_test
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/d3009be5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/d3009be5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/d3009be5 Branch: refs/heads/JIRA-22/pr-385 Commit: d3009be59bcf314b373038e3db8903a041396931 Parents: 6f9b4fa Author: amaya <[email protected]> Authored: Fri Sep 16 16:00:58 2016 +0900 Committer: amaya <[email protected]> Committed: Fri Sep 16 16:00:58 2016 +0900 ---------------------------------------------------------------------- .../ftvec/selection/ChiSquareTestUDF.java | 21 +++++ .../hivemall/ftvec/selection/ChiSquareUDF.java | 21 +++++ .../ftvec/selection/DissociationDegreeUDF.java | 88 ++++++++++++++++++++ .../java/hivemall/utils/math/StatsUtils.java | 49 +++++++++++ 4 files changed, 179 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java new file mode 100644 index 0000000..d367085 --- /dev/null +++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java @@ -0,0 +1,21 @@ +package hivemall.ftvec.selection; + +import hivemall.utils.math.StatsUtils; +import org.apache.hadoop.hive.ql.exec.Description; + +import javax.annotation.Nonnull; + +@Description(name = "chi2_test", + value = "_FUNC_(array<number> expected, array<number> observed) - Returns p-value as double") +public class ChiSquareTestUDF extends DissociationDegreeUDF { + @Override + double calcDissociation(@Nonnull final double[] expected,@Nonnull final double[] observed) { + return StatsUtils.chiSquareTest(expected, observed); + } + + @Override + @Nonnull + String getFuncName() { + return "chi2_test"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java new file mode 100644 index 0000000..937b1bd --- /dev/null +++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java @@ -0,0 +1,21 @@ +package hivemall.ftvec.selection; + +import hivemall.utils.math.StatsUtils; +import org.apache.hadoop.hive.ql.exec.Description; + +import javax.annotation.Nonnull; + +@Description(name = "chi2", + value = "_FUNC_(array<number> expected, array<number> observed) - Returns chi2-value as double") +public class ChiSquareUDF extends DissociationDegreeUDF { + @Override + double calcDissociation(@Nonnull final double[] expected,@Nonnull final double[] observed) { + return StatsUtils.chiSquare(expected, observed); + } + + @Override + @Nonnull + String getFuncName() { + return "chi2"; + } +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java b/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java new file mode 100644 index 0000000..0acae82 --- /dev/null +++ b/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java @@ -0,0 +1,88 @@ +package hivemall.ftvec.selection; + +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.lang.Preconditions; +import hivemall.utils.math.StatsUtils; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException; +import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.io.DoubleWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +import javax.annotation.Nonnull; + +@Description(name = "", + value = "_FUNC_(array<number> expected, array<number> observed) - Returns dissociation degree as double") +public abstract class DissociationDegreeUDF extends GenericUDF { + private ListObjectInspector expectedOI; + private DoubleObjectInspector expectedElOI; + private ListObjectInspector observedOI; + private DoubleObjectInspector observedElOI; + + @Override + public ObjectInspector initialize(ObjectInspector[] OIs) throws UDFArgumentException { + if (OIs.length != 2) { + throw new UDFArgumentLengthException("Specify two arguments."); + } + + if (!HiveUtils.isListOI(OIs[0]) + || !HiveUtils.isNumberOI(((ListObjectInspector) OIs[0]).getListElementObjectInspector())){ + throw new UDFArgumentTypeException(0, "Only array<number> type argument is acceptable but " + + OIs[0].getTypeName() + " was passed as `expected`"); + } + + if (!HiveUtils.isListOI(OIs[1]) + || !HiveUtils.isNumberOI(((ListObjectInspector) OIs[1]).getListElementObjectInspector())){ + throw new UDFArgumentTypeException(1, "Only array<number> type argument is acceptable but " + + OIs[1].getTypeName() + " was passed as `observed`"); + } + + expectedOI = (ListObjectInspector) OIs[0]; + expectedElOI = (DoubleObjectInspector) expectedOI.getListElementObjectInspector(); + observedOI = (ListObjectInspector) OIs[1]; + observedElOI = (DoubleObjectInspector) observedOI.getListElementObjectInspector(); + + return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; + } + + @Override + public Object evaluate(GenericUDF.DeferredObject[] dObj) throws HiveException { + final double[] expected = HiveUtils.asDoubleArray(dObj[0].get(),expectedOI,expectedElOI); + final double[] observed = HiveUtils.asDoubleArray(dObj[1].get(),observedOI,observedElOI); + + Preconditions.checkNotNull(expected); + Preconditions.checkNotNull(observed); + Preconditions.checkArgument(expected.length == observed.length); + + final double dissociation = calcDissociation(expected,observed); + + return new DoubleWritable(dissociation); + } + + @Override + public String getDisplayString(String[] children) { + final StringBuilder sb = new StringBuilder(); + sb.append(getFuncName()); + sb.append("("); + if (children.length > 0) { + sb.append(children[0]); + for (int i = 1; i < children.length; i++) { + sb.append(", "); + sb.append(children[i]); + } + } + sb.append(")"); + return sb.toString(); + } + + abstract double calcDissociation(@Nonnull final double[] expected,@Nonnull final double[] observed); + + @Nonnull + abstract String getFuncName(); +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/utils/math/StatsUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/utils/math/StatsUtils.java b/core/src/main/java/hivemall/utils/math/StatsUtils.java index 42a2c90..ffccea3 100644 --- a/core/src/main/java/hivemall/utils/math/StatsUtils.java +++ b/core/src/main/java/hivemall/utils/math/StatsUtils.java @@ -22,6 +22,7 @@ import hivemall.utils.lang.Preconditions; import javax.annotation.Nonnull; +import org.apache.commons.math3.distribution.ChiSquaredDistribution; import org.apache.commons.math3.linear.DecompositionSolver; import org.apache.commons.math3.linear.LUDecomposition; import org.apache.commons.math3.linear.RealMatrix; @@ -189,4 +190,52 @@ public final class StatsUtils { return 1.d - numerator / denominator; } + /** + * @param expected mean vector whose value is expected + * @param observed mean vector whose value is observed + * @return chi2-value + */ + public static double chiSquare(@Nonnull final double[] expected, @Nonnull final double[] observed) { + Preconditions.checkArgument(expected.length == observed.length); + + double sumExpected = 0.0D; + double sumObserved = 0.0D; + + for (int ratio = 0; ratio < observed.length; ++ratio) { + sumExpected += expected[ratio]; + sumObserved += observed[ratio]; + } + + double var15 = 1.0D; + boolean rescale = false; + if (Math.abs(sumExpected - sumObserved) > 1.0E-5D) { + var15 = sumObserved / sumExpected; + rescale = true; + } + + double sumSq = 0.0D; + + for (int i = 0; i < observed.length; ++i) { + double dev; + if (rescale) { + dev = observed[i] - var15 * expected[i]; + sumSq += dev * dev / (var15 * expected[i]); + } else { + dev = observed[i] - expected[i]; + sumSq += dev * dev / expected[i]; + } + } + + return sumSq; + } + + /** + * @param expected means vector whose value is expected + * @param observed means vector whose value is observed + * @return p-value + */ + public static double chiSquareTest(@Nonnull final double[] expected,@Nonnull final double[] observed) { + ChiSquaredDistribution distribution = new ChiSquaredDistribution(null, (double)expected.length - 1.0D); + return 1.0D - distribution.cumulativeProbability(chiSquare(expected, observed)); + } }
