add chi2 and chi2_test


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/d3009be5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/d3009be5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/d3009be5

Branch: refs/heads/JIRA-22/pr-385
Commit: d3009be59bcf314b373038e3db8903a041396931
Parents: 6f9b4fa
Author: amaya <g...@sapphire.in.net>
Authored: Fri Sep 16 16:00:58 2016 +0900
Committer: amaya <g...@sapphire.in.net>
Committed: Fri Sep 16 16:00:58 2016 +0900

----------------------------------------------------------------------
 .../ftvec/selection/ChiSquareTestUDF.java       | 21 +++++
 .../hivemall/ftvec/selection/ChiSquareUDF.java  | 21 +++++
 .../ftvec/selection/DissociationDegreeUDF.java  | 88 ++++++++++++++++++++
 .../java/hivemall/utils/math/StatsUtils.java    | 49 +++++++++++
 4 files changed, 179 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java 
b/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java
new file mode 100644
index 0000000..d367085
--- /dev/null
+++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareTestUDF.java
@@ -0,0 +1,21 @@
+package hivemall.ftvec.selection;
+
+import hivemall.utils.math.StatsUtils;
+import org.apache.hadoop.hive.ql.exec.Description;
+
+import javax.annotation.Nonnull;
+
+@Description(name = "chi2_test",
+        value = "_FUNC_(array<number> expected, array<number> observed) - 
Returns p-value as double")
+public class ChiSquareTestUDF extends DissociationDegreeUDF {
+    @Override
+    double calcDissociation(@Nonnull final double[] expected,@Nonnull final  
double[] observed) {
+        return StatsUtils.chiSquareTest(expected, observed);
+    }
+
+    @Override
+    @Nonnull
+    String getFuncName() {
+        return "chi2_test";
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java 
b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java
new file mode 100644
index 0000000..937b1bd
--- /dev/null
+++ b/core/src/main/java/hivemall/ftvec/selection/ChiSquareUDF.java
@@ -0,0 +1,21 @@
+package hivemall.ftvec.selection;
+
+import hivemall.utils.math.StatsUtils;
+import org.apache.hadoop.hive.ql.exec.Description;
+
+import javax.annotation.Nonnull;
+
+@Description(name = "chi2",
+        value = "_FUNC_(array<number> expected, array<number> observed) - 
Returns chi2-value as double")
+public class ChiSquareUDF extends DissociationDegreeUDF {
+    @Override
+    double calcDissociation(@Nonnull final double[] expected,@Nonnull final  
double[] observed) {
+        return StatsUtils.chiSquare(expected, observed);
+    }
+
+    @Override
+    @Nonnull
+    String getFuncName() {
+        return "chi2";
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java 
b/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java
new file mode 100644
index 0000000..0acae82
--- /dev/null
+++ b/core/src/main/java/hivemall/ftvec/selection/DissociationDegreeUDF.java
@@ -0,0 +1,88 @@
+package hivemall.ftvec.selection;
+
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.lang.Preconditions;
+import hivemall.utils.math.StatsUtils;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+
+import javax.annotation.Nonnull;
+
+@Description(name = "",
+        value = "_FUNC_(array<number> expected, array<number> observed) - 
Returns dissociation degree as double")
+public abstract class DissociationDegreeUDF extends GenericUDF {
+    private ListObjectInspector expectedOI;
+    private DoubleObjectInspector expectedElOI;
+    private ListObjectInspector observedOI;
+    private DoubleObjectInspector observedElOI;
+
+    @Override
+    public ObjectInspector initialize(ObjectInspector[] OIs) throws 
UDFArgumentException {
+        if (OIs.length != 2) {
+            throw new UDFArgumentLengthException("Specify two arguments.");
+        }
+
+        if (!HiveUtils.isListOI(OIs[0])
+                || !HiveUtils.isNumberOI(((ListObjectInspector) 
OIs[0]).getListElementObjectInspector())){
+            throw new UDFArgumentTypeException(0, "Only array<number> type 
argument is acceptable but "
+                    + OIs[0].getTypeName() + " was passed as `expected`");
+        }
+
+        if (!HiveUtils.isListOI(OIs[1])
+                || !HiveUtils.isNumberOI(((ListObjectInspector) 
OIs[1]).getListElementObjectInspector())){
+            throw new UDFArgumentTypeException(1, "Only array<number> type 
argument is acceptable but "
+                    + OIs[1].getTypeName() + " was passed as `observed`");
+        }
+
+        expectedOI = (ListObjectInspector) OIs[0];
+        expectedElOI = (DoubleObjectInspector) 
expectedOI.getListElementObjectInspector();
+        observedOI = (ListObjectInspector) OIs[1];
+        observedElOI = (DoubleObjectInspector) 
observedOI.getListElementObjectInspector();
+
+        return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector;
+    }
+
+    @Override
+    public Object evaluate(GenericUDF.DeferredObject[] dObj) throws 
HiveException {
+        final double[] expected = 
HiveUtils.asDoubleArray(dObj[0].get(),expectedOI,expectedElOI);
+        final double[] observed = 
HiveUtils.asDoubleArray(dObj[1].get(),observedOI,observedElOI);
+
+        Preconditions.checkNotNull(expected);
+        Preconditions.checkNotNull(observed);
+        Preconditions.checkArgument(expected.length == observed.length);
+
+        final double dissociation = calcDissociation(expected,observed);
+
+        return new DoubleWritable(dissociation);
+    }
+
+    @Override
+    public String getDisplayString(String[] children) {
+        final StringBuilder sb = new StringBuilder();
+        sb.append(getFuncName());
+        sb.append("(");
+        if (children.length > 0) {
+            sb.append(children[0]);
+            for (int i = 1; i < children.length; i++) {
+                sb.append(", ");
+                sb.append(children[i]);
+            }
+        }
+        sb.append(")");
+        return sb.toString();
+    }
+
+    abstract double calcDissociation(@Nonnull final double[] expected,@Nonnull 
final  double[] observed);
+
+    @Nonnull
+    abstract String getFuncName();
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/d3009be5/core/src/main/java/hivemall/utils/math/StatsUtils.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/hivemall/utils/math/StatsUtils.java 
b/core/src/main/java/hivemall/utils/math/StatsUtils.java
index 42a2c90..ffccea3 100644
--- a/core/src/main/java/hivemall/utils/math/StatsUtils.java
+++ b/core/src/main/java/hivemall/utils/math/StatsUtils.java
@@ -22,6 +22,7 @@ import hivemall.utils.lang.Preconditions;
 
 import javax.annotation.Nonnull;
 
+import org.apache.commons.math3.distribution.ChiSquaredDistribution;
 import org.apache.commons.math3.linear.DecompositionSolver;
 import org.apache.commons.math3.linear.LUDecomposition;
 import org.apache.commons.math3.linear.RealMatrix;
@@ -189,4 +190,52 @@ public final class StatsUtils {
         return 1.d - numerator / denominator;
     }
 
+    /**
+     * @param expected mean vector whose value is expected
+     * @param observed mean vector whose value is observed
+     * @return chi2-value
+     */
+    public static double chiSquare(@Nonnull final double[] expected, @Nonnull 
final double[] observed) {
+        Preconditions.checkArgument(expected.length == observed.length);
+
+        double sumExpected = 0.0D;
+        double sumObserved = 0.0D;
+
+        for (int ratio = 0; ratio < observed.length; ++ratio) {
+            sumExpected += expected[ratio];
+            sumObserved += observed[ratio];
+        }
+
+        double var15 = 1.0D;
+        boolean rescale = false;
+        if (Math.abs(sumExpected - sumObserved) > 1.0E-5D) {
+            var15 = sumObserved / sumExpected;
+            rescale = true;
+        }
+
+        double sumSq = 0.0D;
+
+        for (int i = 0; i < observed.length; ++i) {
+            double dev;
+            if (rescale) {
+                dev = observed[i] - var15 * expected[i];
+                sumSq += dev * dev / (var15 * expected[i]);
+            } else {
+                dev = observed[i] - expected[i];
+                sumSq += dev * dev / expected[i];
+            }
+        }
+
+        return sumSq;
+    }
+
+    /**
+     * @param expected means vector whose value is expected
+     * @param observed means vector whose value is observed
+     * @return p-value
+     */
+    public static double chiSquareTest(@Nonnull final double[] 
expected,@Nonnull final double[] observed) {
+        ChiSquaredDistribution distribution = new ChiSquaredDistribution(null, 
(double)expected.length - 1.0D);
+        return 1.0D - distribution.cumulativeProbability(chiSquare(expected, 
observed));
+    }
 }

Reply via email to