subject:"\[GitHub\] incubator\-hivemall pull request #107\: \[HIVEMALL\-132\] Generalize f1score UDAF..."

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-09-13 Thread asfgit

Github user asfgit closed the pull request at:

https://github.com/apache/incubator-hivemall/pull/107


---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-09-13 Thread myui

Github user myui commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r138613348
  
--- Diff: docs/gitbook/eval/auc.md ---
@@ -100,7 +100,7 @@ Note that `floor(prob / 0.2)` means that the rows are 
distributed to 5 bins for
 
 # Difference between AUC and Logarithmic Loss
 
-Hivemall has another metric called [Logarithmic 
Loss](stat_eval.html#logarithmic-loss) for binary classification. Both AUC and 
Logarithmic Loss compute scores for probability-label pairs. 
+Hivemall has another metric called [Logarithmic 
Loss](stat_eval.html#logarithmic-loss) for binary classification. Both AUC and 
Logarithmic Loss compute scores for probability-label pairs.
--- End diff --

Missing link. `stat_eval.html` is deleted. 


---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-28 Thread myui

Github user myui commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135693139
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean actual , array | int | 
boolean predicted, String) - Return a F-measure (f1score is the special with 
beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The second argument `array/int/boolean predicted` is 
invalid form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument 
`actual`'s type is "
++ typeInfo[0] + ", but the second argument 
`predicted`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-28 Thread myui

Github user myui commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135514455
  
--- Diff: core/src/main/java/hivemall/evaluation/F1ScoreUDAF.java ---
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.evaluation;
+
+import hivemall.utils.hadoop.WritableUtils;
+
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDAF;
+import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
+
+@SuppressWarnings("deprecation")
+@Description(name = "f1score", value = "_FUNC_(array[int], array[int]) - 
Return a F1 score")
+public final class F1ScoreUDAF extends UDAF {
+
+public static class Evaluator implements UDAFEvaluator {
+
+public static class PartialResult {
+long tp;
+/** tp + fn */
+long totalActual;
+/** tp + fp */
+long totalPredicted;
+
+PartialResult() {
+this.tp = 0L;
+this.totalPredicted = 0L;
+this.totalActual = 0L;
+}
+
+void updateScore(final List actual, final 
List predicted) {
+final int numActual = actual.size();
+final int numPredicted = predicted.size();
+int countTp = 0;
+for (int i = 0; i < numPredicted; i++) {
+IntWritable p = predicted.get(i);
+if (actual.contains(p)) {
+countTp++;
+}
+}
+this.tp += countTp;
+this.totalActual += numActual;
+this.totalPredicted += numPredicted;
+}
+
+void merge(PartialResult other) {
--- End diff --

oops. This should be fixed.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-28 Thread nzw0301

Github user nzw0301 commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135471771
  
--- Diff: docs/gitbook/eval/binary_classification_measures.md ---
@@ -0,0 +1,261 @@
+
+
+
+
+# Binary problems
+
+Binary classification problem is the task to predict the label of each 
data given two categorized dataset.
+
+Hivemall provides some tutorials to deal with binary classification 
problems as follows:
+
+- [Online advertisement click prediction](../binaryclass/general.html)
+- [News classification](../binaryclass/news20_dataset.html)
+
+This page focuses on the evaluation of the results from such binary 
classification problems.
+If your classifier outputs probability rather than 0/1 label, evaluation 
based on [Area Under the ROC Curve](./auc.md) would be more appropriate.
+
+
+# Example
+
+For the metrics explanation, this page introduces toy example data and two 
metrics.
+
+## Data
+
+The following table shows the sample of binary classification's prediction.
+In this case, `1` means positive label and `0` means negative label.
+Left column includes supervised label data,
+and center column includes predicted label by a binary classifier.
+
+| truth label| predicted label | |
+|:---:|:---:|:---:|
+| 1 | 0 |False Negative|
+| 0 | 1 |False Positive|
+| 0 | 0 |True Negative|
+| 1 | 1 |True Positive|
+| 0 | 1 |False Positive|
+| 0 | 0 |True Negative|
+
+## Preliminary metrics
+
+Some evaluation metrics are calculated based on 4 values:
+
+- True Positive (TP): truth label is positive and predicted label is also 
positive
+- True Negative (TN): truth label is negative and predicted label is also 
negative
+- False Positive (FP): truth label is negative but predicted label is 
positive
+- False Negative (FN): truth label is positive but predicted label is 
negative
+
+`TR` and `TN` represent correct classification, and `FP` and `FN` 
illustrate incorrect ones.
+
+In this example, we can obtain those values:
+
+- TP: 1
+- TN: 2
+- FP: 2
+- FN: 1
+
+if you want to know about those metrics, Wikipedia provides [more detail 
information](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
+
+### Recall
+
+Recall indicates the true positive rate in truth positive labels.
+The value is computed by the following equation:
+
+$$
+\mathrm{recall} = \frac{\mathrm{\#TP}}{\mathrm{\#TP} + \mathrm{\#FN}}
+$$
+
+In the previous example, $$\mathrm{precision} = \frac{1}{2}$$.
+
+### Precision
+
+Precision indicates the true positive rate in positive predictive labels.
+The value is computed by the following equation:
+
+$$
+\mathrm{precision} = \frac{\mathrm{\#TP}}{\mathrm{\#TP} + \mathrm{\#FP}}
+$$
+
+In the previous example, $$\mathrm{precision} = \frac{1}{3}$$.
+
+# Metrics
+
+## F1-score
+
+F1-score is the harmonic mean of recall and precision.
+F1-score is computed by the following equation:
+
+$$
+\mathrm{F}_1 = 2 \frac{\mathrm{precision} * 
\mathrm{recall}}{\mathrm{precision} + \mathrm{recall}}
+$$
+
+Hivemall's `fmeasure` function provides the option which can switch 
`micro`(default) or `binary` by passing `average` argument.
+
+
+>  Caution
+> Hivemall also provides `f1score` function, but it is old function to 
obtain F1-score. The value of `f1score` is based on set operation. So, we 
recommend to use `fmeasure` function to get F1-score based on this article.
+
+You can learn more about this from the following external resource:
+
+- [scikit-learn's 
F1-score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
+
+
+### Micro average
+
+If `micro` is passed to `average`, 
+recall and precision are modified to consider True Negative.
+So, micro f1score are calculated by those modified recall and precision.
+
+$$
+\mathrm{recall} = \frac{\mathrm{\#TP} + \mathrm{\#TN}}{\mathrm{\#TP} + 
\mathrm{\#FN} + \mathrm{\#TN}}
+$$
+
+$$
+\mathrm{precision} = \frac{\mathrm{\#TP} + \mathrm{\#TN}}{\mathrm{\#TP} + 
\mathrm{\#FP} + \mathrm{\#TN}}
+$$
+
+If `average` argument is omitted, `fmeasure` use default value: `'-average 
micro'`.
+
+The following query shows the example to obtain F1-score.
+Each row value has the same type (`int` or `boolean`).
+If row value's type is `int`, `1` is considered as the positive label, and 
`-1` or `0` is considered as the negative label.
+
+
+```sql
+WITH data as (
+  select 1 as truth, 0 as predicted
+union all
+  select 0 as truth, 1 as predicted
+union all
+  select 0 as truth, 0 as

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-28 Thread nzw0301

Github user nzw0301 commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135464601
  
--- Diff: core/src/test/java/hivemall/evaluation/FMeasureUDAFTest.java ---
@@ -0,0 +1,393 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.evaluation;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import 
org.apache.hadoop.hive.ql.udf.generic.SimpleGenericUDAFParameterInfo;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import 
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+
+public class FMeasureUDAFTest {
+FMeasureUDAF fmeasure;
+GenericUDAFEvaluator evaluator;
+ObjectInspector[] inputOIs;
+FMeasureUDAF.FMeasureAggregationBuffer agg;
+
+@Before
+public void setUp() throws Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[] {
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+ObjectInspectorUtils.getConstantObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-beta 1.")};
+
+evaluator = fmeasure.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+
+agg = (FMeasureUDAF.FMeasureAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+}
+
+private void setUpWithArguments(double beta, String average) throws 
Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[] {
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+ObjectInspectorUtils.getConstantObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-beta " + beta
++ " -average " + average)};
+
+evaluator = fmeasure.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+agg = (FMeasureUDAF.FMeasureAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+}
+
+private void binarySetUp(Object actual, Object predicted, double beta, 
String average)
+throws Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[3];
+
+String actualClassName = actual.getClass().getName();
+if (actualClassName.equals("java.lang.Integer")) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT);
+} else if (actualClassName.equals("java.lang.Boolean")) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.BOOLEAN);
+} else if ((actualClassName.equals("java.lang.String"))) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING);
+}
+
+String predicatedClassName = predicted.getClass().getName();

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-28 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135453504
  
--- Diff: core/src/main/java/hivemall/evaluation/F1ScoreUDAF.java ---
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.evaluation;
+
+import hivemall.utils.hadoop.WritableUtils;
+
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDAF;
+import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
+
+@SuppressWarnings("deprecation")
+@Description(name = "f1score", value = "_FUNC_(array[int], array[int]) - 
Return a F1 score")
+public final class F1ScoreUDAF extends UDAF {
+
+public static class Evaluator implements UDAFEvaluator {
+
+public static class PartialResult {
+long tp;
+/** tp + fn */
+long totalActual;
+/** tp + fp */
+long totalPredicted;
+
+PartialResult() {
+this.tp = 0L;
+this.totalPredicted = 0L;
+this.totalActual = 0L;
+}
+
+void updateScore(final List actual, final 
List predicted) {
+final int numActual = actual.size();
+final int numPredicted = predicted.size();
+int countTp = 0;
+for (int i = 0; i < numPredicted; i++) {
+IntWritable p = predicted.get(i);
+if (actual.contains(p)) {
+countTp++;
+}
+}
+this.tp += countTp;
+this.totalActual += numActual;
+this.totalPredicted += numPredicted;
+}
+
+void merge(PartialResult other) {
--- End diff --

Oh, there is a bug here! ð 

Correct:

```java
void merge(PartialResult other) {
this.tp += other.tp;
this.totalActual += other.totalActual;
this.totalPredicted += other.totalPredicted;
}
```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-28 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135447663
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean actual , array | int | 
boolean predicted, String) - Return a F-measure (f1score is the special with 
beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The second argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
--- End diff --

Typo: `predicated` => `predicted`


---
If your project is set up for it, you can reply to this email and have your
reply

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-28 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135450159
  
--- Diff: core/src/test/java/hivemall/evaluation/FMeasureUDAFTest.java ---
@@ -0,0 +1,393 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.evaluation;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import 
org.apache.hadoop.hive.ql.udf.generic.SimpleGenericUDAFParameterInfo;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import 
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+
+public class FMeasureUDAFTest {
+FMeasureUDAF fmeasure;
+GenericUDAFEvaluator evaluator;
+ObjectInspector[] inputOIs;
+FMeasureUDAF.FMeasureAggregationBuffer agg;
+
+@Before
+public void setUp() throws Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[] {
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+ObjectInspectorUtils.getConstantObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-beta 1.")};
+
+evaluator = fmeasure.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+
+agg = (FMeasureUDAF.FMeasureAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+}
+
+private void setUpWithArguments(double beta, String average) throws 
Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[] {
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+ObjectInspectorUtils.getConstantObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-beta " + beta
++ " -average " + average)};
+
+evaluator = fmeasure.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+agg = (FMeasureUDAF.FMeasureAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+}
+
+private void binarySetUp(Object actual, Object predicted, double beta, 
String average)
+throws Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[3];
+
+String actualClassName = actual.getClass().getName();
+if (actualClassName.equals("java.lang.Integer")) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT);
+} else if (actualClassName.equals("java.lang.Boolean")) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.BOOLEAN);
+} else if ((actualClassName.equals("java.lang.String"))) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING);
+}
+
+String predicatedClassName = predicted.getClass().getName();

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-28 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135450892
  
--- Diff: docs/gitbook/eval/binary_classification_measures.md ---
@@ -0,0 +1,261 @@
+
+
+
+
+# Binary problems
+
+Binary classification problem is the task to predict the label of each 
data given two categorized dataset.
+
+Hivemall provides some tutorials to deal with binary classification 
problems as follows:
+
+- [Online advertisement click prediction](../binaryclass/general.html)
+- [News classification](../binaryclass/news20_dataset.html)
+
+This page focuses on the evaluation of the results from such binary 
classification problems.
+If your classifier outputs probability rather than 0/1 label, evaluation 
based on [Area Under the ROC Curve](./auc.md) would be more appropriate.
+
+
+# Example
+
+For the metrics explanation, this page introduces toy example data and two 
metrics.
+
+## Data
+
+The following table shows the sample of binary classification's prediction.
+In this case, `1` means positive label and `0` means negative label.
+Left column includes supervised label data,
+and center column includes predicted label by a binary classifier.
+
+| truth label| predicted label | |
+|:---:|:---:|:---:|
+| 1 | 0 |False Negative|
+| 0 | 1 |False Positive|
+| 0 | 0 |True Negative|
+| 1 | 1 |True Positive|
+| 0 | 1 |False Positive|
+| 0 | 0 |True Negative|
+
+## Preliminary metrics
+
+Some evaluation metrics are calculated based on 4 values:
+
+- True Positive (TP): truth label is positive and predicted label is also 
positive
+- True Negative (TN): truth label is negative and predicted label is also 
negative
+- False Positive (FP): truth label is negative but predicted label is 
positive
+- False Negative (FN): truth label is positive but predicted label is 
negative
+
+`TR` and `TN` represent correct classification, and `FP` and `FN` 
illustrate incorrect ones.
+
+In this example, we can obtain those values:
+
+- TP: 1
+- TN: 2
+- FP: 2
+- FN: 1
+
+if you want to know about those metrics, Wikipedia provides [more detail 
information](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
+
+### Recall
+
+Recall indicates the true positive rate in truth positive labels.
+The value is computed by the following equation:
+
+$$
+\mathrm{recall} = \frac{\mathrm{\#TP}}{\mathrm{\#TP} + \mathrm{\#FN}}
+$$
+
+In the previous example, $$\mathrm{precision} = \frac{1}{2}$$.
+
+### Precision
+
+Precision indicates the true positive rate in positive predictive labels.
+The value is computed by the following equation:
+
+$$
+\mathrm{precision} = \frac{\mathrm{\#TP}}{\mathrm{\#TP} + \mathrm{\#FP}}
+$$
+
+In the previous example, $$\mathrm{precision} = \frac{1}{3}$$.
+
+# Metrics
+
+## F1-score
+
+F1-score is the harmonic mean of recall and precision.
+F1-score is computed by the following equation:
+
+$$
+\mathrm{F}_1 = 2 \frac{\mathrm{precision} * 
\mathrm{recall}}{\mathrm{precision} + \mathrm{recall}}
+$$
+
+Hivemall's `fmeasure` function provides the option which can switch 
`micro`(default) or `binary` by passing `average` argument.
+
+
+>  Caution
+> Hivemall also provides `f1score` function, but it is old function to 
obtain F1-score. The value of `f1score` is based on set operation. So, we 
recommend to use `fmeasure` function to get F1-score based on this article.
+
+You can learn more about this from the following external resource:
+
+- [scikit-learn's 
F1-score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
+
+
+### Micro average
+
+If `micro` is passed to `average`, 
--- End diff --

ð 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-28 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135454567
  
--- Diff: docs/gitbook/eval/binary_classification_measures.md ---
@@ -0,0 +1,261 @@
+
+
+
+
+# Binary problems
+
+Binary classification problem is the task to predict the label of each 
data given two categorized dataset.
+
+Hivemall provides some tutorials to deal with binary classification 
problems as follows:
+
+- [Online advertisement click prediction](../binaryclass/general.html)
+- [News classification](../binaryclass/news20_dataset.html)
+
+This page focuses on the evaluation of the results from such binary 
classification problems.
+If your classifier outputs probability rather than 0/1 label, evaluation 
based on [Area Under the ROC Curve](./auc.md) would be more appropriate.
+
+
+# Example
+
+For the metrics explanation, this page introduces toy example data and two 
metrics.
+
+## Data
+
+The following table shows the sample of binary classification's prediction.
+In this case, `1` means positive label and `0` means negative label.
+Left column includes supervised label data,
+and center column includes predicted label by a binary classifier.
+
+| truth label| predicted label | |
+|:---:|:---:|:---:|
+| 1 | 0 |False Negative|
+| 0 | 1 |False Positive|
+| 0 | 0 |True Negative|
+| 1 | 1 |True Positive|
+| 0 | 1 |False Positive|
+| 0 | 0 |True Negative|
+
+## Preliminary metrics
+
+Some evaluation metrics are calculated based on 4 values:
+
+- True Positive (TP): truth label is positive and predicted label is also 
positive
+- True Negative (TN): truth label is negative and predicted label is also 
negative
+- False Positive (FP): truth label is negative but predicted label is 
positive
+- False Negative (FN): truth label is positive but predicted label is 
negative
+
+`TR` and `TN` represent correct classification, and `FP` and `FN` 
illustrate incorrect ones.
+
+In this example, we can obtain those values:
+
+- TP: 1
+- TN: 2
+- FP: 2
+- FN: 1
+
+if you want to know about those metrics, Wikipedia provides [more detail 
information](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
+
+### Recall
+
+Recall indicates the true positive rate in truth positive labels.
+The value is computed by the following equation:
+
+$$
+\mathrm{recall} = \frac{\mathrm{\#TP}}{\mathrm{\#TP} + \mathrm{\#FN}}
+$$
+
+In the previous example, $$\mathrm{precision} = \frac{1}{2}$$.
+
+### Precision
+
+Precision indicates the true positive rate in positive predictive labels.
+The value is computed by the following equation:
+
+$$
+\mathrm{precision} = \frac{\mathrm{\#TP}}{\mathrm{\#TP} + \mathrm{\#FP}}
+$$
+
+In the previous example, $$\mathrm{precision} = \frac{1}{3}$$.
+
+# Metrics
+
+## F1-score
+
+F1-score is the harmonic mean of recall and precision.
+F1-score is computed by the following equation:
+
+$$
+\mathrm{F}_1 = 2 \frac{\mathrm{precision} * 
\mathrm{recall}}{\mathrm{precision} + \mathrm{recall}}
+$$
+
+Hivemall's `fmeasure` function provides the option which can switch 
`micro`(default) or `binary` by passing `average` argument.
+
+
+>  Caution
+> Hivemall also provides `f1score` function, but it is old function to 
obtain F1-score. The value of `f1score` is based on set operation. So, we 
recommend to use `fmeasure` function to get F1-score based on this article.
+
+You can learn more about this from the following external resource:
+
+- [scikit-learn's 
F1-score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
+
+
+### Micro average
+
+If `micro` is passed to `average`, 
+recall and precision are modified to consider True Negative.
+So, micro f1score are calculated by those modified recall and precision.
+
+$$
+\mathrm{recall} = \frac{\mathrm{\#TP} + \mathrm{\#TN}}{\mathrm{\#TP} + 
\mathrm{\#FN} + \mathrm{\#TN}}
+$$
+
+$$
+\mathrm{precision} = \frac{\mathrm{\#TP} + \mathrm{\#TN}}{\mathrm{\#TP} + 
\mathrm{\#FP} + \mathrm{\#TN}}
+$$
+
+If `average` argument is omitted, `fmeasure` use default value: `'-average 
micro'`.
+
+The following query shows the example to obtain F1-score.
+Each row value has the same type (`int` or `boolean`).
+If row value's type is `int`, `1` is considered as the positive label, and 
`-1` or `0` is considered as the negative label.
+
+
+```sql
+WITH data as (
+  select 1 as truth, 0 as predicted
+union all
+  select 0 as truth, 1 as predicted
+union all
+  select 0 as truth, 0 as

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-28 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135449941
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean actual , array | int | 
boolean predicted, String) - Return a F-measure (f1score is the special with 
beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The second argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-27 Thread nzw0301

Github user nzw0301 commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135444092
  
--- Diff: docs/gitbook/eval/multilabel_classification_measures.md ---
@@ -0,0 +1,144 @@
+
+
+
+
+# Multi-label classification
+
+
+Multi-label classification problem is the task to predict the labels given 
categorized dataset.
+Each sample $$i$$ has $$l_i$$ labels, where $$L$$ is the number of unique 
labels in the dataset, and $$0 \leq  l_i \leq |L| $$.
--- End diff --

Yes, I fixed it. Thanks!


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-27 Thread nzw0301

Github user nzw0301 commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135432462
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-25 Thread nzw0301

Github user nzw0301 commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135227946
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-24 Thread myui

Github user myui commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r135045093
  
--- Diff: resources/ddl/define-all.spark ---
@@ -530,6 +530,9 @@ sqlContext.sql("CREATE TEMPORARY FUNCTION lr_datagen AS 
'hivemall.dataset.Logist
 sqlContext.sql("DROP TEMPORARY FUNCTION IF EXISTS f1score")
 sqlContext.sql("CREATE TEMPORARY FUNCTION f1score AS 
'hivemall.evaluation.FMeasureUDAF'")
--- End diff --

It is better to copy the old FMeasureUDAF.java as F1ScoreUDAF.java for a 
backward compatibility.

Then, `CREATE TEMPORARY FUNCTION f1score AS 
'hivemall.evaluation.F1ScoreUDAF'`.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-22 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134491966
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134138339
  
--- Diff: docs/gitbook/eval/binary_classification_measures.md ---
@@ -0,0 +1,232 @@
+
+
+
+
+# Binary problems
+
+Binary classification problem is the task to predict the label of each 
data given two categorized dataset.
+
+Hivemall provides some tutorials to deal with binary classification 
problems as follows:
+
+- [Online advertisement click prediction](../binaryclass/general.html)
+- [News classification](../binaryclass/news20_dataset.html)
+
+This page focuses on the evaluation of the results from such binary 
classification problems.
+If you want to know about Area Under the ROC Curve, please check 
[AUC](./auc.md) page.
+
+# Example
+
+For the metrics explanation, this page introduces toy example data and two 
metrics.
+
+## Data
+
+The following table shows the sample of binary classification's prediction.
+In this case, `1` means positive label and `0` means negative label.
+Left column includes supervised label data,
+Right column includes are predicted label by a binary classifier.w
+
+| truth label| predicted label |
+|:---:|:---:|
+| 1 | 0 |
+| 0 | 1 |
+| 0 | 0 |
+| 1 | 1 |
+| 0 | 1 |
+| 0 | 0 |
+
+## Preliminary metrics
+
+Some evaluation metrics are calculated based on 4 values:
+
+- True Positive: truth label is positive and predicted label is also 
positive
+- True Negative: truth label is negative and predicted label is also 
negative
+- False Positive: truth label is negative but predicted label is positive
+- False Negative: truth label is positive but predicted label is negative
+
+In this example, we can obtain those values:
+
+- True Positive: 1
--- End diff --

Values (or table) are incorrect; the above table should be TP=1, TN=2, 
FN=1, FP=2

Since TP, FP, FN and TN are complicated for beginners, it's better to show 
which row corresponds to each of them as:

| truth label| predicted label | |
|:---:|:---:|:---|
| 1 | 0 |False Negative|
| 0 | 1 |False Positive|
| 0 | 0 |True Negative|
| 1 | 1 |True Positive|
| 0 | 1 |False Positive|
| 0 | 0 |True Negative|

In addition, I would recommend you to clearly describe "TP and TN represent 
**correct** classification, and FP and FN illustrate **incorrect** ones."

Furthermore, adding a link to external page which explains TP, FP, TN, FN 
somewhere like Wikipedia would be better.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134156437
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134145129
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134141170
  
--- Diff: docs/gitbook/eval/multilabel_classification_measures.md ---
@@ -0,0 +1,144 @@
+
+
+
+
+# Multi-label classification
+
+
+Multi-label classification problem is the task to predict the labels given 
categorized dataset.
+Each sample $$i$$ has $$l_i$$ labels, where $$L$$ is the number of unique 
labels in the dataset, and $$0 \leq  l_i \leq |L| $$.
--- End diff --

You mean, "L is **a set of** unique labels in the dataset"?


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134145190
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134142978
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
--- End diff --

Copy & paste related error. A correct message is: "The **second** argument 
\`array/int/boolean **predicted**\` is invalid form: "...


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134135290
  
--- Diff: core/src/main/java/hivemall/UDAFEvaluatorWithOptions.java ---
@@ -0,0 +1,97 @@
+package hivemall;
+
+import hivemall.utils.lang.CommandLineUtils;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.MapredContext;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.mapred.Counters;
+import org.apache.hadoop.mapred.Reporter;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import java.io.PrintWriter;
+import java.io.StringWriter;
+
+public abstract class UDAFEvaluatorWithOptions extends 
GenericUDAFEvaluator {
+
+@Nullable
+protected MapredContext mapredContext;
+
+@Override
+public final void configure(MapredContext mapredContext) {
+this.mapredContext = mapredContext;
+}
+
+@Nullable
+protected final Reporter getReporter() {
+if (mapredContext == null) {
+return null;
+}
+return mapredContext.getReporter();
+}
+
+protected static void reportProgress(@Nullable Reporter reporter) {
+if (reporter != null) {
+synchronized (reporter) {
+reporter.progress();
+}
+}
+}
+
+protected static void setCounterValue(@Nullable Counters.Counter 
counter, long value) {
--- End diff --

Since `org.apache.hadoop.mapred.Counters` is only used for pointing to 
`org.apache.hadoop.mapred.Counters.Counter`, you can directly import 
`org.apache.hadoop.mapred.Counters.Counter` like `UDTFWithOptions`.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134146143
  
--- Diff: docs/gitbook/eval/binary_classification_measures.md ---
@@ -0,0 +1,232 @@
+
+
+
+
+# Binary problems
+
+Binary classification problem is the task to predict the label of each 
data given two categorized dataset.
+
+Hivemall provides some tutorials to deal with binary classification 
problems as follows:
+
+- [Online advertisement click prediction](../binaryclass/general.html)
+- [News classification](../binaryclass/news20_dataset.html)
+
+This page focuses on the evaluation of the results from such binary 
classification problems.
+If you want to know about Area Under the ROC Curve, please check 
[AUC](./auc.md) page.
--- End diff --

Describing why AUC is documented separately is better. For example: "If 
your classifier outputs probability rather than 0/1 label, evaluation based on 
[Area Under the ROC Curve](./auc.md) would be more appropriate."


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134151609
  
--- Diff: core/src/test/java/hivemall/evaluation/FMeasureUDAFTest.java ---
@@ -0,0 +1,355 @@
+package hivemall.evaluation;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import 
org.apache.hadoop.hive.ql.udf.generic.SimpleGenericUDAFParameterInfo;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import 
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+
+public class FMeasureUDAFTest {
+FMeasureUDAF fmeasure;
+GenericUDAFEvaluator evaluator;
+ObjectInspector[] inputOIs;
+FMeasureUDAF.FMeasureAggregationBuffer agg;
+
+@Before
+public void setUp() throws Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[] {
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+ObjectInspectorUtils.getConstantObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-beta 1.")};
+
+evaluator = fmeasure.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+
+agg = (FMeasureUDAF.FMeasureAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+}
+
+private void setUpWithArguments(double beta, String average) throws 
Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[] {
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+ObjectInspectorUtils.getConstantObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-beta " + beta
++ " -average " + average)};
+
+evaluator = fmeasure.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+agg = (FMeasureUDAF.FMeasureAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+}
+
+private void binarySetUp(Object actual, Object predicted, double beta, 
String average)
+throws Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[3];
+
+String actualClassName = actual.getClass().getName();
+if (actualClassName.equals("java.lang.Integer")) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT);
+} else if (actualClassName.equals("java.lang.Boolean")) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.BOOLEAN);
+} else if ((actualClassName.equals("java.lang.String"))) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING);
+}
+
+String predicatedClassName = predicted.getClass().getName();
+if (predicatedClassName.equals("java.lang.Integer")) {
+inputOIs[1] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT);
+} else if (predicatedClassName.equals("java.lang.Boolean")) {
+inputOIs[1] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.BOOLEAN);
+} else if ((predicatedClassName.equals("java.lang.String"))) {
+inputOIs[1] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING);
+}
+
+inputOIs[2] = ObjectInspectorUtils.getConstantObjectInspector(
+PrimitiveObjectInspectorFactory.javaStringObjectInspector,

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134146276
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134157657
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134141649
  
--- Diff: docs/gitbook/eval/multilabel_classification_measures.md ---
@@ -0,0 +1,144 @@
+
+
+
+
+# Multi-label classification
+
+
+Multi-label classification problem is the task to predict the labels given 
categorized dataset.
+Each sample $$i$$ has $$l_i$$ labels, where $$L$$ is the number of unique 
labels in the dataset, and $$0 \leq  l_i \leq |L| $$.
+
+This page focuses on evaluation of the results from such multi-label 
classification problems.
+
+# Example
+
+For the metrics explanation, this page introduces toy example dataset.
+
+## Data
+
+The following table shows the sample of multi-label classification's 
prediction.
+Animal names represent the tags of blog post.
+Left column includes supervised labels,
+Right column includes are predicted labels by a Multi-label classifier.
+
+| truth labels| predicted labels |
+|:---:|:---:|
+|cat, dog | cat, bird |
+| cat, bird | cat, dog |
+| | cat |
+| bird | bird |
+| bird, cat | bird, cat |
+| cat, dog, bird | cat, dog |
+| dog | dog, bird|
+
+
+# Evaluation metrics for multi-label classification
--- End diff --

Much more easier to understand than the explanation of binary 
classification ð 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134154140
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134149484
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134145321
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134141399
  
--- Diff: docs/gitbook/eval/multilabel_classification_measures.md ---
@@ -0,0 +1,144 @@
+
+
+
+
+# Multi-label classification
+
+
+Multi-label classification problem is the task to predict the labels given 
categorized dataset.
+Each sample $$i$$ has $$l_i$$ labels, where $$L$$ is the number of unique 
labels in the dataset, and $$0 \leq  l_i \leq |L| $$.
+
+This page focuses on evaluation of the results from such multi-label 
classification problems.
+
+# Example
+
+For the metrics explanation, this page introduces toy example dataset.
+
+## Data
+
+The following table shows the sample of multi-label classification's 
prediction.
+Animal names represent the tags of blog post.
+Left column includes supervised labels,
+Right column includes are predicted labels by a Multi-label classifier.
+
+| truth labels| predicted labels |
+|:---:|:---:|
+|cat, dog | cat, bird |
+| cat, bird | cat, dog |
+| | cat |
--- End diff --

If you intend to represent "no truth labels" here, writing *(no truth 
label)* in the cell would be better.


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134140877
  
--- Diff: docs/gitbook/eval/binary_classification_measures.md ---
@@ -0,0 +1,232 @@
+
+
+
+
+# Binary problems
+
+Binary classification problem is the task to predict the label of each 
data given two categorized dataset.
+
+Hivemall provides some tutorials to deal with binary classification 
problems as follows:
+
+- [Online advertisement click prediction](../binaryclass/general.html)
+- [News classification](../binaryclass/news20_dataset.html)
+
+This page focuses on the evaluation of the results from such binary 
classification problems.
+If you want to know about Area Under the ROC Curve, please check 
[AUC](./auc.md) page.
+
+# Example
+
+For the metrics explanation, this page introduces toy example data and two 
metrics.
+
+## Data
+
+The following table shows the sample of binary classification's prediction.
+In this case, `1` means positive label and `0` means negative label.
+Left column includes supervised label data,
+Right column includes are predicted label by a binary classifier.w
+
+| truth label| predicted label |
+|:---:|:---:|
+| 1 | 0 |
+| 0 | 1 |
+| 0 | 0 |
+| 1 | 1 |
+| 0 | 1 |
+| 0 | 0 |
+
+## Preliminary metrics
+
+Some evaluation metrics are calculated based on 4 values:
+
+- True Positive: truth label is positive and predicted label is also 
positive
+- True Negative: truth label is negative and predicted label is also 
negative
+- False Positive: truth label is negative but predicted label is positive
+- False Negative: truth label is positive but predicted label is negative
+
+In this example, we can obtain those values:
+
+- True Positive: 1
+- True Negative: 1
+- False Positive: 2
+- False Negative: 2
+
+### Recall
+
+Recall indicates the true positive rate in truth positive labels.
+The value is computed by the following equation:
+
+$$
+\mathrm{recall} = \frac{\mathrm{\#true\ positive}}{\mathrm{\#true\ 
positive} + \mathrm{\#false\ negative}}
+$$
+
+In the previous example, $$\mathrm{precision} = \frac{1}{2}$$.
+
+### Precision
+
+Precision indicates the true positive rate in positive predictive labels.
+The value is computed by the following equation:
+
+$$
+\mathrm{precision} = \frac{\mathrm{\#true\ positive}}{\mathrm{\#true\ 
positive} + \mathrm{\#false\ positive}}
+$$
+
+In the previous example, $$\mathrm{precision} = \frac{1}{3}$$.
+
+# Metrics
+
+## F1-score
--- End diff --

I felt understanding the difference in `-average` option is hard for users. 

> true positive includes true positive and false negative (: predicted 
label matches truth label) in above equations.

> TP only includes true positive in above equations.

It sounds strange... From a reader's point of view, "true positive" is 
"true positive," and "false negative" is "false negative," isn't it? *"true 
positive includes true positive and false negative"* and *"TP only includes 
true positive"* are really surprising expressions for readers.

Could you explain the option more precisely? Moreover, users probably want 
to know "which one of `micro` and `binary` is better (appropriate)," so 
describing the difference between them from a practical point of view would be 
better if it's possible.

Minor things:
- Adding a link to scikit-learn's F1 score document is better
- Let you clearly state "True Positive (TP)" to tell readers "TP" is 
shortened form of "true positive"


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134136143
  
--- Diff: core/src/test/java/hivemall/evaluation/FMeasureUDAFTest.java ---
@@ -0,0 +1,355 @@
+package hivemall.evaluation;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import 
org.apache.hadoop.hive.ql.udf.generic.SimpleGenericUDAFParameterInfo;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import 
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+
+public class FMeasureUDAFTest {
+FMeasureUDAF fmeasure;
+GenericUDAFEvaluator evaluator;
+ObjectInspector[] inputOIs;
+FMeasureUDAF.FMeasureAggregationBuffer agg;
+
+@Before
+public void setUp() throws Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[] {
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+ObjectInspectorUtils.getConstantObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-beta 1.")};
+
+evaluator = fmeasure.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+
+agg = (FMeasureUDAF.FMeasureAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+}
+
+private void setUpWithArguments(double beta, String average) throws 
Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[] {
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+ObjectInspectorUtils.getConstantObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-beta " + beta
++ " -average " + average)};
+
+evaluator = fmeasure.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+agg = (FMeasureUDAF.FMeasureAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+}
+
+private void binarySetUp(Object actual, Object predicted, double beta, 
String average)
+throws Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[3];
+
+String actualClassName = actual.getClass().getName();
+if (actualClassName.equals("java.lang.Integer")) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT);
+} else if (actualClassName.equals("java.lang.Boolean")) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.BOOLEAN);
+} else if ((actualClassName.equals("java.lang.String"))) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING);
+}
+
+String predicatedClassName = predicted.getClass().getName();
+if (predicatedClassName.equals("java.lang.Integer")) {
+inputOIs[1] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT);
+} else if (predicatedClassName.equals("java.lang.Boolean")) {
+inputOIs[1] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.BOOLEAN);
+} else if ((predicatedClassName.equals("java.lang.String"))) {
+inputOIs[1] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING);
+}
+
+inputOIs[2] = ObjectInspectorUtils.getConstantObjectInspector(
+PrimitiveObjectInspectorFactory.javaStringObjectInspector,

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134137472
  
--- Diff: docs/gitbook/eval/binary_classification_measures.md ---
@@ -0,0 +1,232 @@
+
+
+
+
+# Binary problems
+
+Binary classification problem is the task to predict the label of each 
data given two categorized dataset.
+
+Hivemall provides some tutorials to deal with binary classification 
problems as follows:
+
+- [Online advertisement click prediction](../binaryclass/general.html)
+- [News classification](../binaryclass/news20_dataset.html)
+
+This page focuses on the evaluation of the results from such binary 
classification problems.
+If you want to know about Area Under the ROC Curve, please check 
[AUC](./auc.md) page.
+
+# Example
+
+For the metrics explanation, this page introduces toy example data and two 
metrics.
+
+## Data
+
+The following table shows the sample of binary classification's prediction.
+In this case, `1` means positive label and `0` means negative label.
+Left column includes supervised label data,
--- End diff --

Let you fix typo and strange capitalization as: "Left column includes 
supervised label data, and right column includes predicted label by a binary 
classifier."


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134144074
  
--- Diff: core/src/main/java/hivemall/evaluation/FMeasureUDAF.java ---
@@ -18,118 +18,387 @@
  */
 package hivemall.evaluation;
 
-import hivemall.utils.hadoop.WritableUtils;
+import hivemall.UDAFEvaluatorWithOptions;
+import hivemall.utils.hadoop.HiveUtils;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 
+import hivemall.utils.lang.Primitives;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.Options;
+
 import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDAF;
-import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
-import org.apache.hadoop.io.IntWritable;
-
-@SuppressWarnings("deprecation")
-@Description(name = "f1score",
-value = "_FUNC_(array[int], array[int]) - Return a F-measure/F1 
score")
-public final class FMeasureUDAF extends UDAF {
-
-public static class Evaluator implements UDAFEvaluator {
-
-public static class PartialResult {
-long tp;
-/** tp + fn */
-long totalAcutal;
-/** tp + fp */
-long totalPredicted;
-
-PartialResult() {
-this.tp = 0L;
-this.totalPredicted = 0L;
-this.totalAcutal = 0L;
-}
+import org.apache.hadoop.hive.serde2.objectinspector.*;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
+import org.apache.hadoop.io.LongWritable;
 
-void updateScore(final List actual, final 
List predicted) {
-final int numActual = actual.size();
-final int numPredicted = predicted.size();
-int countTp = 0;
-for (int i = 0; i < numPredicted; i++) {
-IntWritable p = predicted.get(i);
-if (actual.contains(p)) {
-countTp++;
-}
+import javax.annotation.Nonnull;
+
+@Description(
+name = "fmeasure",
+value = "_FUNC_(array | int | boolean, array | int | boolean, 
String) - Return a F-measure (f1score is the special with beta=1.)")
+public final class FMeasureUDAF extends AbstractGenericUDAFResolver {
+@Override
+public GenericUDAFEvaluator getEvaluator(@Nonnull TypeInfo[] typeInfo) 
throws SemanticException {
+if (typeInfo.length != 2 && typeInfo.length != 3) {
+throw new UDFArgumentTypeException(typeInfo.length - 1,
+"_FUNC_ takes two or three arguments");
+}
+
+boolean isArg1ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[0])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[0])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[0]);
+if (!isArg1ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(0,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[0]);
+}
+
+boolean isArg2ListOrIntOrBoolean = 
HiveUtils.isListTypeInfo(typeInfo[1])
+|| HiveUtils.isIntegerTypeInfo(typeInfo[1])
+|| HiveUtils.isBooleanTypeInfo(typeInfo[1]);
+if (!isArg2ListOrIntOrBoolean) {
+throw new UDFArgumentTypeException(1,
+"The first argument `array/int/boolean actual` is invalid 
form: " + typeInfo[1]);
+}
+
+if (typeInfo[0] != typeInfo[1]) {
+throw new UDFArgumentTypeException(1, "The first argument's 
`actual` type is "
++ typeInfo[0] + ", but the second argument 
`predicated`'s type is not match: "
++ typeInfo[1]);
+}
+
+return new Evaluator();
+}
+
+public static class Evaluator extends

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-21 Thread takuti

Github user takuti commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r134168379
  
--- Diff: core/src/test/java/hivemall/evaluation/FMeasureUDAFTest.java ---
@@ -0,0 +1,355 @@
+package hivemall.evaluation;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import 
org.apache.hadoop.hive.ql.udf.generic.SimpleGenericUDAFParameterInfo;
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
+import 
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.Arrays;
+import java.util.List;
+
+
+public class FMeasureUDAFTest {
+FMeasureUDAF fmeasure;
+GenericUDAFEvaluator evaluator;
+ObjectInspector[] inputOIs;
+FMeasureUDAF.FMeasureAggregationBuffer agg;
+
+@Before
+public void setUp() throws Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[] {
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+ObjectInspectorUtils.getConstantObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-beta 1.")};
+
+evaluator = fmeasure.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+
+agg = (FMeasureUDAF.FMeasureAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+}
+
+private void setUpWithArguments(double beta, String average) throws 
Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[] {
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableLongObjectInspector),
+ObjectInspectorUtils.getConstantObjectInspector(
+
PrimitiveObjectInspectorFactory.javaStringObjectInspector, "-beta " + beta
++ " -average " + average)};
+
+evaluator = fmeasure.getEvaluator(new 
SimpleGenericUDAFParameterInfo(inputOIs, false, false));
+agg = (FMeasureUDAF.FMeasureAggregationBuffer) 
evaluator.getNewAggregationBuffer();
+}
+
+private void binarySetUp(Object actual, Object predicted, double beta, 
String average)
+throws Exception {
+fmeasure = new FMeasureUDAF();
+inputOIs = new ObjectInspector[3];
+
+String actualClassName = actual.getClass().getName();
+if (actualClassName.equals("java.lang.Integer")) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT);
+} else if (actualClassName.equals("java.lang.Boolean")) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.BOOLEAN);
+} else if ((actualClassName.equals("java.lang.String"))) {
+inputOIs[0] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING);
+}
+
+String predicatedClassName = predicted.getClass().getName();
+if (predicatedClassName.equals("java.lang.Integer")) {
+inputOIs[1] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.INT);
+} else if (predicatedClassName.equals("java.lang.Boolean")) {
+inputOIs[1] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.BOOLEAN);
+} else if ((predicatedClassName.equals("java.lang.String"))) {
+inputOIs[1] = 
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(PrimitiveObjectInspector.PrimitiveCategory.STRING);
+}
+
+inputOIs[2] = ObjectInspectorUtils.getConstantObjectInspector(
+PrimitiveObjectInspectorFactory.javaStringObjectInspector,

[GitHub] incubator-hivemall pull request #107: [HIVEMALL-132] Generalize f1score UDAF...

2017-08-02 Thread myui

Github user myui commented on a diff in the pull request:

https://github.com/apache/incubator-hivemall/pull/107#discussion_r130797590
  
--- Diff: resources/ddl/define-all.hive ---
@@ -543,8 +543,8 @@ create temporary function lr_datagen as 
'hivemall.dataset.LogisticRegressionData
 -- Evaluating functions --
 --
 
-drop temporary function if exists f1score;
-create temporary function f1score as 'hivemall.evaluation.FMeasureUDAF';
+drop temporary function if exists fmeasure;
+create temporary function fmeasure as 'hivemall.evaluation.FMeasureUDAF';
--- End diff --

Could you remain alias for `f1score` in DDLs for backward compatibility.

```sql
-- alias for backward compatibility
drop temporary function if exists f1score;
create temporary function f1score as 'hivemall.evaluation.FMeasureUDAF';

drop temporary function if exists fmeasure;
...
```


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

39 matches

Mail list logo