Repository: incubator-hivemall Updated Branches: refs/heads/master 79a099e98 -> 9257a3509
Applied refactoring for #145 Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/9257a350 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/9257a350 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/9257a350 Branch: refs/heads/master Commit: 9257a3509bf3fb3bfb34a00e2ebd8d7c3d86f459 Parents: 7a6595c Author: Makoto Yui <[email protected]> Authored: Thu Apr 26 15:46:07 2018 +0900 Committer: Makoto Yui <[email protected]> Committed: Thu Apr 26 15:49:28 2018 +0900 ---------------------------------------------------------------------- .../ftvec/trans/QuantifiedFeaturesUDTF.java | 20 +++++------ core/src/test/java/hivemall/TestUtils.java | 1 + .../ftvec/trans/QuantifiedFeaturesUDTFTest.java | 1 + .../hivemall/nlp/tokenizer/KuromojiUDF.java | 36 +++++++++++--------- .../hivemall/nlp/tokenizer/KuromojiUDFTest.java | 5 +-- 5 files changed, 32 insertions(+), 31 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9257a350/core/src/main/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTF.java b/core/src/main/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTF.java index 5b2eefe..c036855 100644 --- a/core/src/main/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTF.java +++ b/core/src/main/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTF.java @@ -48,7 +48,8 @@ public final class QuantifiedFeaturesUDTF extends GenericUDTF { private Identifier<String>[] identifiers; private DoubleWritable[] columnValues; - private Object[] forwardObjs; + // lazy instantiation to avoid org.apache.hive.com.esotericsoftware.kryo.KryoException: java.lang.NullPointerException + private transient Object[] forwardObjs; @SuppressWarnings("unchecked") @Override @@ -87,37 +88,32 @@ public final class QuantifiedFeaturesUDTF extends GenericUDTF { @Override public void process(Object[] args) throws HiveException { - int outputSize = args.length - 1; boolean outputRow = boolOI.get(args[0]); if (outputRow) { + final DoubleWritable[] values = this.columnValues; if (forwardObjs == null) { - // forwardObjs internally references columnValues - List<DoubleWritable> column = new ArrayList<>(outputSize); - this.forwardObjs = new Object[] {column}; - for (int i = 0; i < outputSize; i++) { - column.add(columnValues[i]); - } + this.forwardObjs = new Object[] {Arrays.asList(values)}; } // updating columnValues simultaneously changes forwardObjs - for (int i = 0; i < outputSize; i++) { + for (int i = 0, outputSize = args.length - 1; i < outputSize; i++) { Object arg = args[i + 1]; Identifier<String> identifier = identifiers[i]; if (identifier == null) { double v = PrimitiveObjectInspectorUtils.getDouble(arg, doubleOIs[i]); - columnValues[i].set(v); + values[i].set(v); } else { if (arg == null) { throw new HiveException("Found Null in the input: " + Arrays.toString(args)); } else { String k = arg.toString(); int id = identifier.valueOf(k); - columnValues[i].set(id); + values[i].set(id); } } } forward(forwardObjs); } else {// load only - for (int i = 0; i < outputSize; i++) { + for (int i = 0, outputSize = args.length - 1; i < outputSize; i++) { Identifier<String> identifier = identifiers[i]; if (identifier != null) { Object arg = args[i + 1]; http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9257a350/core/src/test/java/hivemall/TestUtils.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/TestUtils.java b/core/src/test/java/hivemall/TestUtils.java index 7bd1ac3..1f1f1da 100644 --- a/core/src/test/java/hivemall/TestUtils.java +++ b/core/src/test/java/hivemall/TestUtils.java @@ -66,6 +66,7 @@ public final class TestUtils { udf.close(); } + @SuppressWarnings("deprecation") public static <T extends GenericUDTF> void testGenericUDTFSerialization( @Nonnull Class<T> clazz, @Nonnull ObjectInspector[] ois, @Nonnull Object[][] rows) throws HiveException { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9257a350/core/src/test/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTFTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTFTest.java b/core/src/test/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTFTest.java index 4cfc20e..9d5574f 100644 --- a/core/src/test/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTFTest.java +++ b/core/src/test/java/hivemall/ftvec/trans/QuantifiedFeaturesUDTFTest.java @@ -49,6 +49,7 @@ public class QuantifiedFeaturesUDTFTest { udtf.setCollector(new Collector() { public void collect(Object input) throws HiveException { Object[] row = (Object[]) input; + @SuppressWarnings("unchecked") List<DoubleWritable> column = (List<DoubleWritable>) row[0]; List<Double> quantifiedInput = new ArrayList<>(); for (DoubleWritable elem : column) { http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9257a350/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java ---------------------------------------------------------------------- diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java index 745ec30..cdf6dab 100644 --- a/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java +++ b/nlp/src/main/java/hivemall/nlp/tokenizer/KuromojiUDF.java @@ -72,10 +72,12 @@ public final class KuromojiUDF extends GenericUDF { private static final int READ_TIMEOUT_MS = 60000; // 60 sec private static final long MAX_INPUT_STREAM_SIZE = 32L * 1024L * 1024L; // ~32MB - private String _modeString; + private Mode _mode; + @Nullable private String[] _stopWordsArray; - private String[] _stopTagsArray; - private Object _userDictObj; + private Set<String> _stopTags; + @Nullable + private Object _userDictObj; // String[] or String // workaround to avoid org.apache.hive.com.esotericsoftware.kryo.KryoException: java.util.ConcurrentModificationException private transient JapaneseAnalyzer _analyzer; @@ -88,19 +90,15 @@ public final class KuromojiUDF extends GenericUDF { + arglen); } - this._modeString = (arglen >= 2) ? HiveUtils.getConstString(arguments[1]) : "NORMAL"; + this._mode = (arglen >= 2) ? tokenizationMode(arguments[1]) : Mode.NORMAL; - this._stopWordsArray = null; if (arglen >= 3 && !HiveUtils.isVoidOI(arguments[2])) { this._stopWordsArray = HiveUtils.getConstStringArray(arguments[2]); } - this._stopTagsArray = null; - if (arglen >= 4 && !HiveUtils.isVoidOI(arguments[3])) { - this._stopTagsArray = HiveUtils.getConstStringArray(arguments[3]); - } + this._stopTags = (arglen >= 4) ? stopTags(arguments[3]) + : JapaneseAnalyzer.getDefaultStopTags(); - this._userDictObj = null; if (arglen >= 5) { if (HiveUtils.isConstListOI(arguments[4])) { this._userDictObj = HiveUtils.getConstStringArray(arguments[4]); @@ -120,9 +118,7 @@ public final class KuromojiUDF extends GenericUDF { @Override public List<Text> evaluate(DeferredObject[] arguments) throws HiveException { if (_analyzer == null) { - Mode mode = tokenizationMode(_modeString); CharArraySet stopWords = stopWords(_stopWordsArray); - Set<String> stopTags = stopTags(_stopTagsArray); UserDictionary userDict = null; if (_userDictObj instanceof String[]) { @@ -131,7 +127,7 @@ public final class KuromojiUDF extends GenericUDF { userDict = userDictionary((String) _userDictObj); } - this._analyzer = new JapaneseAnalyzer(userDict, mode, stopWords, stopTags); + this._analyzer = new JapaneseAnalyzer(userDict, _mode, stopWords, _stopTags); } Object arg0 = arguments[0].get(); @@ -162,7 +158,9 @@ public final class KuromojiUDF extends GenericUDF { } @Nonnull - private static Mode tokenizationMode(@Nullable final String arg) throws UDFArgumentException { + private static Mode tokenizationMode(@Nonnull final ObjectInspector oi) + throws UDFArgumentException { + String arg = HiveUtils.getConstString(oi); if (arg == null) { return Mode.NORMAL; } @@ -191,12 +189,16 @@ public final class KuromojiUDF extends GenericUDF { if (array.length == 0) { return CharArraySet.EMPTY_SET; } - CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */true); - return results; + return new CharArraySet(Arrays.asList(array), /* ignoreCase */true); } @Nonnull - private static Set<String> stopTags(@Nullable final String[] array) throws UDFArgumentException { + private static Set<String> stopTags(@Nonnull final ObjectInspector oi) + throws UDFArgumentException { + if (HiveUtils.isVoidOI(oi)) { + return JapaneseAnalyzer.getDefaultStopTags(); + } + final String[] array = HiveUtils.getConstStringArray(oi); if (array == null) { return JapaneseAnalyzer.getDefaultStopTags(); } http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/9257a350/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java ---------------------------------------------------------------------- diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java index 1c3db9f..356507d 100644 --- a/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java +++ b/nlp/src/test/java/hivemall/nlp/tokenizer/KuromojiUDFTest.java @@ -18,12 +18,12 @@ */ package hivemall.nlp.tokenizer; +import hivemall.TestUtils; + import java.io.IOException; import java.util.ArrayList; import java.util.List; -import hivemall.TestUtils; - import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; @@ -395,6 +395,7 @@ public class KuromojiUDFTest { public void prepare(int arg) throws HiveException {} }; List<Text> tokens = udf.evaluate(args); + Assert.assertNotNull(tokens); // serialization after evaluation serialized = TestUtils.serializeObjectByKryo(udf);
