Repository: incubator-hivemall Updated Branches: refs/heads/master 9876d0631 -> ec6d945fe
Added tokenize_cn UDF, using org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/5eb80373 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/5eb80373 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/5eb80373 Branch: refs/heads/master Commit: 5eb80373641920428d9f95f54b726995e89e8443 Parents: 9f01ebf Author: partyyoung <[email protected]> Authored: Thu Jun 29 18:32:05 2017 +0800 Committer: partyyoung <[email protected]> Committed: Thu Jun 29 18:32:05 2017 +0800 ---------------------------------------------------------------------- nlp/pom.xml | 13 ++ .../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 137 +++++++++++++++++++ .../hivemall/nlp/tokenizer/SmartcnUDFTest.java | 85 ++++++++++++ 3 files changed, 235 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/pom.xml ---------------------------------------------------------------------- diff --git a/nlp/pom.xml b/nlp/pom.xml index b6ea409..021cd6d 100644 --- a/nlp/pom.xml +++ b/nlp/pom.xml @@ -117,6 +117,12 @@ <version>5.3.1</version> <scope>compile</scope> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-smartcn</artifactId> + <version>5.3.1</version> + <scope>compile</scope> + </dependency> <!-- test scope --> <dependency> @@ -171,6 +177,7 @@ <includes> <include>io.github.myui:hivemall-core</include> <include>org.apache.lucene:lucene-analyzers-kuromoji</include> + <include>org.apache.lucene:lucene-analyzers-smartcn</include> <include>org.apache.lucene:lucene-analyzers-common</include> <include>org.apache.lucene:lucene-core</include> </includes> @@ -183,6 +190,12 @@ </includes> </filter> <filter> + <artifact>org.apache.lucene:lucene-analyzers-smartcn</artifact> + <includes> + <include>**</include> + </includes> + </filter> + <filter> <artifact>org.apache.lucene:lucene-analyzers-common</artifact> <includes> <include>**</include> http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java ---------------------------------------------------------------------- diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java new file mode 100644 index 0000000..3d148c9 --- /dev/null +++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.nlp.tokenizer; + +import hivemall.utils.hadoop.HiveUtils; +import hivemall.utils.io.IOUtils; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import javax.annotation.Nonnull; + +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.UDFType; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.CharArraySet; + +@Description( + name = "tokenize_cn", + value = "_FUNC_(String line [, const list<string> stopWords])" + + " - returns tokenized strings in array<string>") +@UDFType(deterministic = true, stateful = false) +public final class SmartcnUDF extends GenericUDF { + + private String[] _stopWordsArray; + + private transient SmartChineseAnalyzer _analyzer; + + @Override + public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + final int arglen = arguments.length; + if (arglen < 1 || arglen > 2) { + throw new UDFArgumentException("Invalid number of arguments for `tokenize_cn`: " + + arglen); + } + + this._stopWordsArray = (arglen >= 2) ? HiveUtils.getConstStringArray(arguments[1]) : null; + this._analyzer = null; + + return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector); + } + + @Override + public List<Text> evaluate(DeferredObject[] arguments) throws HiveException { + SmartChineseAnalyzer analyzer = _analyzer; + if (analyzer == null) { + CharArraySet stopwords = stopWords(_stopWordsArray); + analyzer= new SmartChineseAnalyzer(stopwords); + this._analyzer = analyzer; + } + + Object arg0 = arguments[0].get(); + if (arg0 == null) { + return null; + } + String line = arg0.toString(); + + final List<Text> results = new ArrayList<Text>(32); + TokenStream stream = null; + try { + stream = analyzer.tokenStream("", line); + if (stream != null) { + analyzeTokens(stream, results); + } + } catch (IOException e) { + IOUtils.closeQuietly(analyzer); + throw new HiveException(e); + } finally { + IOUtils.closeQuietly(stream); + } + return results; + } + + @Override + public void close() throws IOException { + IOUtils.closeQuietly(_analyzer); + } + + + @Nonnull + private static CharArraySet stopWords(@Nonnull final String[] array) + throws UDFArgumentException { + if (array == null) { + return SmartChineseAnalyzer.getDefaultStopSet(); + } + if (array.length == 0) { + return CharArraySet.EMPTY_SET; + } + CharArraySet results = new CharArraySet(Arrays.asList(array), /* ignoreCase */true); + return results; + } + + private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull List<Text> results) + throws IOException { + // instantiate an attribute placeholder once + CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); + stream.reset(); + + while (stream.incrementToken()) { + String term = termAttr.toString(); + results.add(new Text(term)); + } + } + + @Override + public String getDisplayString(String[] children) { + return "tokenize_cn(" + Arrays.toString(children) + ')'; + } + +} http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java ---------------------------------------------------------------------- diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java new file mode 100644 index 0000000..720e532 --- /dev/null +++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package hivemall.nlp.tokenizer; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.hive.ql.exec.UDFArgumentException; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.io.Text; +import org.junit.Assert; +import org.junit.Test; + +public class SmartcnUDFTest { + + @Test + public void testOneArgment() throws UDFArgumentException, IOException { + GenericUDF udf = new SmartcnUDF(); + ObjectInspector[] argOIs = new ObjectInspector[1]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + udf.initialize(argOIs); + udf.close(); + } + + @Test + public void testTwoArgment() throws UDFArgumentException, IOException { + GenericUDF udf = new SmartcnUDF(); + ObjectInspector[] argOIs = new ObjectInspector[2]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.javaStringObjectInspector; + // stopWords + argOIs[1] = ObjectInspectorFactory + .getStandardConstantListObjectInspector( + PrimitiveObjectInspectorFactory.javaStringObjectInspector, + null); + udf.initialize(argOIs); + udf.close(); + } + + @Test + public void testEvalauteOneRow() throws IOException, HiveException { + SmartcnUDF udf = new SmartcnUDF(); + ObjectInspector[] argOIs = new ObjectInspector[1]; + // line + argOIs[0] = PrimitiveObjectInspectorFactory.writableStringObjectInspector; + udf.initialize(argOIs); + + DeferredObject[] args = new DeferredObject[1]; + args[0] = new DeferredObject() { + public Text get() throws HiveException { + return new Text( + "Smartcn为Apache2.0åè®®ç弿ºä¸æåè¯ç³»ç»ï¼Javaè¯è¨ç¼åï¼ä¿®æ¹çä¸ç§é¢è®¡ç®æICTCLASåè¯ç³»ç»ã"); + } + + @Override + public void prepare(int arg) throws HiveException { + } + }; + List<Text> tokens = udf.evaluate(args); + Assert.assertNotNull(tokens); + udf.close(); + } +}
