Repository: incubator-hivemall
Updated Branches:
  refs/heads/master 9876d0631 -> ec6d945fe


Added tokenize_cn UDF, using 
org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer


Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/5eb80373
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/5eb80373
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/5eb80373

Branch: refs/heads/master
Commit: 5eb80373641920428d9f95f54b726995e89e8443
Parents: 9f01ebf
Author: partyyoung <[email protected]>
Authored: Thu Jun 29 18:32:05 2017 +0800
Committer: partyyoung <[email protected]>
Committed: Thu Jun 29 18:32:05 2017 +0800

----------------------------------------------------------------------
 nlp/pom.xml                                     |  13 ++
 .../java/hivemall/nlp/tokenizer/SmartcnUDF.java | 137 +++++++++++++++++++
 .../hivemall/nlp/tokenizer/SmartcnUDFTest.java  |  85 ++++++++++++
 3 files changed, 235 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/pom.xml
----------------------------------------------------------------------
diff --git a/nlp/pom.xml b/nlp/pom.xml
index b6ea409..021cd6d 100644
--- a/nlp/pom.xml
+++ b/nlp/pom.xml
@@ -117,6 +117,12 @@
                        <version>5.3.1</version>
                        <scope>compile</scope>
                </dependency>
+               <dependency>
+                       <groupId>org.apache.lucene</groupId>
+                       <artifactId>lucene-analyzers-smartcn</artifactId>
+                       <version>5.3.1</version>
+                       <scope>compile</scope>
+               </dependency>
 
                <!-- test scope -->
                <dependency>
@@ -171,6 +177,7 @@
                                                                <includes>
                                                                        
<include>io.github.myui:hivemall-core</include>
                                                                        
<include>org.apache.lucene:lucene-analyzers-kuromoji</include>
+                                                                       
<include>org.apache.lucene:lucene-analyzers-smartcn</include>
                                                                        
<include>org.apache.lucene:lucene-analyzers-common</include>
                                                                        
<include>org.apache.lucene:lucene-core</include>
                                                                </includes>
@@ -183,6 +190,12 @@
                                                                        
</includes>
                                                                </filter>
                                                                <filter>
+                                                                       
<artifact>org.apache.lucene:lucene-analyzers-smartcn</artifact>
+                                                                       
<includes>
+                                                                               
<include>**</include>
+                                                                       
</includes>
+                                                               </filter>
+                                                               <filter>
                                                                        
<artifact>org.apache.lucene:lucene-analyzers-common</artifact>
                                                                        
<includes>
                                                                                
<include>**</include>

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
----------------------------------------------------------------------
diff --git a/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java 
b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
new file mode 100644
index 0000000..3d148c9
--- /dev/null
+++ b/nlp/src/main/java/hivemall/nlp/tokenizer/SmartcnUDF.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import hivemall.utils.hadoop.HiveUtils;
+import hivemall.utils.io.IOUtils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import javax.annotation.Nonnull;
+
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.UDFType;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+@Description(
+        name = "tokenize_cn",
+        value = "_FUNC_(String line [, const list<string> stopWords])"
+                + " - returns tokenized strings in array<string>")
+@UDFType(deterministic = true, stateful = false)
+public final class SmartcnUDF extends GenericUDF {
+
+    private String[] _stopWordsArray;
+
+    private transient SmartChineseAnalyzer _analyzer;
+
+    @Override
+    public ObjectInspector initialize(ObjectInspector[] arguments) throws 
UDFArgumentException {
+        final int arglen = arguments.length;
+        if (arglen < 1 || arglen > 2) {
+            throw new UDFArgumentException("Invalid number of arguments for 
`tokenize_cn`: "
+                    + arglen);
+        }
+
+        this._stopWordsArray = (arglen >= 2) ? 
HiveUtils.getConstStringArray(arguments[1]) : null;
+        this._analyzer = null;
+
+        return 
ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
+    }
+
+    @Override
+    public List<Text> evaluate(DeferredObject[] arguments) throws 
HiveException {
+        SmartChineseAnalyzer analyzer = _analyzer;
+        if (analyzer == null) {
+                       CharArraySet stopwords = stopWords(_stopWordsArray);
+            analyzer= new SmartChineseAnalyzer(stopwords);
+            this._analyzer = analyzer;
+        }
+
+        Object arg0 = arguments[0].get();
+        if (arg0 == null) {
+            return null;
+        }
+        String line = arg0.toString();
+
+        final List<Text> results = new ArrayList<Text>(32);
+        TokenStream stream = null;
+        try {
+            stream = analyzer.tokenStream("", line);
+            if (stream != null) {
+                analyzeTokens(stream, results);
+            }
+        } catch (IOException e) {
+            IOUtils.closeQuietly(analyzer);
+            throw new HiveException(e);
+        } finally {
+            IOUtils.closeQuietly(stream);
+        }
+        return results;
+    }
+
+    @Override
+    public void close() throws IOException {
+        IOUtils.closeQuietly(_analyzer);
+    }
+
+
+    @Nonnull
+    private static CharArraySet stopWords(@Nonnull final String[] array)
+            throws UDFArgumentException {
+        if (array == null) {
+            return SmartChineseAnalyzer.getDefaultStopSet();
+        }
+        if (array.length == 0) {
+            return CharArraySet.EMPTY_SET;
+        }
+        CharArraySet results = new CharArraySet(Arrays.asList(array), /* 
ignoreCase */true);
+        return results;
+    }
+
+    private static void analyzeTokens(@Nonnull TokenStream stream, @Nonnull 
List<Text> results)
+            throws IOException {
+        // instantiate an attribute placeholder once
+        CharTermAttribute termAttr = 
stream.getAttribute(CharTermAttribute.class);
+        stream.reset();
+
+        while (stream.incrementToken()) {
+            String term = termAttr.toString();
+            results.add(new Text(term));
+        }
+    }
+
+    @Override
+    public String getDisplayString(String[] children) {
+        return "tokenize_cn(" + Arrays.toString(children) + ')';
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/5eb80373/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
----------------------------------------------------------------------
diff --git a/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java 
b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
new file mode 100644
index 0000000..720e532
--- /dev/null
+++ b/nlp/src/test/java/hivemall/nlp/tokenizer/SmartcnUDFTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package hivemall.nlp.tokenizer;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.Text;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class SmartcnUDFTest {
+
+       @Test
+       public void testOneArgment() throws UDFArgumentException, IOException {
+               GenericUDF udf = new SmartcnUDF();
+               ObjectInspector[] argOIs = new ObjectInspector[1];
+               // line
+               argOIs[0] = 
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+               udf.initialize(argOIs);
+               udf.close();
+       }
+
+       @Test
+       public void testTwoArgment() throws UDFArgumentException, IOException {
+               GenericUDF udf = new SmartcnUDF();
+               ObjectInspector[] argOIs = new ObjectInspector[2];
+               // line
+               argOIs[0] = 
PrimitiveObjectInspectorFactory.javaStringObjectInspector;
+               // stopWords
+               argOIs[1] = ObjectInspectorFactory
+                               .getStandardConstantListObjectInspector(
+                                               
PrimitiveObjectInspectorFactory.javaStringObjectInspector,
+                                               null);
+               udf.initialize(argOIs);
+               udf.close();
+       }
+
+       @Test
+       public void testEvalauteOneRow() throws IOException, HiveException {
+               SmartcnUDF udf = new SmartcnUDF();
+               ObjectInspector[] argOIs = new ObjectInspector[1];
+               // line
+               argOIs[0] = 
PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+               udf.initialize(argOIs);
+
+               DeferredObject[] args = new DeferredObject[1];
+               args[0] = new DeferredObject() {
+                       public Text get() throws HiveException {
+                               return new Text(
+                                               
"Smartcn为Apache2.0协议的开源中文分词系统,Java语言编写,修改的中科院计算所ICTCLAS分词系统。");
+                       }
+
+                       @Override
+                       public void prepare(int arg) throws HiveException {
+                       }
+               };
+               List<Text> tokens = udf.evaluate(args);
+               Assert.assertNotNull(tokens);
+               udf.close();
+       }
+}

Reply via email to